Files
very_short_lightning/1_get_ef_data_bak.py
2025-07-28 11:08:04 +08:00

86 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import os
import glob
import numpy as np
# --- 1. 数据加载与预处理 ---
# 加载并合并Parquet文件
df_list: list[pd.DataFrame] = []
# 请确保这里的路径是您实际的文件路径,此处仅为示例
for i in glob.glob("./data/ef/212/212_*.parquet"):
df_list.append(pd.read_parquet(i))
df: pd.DataFrame = pd.concat(df_list, ignore_index=True)
# 重命名列并转换时间格式
df = df.rename(columns={"HappenTime": "time", "ElectricField": "ef"})
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values('time').drop_duplicates(subset='time', keep='first')
# 创建一个从头到尾每秒连续的时间索引
full_time_index: pd.DatetimeIndex = pd.date_range(start=df["time"].min(), end=df["time"].max(), freq="s")
df = df.set_index("time").reindex(full_time_index).rename_axis("time")
# --- 修正 1处理 NaN (空值) ---
# 在线性插值前处理reindex引入的NaN值
df['ef'] = df['ef'].interpolate(method='linear', limit_direction='both')
df = df.reset_index()
# --- 2. 统计特征提取 ---
window_sizes: list[int] = [2, 5, 10, 30, 60, 300, 600, 1200]
def rolling_window(a: np.ndarray, window: int) -> np.ndarray:
"""
创建滚动窗口数组
Args:
a: 输入数组
window: 窗口大小
Returns:
滚动窗口数组
"""
shape: tuple[int, int] = (a.size - window + 1, window)
strides: tuple[int, int] = (a.strides[0], a.strides[0])
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
def pad_result(res: np.ndarray, win: int) -> np.ndarray:
"""
在前面补 nan使长度与原始数据一致
Args:
res: 结果数组
win: 窗口大小
Returns:
填充后的数组
"""
return np.concatenate([np.full(win-1, np.nan), res])
# 初始化特征DataFrame
stat_features_all: pd.DataFrame = pd.DataFrame({'time': df['time']})
ef: np.ndarray = df['ef'].values
# 只计算mean特征
for win in window_sizes:
if len(ef) < win:
# 数据太短,全部填 nan
n: int = len(ef)
stat_features_all[f'mean_{win}s'] = np.full(n, np.nan)
continue
# 计算滚动窗口均值
win_arr: np.ndarray = rolling_window(ef, win)
stat_features_all[f'mean_{win}s'] = pad_result(win_arr.mean(axis=1), win)
# 保存结果
save_path: str = "./data/preprocessed_ef/ef_212.parquet"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
stat_features_all.to_parquet(save_path, index=True)
print(f"Saved to {save_path}")
print(stat_features_all.head())