import pandas as pd import os import glob import numpy as np # --- 1. 数据加载与预处理 --- # 加载并合并Parquet文件 df_list: list[pd.DataFrame] = [] # 请确保这里的路径是您实际的文件路径,此处仅为示例 for i in glob.glob("./data/ef/212/212_*.parquet"): df_list.append(pd.read_parquet(i)) df: pd.DataFrame = pd.concat(df_list, ignore_index=True) # 重命名列并转换时间格式 df = df.rename(columns={"HappenTime": "time", "ElectricField": "ef"}) df["time"] = pd.to_datetime(df["time"]) df = df.sort_values('time').drop_duplicates(subset='time', keep='first') # 创建一个从头到尾每秒连续的时间索引 full_time_index: pd.DatetimeIndex = pd.date_range(start=df["time"].min(), end=df["time"].max(), freq="s") df = df.set_index("time").reindex(full_time_index).rename_axis("time") # --- 修正 1:处理 NaN (空值) --- # 在线性插值前处理reindex引入的NaN值 df['ef'] = df['ef'].interpolate(method='linear', limit_direction='both') df = df.reset_index() # --- 2. 统计特征提取 --- window_sizes: list[int] = [2, 5, 10, 30, 60, 300, 600, 1200] def rolling_window(a: np.ndarray, window: int) -> np.ndarray: """ 创建滚动窗口数组 Args: a: 输入数组 window: 窗口大小 Returns: 滚动窗口数组 """ shape: tuple[int, int] = (a.size - window + 1, window) strides: tuple[int, int] = (a.strides[0], a.strides[0]) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) def pad_result(res: np.ndarray, win: int) -> np.ndarray: """ 在前面补 nan,使长度与原始数据一致 Args: res: 结果数组 win: 窗口大小 Returns: 填充后的数组 """ return np.concatenate([np.full(win-1, np.nan), res]) # 初始化特征DataFrame stat_features_all: pd.DataFrame = pd.DataFrame({'time': df['time']}) ef: np.ndarray = df['ef'].values # 只计算mean特征 for win in window_sizes: if len(ef) < win: # 数据太短,全部填 nan n: int = len(ef) stat_features_all[f'mean_{win}s'] = np.full(n, np.nan) continue # 计算滚动窗口均值 win_arr: np.ndarray = rolling_window(ef, win) stat_features_all[f'mean_{win}s'] = pad_result(win_arr.mean(axis=1), win) # 保存结果 save_path: str = "./data/preprocessed_ef/ef_212.parquet" os.makedirs(os.path.dirname(save_path), exist_ok=True) stat_features_all.to_parquet(save_path, index=True) print(f"Saved to {save_path}") print(stat_features_all.head())