86 lines
2.5 KiB
Python
86 lines
2.5 KiB
Python
import pandas as pd
|
||
import os
|
||
import glob
|
||
import numpy as np
|
||
|
||
# --- 1. 数据加载与预处理 ---
|
||
|
||
# 加载并合并Parquet文件
|
||
df_list: list[pd.DataFrame] = []
|
||
# 请确保这里的路径是您实际的文件路径,此处仅为示例
|
||
for i in glob.glob("./data/ef/212/212_*.parquet"):
|
||
df_list.append(pd.read_parquet(i))
|
||
df: pd.DataFrame = pd.concat(df_list, ignore_index=True)
|
||
|
||
# 重命名列并转换时间格式
|
||
df = df.rename(columns={"HappenTime": "time", "ElectricField": "ef"})
|
||
df["time"] = pd.to_datetime(df["time"])
|
||
df = df.sort_values('time').drop_duplicates(subset='time', keep='first')
|
||
|
||
# 创建一个从头到尾每秒连续的时间索引
|
||
full_time_index: pd.DatetimeIndex = pd.date_range(start=df["time"].min(), end=df["time"].max(), freq="s")
|
||
df = df.set_index("time").reindex(full_time_index).rename_axis("time")
|
||
|
||
# --- 修正 1:处理 NaN (空值) ---
|
||
# 在线性插值前处理reindex引入的NaN值
|
||
df['ef'] = df['ef'].interpolate(method='linear', limit_direction='both')
|
||
df = df.reset_index()
|
||
|
||
# --- 2. 统计特征提取 ---
|
||
|
||
window_sizes: list[int] = [2, 5, 10, 30, 60, 300, 600, 1200]
|
||
|
||
|
||
def rolling_window(a: np.ndarray, window: int) -> np.ndarray:
|
||
"""
|
||
创建滚动窗口数组
|
||
|
||
Args:
|
||
a: 输入数组
|
||
window: 窗口大小
|
||
|
||
Returns:
|
||
滚动窗口数组
|
||
"""
|
||
shape: tuple[int, int] = (a.size - window + 1, window)
|
||
strides: tuple[int, int] = (a.strides[0], a.strides[0])
|
||
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
|
||
|
||
|
||
def pad_result(res: np.ndarray, win: int) -> np.ndarray:
|
||
"""
|
||
在前面补 nan,使长度与原始数据一致
|
||
|
||
Args:
|
||
res: 结果数组
|
||
win: 窗口大小
|
||
|
||
Returns:
|
||
填充后的数组
|
||
"""
|
||
return np.concatenate([np.full(win-1, np.nan), res])
|
||
|
||
|
||
# 初始化特征DataFrame
|
||
stat_features_all: pd.DataFrame = pd.DataFrame({'time': df['time']})
|
||
ef: np.ndarray = df['ef'].values
|
||
|
||
# 只计算mean特征
|
||
for win in window_sizes:
|
||
if len(ef) < win:
|
||
# 数据太短,全部填 nan
|
||
n: int = len(ef)
|
||
stat_features_all[f'mean_{win}s'] = np.full(n, np.nan)
|
||
continue
|
||
|
||
# 计算滚动窗口均值
|
||
win_arr: np.ndarray = rolling_window(ef, win)
|
||
stat_features_all[f'mean_{win}s'] = pad_result(win_arr.mean(axis=1), win)
|
||
|
||
# 保存结果
|
||
save_path: str = "./data/preprocessed_ef/ef_212.parquet"
|
||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||
stat_features_all.to_parquet(save_path, index=True)
|
||
print(f"Saved to {save_path}")
|
||
print(stat_features_all.head())
|