This commit is contained in:
lvhao
2025-07-28 11:08:04 +08:00
parent 07ab95ff51
commit 47a6cc00e7
11 changed files with 1470 additions and 0 deletions

85
1_get_ef_data_bak.py Normal file
View File

@@ -0,0 +1,85 @@
import pandas as pd
import os
import glob
import numpy as np
# --- 1. 数据加载与预处理 ---
# 加载并合并Parquet文件
df_list: list[pd.DataFrame] = []
# 请确保这里的路径是您实际的文件路径,此处仅为示例
for i in glob.glob("./data/ef/212/212_*.parquet"):
df_list.append(pd.read_parquet(i))
df: pd.DataFrame = pd.concat(df_list, ignore_index=True)
# 重命名列并转换时间格式
df = df.rename(columns={"HappenTime": "time", "ElectricField": "ef"})
df["time"] = pd.to_datetime(df["time"])
df = df.sort_values('time').drop_duplicates(subset='time', keep='first')
# 创建一个从头到尾每秒连续的时间索引
full_time_index: pd.DatetimeIndex = pd.date_range(start=df["time"].min(), end=df["time"].max(), freq="s")
df = df.set_index("time").reindex(full_time_index).rename_axis("time")
# --- 修正 1处理 NaN (空值) ---
# 在线性插值前处理reindex引入的NaN值
df['ef'] = df['ef'].interpolate(method='linear', limit_direction='both')
df = df.reset_index()
# --- 2. 统计特征提取 ---
window_sizes: list[int] = [2, 5, 10, 30, 60, 300, 600, 1200]
def rolling_window(a: np.ndarray, window: int) -> np.ndarray:
"""
创建滚动窗口数组
Args:
a: 输入数组
window: 窗口大小
Returns:
滚动窗口数组
"""
shape: tuple[int, int] = (a.size - window + 1, window)
strides: tuple[int, int] = (a.strides[0], a.strides[0])
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
def pad_result(res: np.ndarray, win: int) -> np.ndarray:
"""
在前面补 nan使长度与原始数据一致
Args:
res: 结果数组
win: 窗口大小
Returns:
填充后的数组
"""
return np.concatenate([np.full(win-1, np.nan), res])
# 初始化特征DataFrame
stat_features_all: pd.DataFrame = pd.DataFrame({'time': df['time']})
ef: np.ndarray = df['ef'].values
# 只计算mean特征
for win in window_sizes:
if len(ef) < win:
# 数据太短,全部填 nan
n: int = len(ef)
stat_features_all[f'mean_{win}s'] = np.full(n, np.nan)
continue
# 计算滚动窗口均值
win_arr: np.ndarray = rolling_window(ef, win)
stat_features_all[f'mean_{win}s'] = pad_result(win_arr.mean(axis=1), win)
# 保存结果
save_path: str = "./data/preprocessed_ef/ef_212.parquet"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
stat_features_all.to_parquet(save_path, index=True)
print(f"Saved to {save_path}")
print(stat_features_all.head())