init
This commit is contained in:
85
1_get_ef_data_bak.py
Normal file
85
1_get_ef_data_bak.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import pandas as pd
|
||||
import os
|
||||
import glob
|
||||
import numpy as np
|
||||
|
||||
# --- 1. 数据加载与预处理 ---
|
||||
|
||||
# 加载并合并Parquet文件
|
||||
df_list: list[pd.DataFrame] = []
|
||||
# 请确保这里的路径是您实际的文件路径,此处仅为示例
|
||||
for i in glob.glob("./data/ef/212/212_*.parquet"):
|
||||
df_list.append(pd.read_parquet(i))
|
||||
df: pd.DataFrame = pd.concat(df_list, ignore_index=True)
|
||||
|
||||
# 重命名列并转换时间格式
|
||||
df = df.rename(columns={"HappenTime": "time", "ElectricField": "ef"})
|
||||
df["time"] = pd.to_datetime(df["time"])
|
||||
df = df.sort_values('time').drop_duplicates(subset='time', keep='first')
|
||||
|
||||
# 创建一个从头到尾每秒连续的时间索引
|
||||
full_time_index: pd.DatetimeIndex = pd.date_range(start=df["time"].min(), end=df["time"].max(), freq="s")
|
||||
df = df.set_index("time").reindex(full_time_index).rename_axis("time")
|
||||
|
||||
# --- 修正 1:处理 NaN (空值) ---
|
||||
# 在线性插值前处理reindex引入的NaN值
|
||||
df['ef'] = df['ef'].interpolate(method='linear', limit_direction='both')
|
||||
df = df.reset_index()
|
||||
|
||||
# --- 2. 统计特征提取 ---
|
||||
|
||||
window_sizes: list[int] = [2, 5, 10, 30, 60, 300, 600, 1200]
|
||||
|
||||
|
||||
def rolling_window(a: np.ndarray, window: int) -> np.ndarray:
|
||||
"""
|
||||
创建滚动窗口数组
|
||||
|
||||
Args:
|
||||
a: 输入数组
|
||||
window: 窗口大小
|
||||
|
||||
Returns:
|
||||
滚动窗口数组
|
||||
"""
|
||||
shape: tuple[int, int] = (a.size - window + 1, window)
|
||||
strides: tuple[int, int] = (a.strides[0], a.strides[0])
|
||||
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)
|
||||
|
||||
|
||||
def pad_result(res: np.ndarray, win: int) -> np.ndarray:
|
||||
"""
|
||||
在前面补 nan,使长度与原始数据一致
|
||||
|
||||
Args:
|
||||
res: 结果数组
|
||||
win: 窗口大小
|
||||
|
||||
Returns:
|
||||
填充后的数组
|
||||
"""
|
||||
return np.concatenate([np.full(win-1, np.nan), res])
|
||||
|
||||
|
||||
# 初始化特征DataFrame
|
||||
stat_features_all: pd.DataFrame = pd.DataFrame({'time': df['time']})
|
||||
ef: np.ndarray = df['ef'].values
|
||||
|
||||
# 只计算mean特征
|
||||
for win in window_sizes:
|
||||
if len(ef) < win:
|
||||
# 数据太短,全部填 nan
|
||||
n: int = len(ef)
|
||||
stat_features_all[f'mean_{win}s'] = np.full(n, np.nan)
|
||||
continue
|
||||
|
||||
# 计算滚动窗口均值
|
||||
win_arr: np.ndarray = rolling_window(ef, win)
|
||||
stat_features_all[f'mean_{win}s'] = pad_result(win_arr.mean(axis=1), win)
|
||||
|
||||
# 保存结果
|
||||
save_path: str = "./data/preprocessed_ef/ef_212.parquet"
|
||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||
stat_features_all.to_parquet(save_path, index=True)
|
||||
print(f"Saved to {save_path}")
|
||||
print(stat_features_all.head())
|
||||
Reference in New Issue
Block a user