import pandas as pd import os df = pd.read_parquet("./data/flash/FLASH_2024.parquet") ''' 时间 微秒 纬度 经度 电流 回击 ... 误差椭圆长半轴方向 标志 陡度 时间差 RawID GUID 0 1/1/2024 03:16:40 5888618 25.850505 106.476598 -17.1 1 ... 118 0 0 0 1050961381,1050961380; 165612266 1 1/1/2024 03:42:33 7173002 25.684971 106.972500 73.1 1 ... 104 0 0 0 1050979230,1050979226,1050979231;1050979227,10... 165612269 2 1/1/2024 04:11:28 1344128 25.544373 106.873714 45.7 1 ... 87 0 0 0 1050991636,1050991653,1050991654;1050991651,10... 165612271 3 1/1/2024 04:37:00 2312000 26.191353 105.803525 -12.9 1 ... 118 0 0 0 1051001690,1051001680; 165612285 4 1/1/2024 05:22:22 1754035 26.291300 106.622184 -29.1 1 ... 0 0 0 0 1051017776,1051017786,1051017785; 165612299 ''' # 只保留需要的列并解析时间 df = df[['时间', '微秒', '纬度', '经度', '电流']].copy() # 解析时间列,将微秒信息合并到时间中 df['时间'] = pd.to_datetime(df['时间'], format='%d/%m/%Y %H:%M:%S') + pd.to_timedelta(df['微秒'], unit='us') lon = 108.068889 lat = 28.118333 # 将时间列转换为datetime格式 df['时间'] = pd.to_datetime(df['时间']) # 创建整5秒的时间范围(即每个时间点的秒数都为0或5的倍数) start_time = df['时间'].min().replace(microsecond=0) if start_time.second % 5 != 0: start_time = start_time + pd.Timedelta(seconds=(5 - start_time.second % 5)) end_time = df['时间'].max().replace(microsecond=0) if end_time.second % 5 != 0: end_time = end_time - pd.Timedelta(seconds=(end_time.second % 5)) time_range = pd.date_range(start=start_time, end=end_time, freq='5S') # 向量化加速:先筛选地理范围,再用pandas的rolling count方法高效统计时间窗口内的闪电次数 # 先筛选地理范围(0.15度内) geo_mask = ( (df['纬度'] >= lat - 0.15) & (df['纬度'] <= lat + 0.15) & (df['经度'] >= lon - 0.15) & (df['经度'] <= lon + 0.15) ) df_geo = df[geo_mask].copy() # 按时间排序 df_geo = df_geo.sort_values('时间').reset_index(drop=True) # 设置时间为索引 df_geo = df_geo.set_index('时间') # 用pandas的rolling+count方法,先构造一个以秒为频率的时间序列 full_time_index = pd.date_range(start=time_range.min(), end=time_range.max(), freq='5S') # 新建一个Series,所有闪电事件置1,其他为0 flash_series = pd.Series(1, index=df_geo.index) # 重新索引到完整时间轴,NaN填0 flash_series = flash_series.reindex(full_time_index, fill_value=0) # 计算未来30分钟(1800秒)内的闪电次数 # rolling的窗口是“右闭左开”,所以要用window=1801(含当前点) flash_count = flash_series[::-1].rolling(window=1801, min_periods=1).sum()[::-1] # 只保留每5秒的时间点 result_df = pd.DataFrame({ 'time': full_time_index, 'flash_count': flash_count.values }) print("结果DataFrame的前10行:") print(result_df.head(10)) print(f"\n总时间点数:{len(result_df)}") print(f"最大闪电次数:{result_df['flash_count'].max()}") save_path = "./data/preprocessed_flash/212/flash_count.csv" os.makedirs(os.path.dirname(save_path), exist_ok=True) result_df.to_csv(save_path, index=False)