79 lines
3.4 KiB
Python
79 lines
3.4 KiB
Python
import pandas as pd
|
||
|
||
import os
|
||
df = pd.read_parquet("./data/flash/FLASH_2024.parquet")
|
||
|
||
'''
|
||
时间 微秒 纬度 经度 电流 回击 ... 误差椭圆长半轴方向 标志 陡度 时间差 RawID GUID
|
||
0 1/1/2024 03:16:40 5888618 25.850505 106.476598 -17.1 1 ... 118 0 0 0 1050961381,1050961380; 165612266
|
||
1 1/1/2024 03:42:33 7173002 25.684971 106.972500 73.1 1 ... 104 0 0 0 1050979230,1050979226,1050979231;1050979227,10... 165612269
|
||
2 1/1/2024 04:11:28 1344128 25.544373 106.873714 45.7 1 ... 87 0 0 0 1050991636,1050991653,1050991654;1050991651,10... 165612271
|
||
3 1/1/2024 04:37:00 2312000 26.191353 105.803525 -12.9 1 ... 118 0 0 0 1051001690,1051001680; 165612285
|
||
4 1/1/2024 05:22:22 1754035 26.291300 106.622184 -29.1 1 ... 0 0 0 0 1051017776,1051017786,1051017785; 165612299
|
||
'''
|
||
# 只保留需要的列并解析时间
|
||
df = df[['时间', '微秒', '纬度', '经度', '电流']].copy()
|
||
|
||
# 解析时间列,将微秒信息合并到时间中
|
||
df['时间'] = pd.to_datetime(df['时间'], format='%d/%m/%Y %H:%M:%S') + pd.to_timedelta(df['微秒'], unit='us')
|
||
|
||
|
||
|
||
lon = 108.068889
|
||
lat = 28.118333
|
||
|
||
|
||
# 将时间列转换为datetime格式
|
||
df['时间'] = pd.to_datetime(df['时间'])
|
||
|
||
# 创建整5秒的时间范围(即每个时间点的秒数都为0或5的倍数)
|
||
start_time = df['时间'].min().replace(microsecond=0)
|
||
if start_time.second % 5 != 0:
|
||
start_time = start_time + pd.Timedelta(seconds=(5 - start_time.second % 5))
|
||
end_time = df['时间'].max().replace(microsecond=0)
|
||
if end_time.second % 5 != 0:
|
||
end_time = end_time - pd.Timedelta(seconds=(end_time.second % 5))
|
||
time_range = pd.date_range(start=start_time, end=end_time, freq='5S')
|
||
|
||
# 向量化加速:先筛选地理范围,再用pandas的rolling count方法高效统计时间窗口内的闪电次数
|
||
|
||
# 先筛选地理范围(0.15度内)
|
||
geo_mask = (
|
||
(df['纬度'] >= lat - 0.15) & (df['纬度'] <= lat + 0.15) &
|
||
(df['经度'] >= lon - 0.15) & (df['经度'] <= lon + 0.15)
|
||
)
|
||
df_geo = df[geo_mask].copy()
|
||
|
||
# 按时间排序
|
||
df_geo = df_geo.sort_values('时间').reset_index(drop=True)
|
||
|
||
# 设置时间为索引
|
||
df_geo = df_geo.set_index('时间')
|
||
|
||
# 用pandas的rolling+count方法,先构造一个以秒为频率的时间序列
|
||
full_time_index = pd.date_range(start=time_range.min(), end=time_range.max(), freq='5S')
|
||
# 新建一个Series,所有闪电事件置1,其他为0
|
||
flash_series = pd.Series(1, index=df_geo.index)
|
||
# 重新索引到完整时间轴,NaN填0
|
||
flash_series = flash_series.reindex(full_time_index, fill_value=0)
|
||
|
||
# 计算未来30分钟(1800秒)内的闪电次数
|
||
# rolling的窗口是“右闭左开”,所以要用window=1801(含当前点)
|
||
flash_count = flash_series[::-1].rolling(window=1801, min_periods=1).sum()[::-1]
|
||
|
||
# 只保留每5秒的时间点
|
||
result_df = pd.DataFrame({
|
||
'time': full_time_index,
|
||
'flash_count': flash_count.values
|
||
})
|
||
|
||
print("结果DataFrame的前10行:")
|
||
print(result_df.head(10))
|
||
print(f"\n总时间点数:{len(result_df)}")
|
||
print(f"最大闪电次数:{result_df['flash_count'].max()}")
|
||
|
||
|
||
save_path = "./data/preprocessed_flash/212/flash_count.csv"
|
||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||
result_df.to_csv(save_path, index=False)
|