Files
very_short_lightning/2_get_flash.py
2025-07-28 11:08:04 +08:00

79 lines
3.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
import os
df = pd.read_parquet("./data/flash/FLASH_2024.parquet")
'''
时间 微秒 纬度 经度 电流 回击 ... 误差椭圆长半轴方向 标志 陡度 时间差 RawID GUID
0 1/1/2024 03:16:40 5888618 25.850505 106.476598 -17.1 1 ... 118 0 0 0 1050961381,1050961380; 165612266
1 1/1/2024 03:42:33 7173002 25.684971 106.972500 73.1 1 ... 104 0 0 0 1050979230,1050979226,1050979231;1050979227,10... 165612269
2 1/1/2024 04:11:28 1344128 25.544373 106.873714 45.7 1 ... 87 0 0 0 1050991636,1050991653,1050991654;1050991651,10... 165612271
3 1/1/2024 04:37:00 2312000 26.191353 105.803525 -12.9 1 ... 118 0 0 0 1051001690,1051001680; 165612285
4 1/1/2024 05:22:22 1754035 26.291300 106.622184 -29.1 1 ... 0 0 0 0 1051017776,1051017786,1051017785; 165612299
'''
# 只保留需要的列并解析时间
df = df[['时间', '微秒', '纬度', '经度', '电流']].copy()
# 解析时间列,将微秒信息合并到时间中
df['时间'] = pd.to_datetime(df['时间'], format='%d/%m/%Y %H:%M:%S') + pd.to_timedelta(df['微秒'], unit='us')
lon = 108.068889
lat = 28.118333
# 将时间列转换为datetime格式
df['时间'] = pd.to_datetime(df['时间'])
# 创建整5秒的时间范围即每个时间点的秒数都为0或5的倍数
start_time = df['时间'].min().replace(microsecond=0)
if start_time.second % 5 != 0:
start_time = start_time + pd.Timedelta(seconds=(5 - start_time.second % 5))
end_time = df['时间'].max().replace(microsecond=0)
if end_time.second % 5 != 0:
end_time = end_time - pd.Timedelta(seconds=(end_time.second % 5))
time_range = pd.date_range(start=start_time, end=end_time, freq='5S')
# 向量化加速先筛选地理范围再用pandas的rolling count方法高效统计时间窗口内的闪电次数
# 先筛选地理范围0.15度内)
geo_mask = (
(df['纬度'] >= lat - 0.15) & (df['纬度'] <= lat + 0.15) &
(df['经度'] >= lon - 0.15) & (df['经度'] <= lon + 0.15)
)
df_geo = df[geo_mask].copy()
# 按时间排序
df_geo = df_geo.sort_values('时间').reset_index(drop=True)
# 设置时间为索引
df_geo = df_geo.set_index('时间')
# 用pandas的rolling+count方法先构造一个以秒为频率的时间序列
full_time_index = pd.date_range(start=time_range.min(), end=time_range.max(), freq='5S')
# 新建一个Series所有闪电事件置1其他为0
flash_series = pd.Series(1, index=df_geo.index)
# 重新索引到完整时间轴NaN填0
flash_series = flash_series.reindex(full_time_index, fill_value=0)
# 计算未来30分钟1800秒内的闪电次数
# rolling的窗口是“右闭左开”所以要用window=1801含当前点
flash_count = flash_series[::-1].rolling(window=1801, min_periods=1).sum()[::-1]
# 只保留每5秒的时间点
result_df = pd.DataFrame({
'time': full_time_index,
'flash_count': flash_count.values
})
print("结果DataFrame的前10行")
print(result_df.head(10))
print(f"\n总时间点数:{len(result_df)}")
print(f"最大闪电次数:{result_df['flash_count'].max()}")
save_path = "./data/preprocessed_flash/212/flash_count.csv"
os.makedirs(os.path.dirname(save_path), exist_ok=True)
result_df.to_csv(save_path, index=False)