ProcessDfGps.py 5.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. import pandas as pd
  2. import numpy as np
  3. from datetime import datetime
  4. from datetime import timedelta
  5. from ProcessDfBms import *
  6. from math import radians, cos, sin, asin, sqrt
  7. def cal_unrecorded_gps(df_in,df_bms):
  8. '''筛选出现gps时间断点的数据,用df_bms数据补齐,df_in为df_gps表格。'''
  9. #未记录到的odo总和
  10. accum_unrecorded_odo=0
  11. #设置丢失的判断条件,获得信息丢失行的index
  12. condition1=df_in['deltatime']>60*3#时间间隔大于3分钟。说明数据掉线了。
  13. condition2=(df_in['deltatime']>90*1)&(df_in['distance']>1000)#时间间隔大于*分钟,且Distance间隔大于*,代表掉线了。
  14. signal_start_list=df_in.loc[condition1|condition2,:].index.to_list()#信息丢失行
  15. #如果第0行属于信息丢失行,则删除,因为需要index-1行
  16. try:
  17. signal_start_list.remove(0)
  18. except:
  19. pass
  20. else:
  21. pass
  22. #筛选出所有GPS信号丢失,对应的开始时间-结束时间对。
  23. if len(signal_start_list)>0:
  24. signal_end_list=[num-1 for num in signal_start_list]#信息丢失行的前一行,此处可能如果是首行,可能会有bug。
  25. pick_gps_list=[0]+signal_start_list+signal_end_list+[len(df_in)-1]#首行+尾行+信号开始行+信号结束行
  26. pick_gps_list=sorted(pick_gps_list)#重新排序
  27. #有出现信号断点的行,则进行以下计算。
  28. if len(signal_start_list)>0:
  29. #针对每个时间对,计算unrecorded odo
  30. for start_time_index,end_time_index in zip(signal_start_list,signal_end_list):
  31. last_end_time=df_in.loc[end_time_index,'time']
  32. this_start_time=df_in.loc[start_time_index,'time']
  33. #print('gps signal loss from: '+str(last_end_time)+'-to-'+str(this_start_time))
  34. #使用cal_delatasoc计算预估里程
  35. unrecorded_odo=cal_deltasoc(df_bms,last_end_time,this_start_time)
  36. accum_unrecorded_odo+=unrecorded_odo
  37. #print('accum_unrecorded_odo:'+str(accum_unrecorded_odo))
  38. else:
  39. pass
  40. return accum_unrecorded_odo
  41. def df_add_avgspeed(df_in):
  42. '''Add a columns:avgspeed ,input df must have deltatime,distance column.'''
  43. for i in range(len(df_in)):
  44. #首行默认为0
  45. if i==0:
  46. df_in.loc[i,'avgspeed']=0
  47. #从第二行开始,计算平均速度
  48. else:
  49. deltatime=df_in.loc[i,'deltatime']
  50. distance=df_in.loc[i,'distance']
  51. avgspeed=(distance/1000)/(deltatime/3600)
  52. df_in.loc[i,'avgspeed']=avgspeed
  53. return df_in
  54. def read_df_gps(path):
  55. df_gps=pd.read_csv(path, encoding='gbk')#编码方式gbk
  56. #重置表头
  57. df_gps.rename(columns = {"时间戳": "time", "纬度":"lat", "经度":"lng",
  58. "卫星数":"sat_num", "海拔m":"height","速度[km/h]":"speed"}, inplace=True)
  59. #时间格式调整
  60. df_gps['time']=pd.to_datetime(df_gps['time'])
  61. #对gps进行清洗
  62. df_gps=df_add_distance(df_gps)#增加distance列
  63. condition=df_gps['distance']<20000#删除GPS漂移过远的点,可能为GPS错误值
  64. df_gps=df_gps.loc[condition,:].copy()#删除condition中,avgspd过大的部分,很可能伴随着GPS的漂移。
  65. df_gps=df_gps.reset_index(drop=True)#重置index
  66. #进行预处理
  67. df_gps=df_add_distance(df_gps)#增加distance列,再算一次distance
  68. df_gps=df_add_deltatime(df_gps)#增加deltatime列
  69. df_gps=df_add_avgspeed(df_gps)#增加avgspeed列
  70. #df_gps.to_excel('df_gps.xlsx',sheet_name='Sheet1')
  71. return df_gps
  72. def preprocess_Df_Gps(df_gps):
  73. '''对Df_Gps进行预处理'''
  74. #重置表头
  75. df_gps.rename(columns = {"时间戳": "time", "纬度":"lat", "经度":"lng",
  76. "卫星数":"sat_num", "海拔m":"height","速度[km/h]":"speed"}, inplace=True)
  77. #删除含有空数据的行
  78. df_gps=df_gps.dropna(subset=['time','lat','lng'])
  79. #删除时间重复的行,保留第一次出现的行
  80. df_gps=df_gps.drop_duplicates(subset=['time'],keep='first')
  81. #时间格式调整
  82. df_gps['time']=pd.to_datetime(df_gps['time'])
  83. #对gps进行清洗
  84. df_gps=df_add_distance(df_gps)#增加distance列
  85. condition=df_gps['distance']<20000#删除GPS漂移过远的点,可能为GPS错误值
  86. df_gps=df_gps.loc[condition,:].copy()#删除condition中,avgspd过大的部分,很可能伴随着GPS的漂移。
  87. df_gps=df_gps.reset_index(drop=True)#重置index
  88. #进行预处理
  89. df_gps=df_add_distance(df_gps)#增加distance列,再算一次distance
  90. df_gps=df_add_deltatime(df_gps)#增加deltatime列
  91. df_gps=df_gps.loc[df_gps['deltatime']>0.01,:].copy()#删除deltatime=0的列,两个时间戳相同,无法求速度。
  92. df_gps=df_add_avgspeed(df_gps)#增加avgspeed列
  93. #df_gps.to_excel('df_gps.xlsx',sheet_name='Sheet1')
  94. return df_gps
  95. def df_add_distance(df_in):
  96. '''Add a columns:distance,input df must have lng,lat columns.'''
  97. for i in range(len(df_in)):
  98. #首行默认为0
  99. if i==0:
  100. df_in.loc[i,'distance']=0
  101. #从第二行开始,计算i行到i-1行,GPS距离之差
  102. else:
  103. lon1=df_in.loc[i-1,'lng']
  104. lat1=df_in.loc[i-1,'lat']
  105. lon2=df_in.loc[i,'lng']
  106. lat2=df_in.loc[i,'lat']
  107. distance=haversine(lon1,lat1,lon2,lat2)#haversine公式计算距离差
  108. df_in.loc[i,'distance']=distance
  109. return df_in
  110. def haversine(lon1, lat1, lon2, lat2):
  111. """
  112. Calculate the great circle distance between two points
  113. on the earth (specified in decimal degrees)
  114. """
  115. # 将十进制度数转化为弧度
  116. lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
  117. # haversine公式
  118. dlon = lon2 - lon1
  119. dlat = lat2 - lat1
  120. a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
  121. c = 2 * asin(sqrt(a))
  122. r = 6371 # 地球平均半径,单位为公里
  123. return c * r * 1000