DataPreProcess.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. '''
  2. 数据预处理类
  3. '''
  4. __author__ = 'wlm'
  5. CONF_PATH = 'D:\\Platform\\platform\\CONFIGURE\\'
  6. import sys
  7. sys.path.append(CONF_PATH)
  8. import PathSetting
  9. sys.path.append(PathSetting.backend_path)
  10. from os import defpath
  11. import pandas as pd
  12. import numpy as np
  13. import pdb
  14. from numba import jit
  15. import Tools
  16. class DataPreProcess:
  17. def __init__(self):
  18. self.tools = Tools.Tools()
  19. pass
  20. # def data_split(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300,
  21. # drive_stand_threshold=120, charge_stand_threshold=300,
  22. # default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300,
  23. # stand_time_threshold = 1800):
  24. # '''
  25. # 数据分段函数,会调用_data_split_by_status和_data_split_by_time函数。
  26. # 其中_data_split_by_status 将数据分为charge、drive、stand、和none段;
  27. # _data_split_by_time 将每个段内的数据,根据时间跳变继续分段。
  28. # '''
  29. def time_filter(self, df_bms, df_gps):
  30. df_bms.drop_duplicates(subset=['时间戳'], keep='first', inplace=True)
  31. df_gps.drop_duplicates(subset=['时间戳'], keep='first', inplace=True)
  32. df_bms = df_bms.reset_index(drop=True)
  33. df_gps = df_gps.reset_index(drop=True)
  34. return df_bms, df_gps
  35. def data_split_by_status(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300,
  36. drive_stand_threshold=120, charge_stand_threshold=300):
  37. '''
  38. # 数据预处理分段, 将原始数据段分为 charge、drive、stand、none段
  39. # 状态判断
  40. # 1、drive:(状态为2或3 且 存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为行车)
  41. # 2、charge:(状态为2或3 且 不存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为充电)
  42. # 3、stand:(电流持续为0 且 是数据段的第一段) 或 (电流持续为0 且 持续时间>阈值)
  43. # 4、none: 其他
  44. --------------输入参数-------------:
  45. drive_interval_threshold: 行车段拼接阈值,如果两段行车的间隔时间小于该值,则两段行车合并。
  46. charge_interval_threshold: 充电段拼接阈值,如果两段充电的间隔时间小于该值,则两段充电合并。
  47. drive_stand_threshold: 静置段合并至行车段阈值,如果静置时间小于该值,则合并到上一段的行车中。
  48. charge_stand_threshold: 静置段合并至充电段阈值,如果静置时间小于该值,则合并到上一段的充电中。
  49. --------------输出-----------------:
  50. 在原始数据后面,增加data_split_by_crnt, data_split_by_status, data_status 三列
  51. data_split_by_crnt: 按电流分段的序号
  52. data_split_by_status:按电流和状态分段的序号
  53. data_status: 状态标识
  54. '''
  55. # 首先根据电流是否为0 ,将数据分段
  56. df = dfin.copy()
  57. df['时间戳'] = pd.to_datetime(df['时间戳'])
  58. crnt_zero_or_not = df['总电流[A]']==0
  59. last_crnt_flag = crnt_zero_or_not[0]
  60. temp = 1
  61. group_id = [temp]
  62. for cur_crnt_flag in crnt_zero_or_not[1:]:
  63. if last_crnt_flag ^ cur_crnt_flag:
  64. temp = temp + 1
  65. last_crnt_flag = cur_crnt_flag
  66. group_id.append(temp)
  67. df['data_split_by_crnt'] = group_id
  68. # 然后判断每个段内的 充电状态及电流=0持续时长,决定当前状态
  69. temp = 1
  70. last_status = ""
  71. status_id = []
  72. status_list = []
  73. data_number_list = sorted(list(set(df['data_split_by_crnt'])))
  74. for data_number in data_number_list:
  75. df_sel = df[df['data_split_by_crnt'] == data_number]
  76. origin_index = list(df_sel.index)
  77. df_sel = df_sel.reset_index(drop=True)
  78. temp_2 = 0
  79. # 如果当前数据段的电流非0,则可能分为charge、drive或none段
  80. if df_sel.loc[0,'总电流[A]'] != 0:
  81. # 电流 分段中可能存在状态变化的时刻, 内部根据状态进行分段.
  82. # 该数据段内部,根据bms状态信号进行二次分段
  83. status_drive_or_not = df_sel['充电状态']==3
  84. last_status_flag = status_drive_or_not[0]
  85. temp_2 = 0
  86. group_id_2 = [temp_2]
  87. for cur_status_flag in status_drive_or_not[1:]:
  88. if last_status_flag ^ cur_status_flag:
  89. temp_2 = temp_2 + 1
  90. last_status_flag = cur_status_flag
  91. group_id_2.append(temp_2)
  92. # 遍历二次状态分段
  93. temp_2 = 0
  94. last_status_2 = last_status
  95. df_sel['index'] = group_id_2
  96. data_number_list_2 = sorted(list(set(group_id_2)))
  97. for data_number_2 in data_number_list_2:
  98. df_sel_2 = df_sel[df_sel['index'] == data_number_2]
  99. df_sel_2 = df_sel_2.reset_index(drop=True)
  100. # 根据bms状态 及 电流符号决定是charge还是drive
  101. # 如果状态为2或3, 且电流均<=0 则记为充电
  102. if df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) == 0:
  103. cur_status = 'charge'
  104. # 如果状态为2或3,且存在电流>0 则记为行车
  105. elif df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) > 0:
  106. cur_status = 'drive'
  107. # 否则 记为none
  108. else:
  109. cur_status = 'none'
  110. status_list.extend([cur_status] * len(df_sel_2))
  111. # 状态id号与前面电流为0的相同状态进行合并, 均判断应不应该与上一段合并
  112. if origin_index[0] == 0: # 如果是所有数据的起始段数据,则直接赋值id号
  113. status_id.extend([temp + temp_2]*len(df_sel_2))
  114. else: # 判断是否与上一段数据合并
  115. deltaT = (df.loc[origin_index[0], '时间戳'] - df.loc[origin_index[0]-1, '时间戳']).total_seconds()
  116. # 如果 状态一致, 且 间隔时间小于阈值,则合并
  117. if last_status_2 == 'drive' and cur_status == last_status_2 and deltaT < drive_interval_threshold:
  118. temp_2 = temp_2 - 1
  119. status_id.extend([temp + temp_2]*len(df_sel_2))
  120. # 如果状态一致, 且 间隔时间小于阈值,则合并
  121. elif last_status_2 == 'charge' and cur_status == last_status_2 and deltaT < charge_interval_threshold:
  122. temp_2 = temp_2 - 1
  123. status_id.extend([temp + temp_2]*len(df_sel_2))
  124. else:
  125. status_id.extend([temp + temp_2]*len(df_sel_2))
  126. temp_2 = temp_2 + 1
  127. last_status_2 = status_list[-1]
  128. temp_2 = temp_2 - 1
  129. else:
  130. # 如果当前数据段的电流为0,则可能分为stand,charge、drive或none段
  131. if origin_index[0] == 0: # 如果是数据的起始,则无论长短,都认为是stand
  132. status_id.extend([temp]*len(df_sel))
  133. status_list.extend(['stand'] * len(df_sel))
  134. else: # 不是数据的起始
  135. cur_deltaT = (df.loc[origin_index[-1], '时间戳'] - df.loc[origin_index[0], '时间戳']).total_seconds()
  136. if last_status == 'charge': # 如果上一个状态为充电
  137. if cur_deltaT < charge_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并
  138. status_list.extend(['charge'] * len(df_sel))
  139. temp = temp - 1
  140. status_id.extend([temp]*len(df_sel))
  141. else: # 否则超过了阈值,记为stand
  142. status_id.extend([temp]*len(df_sel))
  143. status_list.extend(['stand'] * len(df_sel))
  144. elif last_status == 'drive': # 如果上一个状态为行车
  145. if cur_deltaT < drive_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并
  146. status_list.extend(['drive'] * len(df_sel))
  147. temp = temp - 1
  148. status_id.extend([temp]*len(df_sel))
  149. else: # 否则超过了阈值,记为stand
  150. status_id.extend([temp]*len(df_sel))
  151. status_list.extend(['stand'] * len(df_sel))
  152. elif last_status == 'none': # 如果上一个状态未知
  153. status_id.extend([temp] * len(df_sel))
  154. status_list.extend(['stand'] * len(df_sel))
  155. temp = temp + temp_2 + 1
  156. last_status = status_list[-1] # 上一组状态
  157. df['data_split_by_status'] = status_id
  158. df['data_status'] = status_list
  159. return df
  160. def data_split_by_time(self, dfin, default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300,
  161. stand_time_threshold = 1800):
  162. '''
  163. # 该函数用来解决数据丢失问题导致的分段序号异常,
  164. # 将经过data_split_by_status分段后的数据,每个段内两行数据的时间跳变如果超过阈值,则继续分为两段
  165. --------------输入参数-------------:
  166. dfin: 调用data_split_by_status之后的函数
  167. default_time_threshold: 默认时间阈值,如果状态内部时间跳变大于该值,则划分为两段
  168. drive_time_threshold: 行车时间阈值,如果行车状态内部时间跳变大于该值,则划分为两段
  169. charge_time_threshold: 充电时间阈值,如果充电状态内部时间跳变大于该值,则划分为两段
  170. stand_time_threshold:静置时间阈值,如果静置状态内部时间跳变大于该值,则划分为两段
  171. --------------输出-----------------:
  172. 在输入数据后面,增加data_split_by_status_time 一列
  173. data_split_by_status_time: 按照状态和时间分段后的序号
  174. '''
  175. data_id = []
  176. temp = 1
  177. data_number_list = sorted(list(set(dfin['data_split_by_status'])))
  178. for data_number in data_number_list:
  179. # if data_number == 1203:
  180. # pdb.set_trace()
  181. status = list(dfin[dfin['data_split_by_status']==data_number]['data_status'])[0]
  182. cur_indexes = dfin[dfin['data_split_by_status']==data_number].index
  183. time_array = np.array(dfin[dfin['data_split_by_status']==data_number]['时间戳'])
  184. time_diff = np.diff(time_array)
  185. time_diff = time_diff.astype(np.int64)
  186. time_interval = default_time_threshold
  187. if status == 'drive':
  188. time_interval = drive_time_threshold
  189. elif status == 'charge':
  190. time_interval = charge_time_threshold
  191. elif status == 'stand':
  192. time_interval = stand_time_threshold
  193. time_diff_index = (np.argwhere(((time_diff/1e9) > time_interval)==True))[:,0]
  194. time_diff_origin_index = cur_indexes[time_diff_index]+1
  195. if len(time_diff_index) == 0:
  196. data_id.extend([temp] * len(cur_indexes))
  197. temp += 1
  198. else:
  199. last_index = cur_indexes[0]
  200. for index, cur_index in enumerate(time_diff_origin_index):
  201. if index == len(time_diff_origin_index)-1: # 如果是最后一个index,则
  202. data_id.extend([temp]* (cur_index-last_index))
  203. last_index = cur_index
  204. temp += 1
  205. data_id.extend([temp]* (cur_indexes[-1]-last_index+1))
  206. else:
  207. data_id.extend([temp]* (cur_index-last_index))
  208. last_index = cur_index
  209. temp += 1
  210. dfin['data_split_by_status_time'] = data_id
  211. return dfin
  212. def combine_drive_stand(self, dfin):
  213. '''
  214. 合并放电和静置段:将两次充电之间的所有数据段合并为一段, 状态分为 charge 和not charge
  215. ---------------输入----------
  216. dfin: 调用data_split_by_status()后输出的bms数据
  217. ---------------输出----------
  218. 在输入数据后面,增加data_split_by_status_after_combine, data_status_after_combine 两列
  219. data_split_by_status_after_combine: 将两次充电间的数据合并后的段序号
  220. data_status_after_combine: 每段数据的状态标识
  221. '''
  222. df = dfin.copy()
  223. data_split_by_status_1 = []
  224. data_status_1 = []
  225. number = 1
  226. first_flag = True
  227. data_number_list = sorted(list(set(df['data_split_by_status_time'])))
  228. for data_number in data_number_list:
  229. status = list(df[df['data_split_by_status_time']==data_number]['data_status'])
  230. cur_status = status[0]
  231. if first_flag:
  232. first_flag = False
  233. elif (last_status not in ['charge'] and cur_status in ['charge']) or (last_status in ['charge'] and cur_status not in ['charge']):
  234. number += 1
  235. data_split_by_status_1.extend([number]*len(status))
  236. if cur_status in ['charge']:
  237. data_status_1.extend(['charge']*len(status))
  238. else:
  239. data_status_1.extend(['not charge']*len(status))
  240. last_status = cur_status
  241. df['data_split_by_status_after_combine'] = data_split_by_status_1
  242. df['data_status_after_combine'] = data_status_1
  243. return df
  244. def cal_stand_time(self, dfin):
  245. '''
  246. # 计算静置时间
  247. # 将每次行车或充电的前后静置时间,赋值给stand_time 列, 单位为分钟
  248. ----------------输入参数---------
  249. dfin: 调用data_split_by_status()后输出的bms数据
  250. ----------------输出参数----------
  251. 在输入数据后面,增加stand_time列
  252. stand_time : 在行车段或充电段的起止两个位置处,表明开始前和结束后的静置时长,单位为分钟
  253. '''
  254. df = dfin.copy()
  255. stand_time = []
  256. first_flag = True
  257. data_number_list = sorted(list(set(df['data_split_by_status_time'])))
  258. for index, data_number in enumerate(data_number_list):
  259. status = list(df[df['data_split_by_status_time']==data_number]['data_status'])
  260. time = list(df[df['data_split_by_status_time']==data_number]['时间戳'])
  261. cur_status = status[0]
  262. cur_delta_time = (time[-1]-time[0]).total_seconds() / 60.0 # 分钟
  263. if len(status) >= 2:
  264. if first_flag:
  265. first_flag = False
  266. if index < len(data_number_list)-1:
  267. if cur_status in ['charge', 'drive']:
  268. next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0]
  269. stand_time.extend([None]*(len(status)-1))
  270. if next_status == 'stand':
  271. next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳'])
  272. stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0])
  273. else:
  274. stand_time.extend([0])
  275. else:
  276. stand_time.extend([None]*len(status))
  277. else:
  278. stand_time.extend([None]*len(status))
  279. else:
  280. if cur_status in ['charge', 'drive']:
  281. if last_status == 'stand':
  282. stand_time.extend([last_delta_time])
  283. else:
  284. stand_time.extend([0])
  285. stand_time.extend([None]*(len(status)-2))
  286. if index < len(data_number_list)-1:
  287. next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0]
  288. if next_status == 'stand':
  289. next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳'])
  290. stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0])
  291. else:
  292. stand_time.extend([0])
  293. else:
  294. stand_time.extend([None])
  295. else:
  296. stand_time.extend([None]*len(status))
  297. else:
  298. stand_time.extend([None])
  299. last_status = cur_status
  300. last_delta_time = cur_delta_time
  301. df['stand_time'] = stand_time
  302. return df
  303. # 输入GPS数据,返回本段数据的累积里程,及平均时速(如果两点之间)
  304. @jit
  305. def _cal_odo_speed(self, lat_list, long_list, time_list):
  306. '''
  307. 输入:经度列表, 纬度列表, 时间列表;
  308. 输出:每两个经纬度坐标之间的距离,以及速度 的数组
  309. '''
  310. dis_array = []
  311. speed_array = []
  312. for i in range(len(lat_list)-1):
  313. dis = self.tools.cal_distance(lat_list[i],long_list[i], lat_list[i+1],long_list[i+1])
  314. dis_array.append(dis)
  315. deltaT = abs(time_list[i] - time_list[i+1]).total_seconds()
  316. speed_array.append(dis * 3600.0/deltaT)
  317. return np.array(dis_array), np.array(speed_array)
  318. def gps_data_judge(self, df_bms, df_gps, time_diff_thre=300, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2):
  319. '''
  320. GPS数据可靠性判断函数(基于combine前的分段)
  321. GPS数据出现以下情况时,判定为不可靠:
  322. 1)如果该段对应的地理位置数据 少于2 个,则认为不可靠
  323. 2)如果截取的GPS数据的起止时间,与BMS数据段的起止时间相差超过阈值,则认为不可靠
  324. 3)如果行车段 累积里程超过阈值,车速超过阈值
  325. 4) 如果非行车段 车速超过阈值
  326. --------------输入参数--------------:
  327. time_diff_thre: 时间差阈值
  328. odo_sum_thre: 累积里程阈值
  329. drive_spd_thre: 行车车速阈值
  330. parking_spd_thre: 非行车状态车速阈值
  331. --------------输出参数--------------:
  332. df_bms 增加一列gps_rely, 表明对应的GPS数据是否可靠。
  333. 1:可靠
  334. <0: 表示不可靠的原因
  335. df_gps 增加两列odo, speed, 分别表示前后两点间的距离和速度
  336. '''
  337. df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳'])
  338. res_record = {'drive':0, 'charge':0, 'stand':0, 'none':0, 'total':0}
  339. rely_list = []
  340. df_gps['odo'] = [None] * len(df_gps)
  341. df_gps['speed'] = [None] * len(df_gps)
  342. data_number_list = sorted(list(set(df_bms['data_split_by_status_time'])))
  343. for data_number in data_number_list[:]:
  344. df_sel = df_bms[df_bms['data_split_by_status_time'] == data_number]
  345. df_sel = df_sel.reset_index(drop=True)
  346. df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel.loc[len(df_sel)-1,'时间戳'])]
  347. origin_index = list(df_sel_gps.index)
  348. df_sel_gps = df_sel_gps.reset_index(drop=True)
  349. # 如果当前段数据对应的地理位置数据少于2个
  350. if len(df_sel_gps) <= 1:
  351. rely_list.extend([-1]*len(df_sel))
  352. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  353. continue
  354. # 如果GPS 起止时间段和BMS数据相差超过阈值
  355. if abs(df_sel_gps.loc[0, '时间戳'] - df_sel.loc[0,'时间戳']).total_seconds() > time_diff_thre or \
  356. abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel.loc[len(df_sel)-1,'时间戳']).total_seconds() > time_diff_thre:
  357. rely_list.extend([-2]*len(df_sel))
  358. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  359. continue
  360. # 计算该段数据每两点之间的里程以及速度
  361. dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳'])
  362. # 如果 累积里程异常 或 平均车速异常 或两点间车速异常
  363. avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds()
  364. if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any():
  365. rely_list.extend([-3]*len(df_sel))
  366. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  367. continue
  368. # 如果停车,且 平均时速超过阈值,则不可靠
  369. if (str(df_sel.loc[0, 'data_status']) == 'charge' or str(df_sel.loc[0, 'data_status']) == 'stand') and avg_speed > parking_spd_thre :
  370. rely_list.extend([-4]*len(df_sel))
  371. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  372. continue
  373. # 剩下的记录为可靠
  374. rely_list.extend([1]*len(df_sel))
  375. df_gps.loc[origin_index[1:], 'odo'] = dis_array
  376. df_gps.loc[origin_index[1:], 'speed'] = speed_array
  377. df_bms['gps_rely'] = rely_list
  378. res_record['total'] = (res_record['drive'] + res_record['charge'] + res_record['stand'] + res_record['none'] )/df_bms['data_split_by_status_time'].max()
  379. if len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time'])) > 0:
  380. res_record['drive'] = (res_record['drive'])/len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time']))
  381. if len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time'])) > 0:
  382. res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time']))
  383. if len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time'])) > 0:
  384. res_record['stand'] = (res_record['stand'])/len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time']))
  385. if len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time'])) > 0:
  386. res_record['none'] = (res_record['none'])/len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time']))
  387. return df_bms, df_gps, res_record
  388. def data_gps_judge_after_combine(self, df_bms, df_gps, time_diff_thre=600, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2):
  389. '''
  390. GPS数据可靠性判断函数2 (基于combine后的分段) 判别方式同data_gps_judge
  391. '''
  392. df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳'])
  393. res_record = {'not charge':0, 'charge':0, 'total':0} # 不可靠的比例
  394. rely_list = []
  395. df_gps['odo_after_combine'] = [None] * len(df_gps)
  396. df_gps['speed_after_combine'] = [None] * len(df_gps)
  397. data_number_list = sorted(list(set(df_bms['data_split_by_status_after_combine'])))
  398. for data_number in data_number_list[:]:
  399. df_sel = df_bms[df_bms['data_split_by_status_after_combine'] == data_number]
  400. df_sel = df_sel.reset_index(drop=True)
  401. # 尝试采用drive段的开始和结束时间选择GPS数据,因为stand时GPS数据可能存在丢失,影响里程的计算
  402. df_sel_drive = df_sel[df_sel['data_status']=='drive'] #
  403. df_sel_drive = df_sel_drive.reset_index(drop=True)
  404. if df_sel_drive.empty:
  405. df_sel_1 = df_sel
  406. else:
  407. df_sel_1 = df_sel_drive
  408. df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel_1.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel_1.loc[len(df_sel_1)-1,'时间戳'])]
  409. origin_index = list(df_sel_gps.index)
  410. df_sel_gps = df_sel_gps.reset_index(drop=True)
  411. # 如果当前段数据对应的地理位置数据少于2个
  412. if len(df_sel_gps) <= 1:
  413. rely_list.extend([-1]*len(df_sel))
  414. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  415. continue
  416. # 如果GPS 起止时间段和BMS数据相差超过阈值
  417. if abs(df_sel_gps.loc[0, '时间戳'] - df_sel_1.loc[0,'时间戳']).total_seconds() > time_diff_thre or \
  418. abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel_1.loc[len(df_sel_1)-1,'时间戳']).total_seconds() > time_diff_thre:
  419. rely_list.extend([-2]*len(df_sel))
  420. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  421. continue
  422. # 计算该段数据每两点之间的里程以及速度
  423. dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳'])
  424. # 如果 累积里程异常 或 平均车速异常 或两点间车速异常
  425. avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds()
  426. if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any():
  427. rely_list.extend([-3]*len(df_sel))
  428. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  429. continue
  430. # 如果充电,且 平均时速超过阈值,则不可靠
  431. if str(df_sel.loc[0, 'data_status_after_combine']) == 'charge' and avg_speed > parking_spd_thre:
  432. rely_list.extend([-4]*len(df_sel))
  433. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  434. continue
  435. # 剩下的记录为可靠
  436. rely_list.extend([1]*len(df_sel))
  437. df_gps.loc[origin_index[1:], 'odo_after_combine'] = dis_array
  438. df_gps.loc[origin_index[1:], 'speed_after_combine'] = speed_array
  439. df_bms['gps_rely_after_combine'] = rely_list
  440. res_record['total'] = (res_record['not charge'] + res_record['charge'])/df_bms['data_split_by_status_after_combine'].max()
  441. if len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine'])) > 0:
  442. res_record['not charge'] = (res_record['not charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine']))
  443. if len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine'])) > 0 :
  444. res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine']))
  445. return df_bms, df_gps, res_record