DataPreProcess.py 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489
  1. '''
  2. 数据预处理类
  3. '''
  4. __author__ = 'lmstack'
  5. # import CONFIGURE.PathSetting as PathSetting
  6. # import sys
  7. # sys.path.append(PathSetting.backend_path)
  8. # from LIB.BACKEND
  9. from os import defpath
  10. import pandas as pd
  11. import numpy as np
  12. import pdb
  13. from numba import jit
  14. from LIB.BACKEND import Tools
  15. class DataPreProcess:
  16. def __init__(self):
  17. self.tools = Tools.Tools()
  18. pass
  19. # def data_split(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300,
  20. # drive_stand_threshold=120, charge_stand_threshold=300,
  21. # default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300,
  22. # stand_time_threshold = 1800):
  23. # '''
  24. # 数据分段函数,会调用_data_split_by_status和_data_split_by_time函数。
  25. # 其中_data_split_by_status 将数据分为charge、drive、stand、和none段;
  26. # _data_split_by_time 将每个段内的数据,根据时间跳变继续分段。
  27. # '''
  28. def time_filter(self, df_bms, df_gps):
  29. df_bms.drop_duplicates(subset=['时间戳'], keep='first', inplace=True)
  30. df_gps.drop_duplicates(subset=['时间戳'], keep='first', inplace=True)
  31. df_bms = df_bms.reset_index(drop=True)
  32. df_gps = df_gps.reset_index(drop=True)
  33. return df_bms, df_gps
  34. def data_split_by_status(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300,
  35. drive_stand_threshold=120, charge_stand_threshold=300):
  36. '''
  37. # 数据预处理分段, 将原始数据段分为 charge、drive、stand、none段
  38. # 状态判断
  39. # 1、drive:(状态为2或3 且 存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为行车)
  40. # 2、charge:(状态为2或3 且 不存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为充电)
  41. # 3、stand:(电流持续为0 且 是数据段的第一段) 或 (电流持续为0 且 持续时间>阈值)
  42. # 4、none: 其他
  43. --------------输入参数-------------:
  44. drive_interval_threshold: 行车段拼接阈值,如果两段行车的间隔时间小于该值,则两段行车合并。
  45. charge_interval_threshold: 充电段拼接阈值,如果两段充电的间隔时间小于该值,则两段充电合并。
  46. drive_stand_threshold: 静置段合并至行车段阈值,如果静置时间小于该值,则合并到上一段的行车中。
  47. charge_stand_threshold: 静置段合并至充电段阈值,如果静置时间小于该值,则合并到上一段的充电中。
  48. --------------输出-----------------:
  49. 在原始数据后面,增加data_split_by_crnt, data_split_by_status, data_status 三列
  50. data_split_by_crnt: 按电流分段的序号
  51. data_split_by_status:按电流和状态分段的序号
  52. data_status: 状态标识
  53. '''
  54. # 首先根据电流是否为0 ,将数据分段
  55. df = dfin.copy()
  56. df['时间戳'] = pd.to_datetime(df['时间戳'])
  57. crnt_zero_or_not = df['总电流[A]']==0
  58. last_crnt_flag = crnt_zero_or_not[0]
  59. temp = 1
  60. group_id = [temp]
  61. for cur_crnt_flag in crnt_zero_or_not[1:]:
  62. if last_crnt_flag ^ cur_crnt_flag:
  63. temp = temp + 1
  64. last_crnt_flag = cur_crnt_flag
  65. group_id.append(temp)
  66. df['data_split_by_crnt'] = group_id
  67. # 然后判断每个段内的 充电状态及电流=0持续时长,决定当前状态
  68. temp = 1
  69. last_status = ""
  70. status_id = []
  71. status_list = []
  72. data_number_list = sorted(list(set(df['data_split_by_crnt'])))
  73. for data_number in data_number_list:
  74. df_sel = df[df['data_split_by_crnt'] == data_number]
  75. origin_index = list(df_sel.index)
  76. df_sel = df_sel.reset_index(drop=True)
  77. temp_2 = 0
  78. # 如果当前数据段的电流非0,则可能分为charge、drive或none段
  79. if df_sel.loc[0,'总电流[A]'] != 0:
  80. # 电流 分段中可能存在状态变化的时刻, 内部根据状态进行分段.
  81. # 该数据段内部,根据bms状态信号进行二次分段
  82. status_drive_or_not = df_sel['充电状态']==3
  83. last_status_flag = status_drive_or_not[0]
  84. temp_2 = 0
  85. group_id_2 = [temp_2]
  86. for cur_status_flag in status_drive_or_not[1:]:
  87. if last_status_flag ^ cur_status_flag:
  88. temp_2 = temp_2 + 1
  89. last_status_flag = cur_status_flag
  90. group_id_2.append(temp_2)
  91. # 遍历二次状态分段
  92. temp_2 = 0
  93. last_status_2 = last_status
  94. df_sel['index'] = group_id_2
  95. data_number_list_2 = sorted(list(set(group_id_2)))
  96. for data_number_2 in data_number_list_2:
  97. df_sel_2 = df_sel[df_sel['index'] == data_number_2]
  98. df_sel_2 = df_sel_2.reset_index(drop=True)
  99. # 根据bms状态 及 电流符号决定是charge还是drive
  100. # 如果状态为2或3, 且电流均<=0 则记为充电
  101. if df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) == 0:
  102. cur_status = 'charge'
  103. # 如果状态为2或3,且存在电流>0 则记为行车
  104. elif df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) > 0:
  105. cur_status = 'drive'
  106. # 否则 记为none
  107. else:
  108. cur_status = 'none'
  109. status_list.extend([cur_status] * len(df_sel_2))
  110. # 状态id号与前面电流为0的相同状态进行合并, 均判断应不应该与上一段合并
  111. if origin_index[0] == 0: # 如果是所有数据的起始段数据,则直接赋值id号
  112. status_id.extend([temp + temp_2]*len(df_sel_2))
  113. else: # 判断是否与上一段数据合并
  114. deltaT = (df.loc[origin_index[0], '时间戳'] - df.loc[origin_index[0]-1, '时间戳']).total_seconds()
  115. # 如果 状态一致, 且 间隔时间小于阈值,则合并
  116. if last_status_2 == 'drive' and cur_status == last_status_2 and deltaT < drive_interval_threshold:
  117. temp_2 = temp_2 - 1
  118. status_id.extend([temp + temp_2]*len(df_sel_2))
  119. # 如果状态一致, 且 间隔时间小于阈值,则合并
  120. elif last_status_2 == 'charge' and cur_status == last_status_2 and deltaT < charge_interval_threshold:
  121. temp_2 = temp_2 - 1
  122. status_id.extend([temp + temp_2]*len(df_sel_2))
  123. else:
  124. status_id.extend([temp + temp_2]*len(df_sel_2))
  125. temp_2 = temp_2 + 1
  126. last_status_2 = status_list[-1]
  127. temp_2 = temp_2 - 1
  128. else:
  129. # 如果当前数据段的电流为0,则可能分为stand,charge、drive或none段
  130. if origin_index[0] == 0: # 如果是数据的起始,则无论长短,都认为是stand
  131. status_id.extend([temp]*len(df_sel))
  132. status_list.extend(['stand'] * len(df_sel))
  133. else: # 不是数据的起始
  134. cur_deltaT = (df.loc[origin_index[-1], '时间戳'] - df.loc[origin_index[0], '时间戳']).total_seconds()
  135. if last_status == 'charge': # 如果上一个状态为充电
  136. if cur_deltaT < charge_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并
  137. status_list.extend(['charge'] * len(df_sel))
  138. temp = temp - 1
  139. status_id.extend([temp]*len(df_sel))
  140. else: # 否则超过了阈值,记为stand
  141. status_id.extend([temp]*len(df_sel))
  142. status_list.extend(['stand'] * len(df_sel))
  143. elif last_status == 'drive': # 如果上一个状态为行车
  144. if cur_deltaT < drive_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并
  145. status_list.extend(['drive'] * len(df_sel))
  146. temp = temp - 1
  147. status_id.extend([temp]*len(df_sel))
  148. else: # 否则超过了阈值,记为stand
  149. status_id.extend([temp]*len(df_sel))
  150. status_list.extend(['stand'] * len(df_sel))
  151. elif last_status == 'none': # 如果上一个状态未知
  152. status_id.extend([temp] * len(df_sel))
  153. status_list.extend(['stand'] * len(df_sel))
  154. temp = temp + temp_2 + 1
  155. last_status = status_list[-1] # 上一组状态
  156. df['data_split_by_status'] = status_id
  157. df['data_status'] = status_list
  158. return df
  159. def data_split_by_time(self, dfin, default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300,
  160. stand_time_threshold = 1800):
  161. '''
  162. # 该函数用来解决数据丢失问题导致的分段序号异常,
  163. # 将经过data_split_by_status分段后的数据,每个段内两行数据的时间跳变如果超过阈值,则继续分为两段
  164. --------------输入参数-------------:
  165. dfin: 调用data_split_by_status之后的函数
  166. default_time_threshold: 默认时间阈值,如果状态内部时间跳变大于该值,则划分为两段
  167. drive_time_threshold: 行车时间阈值,如果行车状态内部时间跳变大于该值,则划分为两段
  168. charge_time_threshold: 充电时间阈值,如果充电状态内部时间跳变大于该值,则划分为两段
  169. stand_time_threshold:静置时间阈值,如果静置状态内部时间跳变大于该值,则划分为两段
  170. --------------输出-----------------:
  171. 在输入数据后面,增加data_split_by_status_time 一列
  172. data_split_by_status_time: 按照状态和时间分段后的序号
  173. '''
  174. data_id = []
  175. temp = 1
  176. data_number_list = sorted(list(set(dfin['data_split_by_status'])))
  177. for data_number in data_number_list:
  178. # if data_number == 1203:
  179. # pdb.set_trace()
  180. status = list(dfin[dfin['data_split_by_status']==data_number]['data_status'])[0]
  181. cur_indexes = dfin[dfin['data_split_by_status']==data_number].index
  182. time_array = np.array(dfin[dfin['data_split_by_status']==data_number]['时间戳'])
  183. time_diff = np.diff(time_array)
  184. time_diff = time_diff.astype(np.int64)
  185. time_interval = default_time_threshold
  186. if status == 'drive':
  187. time_interval = drive_time_threshold
  188. elif status == 'charge':
  189. time_interval = charge_time_threshold
  190. elif status == 'stand':
  191. time_interval = stand_time_threshold
  192. time_diff_index = (np.argwhere(((time_diff/1e9) > time_interval)==True))[:,0]
  193. time_diff_origin_index = cur_indexes[time_diff_index]+1
  194. if len(time_diff_index) == 0:
  195. data_id.extend([temp] * len(cur_indexes))
  196. temp += 1
  197. else:
  198. last_index = cur_indexes[0]
  199. for index, cur_index in enumerate(time_diff_origin_index):
  200. if index == len(time_diff_origin_index)-1: # 如果是最后一个index,则
  201. data_id.extend([temp]* (cur_index-last_index))
  202. last_index = cur_index
  203. temp += 1
  204. data_id.extend([temp]* (cur_indexes[-1]-last_index+1))
  205. else:
  206. data_id.extend([temp]* (cur_index-last_index))
  207. last_index = cur_index
  208. temp += 1
  209. dfin['data_split_by_status_time'] = data_id
  210. return dfin
  211. def combine_drive_stand(self, dfin):
  212. '''
  213. 合并放电和静置段:将两次充电之间的所有数据段合并为一段, 状态分为 charge 和not charge
  214. ---------------输入----------
  215. dfin: 调用data_split_by_status()后输出的bms数据
  216. ---------------输出----------
  217. 在输入数据后面,增加data_split_by_status_after_combine, data_status_after_combine 两列
  218. data_split_by_status_after_combine: 将两次充电间的数据合并后的段序号
  219. data_status_after_combine: 每段数据的状态标识
  220. '''
  221. df = dfin.copy()
  222. data_split_by_status_1 = []
  223. data_status_1 = []
  224. number = 1
  225. first_flag = True
  226. data_number_list = sorted(list(set(df['data_split_by_status_time'])))
  227. for data_number in data_number_list:
  228. status = list(df[df['data_split_by_status_time']==data_number]['data_status'])
  229. cur_status = status[0]
  230. if first_flag:
  231. first_flag = False
  232. elif (last_status not in ['charge'] and cur_status in ['charge']) or (last_status in ['charge'] and cur_status not in ['charge']):
  233. number += 1
  234. data_split_by_status_1.extend([number]*len(status))
  235. if cur_status in ['charge']:
  236. data_status_1.extend(['charge']*len(status))
  237. else:
  238. data_status_1.extend(['not charge']*len(status))
  239. last_status = cur_status
  240. df['data_split_by_status_after_combine'] = data_split_by_status_1
  241. df['data_status_after_combine'] = data_status_1
  242. return df
  243. def cal_stand_time(self, dfin):
  244. '''
  245. # 计算静置时间
  246. # 将每次行车或充电的前后静置时间,赋值给stand_time 列, 单位为分钟
  247. ----------------输入参数---------
  248. dfin: 调用data_split_by_status()后输出的bms数据
  249. ----------------输出参数----------
  250. 在输入数据后面,增加stand_time列
  251. stand_time : 在行车段或充电段的起止两个位置处,表明开始前和结束后的静置时长,单位为分钟
  252. '''
  253. df = dfin.copy()
  254. stand_time = []
  255. first_flag = True
  256. data_number_list = sorted(list(set(df['data_split_by_status_time'])))
  257. for index, data_number in enumerate(data_number_list):
  258. status = list(df[df['data_split_by_status_time']==data_number]['data_status'])
  259. time = list(df[df['data_split_by_status_time']==data_number]['时间戳'])
  260. cur_status = status[0]
  261. cur_delta_time = (time[-1]-time[0]).total_seconds() / 60.0 # 分钟
  262. if len(status) >= 2:
  263. if first_flag:
  264. first_flag = False
  265. if index < len(data_number_list)-1:
  266. if cur_status in ['charge', 'drive']:
  267. next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0]
  268. stand_time.extend([None]*(len(status)-1))
  269. if next_status == 'stand':
  270. next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳'])
  271. stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0])
  272. else:
  273. stand_time.extend([0])
  274. else:
  275. stand_time.extend([None]*len(status))
  276. else:
  277. stand_time.extend([None]*len(status))
  278. else:
  279. if cur_status in ['charge', 'drive']:
  280. if last_status == 'stand':
  281. stand_time.extend([last_delta_time])
  282. else:
  283. stand_time.extend([0])
  284. stand_time.extend([None]*(len(status)-2))
  285. if index < len(data_number_list)-1:
  286. next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0]
  287. if next_status == 'stand':
  288. next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳'])
  289. stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0])
  290. else:
  291. stand_time.extend([0])
  292. else:
  293. stand_time.extend([None])
  294. else:
  295. stand_time.extend([None]*len(status))
  296. else:
  297. stand_time.extend([None])
  298. last_status = cur_status
  299. last_delta_time = cur_delta_time
  300. df['stand_time'] = stand_time
  301. return df
  302. # 输入GPS数据,返回本段数据的累积里程,及平均时速(如果两点之间)
  303. @jit
  304. def _cal_odo_speed(self, lat_list, long_list, time_list):
  305. '''
  306. 输入:经度列表, 纬度列表, 时间列表;
  307. 输出:每两个经纬度坐标之间的距离,以及速度 的数组
  308. '''
  309. dis_array = []
  310. speed_array = []
  311. for i in range(len(lat_list)-1):
  312. dis = self.tools.cal_distance(lat_list[i],long_list[i], lat_list[i+1],long_list[i+1])
  313. dis_array.append(dis)
  314. deltaT = abs(time_list[i] - time_list[i+1]).total_seconds()
  315. speed_array.append(dis * 3600.0/deltaT)
  316. return np.array(dis_array), np.array(speed_array)
  317. def gps_data_judge(self, df_bms, df_gps, time_diff_thre=300, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2):
  318. '''
  319. GPS数据可靠性判断函数(基于combine前的分段)
  320. GPS数据出现以下情况时,判定为不可靠:
  321. 1)如果该段对应的地理位置数据 少于2 个,则认为不可靠
  322. 2)如果截取的GPS数据的起止时间,与BMS数据段的起止时间相差超过阈值,则认为不可靠
  323. 3)如果行车段 累积里程超过阈值,车速超过阈值
  324. 4) 如果非行车段 车速超过阈值
  325. --------------输入参数--------------:
  326. time_diff_thre: 时间差阈值
  327. odo_sum_thre: 累积里程阈值
  328. drive_spd_thre: 行车车速阈值
  329. parking_spd_thre: 非行车状态车速阈值
  330. --------------输出参数--------------:
  331. df_bms 增加一列gps_rely, 表明对应的GPS数据是否可靠。
  332. 1:可靠
  333. <0: 表示不可靠的原因
  334. df_gps 增加两列odo, speed, 分别表示前后两点间的距离和速度
  335. '''
  336. df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳'])
  337. res_record = {'drive':0, 'charge':0, 'stand':0, 'none':0, 'total':0}
  338. rely_list = []
  339. df_gps['odo'] = [None] * len(df_gps)
  340. df_gps['speed'] = [None] * len(df_gps)
  341. data_number_list = sorted(list(set(df_bms['data_split_by_status_time'])))
  342. for data_number in data_number_list[:]:
  343. df_sel = df_bms[df_bms['data_split_by_status_time'] == data_number]
  344. df_sel = df_sel.reset_index(drop=True)
  345. df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel.loc[len(df_sel)-1,'时间戳'])]
  346. origin_index = list(df_sel_gps.index)
  347. df_sel_gps = df_sel_gps.reset_index(drop=True)
  348. # 如果当前段数据对应的地理位置数据少于2个
  349. if len(df_sel_gps) <= 1:
  350. rely_list.extend([-1]*len(df_sel))
  351. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  352. continue
  353. # 如果GPS 起止时间段和BMS数据相差超过阈值
  354. if abs(df_sel_gps.loc[0, '时间戳'] - df_sel.loc[0,'时间戳']).total_seconds() > time_diff_thre or \
  355. abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel.loc[len(df_sel)-1,'时间戳']).total_seconds() > time_diff_thre:
  356. rely_list.extend([-2]*len(df_sel))
  357. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  358. continue
  359. # 计算该段数据每两点之间的里程以及速度
  360. dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳'])
  361. # 如果 累积里程异常 或 平均车速异常 或两点间车速异常
  362. avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds()
  363. if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any():
  364. rely_list.extend([-3]*len(df_sel))
  365. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  366. continue
  367. # 如果停车,且 平均时速超过阈值,则不可靠
  368. if (str(df_sel.loc[0, 'data_status']) == 'charge' or str(df_sel.loc[0, 'data_status']) == 'stand') and avg_speed > parking_spd_thre :
  369. rely_list.extend([-4]*len(df_sel))
  370. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  371. continue
  372. # 剩下的记录为可靠
  373. rely_list.extend([1]*len(df_sel))
  374. df_gps.loc[origin_index[1:], 'odo'] = dis_array
  375. df_gps.loc[origin_index[1:], 'speed'] = speed_array
  376. df_bms['gps_rely'] = rely_list
  377. res_record['total'] = (res_record['drive'] + res_record['charge'] + res_record['stand'] + res_record['none'] )/df_bms['data_split_by_status_time'].max()
  378. if len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time'])) > 0:
  379. res_record['drive'] = (res_record['drive'])/len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time']))
  380. if len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time'])) > 0:
  381. res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time']))
  382. if len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time'])) > 0:
  383. res_record['stand'] = (res_record['stand'])/len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time']))
  384. if len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time'])) > 0:
  385. res_record['none'] = (res_record['none'])/len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time']))
  386. return df_bms, df_gps, res_record
  387. def data_gps_judge_after_combine(self, df_bms, df_gps, time_diff_thre=600, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2):
  388. '''
  389. GPS数据可靠性判断函数2 (基于combine后的分段) 判别方式同data_gps_judge
  390. '''
  391. df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳'])
  392. res_record = {'not charge':0, 'charge':0, 'total':0} # 不可靠的比例
  393. rely_list = []
  394. df_gps['odo_after_combine'] = [None] * len(df_gps)
  395. df_gps['speed_after_combine'] = [None] * len(df_gps)
  396. data_number_list = sorted(list(set(df_bms['data_split_by_status_after_combine'])))
  397. for data_number in data_number_list[:]:
  398. df_sel = df_bms[df_bms['data_split_by_status_after_combine'] == data_number]
  399. df_sel = df_sel.reset_index(drop=True)
  400. # 尝试采用drive段的开始和结束时间选择GPS数据,因为stand时GPS数据可能存在丢失,影响里程的计算
  401. df_sel_drive = df_sel[df_sel['data_status']=='drive'] #
  402. df_sel_drive = df_sel_drive.reset_index(drop=True)
  403. if df_sel_drive.empty:
  404. df_sel_1 = df_sel
  405. else:
  406. df_sel_1 = df_sel_drive
  407. df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel_1.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel_1.loc[len(df_sel_1)-1,'时间戳'])]
  408. origin_index = list(df_sel_gps.index)
  409. df_sel_gps = df_sel_gps.reset_index(drop=True)
  410. # 如果当前段数据对应的地理位置数据少于2个
  411. if len(df_sel_gps) <= 1:
  412. rely_list.extend([-1]*len(df_sel))
  413. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  414. continue
  415. # 如果GPS 起止时间段和BMS数据相差超过阈值
  416. if abs(df_sel_gps.loc[0, '时间戳'] - df_sel_1.loc[0,'时间戳']).total_seconds() > time_diff_thre or \
  417. abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel_1.loc[len(df_sel_1)-1,'时间戳']).total_seconds() > time_diff_thre:
  418. rely_list.extend([-2]*len(df_sel))
  419. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  420. continue
  421. # 计算该段数据每两点之间的里程以及速度
  422. dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳'])
  423. # 如果 累积里程异常 或 平均车速异常 或两点间车速异常
  424. avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds()
  425. if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any():
  426. rely_list.extend([-3]*len(df_sel))
  427. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  428. continue
  429. # 如果充电,且 平均时速超过阈值,则不可靠
  430. if str(df_sel.loc[0, 'data_status_after_combine']) == 'charge' and avg_speed > parking_spd_thre:
  431. rely_list.extend([-4]*len(df_sel))
  432. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  433. continue
  434. # 剩下的记录为可靠
  435. rely_list.extend([1]*len(df_sel))
  436. df_gps.loc[origin_index[1:], 'odo_after_combine'] = dis_array
  437. df_gps.loc[origin_index[1:], 'speed_after_combine'] = speed_array
  438. df_bms['gps_rely_after_combine'] = rely_list
  439. res_record['total'] = (res_record['not charge'] + res_record['charge'])/df_bms['data_split_by_status_after_combine'].max()
  440. if len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine'])) > 0:
  441. res_record['not charge'] = (res_record['not charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine']))
  442. if len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine'])) > 0 :
  443. res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine']))
  444. return df_bms, df_gps, res_record