DataPreProcess.py 56 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979
  1. <<<<<<< HEAD
  2. '''
  3. 数据预处理类
  4. '''
  5. __author__ = 'lmstack'
  6. import CONFIGURE.PathSetting as PathSetting
  7. import sys
  8. sys.path.append(PathSetting.backend_path)
  9. from os import defpath
  10. import pandas as pd
  11. import numpy as np
  12. import pdb
  13. from numba import jit
  14. import Tools
  15. class DataPreProcess:
  16. def __init__(self):
  17. self.tools = Tools.Tools()
  18. pass
  19. # def data_split(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300,
  20. # drive_stand_threshold=120, charge_stand_threshold=300,
  21. # default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300,
  22. # stand_time_threshold = 1800):
  23. # '''
  24. # 数据分段函数,会调用_data_split_by_status和_data_split_by_time函数。
  25. # 其中_data_split_by_status 将数据分为charge、drive、stand、和none段;
  26. # _data_split_by_time 将每个段内的数据,根据时间跳变继续分段。
  27. # '''
  28. def time_filter(self, df_bms, df_gps):
  29. df_bms.drop_duplicates(subset=['时间戳'], keep='first', inplace=True)
  30. df_gps.drop_duplicates(subset=['时间戳'], keep='first', inplace=True)
  31. df_bms = df_bms.reset_index(drop=True)
  32. df_gps = df_gps.reset_index(drop=True)
  33. return df_bms, df_gps
  34. def data_split_by_status(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300,
  35. drive_stand_threshold=120, charge_stand_threshold=300):
  36. '''
  37. # 数据预处理分段, 将原始数据段分为 charge、drive、stand、none段
  38. # 状态判断
  39. # 1、drive:(状态为2或3 且 存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为行车)
  40. # 2、charge:(状态为2或3 且 不存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为充电)
  41. # 3、stand:(电流持续为0 且 是数据段的第一段) 或 (电流持续为0 且 持续时间>阈值)
  42. # 4、none: 其他
  43. --------------输入参数-------------:
  44. drive_interval_threshold: 行车段拼接阈值,如果两段行车的间隔时间小于该值,则两段行车合并。
  45. charge_interval_threshold: 充电段拼接阈值,如果两段充电的间隔时间小于该值,则两段充电合并。
  46. drive_stand_threshold: 静置段合并至行车段阈值,如果静置时间小于该值,则合并到上一段的行车中。
  47. charge_stand_threshold: 静置段合并至充电段阈值,如果静置时间小于该值,则合并到上一段的充电中。
  48. --------------输出-----------------:
  49. 在原始数据后面,增加data_split_by_crnt, data_split_by_status, data_status 三列
  50. data_split_by_crnt: 按电流分段的序号
  51. data_split_by_status:按电流和状态分段的序号
  52. data_status: 状态标识
  53. '''
  54. # 首先根据电流是否为0 ,将数据分段
  55. df = dfin.copy()
  56. df['时间戳'] = pd.to_datetime(df['时间戳'])
  57. crnt_zero_or_not = df['总电流[A]']==0
  58. last_crnt_flag = crnt_zero_or_not[0]
  59. temp = 1
  60. group_id = [temp]
  61. for cur_crnt_flag in crnt_zero_or_not[1:]:
  62. if last_crnt_flag ^ cur_crnt_flag:
  63. temp = temp + 1
  64. last_crnt_flag = cur_crnt_flag
  65. group_id.append(temp)
  66. df['data_split_by_crnt'] = group_id
  67. # 然后判断每个段内的 充电状态及电流=0持续时长,决定当前状态
  68. temp = 1
  69. last_status = ""
  70. status_id = []
  71. status_list = []
  72. data_number_list = sorted(list(set(df['data_split_by_crnt'])))
  73. for data_number in data_number_list:
  74. df_sel = df[df['data_split_by_crnt'] == data_number]
  75. origin_index = list(df_sel.index)
  76. df_sel = df_sel.reset_index(drop=True)
  77. temp_2 = 0
  78. # 如果当前数据段的电流非0,则可能分为charge、drive或none段
  79. if df_sel.loc[0,'总电流[A]'] != 0:
  80. # 电流 分段中可能存在状态变化的时刻, 内部根据状态进行分段.
  81. # 该数据段内部,根据bms状态信号进行二次分段
  82. status_drive_or_not = df_sel['充电状态']==3
  83. last_status_flag = status_drive_or_not[0]
  84. temp_2 = 0
  85. group_id_2 = [temp_2]
  86. for cur_status_flag in status_drive_or_not[1:]:
  87. if last_status_flag ^ cur_status_flag:
  88. temp_2 = temp_2 + 1
  89. last_status_flag = cur_status_flag
  90. group_id_2.append(temp_2)
  91. # 遍历二次状态分段
  92. temp_2 = 0
  93. last_status_2 = last_status
  94. df_sel['index'] = group_id_2
  95. data_number_list_2 = sorted(list(set(group_id_2)))
  96. for data_number_2 in data_number_list_2:
  97. df_sel_2 = df_sel[df_sel['index'] == data_number_2]
  98. df_sel_2 = df_sel_2.reset_index(drop=True)
  99. # 根据bms状态 及 电流符号决定是charge还是drive
  100. # 如果状态为2或3, 且电流均<=0 则记为充电
  101. if df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) == 0:
  102. cur_status = 'charge'
  103. # 如果状态为2或3,且存在电流>0 则记为行车
  104. elif df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) > 0:
  105. cur_status = 'drive'
  106. # 否则 记为none
  107. else:
  108. cur_status = 'none'
  109. status_list.extend([cur_status] * len(df_sel_2))
  110. # 状态id号与前面电流为0的相同状态进行合并, 均判断应不应该与上一段合并
  111. if origin_index[0] == 0: # 如果是所有数据的起始段数据,则直接赋值id号
  112. status_id.extend([temp + temp_2]*len(df_sel_2))
  113. else: # 判断是否与上一段数据合并
  114. deltaT = (df.loc[origin_index[0], '时间戳'] - df.loc[origin_index[0]-1, '时间戳']).total_seconds()
  115. # 如果 状态一致, 且 间隔时间小于阈值,则合并
  116. if last_status_2 == 'drive' and cur_status == last_status_2 and deltaT < drive_interval_threshold:
  117. temp_2 = temp_2 - 1
  118. status_id.extend([temp + temp_2]*len(df_sel_2))
  119. # 如果状态一致, 且 间隔时间小于阈值,则合并
  120. elif last_status_2 == 'charge' and cur_status == last_status_2 and deltaT < charge_interval_threshold:
  121. temp_2 = temp_2 - 1
  122. status_id.extend([temp + temp_2]*len(df_sel_2))
  123. else:
  124. status_id.extend([temp + temp_2]*len(df_sel_2))
  125. temp_2 = temp_2 + 1
  126. last_status_2 = status_list[-1]
  127. temp_2 = temp_2 - 1
  128. else:
  129. # 如果当前数据段的电流为0,则可能分为stand,charge、drive或none段
  130. if origin_index[0] == 0: # 如果是数据的起始,则无论长短,都认为是stand
  131. status_id.extend([temp]*len(df_sel))
  132. status_list.extend(['stand'] * len(df_sel))
  133. else: # 不是数据的起始
  134. cur_deltaT = (df.loc[origin_index[-1], '时间戳'] - df.loc[origin_index[0], '时间戳']).total_seconds()
  135. if last_status == 'charge': # 如果上一个状态为充电
  136. if cur_deltaT < charge_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并
  137. status_list.extend(['charge'] * len(df_sel))
  138. temp = temp - 1
  139. status_id.extend([temp]*len(df_sel))
  140. else: # 否则超过了阈值,记为stand
  141. status_id.extend([temp]*len(df_sel))
  142. status_list.extend(['stand'] * len(df_sel))
  143. elif last_status == 'drive': # 如果上一个状态为行车
  144. if cur_deltaT < drive_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并
  145. status_list.extend(['drive'] * len(df_sel))
  146. temp = temp - 1
  147. status_id.extend([temp]*len(df_sel))
  148. else: # 否则超过了阈值,记为stand
  149. status_id.extend([temp]*len(df_sel))
  150. status_list.extend(['stand'] * len(df_sel))
  151. elif last_status == 'none': # 如果上一个状态未知
  152. status_id.extend([temp] * len(df_sel))
  153. status_list.extend(['stand'] * len(df_sel))
  154. temp = temp + temp_2 + 1
  155. last_status = status_list[-1] # 上一组状态
  156. df['data_split_by_status'] = status_id
  157. df['data_status'] = status_list
  158. return df
  159. def data_split_by_time(self, dfin, default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300,
  160. stand_time_threshold = 1800):
  161. '''
  162. # 该函数用来解决数据丢失问题导致的分段序号异常,
  163. # 将经过data_split_by_status分段后的数据,每个段内两行数据的时间跳变如果超过阈值,则继续分为两段
  164. --------------输入参数-------------:
  165. dfin: 调用data_split_by_status之后的函数
  166. default_time_threshold: 默认时间阈值,如果状态内部时间跳变大于该值,则划分为两段
  167. drive_time_threshold: 行车时间阈值,如果行车状态内部时间跳变大于该值,则划分为两段
  168. charge_time_threshold: 充电时间阈值,如果充电状态内部时间跳变大于该值,则划分为两段
  169. stand_time_threshold:静置时间阈值,如果静置状态内部时间跳变大于该值,则划分为两段
  170. --------------输出-----------------:
  171. 在输入数据后面,增加data_split_by_status_time 一列
  172. data_split_by_status_time: 按照状态和时间分段后的序号
  173. '''
  174. data_id = []
  175. temp = 1
  176. data_number_list = sorted(list(set(dfin['data_split_by_status'])))
  177. for data_number in data_number_list:
  178. # if data_number == 1203:
  179. # pdb.set_trace()
  180. status = list(dfin[dfin['data_split_by_status']==data_number]['data_status'])[0]
  181. cur_indexes = dfin[dfin['data_split_by_status']==data_number].index
  182. time_array = np.array(dfin[dfin['data_split_by_status']==data_number]['时间戳'])
  183. time_diff = np.diff(time_array)
  184. time_diff = time_diff.astype(np.int64)
  185. time_interval = default_time_threshold
  186. if status == 'drive':
  187. time_interval = drive_time_threshold
  188. elif status == 'charge':
  189. time_interval = charge_time_threshold
  190. elif status == 'stand':
  191. time_interval = stand_time_threshold
  192. time_diff_index = (np.argwhere(((time_diff/1e9) > time_interval)==True))[:,0]
  193. time_diff_origin_index = cur_indexes[time_diff_index]+1
  194. if len(time_diff_index) == 0:
  195. data_id.extend([temp] * len(cur_indexes))
  196. temp += 1
  197. else:
  198. last_index = cur_indexes[0]
  199. for index, cur_index in enumerate(time_diff_origin_index):
  200. if index == len(time_diff_origin_index)-1: # 如果是最后一个index,则
  201. data_id.extend([temp]* (cur_index-last_index))
  202. last_index = cur_index
  203. temp += 1
  204. data_id.extend([temp]* (cur_indexes[-1]-last_index+1))
  205. else:
  206. data_id.extend([temp]* (cur_index-last_index))
  207. last_index = cur_index
  208. temp += 1
  209. dfin['data_split_by_status_time'] = data_id
  210. return dfin
  211. def combine_drive_stand(self, dfin):
  212. '''
  213. 合并放电和静置段:将两次充电之间的所有数据段合并为一段, 状态分为 charge 和not charge
  214. ---------------输入----------
  215. dfin: 调用data_split_by_status()后输出的bms数据
  216. ---------------输出----------
  217. 在输入数据后面,增加data_split_by_status_after_combine, data_status_after_combine 两列
  218. data_split_by_status_after_combine: 将两次充电间的数据合并后的段序号
  219. data_status_after_combine: 每段数据的状态标识
  220. '''
  221. df = dfin.copy()
  222. data_split_by_status_1 = []
  223. data_status_1 = []
  224. number = 1
  225. first_flag = True
  226. data_number_list = sorted(list(set(df['data_split_by_status_time'])))
  227. for data_number in data_number_list:
  228. status = list(df[df['data_split_by_status_time']==data_number]['data_status'])
  229. cur_status = status[0]
  230. if first_flag:
  231. first_flag = False
  232. elif (last_status not in ['charge'] and cur_status in ['charge']) or (last_status in ['charge'] and cur_status not in ['charge']):
  233. number += 1
  234. data_split_by_status_1.extend([number]*len(status))
  235. if cur_status in ['charge']:
  236. data_status_1.extend(['charge']*len(status))
  237. else:
  238. data_status_1.extend(['not charge']*len(status))
  239. last_status = cur_status
  240. df['data_split_by_status_after_combine'] = data_split_by_status_1
  241. df['data_status_after_combine'] = data_status_1
  242. return df
  243. def cal_stand_time(self, dfin):
  244. '''
  245. # 计算静置时间
  246. # 将每次行车或充电的前后静置时间,赋值给stand_time 列, 单位为分钟
  247. ----------------输入参数---------
  248. dfin: 调用data_split_by_status()后输出的bms数据
  249. ----------------输出参数----------
  250. 在输入数据后面,增加stand_time列
  251. stand_time : 在行车段或充电段的起止两个位置处,表明开始前和结束后的静置时长,单位为分钟
  252. '''
  253. df = dfin.copy()
  254. stand_time = []
  255. first_flag = True
  256. data_number_list = sorted(list(set(df['data_split_by_status_time'])))
  257. for index, data_number in enumerate(data_number_list):
  258. status = list(df[df['data_split_by_status_time']==data_number]['data_status'])
  259. time = list(df[df['data_split_by_status_time']==data_number]['时间戳'])
  260. cur_status = status[0]
  261. cur_delta_time = (time[-1]-time[0]).total_seconds() / 60.0 # 分钟
  262. if len(status) >= 2:
  263. if first_flag:
  264. first_flag = False
  265. if index < len(data_number_list)-1:
  266. if cur_status in ['charge', 'drive']:
  267. next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0]
  268. stand_time.extend([None]*(len(status)-1))
  269. if next_status == 'stand':
  270. next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳'])
  271. stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0])
  272. else:
  273. stand_time.extend([0])
  274. else:
  275. stand_time.extend([None]*len(status))
  276. else:
  277. stand_time.extend([None]*len(status))
  278. else:
  279. if cur_status in ['charge', 'drive']:
  280. if last_status == 'stand':
  281. stand_time.extend([last_delta_time])
  282. else:
  283. stand_time.extend([0])
  284. stand_time.extend([None]*(len(status)-2))
  285. if index < len(data_number_list)-1:
  286. next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0]
  287. if next_status == 'stand':
  288. next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳'])
  289. stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0])
  290. else:
  291. stand_time.extend([0])
  292. else:
  293. stand_time.extend([None])
  294. else:
  295. stand_time.extend([None]*len(status))
  296. else:
  297. stand_time.extend([None])
  298. last_status = cur_status
  299. last_delta_time = cur_delta_time
  300. df['stand_time'] = stand_time
  301. return df
  302. # 输入GPS数据,返回本段数据的累积里程,及平均时速(如果两点之间)
  303. @jit
  304. def _cal_odo_speed(self, lat_list, long_list, time_list):
  305. '''
  306. 输入:经度列表, 纬度列表, 时间列表;
  307. 输出:每两个经纬度坐标之间的距离,以及速度 的数组
  308. '''
  309. dis_array = []
  310. speed_array = []
  311. for i in range(len(lat_list)-1):
  312. dis = self.tools.cal_distance(lat_list[i],long_list[i], lat_list[i+1],long_list[i+1])
  313. dis_array.append(dis)
  314. deltaT = abs(time_list[i] - time_list[i+1]).total_seconds()
  315. speed_array.append(dis * 3600.0/deltaT)
  316. return np.array(dis_array), np.array(speed_array)
  317. def gps_data_judge(self, df_bms, df_gps, time_diff_thre=300, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2):
  318. '''
  319. GPS数据可靠性判断函数(基于combine前的分段)
  320. GPS数据出现以下情况时,判定为不可靠:
  321. 1)如果该段对应的地理位置数据 少于2 个,则认为不可靠
  322. 2)如果截取的GPS数据的起止时间,与BMS数据段的起止时间相差超过阈值,则认为不可靠
  323. 3)如果行车段 累积里程超过阈值,车速超过阈值
  324. 4) 如果非行车段 车速超过阈值
  325. --------------输入参数--------------:
  326. time_diff_thre: 时间差阈值
  327. odo_sum_thre: 累积里程阈值
  328. drive_spd_thre: 行车车速阈值
  329. parking_spd_thre: 非行车状态车速阈值
  330. --------------输出参数--------------:
  331. df_bms 增加一列gps_rely, 表明对应的GPS数据是否可靠。
  332. 1:可靠
  333. <0: 表示不可靠的原因
  334. df_gps 增加两列odo, speed, 分别表示前后两点间的距离和速度
  335. '''
  336. df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳'])
  337. res_record = {'drive':0, 'charge':0, 'stand':0, 'none':0, 'total':0}
  338. rely_list = []
  339. df_gps['odo'] = [None] * len(df_gps)
  340. df_gps['speed'] = [None] * len(df_gps)
  341. data_number_list = sorted(list(set(df_bms['data_split_by_status_time'])))
  342. for data_number in data_number_list[:]:
  343. df_sel = df_bms[df_bms['data_split_by_status_time'] == data_number]
  344. df_sel = df_sel.reset_index(drop=True)
  345. df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel.loc[len(df_sel)-1,'时间戳'])]
  346. origin_index = list(df_sel_gps.index)
  347. df_sel_gps = df_sel_gps.reset_index(drop=True)
  348. # 如果当前段数据对应的地理位置数据少于2个
  349. if len(df_sel_gps) <= 1:
  350. rely_list.extend([-1]*len(df_sel))
  351. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  352. continue
  353. # 如果GPS 起止时间段和BMS数据相差超过阈值
  354. if abs(df_sel_gps.loc[0, '时间戳'] - df_sel.loc[0,'时间戳']).total_seconds() > time_diff_thre or \
  355. abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel.loc[len(df_sel)-1,'时间戳']).total_seconds() > time_diff_thre:
  356. rely_list.extend([-2]*len(df_sel))
  357. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  358. continue
  359. # 计算该段数据每两点之间的里程以及速度
  360. dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳'])
  361. # 如果 累积里程异常 或 平均车速异常 或两点间车速异常
  362. avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds()
  363. if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any():
  364. rely_list.extend([-3]*len(df_sel))
  365. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  366. continue
  367. # 如果停车,且 平均时速超过阈值,则不可靠
  368. if (str(df_sel.loc[0, 'data_status']) == 'charge' or str(df_sel.loc[0, 'data_status']) == 'stand') and avg_speed > parking_spd_thre :
  369. rely_list.extend([-4]*len(df_sel))
  370. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  371. continue
  372. # 剩下的记录为可靠
  373. rely_list.extend([1]*len(df_sel))
  374. df_gps.loc[origin_index[1:], 'odo'] = dis_array
  375. df_gps.loc[origin_index[1:], 'speed'] = speed_array
  376. df_bms['gps_rely'] = rely_list
  377. res_record['total'] = (res_record['drive'] + res_record['charge'] + res_record['stand'] + res_record['none'] )/df_bms['data_split_by_status_time'].max()
  378. if len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time'])) > 0:
  379. res_record['drive'] = (res_record['drive'])/len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time']))
  380. if len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time'])) > 0:
  381. res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time']))
  382. if len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time'])) > 0:
  383. res_record['stand'] = (res_record['stand'])/len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time']))
  384. if len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time'])) > 0:
  385. res_record['none'] = (res_record['none'])/len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time']))
  386. return df_bms, df_gps, res_record
  387. def data_gps_judge_after_combine(self, df_bms, df_gps, time_diff_thre=600, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2):
  388. '''
  389. GPS数据可靠性判断函数2 (基于combine后的分段) 判别方式同data_gps_judge
  390. '''
  391. df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳'])
  392. res_record = {'not charge':0, 'charge':0, 'total':0} # 不可靠的比例
  393. rely_list = []
  394. df_gps['odo_after_combine'] = [None] * len(df_gps)
  395. df_gps['speed_after_combine'] = [None] * len(df_gps)
  396. data_number_list = sorted(list(set(df_bms['data_split_by_status_after_combine'])))
  397. for data_number in data_number_list[:]:
  398. df_sel = df_bms[df_bms['data_split_by_status_after_combine'] == data_number]
  399. df_sel = df_sel.reset_index(drop=True)
  400. # 尝试采用drive段的开始和结束时间选择GPS数据,因为stand时GPS数据可能存在丢失,影响里程的计算
  401. df_sel_drive = df_sel[df_sel['data_status']=='drive'] #
  402. df_sel_drive = df_sel_drive.reset_index(drop=True)
  403. if df_sel_drive.empty:
  404. df_sel_1 = df_sel
  405. else:
  406. df_sel_1 = df_sel_drive
  407. df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel_1.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel_1.loc[len(df_sel_1)-1,'时间戳'])]
  408. origin_index = list(df_sel_gps.index)
  409. df_sel_gps = df_sel_gps.reset_index(drop=True)
  410. # 如果当前段数据对应的地理位置数据少于2个
  411. if len(df_sel_gps) <= 1:
  412. rely_list.extend([-1]*len(df_sel))
  413. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  414. continue
  415. # 如果GPS 起止时间段和BMS数据相差超过阈值
  416. if abs(df_sel_gps.loc[0, '时间戳'] - df_sel_1.loc[0,'时间戳']).total_seconds() > time_diff_thre or \
  417. abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel_1.loc[len(df_sel_1)-1,'时间戳']).total_seconds() > time_diff_thre:
  418. rely_list.extend([-2]*len(df_sel))
  419. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  420. continue
  421. # 计算该段数据每两点之间的里程以及速度
  422. dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳'])
  423. # 如果 累积里程异常 或 平均车速异常 或两点间车速异常
  424. avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds()
  425. if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any():
  426. rely_list.extend([-3]*len(df_sel))
  427. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  428. continue
  429. # 如果充电,且 平均时速超过阈值,则不可靠
  430. if str(df_sel.loc[0, 'data_status_after_combine']) == 'charge' and avg_speed > parking_spd_thre:
  431. rely_list.extend([-4]*len(df_sel))
  432. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  433. continue
  434. # 剩下的记录为可靠
  435. rely_list.extend([1]*len(df_sel))
  436. df_gps.loc[origin_index[1:], 'odo_after_combine'] = dis_array
  437. df_gps.loc[origin_index[1:], 'speed_after_combine'] = speed_array
  438. df_bms['gps_rely_after_combine'] = rely_list
  439. res_record['total'] = (res_record['not charge'] + res_record['charge'])/df_bms['data_split_by_status_after_combine'].max()
  440. if len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine'])) > 0:
  441. res_record['not charge'] = (res_record['not charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine']))
  442. if len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine'])) > 0 :
  443. res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine']))
  444. return df_bms, df_gps, res_record
  445. =======
  446. '''
  447. 数据预处理类
  448. '''
  449. __author__ = 'Wang Liming'
  450. import CONFIGURE.PathSetting as PathSetting
  451. import sys
  452. sys.path.append(PathSetting.backend_path)
  453. from os import defpath
  454. import pandas as pd
  455. import numpy as np
  456. import pdb
  457. from numba import jit
  458. import Tools
  459. class DataPreProcess:
  460. def __init__(self):
  461. self.tools = Tools.Tools()
  462. pass
  463. # def data_split(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300,
  464. # drive_stand_threshold=120, charge_stand_threshold=300,
  465. # default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300,
  466. # stand_time_threshold = 1800):
  467. # '''
  468. # 数据分段函数,会调用_data_split_by_status和_data_split_by_time函数。
  469. # 其中_data_split_by_status 将数据分为charge、drive、stand、和none段;
  470. # _data_split_by_time 将每个段内的数据,根据时间跳变继续分段。
  471. # '''
  472. def time_filter(self, df_bms, df_gps):
  473. df_bms.drop_duplicates(subset=['时间戳'], keep='first', inplace=True)
  474. df_gps.drop_duplicates(subset=['时间戳'], keep='first', inplace=True)
  475. df_bms = df_bms.reset_index(drop=True)
  476. df_gps = df_gps.reset_index(drop=True)
  477. return df_bms, df_gps
  478. def data_split_by_status(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300,
  479. drive_stand_threshold=120, charge_stand_threshold=300):
  480. '''
  481. # 数据预处理分段, 将原始数据段分为 charge、drive、stand、none段
  482. # 状态判断
  483. # 1、drive:(状态为2或3 且 存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为行车)
  484. # 2、charge:(状态为2或3 且 不存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为充电)
  485. # 3、stand:(电流持续为0 且 是数据段的第一段) 或 (电流持续为0 且 持续时间>阈值)
  486. # 4、none: 其他
  487. --------------输入参数-------------:
  488. drive_interval_threshold: 行车段拼接阈值,如果两段行车的间隔时间小于该值,则两段行车合并。
  489. charge_interval_threshold: 充电段拼接阈值,如果两段充电的间隔时间小于该值,则两段充电合并。
  490. drive_stand_threshold: 静置段合并至行车段阈值,如果静置时间小于该值,则合并到上一段的行车中。
  491. charge_stand_threshold: 静置段合并至充电段阈值,如果静置时间小于该值,则合并到上一段的充电中。
  492. --------------输出-----------------:
  493. 在原始数据后面,增加data_split_by_crnt, data_split_by_status, data_status 三列
  494. data_split_by_crnt: 按电流分段的序号
  495. data_split_by_status:按电流和状态分段的序号
  496. data_status: 状态标识
  497. '''
  498. # 首先根据电流是否为0 ,将数据分段
  499. df = dfin.copy()
  500. df['时间戳'] = pd.to_datetime(df['时间戳'])
  501. crnt_zero_or_not = df['总电流[A]']==0
  502. last_crnt_flag = crnt_zero_or_not[0]
  503. temp = 1
  504. group_id = [temp]
  505. for cur_crnt_flag in crnt_zero_or_not[1:]:
  506. if last_crnt_flag ^ cur_crnt_flag:
  507. temp = temp + 1
  508. last_crnt_flag = cur_crnt_flag
  509. group_id.append(temp)
  510. df['data_split_by_crnt'] = group_id
  511. # 然后判断每个段内的 充电状态及电流=0持续时长,决定当前状态
  512. temp = 1
  513. last_status = ""
  514. status_id = []
  515. status_list = []
  516. data_number_list = sorted(list(set(df['data_split_by_crnt'])))
  517. for data_number in data_number_list:
  518. df_sel = df[df['data_split_by_crnt'] == data_number]
  519. origin_index = list(df_sel.index)
  520. df_sel = df_sel.reset_index(drop=True)
  521. temp_2 = 0
  522. # 如果当前数据段的电流非0,则可能分为charge、drive或none段
  523. if df_sel.loc[0,'总电流[A]'] != 0:
  524. # 电流 分段中可能存在状态变化的时刻, 内部根据状态进行分段.
  525. # 该数据段内部,根据bms状态信号进行二次分段
  526. status_drive_or_not = df_sel['充电状态']==3
  527. last_status_flag = status_drive_or_not[0]
  528. temp_2 = 0
  529. group_id_2 = [temp_2]
  530. for cur_status_flag in status_drive_or_not[1:]:
  531. if last_status_flag ^ cur_status_flag:
  532. temp_2 = temp_2 + 1
  533. last_status_flag = cur_status_flag
  534. group_id_2.append(temp_2)
  535. # 遍历二次状态分段
  536. temp_2 = 0
  537. last_status_2 = last_status
  538. df_sel['index'] = group_id_2
  539. data_number_list_2 = sorted(list(set(group_id_2)))
  540. for data_number_2 in data_number_list_2:
  541. df_sel_2 = df_sel[df_sel['index'] == data_number_2]
  542. df_sel_2 = df_sel_2.reset_index(drop=True)
  543. # 根据bms状态 及 电流符号决定是charge还是drive
  544. # 如果状态为2或3, 且电流均<=0 则记为充电
  545. if df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) == 0:
  546. cur_status = 'charge'
  547. # 如果状态为2或3,且存在电流>0 则记为行车
  548. elif df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) > 0:
  549. cur_status = 'drive'
  550. # 否则 记为none
  551. else:
  552. cur_status = 'none'
  553. status_list.extend([cur_status] * len(df_sel_2))
  554. # 状态id号与前面电流为0的相同状态进行合并, 均判断应不应该与上一段合并
  555. if origin_index[0] == 0: # 如果是所有数据的起始段数据,则直接赋值id号
  556. status_id.extend([temp + temp_2]*len(df_sel_2))
  557. else: # 判断是否与上一段数据合并
  558. deltaT = (df.loc[origin_index[0], '时间戳'] - df.loc[origin_index[0]-1, '时间戳']).total_seconds()
  559. # 如果 状态一致, 且 间隔时间小于阈值,则合并
  560. if last_status_2 == 'drive' and cur_status == last_status_2 and deltaT < drive_interval_threshold:
  561. temp_2 = temp_2 - 1
  562. status_id.extend([temp + temp_2]*len(df_sel_2))
  563. # 如果状态一致, 且 间隔时间小于阈值,则合并
  564. elif last_status_2 == 'charge' and cur_status == last_status_2 and deltaT < charge_interval_threshold:
  565. temp_2 = temp_2 - 1
  566. status_id.extend([temp + temp_2]*len(df_sel_2))
  567. else:
  568. status_id.extend([temp + temp_2]*len(df_sel_2))
  569. temp_2 = temp_2 + 1
  570. last_status_2 = status_list[-1]
  571. temp_2 = temp_2 - 1
  572. else:
  573. # 如果当前数据段的电流为0,则可能分为stand,charge、drive或none段
  574. if origin_index[0] == 0: # 如果是数据的起始,则无论长短,都认为是stand
  575. status_id.extend([temp]*len(df_sel))
  576. status_list.extend(['stand'] * len(df_sel))
  577. else: # 不是数据的起始
  578. cur_deltaT = (df.loc[origin_index[-1], '时间戳'] - df.loc[origin_index[0], '时间戳']).total_seconds()
  579. if last_status == 'charge': # 如果上一个状态为充电
  580. if cur_deltaT < charge_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并
  581. status_list.extend(['charge'] * len(df_sel))
  582. temp = temp - 1
  583. status_id.extend([temp]*len(df_sel))
  584. else: # 否则超过了阈值,记为stand
  585. status_id.extend([temp]*len(df_sel))
  586. status_list.extend(['stand'] * len(df_sel))
  587. elif last_status == 'drive': # 如果上一个状态为行车
  588. if cur_deltaT < drive_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并
  589. status_list.extend(['drive'] * len(df_sel))
  590. temp = temp - 1
  591. status_id.extend([temp]*len(df_sel))
  592. else: # 否则超过了阈值,记为stand
  593. status_id.extend([temp]*len(df_sel))
  594. status_list.extend(['stand'] * len(df_sel))
  595. elif last_status == 'none': # 如果上一个状态未知
  596. status_id.extend([temp] * len(df_sel))
  597. status_list.extend(['stand'] * len(df_sel))
  598. temp = temp + temp_2 + 1
  599. last_status = status_list[-1] # 上一组状态
  600. df['data_split_by_status'] = status_id
  601. df['data_status'] = status_list
  602. return df
  603. def data_split_by_time(self, dfin, default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300,
  604. stand_time_threshold = 1800):
  605. '''
  606. # 该函数用来解决数据丢失问题导致的分段序号异常,
  607. # 将经过data_split_by_status分段后的数据,每个段内两行数据的时间跳变如果超过阈值,则继续分为两段
  608. --------------输入参数-------------:
  609. dfin: 调用data_split_by_status之后的函数
  610. default_time_threshold: 默认时间阈值,如果状态内部时间跳变大于该值,则划分为两段
  611. drive_time_threshold: 行车时间阈值,如果行车状态内部时间跳变大于该值,则划分为两段
  612. charge_time_threshold: 充电时间阈值,如果充电状态内部时间跳变大于该值,则划分为两段
  613. stand_time_threshold:静置时间阈值,如果静置状态内部时间跳变大于该值,则划分为两段
  614. --------------输出-----------------:
  615. 在输入数据后面,增加data_split_by_status_time 一列
  616. data_split_by_status_time: 按照状态和时间分段后的序号
  617. '''
  618. data_id = []
  619. temp = 1
  620. data_number_list = sorted(list(set(dfin['data_split_by_status'])))
  621. for data_number in data_number_list:
  622. # if data_number == 1203:
  623. # pdb.set_trace()
  624. status = list(dfin[dfin['data_split_by_status']==data_number]['data_status'])[0]
  625. cur_indexes = dfin[dfin['data_split_by_status']==data_number].index
  626. time_array = np.array(dfin[dfin['data_split_by_status']==data_number]['时间戳'])
  627. time_diff = np.diff(time_array)
  628. time_diff = time_diff.astype(np.int64)
  629. time_interval = default_time_threshold
  630. if status == 'drive':
  631. time_interval = drive_time_threshold
  632. elif status == 'charge':
  633. time_interval = charge_time_threshold
  634. elif status == 'stand':
  635. time_interval = stand_time_threshold
  636. time_diff_index = (np.argwhere(((time_diff/1e9) > time_interval)==True))[:,0]
  637. time_diff_origin_index = cur_indexes[time_diff_index]+1
  638. if len(time_diff_index) == 0:
  639. data_id.extend([temp] * len(cur_indexes))
  640. temp += 1
  641. else:
  642. last_index = cur_indexes[0]
  643. for index, cur_index in enumerate(time_diff_origin_index):
  644. if index == len(time_diff_origin_index)-1: # 如果是最后一个index,则
  645. data_id.extend([temp]* (cur_index-last_index))
  646. last_index = cur_index
  647. temp += 1
  648. data_id.extend([temp]* (cur_indexes[-1]-last_index+1))
  649. else:
  650. data_id.extend([temp]* (cur_index-last_index))
  651. last_index = cur_index
  652. temp += 1
  653. dfin['data_split_by_status_time'] = data_id
  654. return dfin
  655. def combine_drive_stand(self, dfin):
  656. '''
  657. 合并放电和静置段:将两次充电之间的所有数据段合并为一段, 状态分为 charge 和not charge
  658. ---------------输入----------
  659. dfin: 调用data_split_by_status()后输出的bms数据
  660. ---------------输出----------
  661. 在输入数据后面,增加data_split_by_status_after_combine, data_status_after_combine 两列
  662. data_split_by_status_after_combine: 将两次充电间的数据合并后的段序号
  663. data_status_after_combine: 每段数据的状态标识
  664. '''
  665. df = dfin.copy()
  666. data_split_by_status_1 = []
  667. data_status_1 = []
  668. number = 1
  669. first_flag = True
  670. data_number_list = sorted(list(set(df['data_split_by_status_time'])))
  671. for data_number in data_number_list:
  672. status = list(df[df['data_split_by_status_time']==data_number]['data_status'])
  673. cur_status = status[0]
  674. if first_flag:
  675. first_flag = False
  676. elif (last_status not in ['charge'] and cur_status in ['charge']) or (last_status in ['charge'] and cur_status not in ['charge']):
  677. number += 1
  678. data_split_by_status_1.extend([number]*len(status))
  679. if cur_status in ['charge']:
  680. data_status_1.extend(['charge']*len(status))
  681. else:
  682. data_status_1.extend(['not charge']*len(status))
  683. last_status = cur_status
  684. df['data_split_by_status_after_combine'] = data_split_by_status_1
  685. df['data_status_after_combine'] = data_status_1
  686. return df
  687. def cal_stand_time(self, dfin):
  688. '''
  689. # 计算静置时间
  690. # 将每次行车或充电的前后静置时间,赋值给stand_time 列, 单位为分钟
  691. ----------------输入参数---------
  692. dfin: 调用data_split_by_status()后输出的bms数据
  693. ----------------输出参数----------
  694. 在输入数据后面,增加stand_time列
  695. stand_time : 在行车段或充电段的起止两个位置处,表明开始前和结束后的静置时长,单位为分钟
  696. '''
  697. df = dfin.copy()
  698. stand_time = []
  699. first_flag = True
  700. data_number_list = sorted(list(set(df['data_split_by_status_time'])))
  701. for index, data_number in enumerate(data_number_list):
  702. status = list(df[df['data_split_by_status_time']==data_number]['data_status'])
  703. time = list(df[df['data_split_by_status_time']==data_number]['时间戳'])
  704. cur_status = status[0]
  705. cur_delta_time = (time[-1]-time[0]).total_seconds() / 60.0 # 分钟
  706. if len(status) >= 2:
  707. if first_flag:
  708. first_flag = False
  709. if index < len(data_number_list)-1:
  710. if cur_status in ['charge', 'drive']:
  711. next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0]
  712. stand_time.extend([None]*(len(status)-1))
  713. if next_status == 'stand':
  714. next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳'])
  715. stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0])
  716. else:
  717. stand_time.extend([0])
  718. else:
  719. stand_time.extend([None]*len(status))
  720. else:
  721. stand_time.extend([None]*len(status))
  722. else:
  723. if cur_status in ['charge', 'drive']:
  724. if last_status == 'stand':
  725. stand_time.extend([last_delta_time])
  726. else:
  727. stand_time.extend([0])
  728. stand_time.extend([None]*(len(status)-2))
  729. if index < len(data_number_list)-1:
  730. next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0]
  731. if next_status == 'stand':
  732. next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳'])
  733. stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0])
  734. else:
  735. stand_time.extend([0])
  736. else:
  737. stand_time.extend([None])
  738. else:
  739. stand_time.extend([None]*len(status))
  740. else:
  741. stand_time.extend([None])
  742. last_status = cur_status
  743. last_delta_time = cur_delta_time
  744. df['stand_time'] = stand_time
  745. return df
  746. # 输入GPS数据,返回本段数据的累积里程,及平均时速(如果两点之间)
  747. @jit
  748. def _cal_odo_speed(self, lat_list, long_list, time_list):
  749. '''
  750. 输入:经度列表, 纬度列表, 时间列表;
  751. 输出:每两个经纬度坐标之间的距离,以及速度 的数组
  752. '''
  753. dis_array = []
  754. speed_array = []
  755. for i in range(len(lat_list)-1):
  756. dis = self.tools.cal_distance(lat_list[i],long_list[i], lat_list[i+1],long_list[i+1])
  757. dis_array.append(dis)
  758. deltaT = abs(time_list[i] - time_list[i+1]).total_seconds()
  759. speed_array.append(dis * 3600.0/deltaT)
  760. return np.array(dis_array), np.array(speed_array)
  761. def gps_data_judge(self, df_bms, df_gps, time_diff_thre=300, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2):
  762. '''
  763. GPS数据可靠性判断函数(基于combine前的分段)
  764. GPS数据出现以下情况时,判定为不可靠:
  765. 1)如果该段对应的地理位置数据 少于2 个,则认为不可靠
  766. 2)如果截取的GPS数据的起止时间,与BMS数据段的起止时间相差超过阈值,则认为不可靠
  767. 3)如果行车段 累积里程超过阈值,车速超过阈值
  768. 4) 如果非行车段 车速超过阈值
  769. --------------输入参数--------------:
  770. time_diff_thre: 时间差阈值
  771. odo_sum_thre: 累积里程阈值
  772. drive_spd_thre: 行车车速阈值
  773. parking_spd_thre: 非行车状态车速阈值
  774. --------------输出参数--------------:
  775. df_bms 增加一列gps_rely, 表明对应的GPS数据是否可靠。
  776. 1:可靠
  777. <0: 表示不可靠的原因
  778. df_gps 增加两列odo, speed, 分别表示前后两点间的距离和速度
  779. '''
  780. df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳'])
  781. res_record = {'drive':0, 'charge':0, 'stand':0, 'none':0, 'total':0}
  782. rely_list = []
  783. df_gps['odo'] = [None] * len(df_gps)
  784. df_gps['speed'] = [None] * len(df_gps)
  785. data_number_list = sorted(list(set(df_bms['data_split_by_status_time'])))
  786. for data_number in data_number_list[:]:
  787. df_sel = df_bms[df_bms['data_split_by_status_time'] == data_number]
  788. df_sel = df_sel.reset_index(drop=True)
  789. df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel.loc[len(df_sel)-1,'时间戳'])]
  790. origin_index = list(df_sel_gps.index)
  791. df_sel_gps = df_sel_gps.reset_index(drop=True)
  792. # 如果当前段数据对应的地理位置数据少于2个
  793. if len(df_sel_gps) <= 1:
  794. rely_list.extend([-1]*len(df_sel))
  795. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  796. continue
  797. # 如果GPS 起止时间段和BMS数据相差超过阈值
  798. if abs(df_sel_gps.loc[0, '时间戳'] - df_sel.loc[0,'时间戳']).total_seconds() > time_diff_thre or \
  799. abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel.loc[len(df_sel)-1,'时间戳']).total_seconds() > time_diff_thre:
  800. rely_list.extend([-2]*len(df_sel))
  801. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  802. continue
  803. # 计算该段数据每两点之间的里程以及速度
  804. dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳'])
  805. # 如果 累积里程异常 或 平均车速异常 或两点间车速异常
  806. avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds()
  807. if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any():
  808. rely_list.extend([-3]*len(df_sel))
  809. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  810. continue
  811. # 如果停车,且 平均时速超过阈值,则不可靠
  812. if (str(df_sel.loc[0, 'data_status']) == 'charge' or str(df_sel.loc[0, 'data_status']) == 'stand') and avg_speed > parking_spd_thre :
  813. rely_list.extend([-4]*len(df_sel))
  814. res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1
  815. continue
  816. # 剩下的记录为可靠
  817. rely_list.extend([1]*len(df_sel))
  818. df_gps.loc[origin_index[1:], 'odo'] = dis_array
  819. df_gps.loc[origin_index[1:], 'speed'] = speed_array
  820. df_bms['gps_rely'] = rely_list
  821. res_record['total'] = (res_record['drive'] + res_record['charge'] + res_record['stand'] + res_record['none'] )/df_bms['data_split_by_status_time'].max()
  822. if len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time'])) > 0:
  823. res_record['drive'] = (res_record['drive'])/len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time']))
  824. if len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time'])) > 0:
  825. res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time']))
  826. if len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time'])) > 0:
  827. res_record['stand'] = (res_record['stand'])/len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time']))
  828. if len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time'])) > 0:
  829. res_record['none'] = (res_record['none'])/len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time']))
  830. return df_bms, df_gps, res_record
  831. def data_gps_judge_after_combine(self, df_bms, df_gps, time_diff_thre=600, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2):
  832. '''
  833. GPS数据可靠性判断函数2 (基于combine后的分段) 判别方式同data_gps_judge
  834. '''
  835. df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳'])
  836. res_record = {'not charge':0, 'charge':0, 'total':0} # 不可靠的比例
  837. rely_list = []
  838. df_gps['odo_after_combine'] = [None] * len(df_gps)
  839. df_gps['speed_after_combine'] = [None] * len(df_gps)
  840. data_number_list = sorted(list(set(df_bms['data_split_by_status_after_combine'])))
  841. for data_number in data_number_list[:]:
  842. df_sel = df_bms[df_bms['data_split_by_status_after_combine'] == data_number]
  843. df_sel = df_sel.reset_index(drop=True)
  844. # 尝试采用drive段的开始和结束时间选择GPS数据,因为stand时GPS数据可能存在丢失,影响里程的计算
  845. df_sel_drive = df_sel[df_sel['data_status']=='drive'] #
  846. df_sel_drive = df_sel_drive.reset_index(drop=True)
  847. if df_sel_drive.empty:
  848. df_sel_1 = df_sel
  849. else:
  850. df_sel_1 = df_sel_drive
  851. df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel_1.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel_1.loc[len(df_sel_1)-1,'时间戳'])]
  852. origin_index = list(df_sel_gps.index)
  853. df_sel_gps = df_sel_gps.reset_index(drop=True)
  854. # 如果当前段数据对应的地理位置数据少于2个
  855. if len(df_sel_gps) <= 1:
  856. rely_list.extend([-1]*len(df_sel))
  857. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  858. continue
  859. # 如果GPS 起止时间段和BMS数据相差超过阈值
  860. if abs(df_sel_gps.loc[0, '时间戳'] - df_sel_1.loc[0,'时间戳']).total_seconds() > time_diff_thre or \
  861. abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel_1.loc[len(df_sel_1)-1,'时间戳']).total_seconds() > time_diff_thre:
  862. rely_list.extend([-2]*len(df_sel))
  863. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  864. continue
  865. # 计算该段数据每两点之间的里程以及速度
  866. dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳'])
  867. # 如果 累积里程异常 或 平均车速异常 或两点间车速异常
  868. avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds()
  869. if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any():
  870. rely_list.extend([-3]*len(df_sel))
  871. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  872. continue
  873. # 如果充电,且 平均时速超过阈值,则不可靠
  874. if str(df_sel.loc[0, 'data_status_after_combine']) == 'charge' and avg_speed > parking_spd_thre:
  875. rely_list.extend([-4]*len(df_sel))
  876. res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1
  877. continue
  878. # 剩下的记录为可靠
  879. rely_list.extend([1]*len(df_sel))
  880. df_gps.loc[origin_index[1:], 'odo_after_combine'] = dis_array
  881. df_gps.loc[origin_index[1:], 'speed_after_combine'] = speed_array
  882. df_bms['gps_rely_after_combine'] = rely_list
  883. res_record['total'] = (res_record['not charge'] + res_record['charge'])/df_bms['data_split_by_status_after_combine'].max()
  884. if len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine'])) > 0:
  885. res_record['not charge'] = (res_record['not charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine']))
  886. if len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine'])) > 0 :
  887. res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine']))
  888. return df_bms, df_gps, res_record
  889. >>>>>>> 65a87ae16013552e359df047df19f46fc4e6eb08