''' 数据预处理类 ''' __author__ = 'Wang Liming' import CONFIGURE.PathSetting as PathSetting import sys sys.path.append(PathSetting.backend_path) from os import defpath import pandas as pd import numpy as np import pdb from numba import jit import Tools class DataPreProcess: def __init__(self): self.tools = Tools.Tools() pass # def data_split(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300, # drive_stand_threshold=120, charge_stand_threshold=300, # default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300, # stand_time_threshold = 1800): # ''' # 数据分段函数,会调用_data_split_by_status和_data_split_by_time函数。 # 其中_data_split_by_status 将数据分为charge、drive、stand、和none段; # _data_split_by_time 将每个段内的数据,根据时间跳变继续分段。 # ''' def time_filter(self, df_bms, df_gps): df_bms.drop_duplicates(subset=['时间戳'], keep='first', inplace=True) df_gps.drop_duplicates(subset=['时间戳'], keep='first', inplace=True) df_bms = df_bms.reset_index(drop=True) df_gps = df_gps.reset_index(drop=True) return df_bms, df_gps def data_split_by_status(self, dfin, drive_interval_threshold=120, charge_interval_threshold=300, drive_stand_threshold=120, charge_stand_threshold=300): ''' # 数据预处理分段, 将原始数据段分为 charge、drive、stand、none段 # 状态判断 # 1、drive:(状态为2或3 且 存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为行车) # 2、charge:(状态为2或3 且 不存在电流>0 ) 或 (电流持续为0 且 持续时间<阈值 且 上一段数据为充电) # 3、stand:(电流持续为0 且 是数据段的第一段) 或 (电流持续为0 且 持续时间>阈值) # 4、none: 其他 --------------输入参数-------------: drive_interval_threshold: 行车段拼接阈值,如果两段行车的间隔时间小于该值,则两段行车合并。 charge_interval_threshold: 充电段拼接阈值,如果两段充电的间隔时间小于该值,则两段充电合并。 drive_stand_threshold: 静置段合并至行车段阈值,如果静置时间小于该值,则合并到上一段的行车中。 charge_stand_threshold: 静置段合并至充电段阈值,如果静置时间小于该值,则合并到上一段的充电中。 --------------输出-----------------: 在原始数据后面,增加data_split_by_crnt, data_split_by_status, data_status 三列 data_split_by_crnt: 按电流分段的序号 data_split_by_status:按电流和状态分段的序号 data_status: 状态标识 ''' # 首先根据电流是否为0 ,将数据分段 df = dfin.copy() df['时间戳'] = pd.to_datetime(df['时间戳']) crnt_zero_or_not = df['总电流[A]']==0 last_crnt_flag = crnt_zero_or_not[0] temp = 1 group_id = [temp] for cur_crnt_flag in crnt_zero_or_not[1:]: if last_crnt_flag ^ cur_crnt_flag: temp = temp + 1 last_crnt_flag = cur_crnt_flag group_id.append(temp) df['data_split_by_crnt'] = group_id # 然后判断每个段内的 充电状态及电流=0持续时长,决定当前状态 temp = 1 last_status = "" status_id = [] status_list = [] data_number_list = sorted(list(set(df['data_split_by_crnt']))) for data_number in data_number_list: df_sel = df[df['data_split_by_crnt'] == data_number] origin_index = list(df_sel.index) df_sel = df_sel.reset_index(drop=True) temp_2 = 0 # 如果当前数据段的电流非0,则可能分为charge、drive或none段 if df_sel.loc[0,'总电流[A]'] != 0: # 电流 分段中可能存在状态变化的时刻, 内部根据状态进行分段. # 该数据段内部,根据bms状态信号进行二次分段 status_drive_or_not = df_sel['充电状态']==3 last_status_flag = status_drive_or_not[0] temp_2 = 0 group_id_2 = [temp_2] for cur_status_flag in status_drive_or_not[1:]: if last_status_flag ^ cur_status_flag: temp_2 = temp_2 + 1 last_status_flag = cur_status_flag group_id_2.append(temp_2) # 遍历二次状态分段 temp_2 = 0 last_status_2 = last_status df_sel['index'] = group_id_2 data_number_list_2 = sorted(list(set(group_id_2))) for data_number_2 in data_number_list_2: df_sel_2 = df_sel[df_sel['index'] == data_number_2] df_sel_2 = df_sel_2.reset_index(drop=True) # 根据bms状态 及 电流符号决定是charge还是drive # 如果状态为2或3, 且电流均<=0 则记为充电 if df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) == 0: cur_status = 'charge' # 如果状态为2或3,且存在电流>0 则记为行车 elif df_sel_2.loc[0, '充电状态'] in [2, 3] and len(df_sel_2[df_sel_2['总电流[A]'] > 0]) > 0: cur_status = 'drive' # 否则 记为none else: cur_status = 'none' status_list.extend([cur_status] * len(df_sel_2)) # 状态id号与前面电流为0的相同状态进行合并, 均判断应不应该与上一段合并 if origin_index[0] == 0: # 如果是所有数据的起始段数据,则直接赋值id号 status_id.extend([temp + temp_2]*len(df_sel_2)) else: # 判断是否与上一段数据合并 deltaT = (df.loc[origin_index[0], '时间戳'] - df.loc[origin_index[0]-1, '时间戳']).total_seconds() # 如果 状态一致, 且 间隔时间小于阈值,则合并 if last_status_2 == 'drive' and cur_status == last_status_2 and deltaT < drive_interval_threshold: temp_2 = temp_2 - 1 status_id.extend([temp + temp_2]*len(df_sel_2)) # 如果状态一致, 且 间隔时间小于阈值,则合并 elif last_status_2 == 'charge' and cur_status == last_status_2 and deltaT < charge_interval_threshold: temp_2 = temp_2 - 1 status_id.extend([temp + temp_2]*len(df_sel_2)) else: status_id.extend([temp + temp_2]*len(df_sel_2)) temp_2 = temp_2 + 1 last_status_2 = status_list[-1] temp_2 = temp_2 - 1 else: # 如果当前数据段的电流为0,则可能分为stand,charge、drive或none段 if origin_index[0] == 0: # 如果是数据的起始,则无论长短,都认为是stand status_id.extend([temp]*len(df_sel)) status_list.extend(['stand'] * len(df_sel)) else: # 不是数据的起始 cur_deltaT = (df.loc[origin_index[-1], '时间戳'] - df.loc[origin_index[0], '时间戳']).total_seconds() if last_status == 'charge': # 如果上一个状态为充电 if cur_deltaT < charge_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并 status_list.extend(['charge'] * len(df_sel)) temp = temp - 1 status_id.extend([temp]*len(df_sel)) else: # 否则超过了阈值,记为stand status_id.extend([temp]*len(df_sel)) status_list.extend(['stand'] * len(df_sel)) elif last_status == 'drive': # 如果上一个状态为行车 if cur_deltaT < drive_stand_threshold: # 如果本次电流为0的持续时间小于 阈值,则合并 status_list.extend(['drive'] * len(df_sel)) temp = temp - 1 status_id.extend([temp]*len(df_sel)) else: # 否则超过了阈值,记为stand status_id.extend([temp]*len(df_sel)) status_list.extend(['stand'] * len(df_sel)) elif last_status == 'none': # 如果上一个状态未知 status_id.extend([temp] * len(df_sel)) status_list.extend(['stand'] * len(df_sel)) temp = temp + temp_2 + 1 last_status = status_list[-1] # 上一组状态 df['data_split_by_status'] = status_id df['data_status'] = status_list return df def data_split_by_time(self, dfin, default_time_threshold = 300, drive_time_threshold=300, charge_time_threshold=300, stand_time_threshold = 1800): ''' # 该函数用来解决数据丢失问题导致的分段序号异常, # 将经过data_split_by_status分段后的数据,每个段内两行数据的时间跳变如果超过阈值,则继续分为两段 --------------输入参数-------------: dfin: 调用data_split_by_status之后的函数 default_time_threshold: 默认时间阈值,如果状态内部时间跳变大于该值,则划分为两段 drive_time_threshold: 行车时间阈值,如果行车状态内部时间跳变大于该值,则划分为两段 charge_time_threshold: 充电时间阈值,如果充电状态内部时间跳变大于该值,则划分为两段 stand_time_threshold:静置时间阈值,如果静置状态内部时间跳变大于该值,则划分为两段 --------------输出-----------------: 在输入数据后面,增加data_split_by_status_time 一列 data_split_by_status_time: 按照状态和时间分段后的序号 ''' data_id = [] temp = 1 data_number_list = sorted(list(set(dfin['data_split_by_status']))) for data_number in data_number_list: # if data_number == 1203: # pdb.set_trace() status = list(dfin[dfin['data_split_by_status']==data_number]['data_status'])[0] cur_indexes = dfin[dfin['data_split_by_status']==data_number].index time_array = np.array(dfin[dfin['data_split_by_status']==data_number]['时间戳']) time_diff = np.diff(time_array) time_diff = time_diff.astype(np.int64) time_interval = default_time_threshold if status == 'drive': time_interval = drive_time_threshold elif status == 'charge': time_interval = charge_time_threshold elif status == 'stand': time_interval = stand_time_threshold time_diff_index = (np.argwhere(((time_diff/1e9) > time_interval)==True))[:,0] time_diff_origin_index = cur_indexes[time_diff_index]+1 if len(time_diff_index) == 0: data_id.extend([temp] * len(cur_indexes)) temp += 1 else: last_index = cur_indexes[0] for index, cur_index in enumerate(time_diff_origin_index): if index == len(time_diff_origin_index)-1: # 如果是最后一个index,则 data_id.extend([temp]* (cur_index-last_index)) last_index = cur_index temp += 1 data_id.extend([temp]* (cur_indexes[-1]-last_index+1)) else: data_id.extend([temp]* (cur_index-last_index)) last_index = cur_index temp += 1 dfin['data_split_by_status_time'] = data_id return dfin def combine_drive_stand(self, dfin): ''' 合并放电和静置段:将两次充电之间的所有数据段合并为一段, 状态分为 charge 和not charge ---------------输入---------- dfin: 调用data_split_by_status()后输出的bms数据 ---------------输出---------- 在输入数据后面,增加data_split_by_status_after_combine, data_status_after_combine 两列 data_split_by_status_after_combine: 将两次充电间的数据合并后的段序号 data_status_after_combine: 每段数据的状态标识 ''' df = dfin.copy() data_split_by_status_1 = [] data_status_1 = [] number = 1 first_flag = True data_number_list = sorted(list(set(df['data_split_by_status_time']))) for data_number in data_number_list: status = list(df[df['data_split_by_status_time']==data_number]['data_status']) cur_status = status[0] if first_flag: first_flag = False elif (last_status not in ['charge'] and cur_status in ['charge']) or (last_status in ['charge'] and cur_status not in ['charge']): number += 1 data_split_by_status_1.extend([number]*len(status)) if cur_status in ['charge']: data_status_1.extend(['charge']*len(status)) else: data_status_1.extend(['not charge']*len(status)) last_status = cur_status df['data_split_by_status_after_combine'] = data_split_by_status_1 df['data_status_after_combine'] = data_status_1 return df def cal_stand_time(self, dfin): ''' # 计算静置时间 # 将每次行车或充电的前后静置时间,赋值给stand_time 列, 单位为分钟 ----------------输入参数--------- dfin: 调用data_split_by_status()后输出的bms数据 ----------------输出参数---------- 在输入数据后面,增加stand_time列 stand_time : 在行车段或充电段的起止两个位置处,表明开始前和结束后的静置时长,单位为分钟 ''' df = dfin.copy() stand_time = [] first_flag = True data_number_list = sorted(list(set(df['data_split_by_status_time']))) for index, data_number in enumerate(data_number_list): status = list(df[df['data_split_by_status_time']==data_number]['data_status']) time = list(df[df['data_split_by_status_time']==data_number]['时间戳']) cur_status = status[0] cur_delta_time = (time[-1]-time[0]).total_seconds() / 60.0 # 分钟 if len(status) >= 2: if first_flag: first_flag = False if index < len(data_number_list)-1: if cur_status in ['charge', 'drive']: next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0] stand_time.extend([None]*(len(status)-1)) if next_status == 'stand': next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳']) stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0]) else: stand_time.extend([0]) else: stand_time.extend([None]*len(status)) else: stand_time.extend([None]*len(status)) else: if cur_status in ['charge', 'drive']: if last_status == 'stand': stand_time.extend([last_delta_time]) else: stand_time.extend([0]) stand_time.extend([None]*(len(status)-2)) if index < len(data_number_list)-1: next_status = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['data_status'])[0] if next_status == 'stand': next_time = list(df[df['data_split_by_status_time']==data_number_list[index+1]]['时间戳']) stand_time.extend([(next_time[-1]-next_time[0]).total_seconds() / 60.0]) else: stand_time.extend([0]) else: stand_time.extend([None]) else: stand_time.extend([None]*len(status)) else: stand_time.extend([None]) last_status = cur_status last_delta_time = cur_delta_time df['stand_time'] = stand_time return df # 输入GPS数据,返回本段数据的累积里程,及平均时速(如果两点之间) @jit def _cal_odo_speed(self, lat_list, long_list, time_list): ''' 输入:经度列表, 纬度列表, 时间列表; 输出:每两个经纬度坐标之间的距离,以及速度 的数组 ''' dis_array = [] speed_array = [] for i in range(len(lat_list)-1): dis = self.tools.cal_distance(lat_list[i],long_list[i], lat_list[i+1],long_list[i+1]) dis_array.append(dis) deltaT = abs(time_list[i] - time_list[i+1]).total_seconds() speed_array.append(dis * 3600.0/deltaT) return np.array(dis_array), np.array(speed_array) def gps_data_judge(self, df_bms, df_gps, time_diff_thre=300, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2): ''' GPS数据可靠性判断函数(基于combine前的分段) GPS数据出现以下情况时,判定为不可靠: 1)如果该段对应的地理位置数据 少于2 个,则认为不可靠 2)如果截取的GPS数据的起止时间,与BMS数据段的起止时间相差超过阈值,则认为不可靠 3)如果行车段 累积里程超过阈值,车速超过阈值 4) 如果非行车段 车速超过阈值 --------------输入参数--------------: time_diff_thre: 时间差阈值 odo_sum_thre: 累积里程阈值 drive_spd_thre: 行车车速阈值 parking_spd_thre: 非行车状态车速阈值 --------------输出参数--------------: df_bms 增加一列gps_rely, 表明对应的GPS数据是否可靠。 1:可靠 <0: 表示不可靠的原因 df_gps 增加两列odo, speed, 分别表示前后两点间的距离和速度 ''' df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳']) res_record = {'drive':0, 'charge':0, 'stand':0, 'none':0, 'total':0} rely_list = [] df_gps['odo'] = [None] * len(df_gps) df_gps['speed'] = [None] * len(df_gps) data_number_list = sorted(list(set(df_bms['data_split_by_status_time']))) for data_number in data_number_list[:]: df_sel = df_bms[df_bms['data_split_by_status_time'] == data_number] df_sel = df_sel.reset_index(drop=True) df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel.loc[len(df_sel)-1,'时间戳'])] origin_index = list(df_sel_gps.index) df_sel_gps = df_sel_gps.reset_index(drop=True) # 如果当前段数据对应的地理位置数据少于2个 if len(df_sel_gps) <= 1: rely_list.extend([-1]*len(df_sel)) res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1 continue # 如果GPS 起止时间段和BMS数据相差超过阈值 if abs(df_sel_gps.loc[0, '时间戳'] - df_sel.loc[0,'时间戳']).total_seconds() > time_diff_thre or \ abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel.loc[len(df_sel)-1,'时间戳']).total_seconds() > time_diff_thre: rely_list.extend([-2]*len(df_sel)) res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1 continue # 计算该段数据每两点之间的里程以及速度 dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳']) # 如果 累积里程异常 或 平均车速异常 或两点间车速异常 avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds() if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any(): rely_list.extend([-3]*len(df_sel)) res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1 continue # 如果停车,且 平均时速超过阈值,则不可靠 if (str(df_sel.loc[0, 'data_status']) == 'charge' or str(df_sel.loc[0, 'data_status']) == 'stand') and avg_speed > parking_spd_thre : rely_list.extend([-4]*len(df_sel)) res_record[str(df_sel.loc[0, 'data_status'])] = res_record[str(df_sel.loc[0, 'data_status'])] + 1 continue # 剩下的记录为可靠 rely_list.extend([1]*len(df_sel)) df_gps.loc[origin_index[1:], 'odo'] = dis_array df_gps.loc[origin_index[1:], 'speed'] = speed_array df_bms['gps_rely'] = rely_list res_record['total'] = (res_record['drive'] + res_record['charge'] + res_record['stand'] + res_record['none'] )/df_bms['data_split_by_status_time'].max() if len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time'])) > 0: res_record['drive'] = (res_record['drive'])/len(set(df_bms[df_bms['data_status']=='drive']['data_split_by_status_time'])) if len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time'])) > 0: res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status']=='charge']['data_split_by_status_time'])) if len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time'])) > 0: res_record['stand'] = (res_record['stand'])/len(set(df_bms[df_bms['data_status']=='stand']['data_split_by_status_time'])) if len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time'])) > 0: res_record['none'] = (res_record['none'])/len(set(df_bms[df_bms['data_status']=='none']['data_split_by_status_time'])) return df_bms, df_gps, res_record def data_gps_judge_after_combine(self, df_bms, df_gps, time_diff_thre=600, odo_sum_thre=200, drive_spd_thre=80, parking_spd_thre=2): ''' GPS数据可靠性判断函数2 (基于combine后的分段) 判别方式同data_gps_judge ''' df_gps['时间戳'] = pd.to_datetime(df_gps['时间戳']) res_record = {'not charge':0, 'charge':0, 'total':0} # 不可靠的比例 rely_list = [] df_gps['odo_after_combine'] = [None] * len(df_gps) df_gps['speed_after_combine'] = [None] * len(df_gps) data_number_list = sorted(list(set(df_bms['data_split_by_status_after_combine']))) for data_number in data_number_list[:]: df_sel = df_bms[df_bms['data_split_by_status_after_combine'] == data_number] df_sel = df_sel.reset_index(drop=True) # 尝试采用drive段的开始和结束时间选择GPS数据,因为stand时GPS数据可能存在丢失,影响里程的计算 df_sel_drive = df_sel[df_sel['data_status']=='drive'] # df_sel_drive = df_sel_drive.reset_index(drop=True) if df_sel_drive.empty: df_sel_1 = df_sel else: df_sel_1 = df_sel_drive df_sel_gps = df_gps[(df_gps['时间戳']>=df_sel_1.loc[0,'时间戳']) & (df_gps['时间戳']<=df_sel_1.loc[len(df_sel_1)-1,'时间戳'])] origin_index = list(df_sel_gps.index) df_sel_gps = df_sel_gps.reset_index(drop=True) # 如果当前段数据对应的地理位置数据少于2个 if len(df_sel_gps) <= 1: rely_list.extend([-1]*len(df_sel)) res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1 continue # 如果GPS 起止时间段和BMS数据相差超过阈值 if abs(df_sel_gps.loc[0, '时间戳'] - df_sel_1.loc[0,'时间戳']).total_seconds() > time_diff_thre or \ abs(df_sel_gps.loc[len(df_sel_gps)-1, '时间戳'] - df_sel_1.loc[len(df_sel_1)-1,'时间戳']).total_seconds() > time_diff_thre: rely_list.extend([-2]*len(df_sel)) res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1 continue # 计算该段数据每两点之间的里程以及速度 dis_array, speed_array = self._cal_odo_speed(df_sel_gps['纬度'], df_sel_gps['经度'], df_sel_gps['时间戳']) # 如果 累积里程异常 或 平均车速异常 或两点间车速异常 avg_speed = np.sum(dis_array) *3600.0 / abs(df_sel_gps.loc[0, '时间戳'] - df_sel_gps.loc[len(df_sel_gps)-1, '时间戳']).total_seconds() if np.sum(dis_array) > odo_sum_thre or avg_speed > drive_spd_thre or (speed_array > drive_spd_thre).any(): rely_list.extend([-3]*len(df_sel)) res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1 continue # 如果充电,且 平均时速超过阈值,则不可靠 if str(df_sel.loc[0, 'data_status_after_combine']) == 'charge' and avg_speed > parking_spd_thre: rely_list.extend([-4]*len(df_sel)) res_record[str(df_sel.loc[0, 'data_status_after_combine'])] = res_record[str(df_sel.loc[0, 'data_status_after_combine'])] + 1 continue # 剩下的记录为可靠 rely_list.extend([1]*len(df_sel)) df_gps.loc[origin_index[1:], 'odo_after_combine'] = dis_array df_gps.loc[origin_index[1:], 'speed_after_combine'] = speed_array df_bms['gps_rely_after_combine'] = rely_list res_record['total'] = (res_record['not charge'] + res_record['charge'])/df_bms['data_split_by_status_after_combine'].max() if len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine'])) > 0: res_record['not charge'] = (res_record['not charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='not charge']['data_split_by_status_after_combine'])) if len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine'])) > 0 : res_record['charge'] = (res_record['charge'])/len(set(df_bms[df_bms['data_status_after_combine']=='charge']['data_split_by_status_after_combine'])) return df_bms, df_gps, res_record