hz_data_clean.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. import os
  2. import pandas as pd
  3. import numpy as np
  4. def clean_dead_value(series, num_dead_thresh):
  5. slide_list = [series.index[0]]
  6. slide_list_all = []
  7. for i in range(series.index[0],series.index[-1]):
  8. j = i + 1
  9. diff = series[j] - series[i]
  10. if diff == 0:
  11. slide_list.append(j)
  12. else:
  13. slide_list.clear()
  14. slide_list.append(j)
  15. if len(slide_list) >= num_dead_thresh:
  16. target_list = slide_list.copy()
  17. slide_list_all.append(target_list)
  18. index= [] # 将找到的满足条件的index合并
  19. for i in range(len(slide_list_all) - 1):
  20. if set(slide_list_all[i]) < set(slide_list_all[i + 1]):
  21. index.append(i)
  22. m = {i: element for i, element in enumerate(slide_list_all)}
  23. [m.pop(i) for i in index]
  24. return list(m.values())
  25. def crnt_insert(df_data_temp):
  26. df_data_temp['time'] = pd.to_datetime(df_data_temp['time'])
  27. df_data_temp.sort_values(by = ['time'], inplace = True)
  28. interval = pd.Timedelta(minutes = 5)
  29. threshold = pd.Timedelta(minutes = 20)
  30. inserted_data = pd.DataFrame(columns=df_data_temp.columns)
  31. for i in range(1, len(df_data_temp)):
  32. current_time = df_data_temp.iloc[i]['time']
  33. prev_time = df_data_temp.iloc[i-1]['time']
  34. time_diff = current_time - prev_time
  35. if time_diff >= threshold:
  36. num_intervals = int(time_diff / interval) - 1
  37. new_times = pd.date_range(start=prev_time + interval, end=current_time - interval, freq=interval)
  38. new_rows = pd.DataFrame({col: np.nan for col in df_data_temp.columns}, index=range(len(new_times)))
  39. new_rows['time'] = new_times
  40. inserted_data = inserted_data.append(new_rows)
  41. df_data_temp = pd.concat([df_data_temp, inserted_data], ignore_index=True).sort_values('time')
  42. # df_data_temp['PackCrnt'] = df_data_temp['PackCrnt'].interpolate()
  43. df_data_temp['packcrnt'].fillna(0.2, inplace = True)
  44. df_data_temp.drop_duplicates(subset=['time'], keep = 'first', inplace = True)
  45. df_data_temp.sort_values(by = ['time'], inplace = True)
  46. df_data_temp.reset_index(drop = True, inplace = True)
  47. return df_data_temp
  48. def split_data(df_data2):
  49. df_data_ori = df_data2.copy()
  50. df_data_ori.reset_index(drop = True, inplace = True)
  51. df_data = crnt_insert(df_data_ori)
  52. df_data['crnt_flg'] = 0
  53. df_data.loc[df_data['packcrnt'] > 0.05*102, 'crnt_flg'] = 1
  54. df_data.loc[df_data['packcrnt'] < -0.05*102, 'crnt_flg'] = -1
  55. df_data['sts_flg'] = 2
  56. df_sts_chrg = pd.DataFrame(columns=list(df_data.columns))
  57. df_crnt_flg = df_data['crnt_flg']
  58. num_dead_thresh = 15
  59. indexs_to_delelte = clean_dead_value(df_crnt_flg, num_dead_thresh)#获得连续数据所在的行
  60. rest_num = len(indexs_to_delelte)
  61. if rest_num > 0:#仅有一个连续数据时
  62. for splice_item in range(0, rest_num):#rest_num
  63. df_data_temp = df_data.iloc[indexs_to_delelte[splice_item][0]:(indexs_to_delelte[splice_item][-1]+1)]#获得电流连续数据
  64. df_data_temp.reset_index(drop = True, inplace = True)
  65. # cal_ah_temp = cal_ah(df_data_temp)
  66. delta_soc = df_data_temp['packsoc'].iloc[-1] - df_data_temp['packsoc'].iloc[0]
  67. df_time_temp = pd.to_datetime(df_data_temp['time'])
  68. delta_time = (df_time_temp.iloc[-1] - df_time_temp.iloc[0])/pd.Timedelta(1, 'hours')
  69. if all(df_data_temp['crnt_flg'] == 0):#静置判断
  70. if delta_time > 0.17:
  71. df_data_temp['sts_flg'] = 0
  72. df_sts_chrg = df_sts_chrg.append(df_data_temp)
  73. df_sts_chrg.reset_index(drop = True, inplace = True)
  74. elif all(df_data_temp['crnt_flg'] == -1) and (delta_soc > 2):#充电判断0.1*self.capty
  75. df_data_temp['sts_flg'] = 1
  76. df_time_temp = pd.to_datetime(df_data_temp['time'])
  77. df_soc_temp = df_data_temp['packsoc']
  78. delta_soc = round((df_soc_temp.iloc[-1] - df_soc_temp.iloc[0]), 3)
  79. delta_time = (df_time_temp.iloc[-1] - df_time_temp.iloc[0])/pd.Timedelta(1, 'hours')
  80. if delta_time >= 0.03:
  81. rate = round(delta_soc/(100*delta_time), 3)
  82. else:
  83. rate = 0
  84. if abs(rate) > 0.3:
  85. status = 'offboard_charge'
  86. else:
  87. status = 'onboard_charge'
  88. if df_soc_temp.iloc[-1] >= 99:
  89. ful_chrg_flg = 1
  90. else:
  91. ful_chrg_flg = 0
  92. df_data_temp['charge_method']= status
  93. df_data_temp['full_charge_flg']= ful_chrg_flg
  94. df_sts_chrg = df_sts_chrg.append(df_data_temp)
  95. df_sts_chrg.reset_index(drop = True, inplace = True)
  96. # elif all(df_data_temp['crnt_flg'] == 1) and (cal_ah_temp > 0.1*self.capty):#充电判断
  97. # df_data_temp['sts_flg'] = 1
  98. # df_sts_chrg = df_sts_chrg.append(df_data_temp)
  99. # df_sts_chrg.reset_index(drop = True, inplace = True)
  100. df_dschrg = pd.concat([df_data, df_sts_chrg, df_sts_chrg]).drop_duplicates(subset = ['time', 'packsoc'], keep = False)
  101. data_temp = pd.concat([df_dschrg, df_sts_chrg])
  102. data_temp.sort_values(by = 'time', inplace = True)
  103. data_temp.reset_index(drop = True, inplace = True)
  104. mask = (data_temp['sts_flg'] == 2) & (data_temp['sts_flg'].shift(13) == 0) & (data_temp['sts_flg'].shift(-13) == 0)
  105. data_temp.loc[mask, 'sts_flg'] = 0#静置中的单点行车修改为静置
  106. df_chrg_temp = pd.DataFrame()
  107. chrgr_splice_num = []
  108. if len(df_sts_chrg) > 30:
  109. #---------------------------------充电过程判断--------------------------------------
  110. df_chrg_temp = data_temp.loc[(data_temp['sts_flg'] == 1)]#获取电池充电段
  111. if not df_chrg_temp.empty:
  112. df_chrg_temp.reset_index(inplace = True, drop = True)
  113. chrgr_time = pd.to_datetime(df_chrg_temp['time'])
  114. delta_time = (np.diff(chrgr_time)/pd.Timedelta(1, 'min'))#
  115. pos = np.where(delta_time > 30)
  116. splice_num = []
  117. if len(pos[0]) >= 1:
  118. pos_ful_tem = np.insert(pos, 0, 0)
  119. pos_len = len(pos_ful_tem)
  120. data_len = len(chrgr_time)
  121. pos_ful = np.insert(pos_ful_tem, pos_len, data_len-1)
  122. for item in range(0,len(pos_ful)-1):
  123. splice_num.extend(item*np.ones(pos_ful[item +1]-pos_ful[item]))
  124. splice_num = np.insert(splice_num, 0, 0)
  125. else:
  126. splice_num = np.zeros(len(chrgr_time))
  127. pos_ful = np.array([0])
  128. if len(splice_num) > 0:
  129. df_chrg_temp['chrgr_rest'] = splice_num
  130. chrgr_splice_num = np.unique(df_chrg_temp['chrgr_rest'])#判断有几段充电数据
  131. if not data_temp.empty:
  132. stat_delta_flg = np.diff(data_temp['sts_flg'])#计算时间差的分钟数
  133. delta_time = np.diff(pd.to_datetime(data_temp['time']))/pd.Timedelta(1, 'min')
  134. stat_pos = np.where((stat_delta_flg != 0) | (delta_time > 40))#充电数据分段,大于10min时,认为是两个充电过程
  135. sts_splice_num = []
  136. if len(stat_pos[0]) >= 1:
  137. pos_ful_tem = np.insert(stat_pos, 0, 0)
  138. pos_len = len(pos_ful_tem)
  139. data_len = len(data_temp)
  140. pos_ful = np.insert(pos_ful_tem, pos_len, data_len-1)
  141. for item in range(0,len(pos_ful)-1):
  142. sts_splice_num.extend(item*np.ones(pos_ful[item +1]-pos_ful[item]))
  143. sts_splice_num = np.insert(sts_splice_num, 0, 0)
  144. else:
  145. sts_splice_num = np.zeros(len(data_temp))
  146. pos_ful = np.array([0])
  147. if len(sts_splice_num) > 0:
  148. data_temp['state_bms'] = sts_splice_num
  149. state_splice_num = np.unique(data_temp['state_bms'])#判断有几段充电数据
  150. return data_temp, df_chrg_temp, chrgr_splice_num, state_splice_num
  151. def datacleaning(df):
  152. if not df.empty:
  153. # df['Time'] = pd.to_datetime(list(df['Time']), utc=True, unit='ms').tz_convert('Asia/Shanghai')
  154. df=df.replace('', np.nan)
  155. df.dropna(axis=0,subset = [ "time", "cellvoltage", "celltemp", "packcrnt"], inplace=True)
  156. df['time'] = pd.to_datetime(df['time'], unit='s') #+ pd.Timedelta(hours=8)
  157. df.drop(df.index[(df['packvoltage'] > 1000) | abs(df['packcrnt'] > 999) | (df['packsoc'] > 100) ], inplace=True)
  158. df.drop(df.index[(df['packvoltage'] < 0.001) & (abs(df['packcrnt']) < 0.001) ], inplace=True)
  159. if not df.empty:
  160. df.sort_values(by="time", inplace=True)
  161. df.drop_duplicates(subset="time", inplace=True)
  162. df.reset_index(drop=True, inplace=True)
  163. #num_lst = df.loc[:,'CellVoltTotalCount'].value_counts() #统计电芯数量这一列每个元素出现的个数
  164. CellVoltNums = 96 #num_lst.idxmax() # 找出电芯数量出现最多的次数
  165. #num_lst = df.loc[:,'CellTempTotalCount'].value_counts() #统计电芯数量这一列每个元素出现的个数
  166. CellTempNums = 16#num_lst.idxmax() # 找出电芯数量出现最多的次数
  167. cellvolt_name=['cellvoltage'+str(x) for x in range(1, CellVoltNums+1)]
  168. celltemp_name=['celltemp'+str(x) for x in range(1, CellTempNums+1)]
  169. df_volt=pd.DataFrame([x[0].split(",") for x in np.array(df[['cellvoltage']])]).iloc[:,list(range(CellVoltNums))]
  170. df_volt.columns = cellvolt_name
  171. df_volt=df_volt.astype('float')
  172. cellvoltmax = df_volt.max(axis=1)
  173. cellvoltmin = df_volt.min(axis=1)
  174. df_volt[['cellvoltmax','cellvoltmin']] = pd.concat([cellvoltmax,cellvoltmin], axis=1)
  175. df_temp=pd.DataFrame([x[0].split(",") for x in np.array(df[['celltemp']])]).iloc[:,list(range(CellTempNums))]
  176. df_temp.columns = celltemp_name
  177. df_temp=df_temp.astype('float')
  178. celltempmax = df_temp.max(axis=1)
  179. celltempmin = df_temp.min(axis=1)
  180. df_temp[['celltempmax','celltempmin']] = pd.concat([celltempmax,celltempmin], axis=1)
  181. df=pd.concat([df,df_volt,df_temp],axis=1)
  182. df.drop(df.index[(df['cellvoltmin'] <0.1)], inplace=True)
  183. df.reset_index(inplace=True, drop=True)
  184. # df_table = df.drop_duplicates(subset=['SN'], keep='first', ignore_index=True)
  185. # df_table = df_table.set_index('SN')
  186. else:
  187. df = pd.DataFrame()
  188. #df_table = pd.DataFrame()
  189. cellvolt_name = []
  190. celltemp_name = []
  191. return df, cellvolt_name, celltemp_name #df_table,
  192. else:
  193. return pd.DataFrame(), [], [] # pd.DataFrame(),
  194. def get_all_files_in_folder(folder_path):
  195. all_files = []
  196. for root, dirs, files in os.walk(folder_path):
  197. for file in files:
  198. file_path = os.path.join(root, file)
  199. all_files.append(file_path)
  200. return all_files
  201. def concat_csv_files(csv_files):
  202. dfs = []
  203. for file in csv_files:
  204. df = pd.read_csv(file)
  205. dfs.append(df)
  206. return pd.concat(dfs, ignore_index=True)
  207. sn_list=['LUZAGAAA9LA012791','LUZAGAAA2LA014379','LUZAGAAA1MA020272','LUZAGAAA0MA020540','LUZAGAAA3MA033346','LUZAGAAA0MA035149', 'LUZAGAAA7MA043572','LUZAGAAA0MA070726','LUZAGAAA9NA020991','LUZAGAAA9NA022448']
  208. for i in range (0,len(sn_list)):
  209. folder_path = "/data_highspeed/common/hz/lifecycle/{}.csv".format(sn_list[i]) # 替换为实际的文件夹路径
  210. out_put_folder_path="{}_charge.feather".format(sn_list[i]) #/data/common/hz/lifecycle/LUZAGAAAXKA008957/
  211. # file_list = get_all_files_in_folder(folder_path)
  212. # if len(file_list) > 0:
  213. # concatenated_df = concat_csv_files(file_list)
  214. print('读取第'+str(i)+'个电池'+sn_list[i])
  215. concatenated_df=pd.read_csv(folder_path)
  216. if concatenated_df is not None:
  217. print('清洗第'+str(i)+'个电池'+sn_list[i])
  218. df_cleaned,cellvolt_name,celltemp_name=datacleaning(concatenated_df)
  219. print('分段第'+str(i)+'个电池'+sn_list[i])
  220. data_temp, df_chrg_splited, chrgr_splice_num, state_splice_num=split_data(df_cleaned)
  221. print('写入第'+str(i)+'个电池'+sn_list[i])
  222. df_chrg_splited.to_feather(out_put_folder_path)
  223. print("Finished.")
  224. else:
  225. print("No CSV files found in the folder.")