data_preprocess.py 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194
  1. import pandas as pd
  2. import os
  3. import numpy as np
  4. from tqdm import tqdm
  5. from datetime import datetime
  6. import shutil
  7. pd.set_option('display.max_columns', 500)
  8. pd.set_option('display.max_rows', 500)
  9. path = '/home/chenenze/hz_user/'
  10. out_path = './dataframes/'
  11. temp_thresh_max_c = 35
  12. temp_thresh_min_c = 10
  13. mean_crnt_thresh_c = 1
  14. temp_thresh_max_d = 35
  15. temp_thresh_min_d = 10
  16. mean_crnt_thresh_d = 0.1
  17. if __name__ == '__main__':
  18. test_vin_list = os.listdir(path)
  19. df_vin_pack_cell_info = pd.read_csv('./vin_pack_cell_info.csv')
  20. df_vin_pack_cell_info = df_vin_pack_cell_info.drop_duplicates(subset=['vin'], keep='first')
  21. df_vin_pack_cell_info = df_vin_pack_cell_info.set_index('vin')
  22. # remove all the folders in out_path
  23. # if os.path.exists(out_path):
  24. # shutil.rmtree(out_path)
  25. for test_vin in test_vin_list:
  26. pack_model = df_vin_pack_cell_info.loc[test_vin]['pack_model_code']
  27. if pack_model != '2101TBC':
  28. continue
  29. if not os.path.exists(out_path+pack_model+'/charge/'):
  30. os.makedirs(out_path+pack_model+'/charge/')
  31. if not os.path.exists(out_path+pack_model+'/drive/'):
  32. os.makedirs(out_path+pack_model+'/drive/')
  33. crg_file_list = [file for file in os.listdir(path+test_vin+'/') if file.endswith('charge_proc_di.feather')]
  34. for crg_file in tqdm(crg_file_list):
  35. df_user = pd.DataFrame(columns=['vin', 'pack_model', 'week', 'temp_max_25', 'temp_max_75', 'temp_35', 'temp_min_25', 'temp_min_75', 'temp_10', \
  36. 'chrgah', 'meancrnt', 'sts_flg', 'full_chrg_flg', 'cellvol_max_25', 'cellvol_max_75', 'packvol_max_25', 'packvol_max_75', 'temp_time_15', 'temp_time_30'])
  37. vin = crg_file.split('_')[0]
  38. df_data = pd.read_feather(path+test_vin+'/' + crg_file)
  39. # convert dt from int to datetime
  40. df_data['dt'] = df_data['dt'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d'))
  41. df_data = df_data.loc[df_data['dt'] >= df_data['dt'].max() - pd.Timedelta(days=60)]
  42. if len(df_data) == 0:
  43. continue
  44. df_data['wk'] = df_data['dt'].apply(lambda x: x.isocalendar()[1])
  45. pack_model = df_data['pack_model'][0]
  46. df_data_dt_list = list(df_data.groupby('wk'))
  47. for date, df_data_dt in df_data_dt_list:
  48. temp_max_list = df_data_dt['temp_max'].tolist()
  49. temp_max_25 = np.quantile(temp_max_list, 0.25)
  50. temp_max_75 = np.quantile(temp_max_list, 0.75)
  51. temp_35 = len([temp for temp in temp_max_list if temp > temp_thresh_max_c])
  52. temp_min_list = df_data_dt['temp_min'].tolist()
  53. temp_min_25 = np.quantile(temp_min_list, 0.25)
  54. temp_min_75 = np.quantile(temp_min_list, 0.75)
  55. temp_10 = len([temp for temp in temp_min_list if temp < temp_thresh_min_c])
  56. chrgah = df_data_dt['chrgah'].sum()
  57. meancrnt = len([crnt for crnt in df_data_dt['meancrnt'].tolist() if crnt > mean_crnt_thresh_c])
  58. sts_flg = len([sts for sts in df_data_dt['sts_flg'].tolist() if sts == 1])
  59. full_chrg_flg = len([full_chrg for full_chrg in df_data_dt['full_chrg_flg'].tolist() if full_chrg == 1])
  60. cellvol_max_list = df_data_dt.loc[df_data_dt['full_chrg_flg'] == 1]['cellvol_max'].tolist()
  61. if len(cellvol_max_list) == 0:
  62. cellvol_max_25 = 0
  63. cellvol_max_75 = 0
  64. else:
  65. cellvol_max_25 = np.quantile(cellvol_max_list, 0.25)
  66. cellvol_max_75 = np.quantile(cellvol_max_list, 0.75)
  67. packvol_max_list = df_data_dt.loc[df_data_dt['full_chrg_flg'] == 1]['packvol_max'].tolist()
  68. if len(packvol_max_list) == 0:
  69. packvol_max_25 = 0
  70. packvol_max_75 = 0
  71. else:
  72. packvol_max_25 = np.quantile(packvol_max_list, 0.25)
  73. packvol_max_75 = np.quantile(packvol_max_list, 0.75)
  74. temp_time_15 = (df_data_dt['temp_time_1']+df_data_dt['temp_time_2']+df_data_dt['temp_time_3']+df_data_dt['temp_time_4']).sum()
  75. temp_time_30 = (df_data_dt['temp_time_6']+df_data_dt['temp_time_7']).sum()
  76. temp_max_25 = round(temp_max_25, 2)
  77. temp_max_75 = round(temp_max_75, 2)
  78. temp_min_25 = round(temp_min_25, 2)
  79. temp_min_75 = round(temp_min_75, 2)
  80. cellvol_max_25 = round(cellvol_max_25, 2)
  81. cellvol_max_75 = round(cellvol_max_75, 2)
  82. packvol_max_25 = round(packvol_max_25, 2)
  83. packvol_max_75 = round(packvol_max_75, 2)
  84. temp_35 = round(temp_35, 2)
  85. temp_10 = round(temp_10, 2)
  86. chrgah = round(chrgah, 2)
  87. meancrnt = round(meancrnt, 2)
  88. sts_flg = round(sts_flg, 2)
  89. full_chrg_flg = round(full_chrg_flg, 2)
  90. temp_time_15 = round(temp_time_15, 2)
  91. temp_time_30 = round(temp_time_30, 2)
  92. df_user.loc[len(df_user)] = [vin, pack_model, date, temp_max_25, temp_max_75, temp_35, temp_min_25, temp_min_75, temp_10, \
  93. chrgah, meancrnt, sts_flg, full_chrg_flg, cellvol_max_25, cellvol_max_75, packvol_max_25, packvol_max_75, temp_time_15, temp_time_30]
  94. if os.path.exists(out_path+pack_model+'/charge/' +f'{vin}.csv'):
  95. df_user = pd.concat([df_user, pd.read_csv(out_path+pack_model+'/charge/' +f'{vin}.csv')])
  96. df_user.to_csv(out_path+pack_model+'/charge/' +f'{vin}.csv', index=False)
  97. drv_file_list = [file for file in os.listdir(path+test_vin+'/') if file.endswith('drive_proc_di.feather')]
  98. for drv_file in tqdm(drv_file_list):
  99. df_user = pd.DataFrame(columns=['vin', 'pack_model', 'week', 'temp_max_25', 'temp_max_75', 'temp_35', 'temp_min_25', 'temp_min_75', 'temp_10', \
  100. 'delta_odo', 'dschrgah', 'meancrnt','temp_time_15', 'temp_time_30', 'spd_mean', 'accon_mean', 'fst_acc', 'maxspd'])
  101. vin = drv_file.split('_')[0]
  102. df_data = pd.read_feather(path+test_vin+'/' + drv_file)
  103. # convert dt from int to datetime
  104. df_data['dt'] = df_data['dt'].apply(lambda x: datetime.strptime(str(x), '%Y%m%d'))
  105. df_data = df_data.loc[df_data['dt'] >= df_data['dt'].max() - pd.Timedelta(days=90)]
  106. if len(df_data) == 0:
  107. continue
  108. df_data['wk'] = df_data['dt'].apply(lambda x: x.isocalendar()[1])
  109. pack_model = df_data['pack_model'][0]
  110. df_data_dt_list = list(df_data.groupby('wk'))
  111. for date, df_data_dt in df_data_dt_list:
  112. temp_max_list = df_data_dt['temp_max'].tolist()
  113. temp_max_25 = np.quantile(temp_max_list, 0.25)
  114. temp_max_75 = np.quantile(temp_max_list, 0.75)
  115. temp_35 = len([temp for temp in temp_max_list if temp > temp_thresh_max_d])
  116. temp_min_list = df_data_dt['temp_min'].tolist()
  117. temp_min_25 = np.quantile(temp_min_list, 0.25)
  118. temp_min_75 = np.quantile(temp_min_list, 0.75)
  119. temp_10 = len([temp for temp in temp_min_list if temp < temp_thresh_min_d])
  120. delta_odo = df_data_dt['delta_odo'].sum()
  121. dschrg_ah = df_data_dt['dschrg_ah'].sum()
  122. spd_mean = df_data_dt['spd_mean'].max()
  123. accon_mean = df_data_dt['accon_mean'].max()
  124. fst_acc = (df_data_dt['fst_acc_pls']+df_data_dt['fst_acc_mus']+df_data_dt['fst_acc_trn']).sum()
  125. meancrnt = len([crnt for crnt in df_data_dt['meancrnt'].tolist() if crnt > mean_crnt_thresh_d])
  126. maxspd = df_data_dt['maxspd'].max()
  127. temp_time_15 = (df_data_dt['temp_time_1']+df_data_dt['temp_time_2']+df_data_dt['temp_time_3']+df_data_dt['temp_time_4']).sum()
  128. temp_time_30 = (df_data_dt['temp_time_6']+df_data_dt['temp_time_7']).sum()
  129. temp_max_25 = round(temp_max_25, 2)
  130. temp_max_75 = round(temp_max_75, 2)
  131. temp_min_25 = round(temp_min_25, 2)
  132. temp_min_75 = round(temp_min_75, 2)
  133. temp_35 = round(temp_35, 2)
  134. temp_10 = round(temp_10, 2)
  135. delta_odo = round(delta_odo, 2)
  136. dschrg_ah = round(dschrg_ah, 2)
  137. spd_mean = round(spd_mean, 2)
  138. accon_mean = round(accon_mean, 2)
  139. fst_acc = round(fst_acc, 2)
  140. meancrnt = round(meancrnt, 2)
  141. maxspd = round(maxspd, 2)
  142. temp_time_15 = round(temp_time_15, 2)
  143. temp_time_30 = round(temp_time_30, 2)
  144. df_user.loc[len(df_user)] = [vin, pack_model, date, temp_max_25, temp_max_75, temp_35, temp_min_25, temp_min_75, temp_10, \
  145. delta_odo, dschrg_ah, meancrnt, temp_time_15, temp_time_30, spd_mean, accon_mean, fst_acc, maxspd]
  146. if os.path.exists(out_path+pack_model+'/drive/' +f'{vin}.csv'):
  147. df_user = pd.concat([df_user, pd.read_csv(out_path+pack_model+'/drive/' +f'{vin}.csv')])
  148. df_user.to_csv(out_path+pack_model+'/drive/' +f'{vin}.csv', index=False)