get_group_analysis.py 3.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. import pandas as pd
  2. import os
  3. from tqdm import tqdm
  4. from matplotlib import pyplot as plt
  5. import numpy as np
  6. # import defaultdict
  7. from collections import defaultdict
  8. from tqdm import tqdm
  9. pd.set_option('display.max_columns', 500)
  10. pd.set_option('display.max_rows', 500)
  11. # create a dict
  12. dict_col = {'fst_acc_rank_2':'driving_behavior',
  13. 'dschrgah_rank_2': 'discharge_ah',
  14. 'temp_min_75_rank_2' : 'temp_min',
  15. 'delta_odo_rank_2' : 'odo',
  16. 'temp_35_rank_2' : 'high_temp',
  17. 'temp_max_25_rank_2' : 'temp_max',
  18. 'temp_time_30_rank_2' : 'high_temp',
  19. 'spd_mean_rank_2' : 'driving_speed',
  20. 'maxspd_rank_2' : 'driving_speed',
  21. 'temp_max_75_rank_2' : 'temp_max',
  22. 'temp_min_25_rank_2' : 'temp_min',
  23. 'temp_10_rank_2' : 'low_temp',
  24. 'accon_mean_rank_2' : 'driving_behavior',
  25. 'meancrnt_rank_2' : 'current',
  26. 'temp_time_15_rank_2' : 'low_temp',
  27. 'cellvol_max_75_rank_2' : 'cellvol',
  28. 'sts_flg_rank_2': 'charge_status',
  29. 'chrgah_rank_2': 'charge_ah',
  30. 'packvol_max_25_rank_2': 'packvol',
  31. 'packvol_max_75_rank_2': 'packvol',
  32. 'cellvol_max_25_rank_2': 'cellvol',
  33. 'full_chrg_flg_rank_2': 'charge_status',
  34. }
  35. default_value = 'other'
  36. dict_col = defaultdict(lambda: default_value, dict_col)
  37. def eva(x):
  38. try:
  39. return list(eval(x))
  40. except:
  41. return [0,0,0]
  42. df_new = pd.DataFrame()
  43. vin_list = os.listdir('./data/')
  44. for vin in tqdm(vin_list):
  45. df_new = pd.DataFrame()
  46. for name in ['drive', 'charge']:
  47. df = pd.read_csv(f'./{name}/group_{vin}.csv')
  48. # select all the columns with 'rank' in the name
  49. df_rank = df.filter(regex='rank')
  50. df_basic = df.drop(columns=(df_rank.columns.tolist() + df.columns[:3].tolist()))
  51. df_rank.columns = [col+'_2' for col in df_rank.columns]
  52. # set df_basic to round 3
  53. df_basic = df_basic.round(3)
  54. df = pd.concat([df[df.columns[:3]], df_basic, df_rank], axis=1)
  55. #sort column names
  56. df = df.reindex((df.columns[:3].tolist() + sorted(df.columns[3:])), axis=1)
  57. # select all the columns with 'rank' in the name
  58. df_rank = df.filter(regex='rank_2')
  59. df_rank_arr = np.array(df_rank)
  60. # find index that has at least one element that is > 0 and < 20
  61. vins = df.iloc[np.where((df_rank_arr > 0) & (df_rank_arr < 20))[0]]['vin'].tolist()
  62. weeks = df.iloc[np.where((df_rank_arr > 0) & (df_rank_arr < 20))[0]]['week'].tolist()
  63. cols = df_rank.columns[np.where((df_rank_arr > 0) & (df_rank_arr < 20))[1]].tolist()
  64. cols_cat = [dict_col[col] for col in cols]
  65. value = df_rank_arr[np.where((df_rank_arr > 0) & (df_rank_arr < 20))]
  66. # make a new dataframe
  67. df_new = pd.concat([df_new, pd.DataFrame({'vin':vins, 'mode':name, 'week':weeks, 'col':cols, 'cols_cat':cols_cat, 'value':value})], axis=0)
  68. df_new.reset_index(drop=True, inplace=True)
  69. df_new = df_new.loc[df_new.groupby(['vin', 'mode', 'week', 'cols_cat'])['value'].idxmin()].reset_index(drop=True)
  70. df_new_1 = df_new.groupby(['mode', 'vin', 'cols_cat'])['value'].count().unstack().fillna(0).astype(int)
  71. df_new_2 = df_new.groupby(['vin', 'cols_cat'])['value'].count().unstack().fillna(0).astype(int)
  72. # sum all the columns grouped by mode
  73. df_new_1.loc[('drive','total'), :] = df_new_1.loc['drive'].sum(axis=0)
  74. df_new_1.loc[('charge','total'), :] = df_new_1.loc['charge'].sum(axis=0)
  75. # sum all the rows
  76. df_new_2['total'] = df_new_2.sum(axis=1)
  77. df_new_2 = df_new_2.sort_values(by='total', ascending=False)
  78. df_new_1.to_csv(f'./df_new_{vin}_1.csv')
  79. df_new_2.to_csv(f'./df_new_{vin}_2.csv')