Data_Cleaning_oop_thr.py 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136
  1. import pandas as pd
  2. import numpy as np
  3. class DataCleaning:
  4. def __init__(self, df_in):
  5. self.df_in = df_in
  6. def process_data_two(self):
  7. i = 0
  8. while i < len(self.df_in) - 1:
  9. row1 = self.df_in.iloc[i]
  10. row2 = self.df_in.iloc[i+1]
  11. if row1['time'] == row2['time']:
  12. if row2['latitude'] == 0:
  13. if row1['latitude'] != 0:
  14. self.df_in.loc[i+1, 'latitude'] = row1['latitude']
  15. self.df_in.loc[i+1, 'longitude'] = row1['longitude']
  16. self.df_in.loc[i+1, 'mileage'] = row1['mileage']
  17. self.df_in.loc[i+1, 'speed'] = row1['speed']
  18. self.df_in.iloc[i] = self.df_in.iloc[i+1]
  19. if row1['latitude'] == 0:
  20. if row2['latitude'] != 0:
  21. self.df_in.loc[i, 'latitude'] = row2['latitude']
  22. self.df_in.loc[i, 'longitude'] = row2['longitude']
  23. self.df_in.loc[i, 'mileage'] = row2['mileage']
  24. self.df_in.loc[i, 'speed'] = row2['speed']
  25. self.df_in.iloc[i+1] = self.df_in.iloc[i]
  26. elif row1['time'] != row2['time']:
  27. self.df_in.drop(i, inplace=True)
  28. self.df_in.reset_index(drop=True, inplace=True)
  29. if i < len(self.df_in) - 1:
  30. row1 = self.df_in.iloc[i]
  31. row2 = self.df_in.iloc[i+1]
  32. if row2['latitude'] == 0:
  33. if row1['latitude'] != 0:
  34. self.df_in.loc[i+1, 'latitude'] = row1['latitude']
  35. self.df_in.loc[i+1, 'longitude'] = row1['longitude']
  36. self.df_in.loc[i+1, 'mileage'] = row1['mileage']
  37. self.df_in.loc[i+1, 'speed'] = row1['speed']
  38. self.df_in.iloc[i] = self.df_in.iloc[i+1]
  39. if row1['latitude'] == 0:
  40. if row2['latitude'] != 0:
  41. self.df_in.loc[i, 'latitude'] = row2['latitude']
  42. self.df_in.loc[i, 'longitude'] = row2['longitude']
  43. self.df_in.loc[i, 'mileage'] = row2['mileage']
  44. self.df_in.loc[i, 'speed'] = row2['speed']
  45. self.df_in.iloc[i+1] = self.df_in.iloc[i]
  46. i += 2
  47. self.df_in = self.df_in.iloc[::2].reset_index(drop=True)
  48. self.df_in = self.df_in.replace('[]', np.nan)
  49. self.df_in.dropna(axis=0, subset=["cell_voltage", "cell_temp"], inplace=True)
  50. self.df_in.drop(
  51. self.df_in.index[(self.df_in['pack_volt'] < 0.001) | (self.df_in['pack_volt'] > 1000) | (self.df_in['pack_soc'] > 100) |
  52. (self.df_in['pack_soc'] < 0) | (self.df_in['pack_crnt'] > 1000) | (self.df_in['pack_crnt'] < -1000)], inplace=True)
  53. self.df_in = self.df_in.reset_index(drop=True)
  54. if not self.df_in.empty:
  55. df_cell_volt = pd.DataFrame([x[0].strip("[]").replace("'", "").split(",") for x in
  56. np.array(self.df_in[['cell_voltage']])]).iloc[:, list(range(384))]
  57. df_cell_volt = df_cell_volt.astype('float')
  58. df_cell_tem = pd.DataFrame([x[0].strip("[]").replace("'", "").split(",") for x in
  59. np.array(self.df_in[['cell_temp']])]).iloc[:, list(range(64))]
  60. df_cell_tem = df_cell_tem.astype('float')
  61. cellvoltmaxlist = df_cell_volt.max(axis=1)
  62. celltempmaxlist = df_cell_tem.max(axis=1)
  63. cellvoltminlist = df_cell_volt.min(axis=1)
  64. celltempminlist = df_cell_tem.min(axis=1)
  65. self.df_in = self.df_in.assign(cell_volt_max=cellvoltmaxlist)
  66. self.df_in = self.df_in.assign(cell_volt_min=cellvoltminlist)
  67. self.df_in = self.df_in.assign(cell_temp_max=celltempmaxlist)
  68. self.df_in = self.df_in.assign(cell_temp_min=celltempminlist)
  69. def revise_status_codes(self, c_soc_dif_p=0.3, c_order_delta=1200, s_soc_dif_p=0, s_order_delta=120):
  70. self.df_in["flag"] = "d"
  71. self.df_in.loc[self.df_in["pack_crnt"] < 0, "flag"] = "c"
  72. self.df_in.loc[self.df_in["pack_crnt"] == 0, "flag"] = "s"
  73. self.df_in['flag_block'] = (self.df_in["flag"].shift(1) != self.df_in["flag"]).astype(int).cumsum()
  74. df_in_c = self.df_in[self.df_in["flag"] == "c"]
  75. df_in_c_soc_b = df_in_c[["pack_soc", "flag_block"]].groupby(["flag_block"]).first()
  76. df_in_c_soc_e = df_in_c[["pack_soc", "flag_block"]].groupby(["flag_block"]).last()
  77. df_in_c_time_b = df_in_c[["time", "flag_block"]].groupby(["flag_block"]).first()
  78. df_in_c_time_e = df_in_c[["time", "flag_block"]].groupby(["flag_block"]).last()
  79. frames = [df_in_c_soc_b, df_in_c_soc_e, df_in_c_time_b, df_in_c_time_e]
  80. df_in_c_choice = pd.concat(frames, axis=1, join='inner')
  81. df_in_c_choice = df_in_c_choice.reset_index()
  82. df_in_c_choice.columns = ["charge_block", "soc_first", "soc_last", "time_first", "time_last"]
  83. df_in_c_choice["soc_dif_p"] = (df_in_c_choice["soc_last"] - df_in_c_choice["soc_first"]) / df_in_c_choice[
  84. "soc_first"]
  85. df_in_c_choice["order_delta"] = pd.to_timedelta(
  86. pd.to_datetime(df_in_c_choice["time_last"]) - pd.to_datetime(df_in_c_choice["time_first"])).dt.total_seconds()
  87. df_in_c_choice["order_delta_h"] = round(df_in_c_choice["order_delta"] / 3600, 2)
  88. df_in_c_choice["rate"] = (df_in_c_choice["soc_last"] - df_in_c_choice["soc_first"]) / df_in_c_choice[
  89. "order_delta_h"]
  90. df_in_c_choice_result1 = df_in_c_choice[
  91. (df_in_c_choice["soc_dif_p"] >= c_soc_dif_p) & (df_in_c_choice["order_delta"] >= c_order_delta) & (
  92. df_in_c_choice["rate"] > 0.1)]
  93. df_in_c_choice_result2 = df_in_c_choice[
  94. (df_in_c_choice["soc_dif_p"] >= c_soc_dif_p) & (df_in_c_choice["order_delta"] >= c_order_delta) & (
  95. df_in_c_choice["rate"] <= 0.1)]
  96. df_in_c21 = df_in_c[(df_in_c["flag_block"].isin(df_in_c_choice_result1["charge_block"]))]
  97. df_in_c22 = df_in_c[(df_in_c["flag_block"].isin(df_in_c_choice_result2["charge_block"]))]
  98. self.df_in["bms_sta"] = 3
  99. self.df_in.loc[df_in_c21.index, "bms_sta"] = 21
  100. self.df_in.loc[df_in_c22.index, "bms_sta"] = 22
  101. df_in_s = self.df_in[self.df_in["flag"] == "s"]
  102. df_in_s_soc_b = df_in_s[["pack_soc", "flag_block"]].groupby(["flag_block"]).first()
  103. df_in_s_soc_e = df_in_s[["pack_soc", "flag_block"]].groupby(["flag_block"]).last()
  104. df_in_s_time_b = df_in_s[["time", "flag_block"]].groupby(["flag_block"]).first()
  105. df_in_s_time_e = df_in_s[["time", "flag_block"]].groupby(["flag_block"]).last()
  106. frames = [df_in_s_soc_b, df_in_s_soc_e, df_in_s_time_b, df_in_s_time_e]
  107. df_in_s_choice = pd.concat(frames, axis=1, join='inner')
  108. df_in_s_choice = df_in_s_choice.reset_index()
  109. df_in_s_choice.columns = ["charge_block", "soc_first", "soc_last", "time_first", "time_last"]
  110. df_in_s_choice["soc_dif_p"] = (df_in_s_choice["soc_last"] - df_in_s_choice["soc_first"]) / df_in_s_choice[
  111. "soc_first"]
  112. df_in_s_choice["order_delta"] = pd.to_timedelta(
  113. pd.to_datetime(df_in_s_choice["time_last"]) - pd.to_datetime(df_in_s_choice["time_first"])).dt.total_seconds()
  114. df_in_s_choice_result = df_in_s_choice[df_in_s_choice["order_delta"] >= s_order_delta]
  115. df_in_s2 = df_in_s[(df_in_s["flag_block"].isin(df_in_s_choice_result["charge_block"]))]
  116. self.df_in.loc[df_in_s2.index, "bms_sta"] = 0
  117. self.df_in = self.df_in.drop(['flag', 'flag_block'], axis=1)
  118. return self.df_in