import pandas as pd import numpy as np class DataCleaning: def __init__(self, df_in): self.df_in = df_in def process_data_two(self): i = 0 while i < len(self.df_in) - 1: row1 = self.df_in.iloc[i] row2 = self.df_in.iloc[i+1] if row1['time'] == row2['time']: if row2['latitude'] == 0: if row1['latitude'] != 0: self.df_in.loc[i+1, 'latitude'] = row1['latitude'] self.df_in.loc[i+1, 'longitude'] = row1['longitude'] self.df_in.loc[i+1, 'mileage'] = row1['mileage'] self.df_in.loc[i+1, 'speed'] = row1['speed'] self.df_in.iloc[i] = self.df_in.iloc[i+1] if row1['latitude'] == 0: if row2['latitude'] != 0: self.df_in.loc[i, 'latitude'] = row2['latitude'] self.df_in.loc[i, 'longitude'] = row2['longitude'] self.df_in.loc[i, 'mileage'] = row2['mileage'] self.df_in.loc[i, 'speed'] = row2['speed'] self.df_in.iloc[i+1] = self.df_in.iloc[i] elif row1['time'] != row2['time']: self.df_in.drop(i, inplace=True) self.df_in.reset_index(drop=True, inplace=True) if i < len(self.df_in) - 1: row1 = self.df_in.iloc[i] row2 = self.df_in.iloc[i+1] if row2['latitude'] == 0: if row1['latitude'] != 0: self.df_in.loc[i+1, 'latitude'] = row1['latitude'] self.df_in.loc[i+1, 'longitude'] = row1['longitude'] self.df_in.loc[i+1, 'mileage'] = row1['mileage'] self.df_in.loc[i+1, 'speed'] = row1['speed'] self.df_in.iloc[i] = self.df_in.iloc[i+1] if row1['latitude'] == 0: if row2['latitude'] != 0: self.df_in.loc[i, 'latitude'] = row2['latitude'] self.df_in.loc[i, 'longitude'] = row2['longitude'] self.df_in.loc[i, 'mileage'] = row2['mileage'] self.df_in.loc[i, 'speed'] = row2['speed'] self.df_in.iloc[i+1] = self.df_in.iloc[i] i += 2 self.df_in = self.df_in.iloc[::2].reset_index(drop=True) self.df_in = self.df_in.replace('[]', np.nan) self.df_in.dropna(axis=0, subset=["cell_voltage", "cell_temp"], inplace=True) self.df_in.drop( self.df_in.index[(self.df_in['pack_volt'] < 0.001) | (self.df_in['pack_volt'] > 1000) | (self.df_in['pack_soc'] > 100) | (self.df_in['pack_soc'] < 0) | (self.df_in['pack_crnt'] > 1000) | (self.df_in['pack_crnt'] < -1000)], inplace=True) self.df_in = self.df_in.reset_index(drop=True) if not self.df_in.empty: df_cell_volt = pd.DataFrame([x[0].strip("[]").replace("'", "").split(",") for x in np.array(self.df_in[['cell_voltage']])]).iloc[:, list(range(384))] df_cell_volt = df_cell_volt.astype('float') df_cell_tem = pd.DataFrame([x[0].strip("[]").replace("'", "").split(",") for x in np.array(self.df_in[['cell_temp']])]).iloc[:, list(range(64))] df_cell_tem = df_cell_tem.astype('float') cellvoltmaxlist = df_cell_volt.max(axis=1) celltempmaxlist = df_cell_tem.max(axis=1) cellvoltminlist = df_cell_volt.min(axis=1) celltempminlist = df_cell_tem.min(axis=1) self.df_in = self.df_in.assign(cell_volt_max=cellvoltmaxlist) self.df_in = self.df_in.assign(cell_volt_min=cellvoltminlist) self.df_in = self.df_in.assign(cell_temp_max=celltempmaxlist) self.df_in = self.df_in.assign(cell_temp_min=celltempminlist) def revise_status_codes(self, c_soc_dif_p=0.3, c_order_delta=1200, s_soc_dif_p=0, s_order_delta=120): self.df_in["flag"] = "d" self.df_in.loc[self.df_in["pack_crnt"] < 0, "flag"] = "c" self.df_in.loc[self.df_in["pack_crnt"] == 0, "flag"] = "s" self.df_in['flag_block'] = (self.df_in["flag"].shift(1) != self.df_in["flag"]).astype(int).cumsum() df_in_c = self.df_in[self.df_in["flag"] == "c"] df_in_c_soc_b = df_in_c[["pack_soc", "flag_block"]].groupby(["flag_block"]).first() df_in_c_soc_e = df_in_c[["pack_soc", "flag_block"]].groupby(["flag_block"]).last() df_in_c_time_b = df_in_c[["time", "flag_block"]].groupby(["flag_block"]).first() df_in_c_time_e = df_in_c[["time", "flag_block"]].groupby(["flag_block"]).last() frames = [df_in_c_soc_b, df_in_c_soc_e, df_in_c_time_b, df_in_c_time_e] df_in_c_choice = pd.concat(frames, axis=1, join='inner') df_in_c_choice = df_in_c_choice.reset_index() df_in_c_choice.columns = ["charge_block", "soc_first", "soc_last", "time_first", "time_last"] df_in_c_choice["soc_dif_p"] = (df_in_c_choice["soc_last"] - df_in_c_choice["soc_first"]) / df_in_c_choice[ "soc_first"] df_in_c_choice["order_delta"] = pd.to_timedelta( pd.to_datetime(df_in_c_choice["time_last"]) - pd.to_datetime(df_in_c_choice["time_first"])).dt.total_seconds() df_in_c_choice["order_delta_h"] = round(df_in_c_choice["order_delta"] / 3600, 2) df_in_c_choice["rate"] = (df_in_c_choice["soc_last"] - df_in_c_choice["soc_first"]) / df_in_c_choice[ "order_delta_h"] df_in_c_choice_result1 = df_in_c_choice[ (df_in_c_choice["soc_dif_p"] >= c_soc_dif_p) & (df_in_c_choice["order_delta"] >= c_order_delta) & ( df_in_c_choice["rate"] > 0.1)] df_in_c_choice_result2 = df_in_c_choice[ (df_in_c_choice["soc_dif_p"] >= c_soc_dif_p) & (df_in_c_choice["order_delta"] >= c_order_delta) & ( df_in_c_choice["rate"] <= 0.1)] df_in_c21 = df_in_c[(df_in_c["flag_block"].isin(df_in_c_choice_result1["charge_block"]))] df_in_c22 = df_in_c[(df_in_c["flag_block"].isin(df_in_c_choice_result2["charge_block"]))] self.df_in["bms_sta"] = 3 self.df_in.loc[df_in_c21.index, "bms_sta"] = 21 self.df_in.loc[df_in_c22.index, "bms_sta"] = 22 df_in_s = self.df_in[self.df_in["flag"] == "s"] df_in_s_soc_b = df_in_s[["pack_soc", "flag_block"]].groupby(["flag_block"]).first() df_in_s_soc_e = df_in_s[["pack_soc", "flag_block"]].groupby(["flag_block"]).last() df_in_s_time_b = df_in_s[["time", "flag_block"]].groupby(["flag_block"]).first() df_in_s_time_e = df_in_s[["time", "flag_block"]].groupby(["flag_block"]).last() frames = [df_in_s_soc_b, df_in_s_soc_e, df_in_s_time_b, df_in_s_time_e] df_in_s_choice = pd.concat(frames, axis=1, join='inner') df_in_s_choice = df_in_s_choice.reset_index() df_in_s_choice.columns = ["charge_block", "soc_first", "soc_last", "time_first", "time_last"] df_in_s_choice["soc_dif_p"] = (df_in_s_choice["soc_last"] - df_in_s_choice["soc_first"]) / df_in_s_choice[ "soc_first"] df_in_s_choice["order_delta"] = pd.to_timedelta( pd.to_datetime(df_in_s_choice["time_last"]) - pd.to_datetime(df_in_s_choice["time_first"])).dt.total_seconds() df_in_s_choice_result = df_in_s_choice[df_in_s_choice["order_delta"] >= s_order_delta] df_in_s2 = df_in_s[(df_in_s["flag_block"].isin(df_in_s_choice_result["charge_block"]))] self.df_in.loc[df_in_s2.index, "bms_sta"] = 0 self.df_in = self.df_in.drop(['flag', 'flag_block'], axis=1) return self.df_in