train2.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293
  1. #自动化训练
  2. import pymysql
  3. import datetime
  4. import pandas as pd
  5. import numpy as np
  6. from sklearn.preprocessing import StandardScaler
  7. from random import shuffle
  8. import tensorflow.keras as keras
  9. import random
  10. import pickle
  11. #特征工程
  12. def features_total(dataset2,capacity):
  13. dataset2=dataset2.drop(['PackSoh','InsulationRssPos','InsulationRssNeg','AccumChrgWh','AccumChrgAh','AccumDsChgAh','imei','InsulationRss','cellvoltmax','cellvoltmin','celltempmax','celltempmin'],axis=1,errors='ignore')
  14. cellvolt_list = [s for s in list(dataset2) if 'CellVoltage' in s] #单体电压
  15. celltemp_name = [s for s in list(dataset2) if 'CellTemp' in s] #单体温度
  16. celltemp_name2 = [s for s in list(dataset2) if '温度' in s] #其他温度
  17. dataset2['PackCrnt']=list(map(lambda x: x/float(capacity),list(dataset2['PackCrnt'])))
  18. dataset2['volt_diff']=list(np.array(dataset2[cellvolt_list].max(axis=1))-np.array(dataset2[cellvolt_list].min(axis=1))) #压差
  19. dataset2=dataset2.reindex(columns=['Time','sn','PackCrnt','PackVolt','BMSSta','volt_diff','PackSoc']+cellvolt_list+celltemp_name+celltemp_name2)
  20. dataset2[cellvolt_list]=dataset2[cellvolt_list]*1000
  21. dataset2['volt_last']=list(dataset2[cellvolt_list[-1]]) #最后一根电芯电压
  22. dataset2['volt_first']=list(dataset2[cellvolt_list[0]]) #第一根电芯电压
  23. dataset2['volt_last2']=list(dataset2[cellvolt_list[-2]]) #倒数第二根电芯电压
  24. dataset2['volt_first2']=list(dataset2[cellvolt_list[1]]) #第二根电芯电压
  25. dataset2['volt_max']=dataset2[cellvolt_list].max(axis=1) #最大电压
  26. dataset2['volt_min']=dataset2[cellvolt_list].min(axis=1) #最小电压
  27. dataset2['volt_mean'] = round(dataset2[cellvolt_list].mean(axis=1),3) #每行平均电压
  28. dataset2['volt_sigma'] =list(dataset2[cellvolt_list].apply(lambda x: np.std(x.values),axis=1)) #电压离散度
  29. cell_volt_max =list(dataset2[cellvolt_list].apply(lambda x: np.argmax(x.values)+1,axis=1))
  30. cell_volt_min =list(dataset2[cellvolt_list].apply(lambda x: np.argmin(x.values)+1,axis=1))
  31. volt_max2= dataset2[cellvolt_list].apply(lambda x: sorted(x)[-2], axis=1)
  32. volt_min2=dataset2[cellvolt_list].apply(lambda x: sorted(x)[1], axis=1)
  33. dataset2['volt_max2']=volt_max2 #第二大电压
  34. dataset2['volt_min2']=volt_min2 #第二小电压
  35. dataset2['volt_max_mean']=list(np.array(dataset2[cellvolt_list].max(axis=1))-np.array(round(dataset2[cellvolt_list].mean(axis=1),3) )) #最大电压与平均电压之差
  36. dataset2['volt_min_mean']=list(np.array(round(dataset2[cellvolt_list].mean(axis=1),3))-np.array(dataset2[cellvolt_list].min(axis=1))) #平均电压与最小电压之差
  37. dataset2['volt_min_diff']= list(np.array(volt_min2)-np.array(dataset2[cellvolt_list].min(axis=1))) #最小两个电压差
  38. dataset2['volt_mm_diff']=list(np.array(dataset2[cellvolt_list].max(axis=1))-np.array(volt_max2))+np.array(volt_min2)-np.array(dataset2[cellvolt_list].min(axis=1)) #最大两个电压差+最小两个电压差
  39. dataset2['mm_volt_cont'] = list(np.array(cell_volt_max) - np.array(cell_volt_min))
  40. dataset2['mm_volt_cont']=list(map(lambda x : 1 if (abs(x)==1) | (abs(x)==len(cellvolt_list)-1) else 0, list(dataset2['mm_volt_cont']))) #最大最小电压的电芯是否连续
  41. dataset2['temp_max']=dataset2[celltemp_name].max(axis=1) #最大单体温度
  42. dataset2['temp_min']=dataset2[celltemp_name].min(axis=1) #最小单体温度
  43. dataset2['temp_mean'] = round(dataset2[celltemp_name].mean(axis=1),3) #每行平均单体温度
  44. dataset2['temp_diff']=list(np.array(dataset2['temp_max'])-np.array(dataset2['temp_min'])) #单体温度差
  45. dataset2['temp2_max']=dataset2[celltemp_name2].max(axis=1) #最大其他温度
  46. dataset2['temp2_min']=dataset2[celltemp_name2].min(axis=1) #最小其他温度
  47. dataset2['temp2_mean'] = round(dataset2[celltemp_name2].mean(axis=1),3) #每行平均其他温度
  48. dataset2['temp2_diff']=list(np.array(dataset2['temp2_max'])-np.array(dataset2['temp2_min'])) #其他温度差
  49. dataset2=dataset2.drop(celltemp_name+cellvolt_list+celltemp_name2,axis=1)
  50. return dataset2
  51. #故障时间序列构建
  52. def makedataset(dataset2,freq):
  53. df_bms=pd.DataFrame()
  54. for sp in list(set(dataset2['split'])):
  55. set2=dataset2[dataset2['split']==sp]
  56. set2.reset_index(drop=True,inplace=True)
  57. data_set=pd.DataFrame()
  58. start=set2.loc[0,'Time']
  59. end=set2.loc[len(set2)-1,'Time']
  60. data_set['Time']=pd.date_range(start=start, end=end, freq=freq) #每freq一条记录
  61. data_set['Time']=list(map(lambda x:str(x),list(data_set['Time'])))
  62. set2['Time']=list(map(lambda x:str(x),list(set2['Time'])))
  63. dfbms=pd.merge(data_set,set2,left_on='Time',right_on='Time',how='left')
  64. dfbms=dfbms.fillna(method='ffill')
  65. dfbms=dfbms.fillna(method='bfill')
  66. df_bms=df_bms.append(dfbms)
  67. df_bms.reset_index(drop=True,inplace=True)
  68. return df_bms
  69. #打乱并切分训练集测试集
  70. def shuffle_data(dataset_faults):
  71. sn_fau=list(set(dataset_faults['sn']))
  72. shuffle(sn_fau)
  73. newtrain=dataset_faults[dataset_faults['sn'].isin(sn_fau[:int(0.8*len(sn_fau))])]
  74. newtest=dataset_faults[dataset_faults['sn'].isin(sn_fau[int(0.8*len(sn_fau)):])]
  75. newtrain.reset_index(drop=True,inplace=True)
  76. newtest.reset_index(drop=True,inplace=True)
  77. return newtrain,newtest
  78. #训练集数据标准化
  79. def scaler_train(train):
  80. Xtrain=train.drop(['Time','sn','split'],axis=1)
  81. Xsc_colnames=list(Xtrain.columns)
  82. scaler=StandardScaler()
  83. scaler.fit(Xtrain) #保存train_sc的均值和标准差
  84. Xsc=scaler.transform(np.array(Xtrain))
  85. Xsc=pd.DataFrame(Xsc)
  86. Xsc.columns=Xsc_colnames
  87. Xsc['split']=train['split'].values
  88. return Xsc,scaler
  89. #测试集数据标准化
  90. def scaler_test_train(test,scaler):
  91. Xtest=test.drop(['Time','sn','split'],axis=1)
  92. Xsc_colnames=list(Xtest.columns)
  93. Xtsc=scaler.transform(np.array(Xtest))
  94. Xtsc=pd.DataFrame(Xtsc)
  95. Xtsc.columns=Xsc_colnames
  96. Xtsc['split']=test['split'].values
  97. return Xtsc
  98. #时间窗口划分
  99. def create_dataset(data_set,data_train,time_steps=6): #X为dataframe,y为serie
  100. a=[]
  101. aa=np.empty(shape=[0,3])
  102. index=pd.DataFrame()
  103. List_n_split=sorted(list(set(data_set['split'])))
  104. for k in List_n_split:
  105. dataset=data_set[data_set['split']==k]
  106. datatrain=data_train[data_train['split']==k]
  107. if len(dataset)>time_steps:
  108. dataset2=dataset.reset_index(drop=True)
  109. dataset=dataset.drop(['split'],axis=1)
  110. dataX= []
  111. index_step=[]
  112. for i in range(len(dataset)-time_steps):
  113. v1 = dataset.iloc[i:(i+time_steps)].values
  114. dataX.append(v1)
  115. index_step.append(i)
  116. dataset3=dataset2.iloc[:len(dataset2)-time_steps]
  117. newdatatrain=datatrain[:len(dataset3)]
  118. dataX2=np.array(dataX)
  119. a.append(dataX2)
  120. index=index.append(newdatatrain)
  121. if len(a)>0:
  122. aa=np.vstack(a)
  123. index.reset_index(drop=True,inplace=True)
  124. return aa,index
  125. #模型训练
  126. def model_train(X,units=60,batch_size=128,epochs=15):
  127. optimizer = 'adam' #梯度下降学习法:降低loss learning rate取默认值
  128. loss = 'mae' #均方误差
  129. dropout=0.30 #迭代次数
  130. reg=0.001
  131. callback = keras.callbacks.EarlyStopping(monitor='loss', patience=2) #过拟合即停止训练,最多容忍一次loss上升
  132. model = keras.Sequential()
  133. #输入层: 输入shape=(N,X.shape[1],X.shape[2])
  134. model.add(keras.layers.LSTM(units=units, input_shape =(X.shape[1],X.shape[2]), return_sequences=True,kernel_regularizer=keras.regularizers.l2(reg),activity_regularizer=keras.regularizers.l1(reg)))
  135. model.add(keras.layers.Dropout(rate=dropout))
  136. #return_sequences输出时间步长区间每个时间点对应的所有值 many to many (N,X.shape[1],units)
  137. #return_sequence=False 输出shape=(N,units)
  138. #输出层:输入shape=(N,X.shape[1],units)
  139. model.add(keras.layers.TimeDistributed(keras.layers.Dense(X.shape[2])))
  140. #Dense输出shape=(N,X.shape[2])
  141. #TimeDistributed:many to many 时间步长区间每个时间点用Dense 输出shape=(N,X.shape[1],X.shape[2])
  142. model.compile(loss= loss, optimizer=optimizer)
  143. #model.fit(X, X, epochs=epochs, batch_size=batch_size, validation_data=(x_test,x_test), shuffle=False,callbacks=[callback])
  144. model.fit(X, X, epochs=epochs, batch_size=batch_size, validation_split=0.1, shuffle=False,callbacks=[callback])
  145. return model
  146. #模型预测
  147. def prediction(model,xtest,Xtsc,conftest):
  148. test_pred = model.predict(xtest)
  149. test_loss = np.mean(np.abs(test_pred - xtest), axis=1)
  150. col=list(Xtsc.columns)
  151. col.remove('split')
  152. test_loss=pd.DataFrame(data=test_loss[0:,0:])
  153. test_loss.columns=col
  154. conftest['loss_sum']=test_loss.sum(axis=1)
  155. conftest['loss_max']=test_loss.max(axis=1)
  156. maxcol=test_loss.quantile(q=0.95, axis=0, numeric_only=True, interpolation='linear')
  157. mincol=test_loss.quantile(q=0.02, axis=0, numeric_only=True, interpolation='linear')
  158. test_loss_test=conftest[['loss_max','loss_sum']].mean(axis=0)
  159. test_loss_test2=conftest[['loss_max','loss_sum']].max(axis=0)
  160. test_loss_test3=conftest[['loss_max','loss_sum']].min(axis=0)
  161. test_loss_test4=conftest[['loss_max','loss_sum']].quantile(q=0.9, axis=0, numeric_only=True, interpolation='linear')
  162. conftest2=conftest[conftest['loss_max'].notnull()]
  163. test_loss_test5=conftest2[['loss_max','loss_sum']].quantile(q=0.05, axis=0, numeric_only=True, interpolation='linear')
  164. # conftest.to_csv('conftest.csv')
  165. # test_loss.to_csv('test_loss.csv')
  166. return test_loss_test,test_loss_test2,test_loss_test3,test_loss_test4,test_loss_test5,maxcol,mincol
  167. #交叉验证模型稳定性
  168. def cross_val(data_bms5,time_steps,units,batch_size,df_nor):
  169. #打乱训练数据
  170. train,test=shuffle_data(data_bms5)
  171. Xsc,scaler=scaler_train(train)
  172. Xtsc=scaler_test_train(test,scaler)
  173. #时间滑窗
  174. xtrain,conftrain=create_dataset(Xsc,train,time_steps=time_steps)
  175. xtest,conftest=create_dataset(Xtsc,test,time_steps=time_steps)
  176. #训练
  177. model= model_train(xtrain,units=units,batch_size=batch_size,epochs=50)
  178. #输出验证集每列loss
  179. test_loss_fault=prediction(model,xtest,Xtsc,conftest)
  180. Xtsc_nor=scaler_test_train(df_nor,scaler)
  181. xtest_nor,conftest_nor=create_dataset(Xtsc_nor,df_nor,time_steps=time_steps)
  182. test_loss_nor=prediction(model,xtest_nor,Xtsc_nor,conftest_nor)
  183. cols=list(test_loss_nor[6].index)
  184. list_delta_loss=[]
  185. for k in range(len(test_loss_fault[5])):
  186. delta_loss=test_loss_nor[6][k]-test_loss_fault[5][k]
  187. list_delta_loss.append(delta_loss)
  188. list_delta_loss_serie = pd.Series(list_delta_loss,index=cols)
  189. key_col=list_delta_loss_serie.idxmax()
  190. delta_loss_serie_sort=list_delta_loss_serie.sort_values()
  191. # print(delta_loss_serie_sort)
  192. return key_col,model,scaler,test_loss_fault,delta_loss_serie_sort
  193. #自动化训练流程
  194. def train(datatest,dataset_nor):
  195. #特征工程
  196. datatest.reset_index(drop=True,inplace=True)
  197. #dataset_nor2=dataset_nor[dataset_nor['temp2_max'].notnull()]
  198. dataset_nor.reset_index(drop=True,inplace=True)
  199. #选取时间滑窗
  200. deltatime=[]
  201. for sp in list(datatest['split'].drop_duplicates()):
  202. fault=datatest[datatest['split']==sp]
  203. fault.reset_index(drop=True,inplace=True)
  204. delta_time=(datetime.datetime.strptime(str(fault.loc[len(fault)-1,'Time']),'%Y-%m-%d %H:%M:%S')-datetime.datetime.strptime(str(fault.loc[0,'Time']),'%Y-%m-%d %H:%M:%S')).total_seconds()
  205. deltatime.append(delta_time)
  206. time_delta= np.median(deltatime) # 秒 接插件过温:110.8 温度NTC漂移 594 低电量 396077 B板采样失效: 373650
  207. if time_delta>259200: # 3天
  208. freq='T'
  209. df_fault=makedataset(datatest,freq) # 1分钟步长
  210. df_nor=makedataset(dataset_nor,freq)
  211. time_steps=10 # 10min
  212. else:
  213. freq='10S'
  214. df_fault=makedataset(datatest,freq) # 10秒步长
  215. df_nor=makedataset(dataset_nor,freq)
  216. time_steps=6 if time_delta<180 else 30 if time_delta<600 else 60 # 1min 5min 10min
  217. #选取模型参数
  218. units=30
  219. #print(len(df_fault))
  220. batch_size=16 if len(df_fault)<20000 else 32 if (len(df_fault)<50000) & (len(df_fault)>20000) else 64 if (len(df_fault)>50000) & (len(df_fault)<100000) else 96 if (len(df_fault)>100000) & (len(df_fault)<200000) else 128 # 接插件过温:25768 # 温度NTC漂移:152363
  221. #10折交叉验证
  222. list_model=[]
  223. list_scaler=[]
  224. list_key_col=[]
  225. list_loss_fault=[]
  226. list_deltaloss_sort=[]
  227. for k in range(10):
  228. key_col,model,scaler,test_loss_fault,delta_loss_serie_sort=cross_val(df_fault,time_steps,units,batch_size,df_nor)
  229. list_model.append(model)
  230. list_scaler.append(scaler)
  231. list_key_col.append(key_col)
  232. list_loss_fault.append(test_loss_fault)
  233. list_deltaloss_sort.append(delta_loss_serie_sort)
  234. #判断模型稳定性 独特特征
  235. #if (list_key_col.count(list_key_col[0]) == len(list_key_col)) & (np.array(list_key_deltaloss).all()>0):
  236. maxlabel = max(list_key_col,key=list_key_col.count)
  237. list_key_deltaloss= [list_deltaloss_sort[i][maxlabel] for i in list(range(10))]
  238. if (list_key_col.count(maxlabel)>6) & (np.array(list_key_deltaloss).all()>0):
  239. #loss阈值
  240. a=[]
  241. for j in range(10):
  242. if list(list_deltaloss_sort[j].index)[-1]==maxlabel:
  243. b=list_deltaloss_sort[j][maxlabel]
  244. a.append(b)
  245. index_delmax=list_key_deltaloss.index(max(a))
  246. test_loss_fault,model,scaler=list_loss_fault[index_delmax],list_model[index_delmax],list_scaler[index_delmax]
  247. loss_th_max=round(test_loss_fault[5][maxlabel],2)
  248. # key_col=list(set(list_key_col))
  249. # aa=[]
  250. # for k in key_col:
  251. # for i in range(len(list_deltaloss_sort)):
  252. # a=list_deltaloss_sort[i][k]
  253. # aa.append(a)
  254. # if np.array(aa).all()>0: #判断模型稳定性 混合特征
  255. # #loss阈值
  256. # bb=[]
  257. # for i in range(len(list_deltaloss_sort)):
  258. # b=sum(list(list_deltaloss_sort[i][key_col]))
  259. # bb.append(b)
  260. # index_delmax=bb.index(max(bb))
  261. # test_loss_fault,model,scaler,deltaloss_sort=list_loss_fault[index_delmax],list_model[index_delmax],list_scaler[index_delmax],list_deltaloss_sort[index_delmax]
  262. # loss_th_max=[]
  263. # for key in key_col:
  264. # loss_max=round(test_loss_fault[5][key],2)+float(deltaloss_sort[key])/10
  265. # loss_th_max.append(loss_max)
  266. # Plan2
  267. else:
  268. #print('模型不稳定')
  269. model,scaler,loss_th_max='','',''
  270. return model,scaler,loss_th_max,time_steps,maxlabel,freq