anomalyPCA.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #热失控预警:PCA异常指数
  2. import pandas as pd
  3. import numpy as np
  4. from scipy.signal import savgol_filter
  5. from sklearn.preprocessing import RobustScaler
  6. from sklearn.decomposition import PCA
  7. #筛选特征
  8. def makedataset1(df_data):
  9. df_data=df_data.drop(['Unnamed: 0','总电流[A]','GSM信号','外电压','SOH[%]','开关状态','充电状态','故障等级','故障代码','绝缘电阻','上锁状态','加热状态','单体均衡状态','总输出状态'],axis=1,errors='ignore')
  10. df_data=df_data.drop(["单体温度"+str(i) for i in range(1,5)],axis=1,errors='ignore')
  11. df_data=df_data.drop(["其他温度"+str(i) for i in range(1,7)],axis=1,errors='ignore')
  12. listV=[s for s in list(df_data) if '单体电压' in s]
  13. for i in range(1,len(listV)+1):
  14. df_data=df_data[(df_data['单体电压'+str(i)]>2200) & (df_data['单体电压'+str(i)]<4800)]
  15. df_data=df_data[df_data['SOC[%]']>20]
  16. df_data['时间']=[df_data.loc[i,'时间戳'][0:15] for i in df_data.index]
  17. df_data=df_data.drop('时间戳',axis=1)
  18. data_set=df_data.groupby('时间').mean(False)
  19. for k in data_set.columns:
  20. data_set[k]=savgol_filter(data_set[k],3,2)
  21. return data_set
  22. #新建统计特征
  23. def makedataset2(df_data):
  24. data_set=makedataset1(df_data)
  25. listV=[s for s in list(df_data) if '单体电压' in s]
  26. data_set["最低单体电压"]=data_set[["单体电压"+str(i) for i in range(1,len(listV)+1)]].min(axis=1)
  27. data_set["最高单体电压"]=data_set[["单体电压"+str(i) for i in range(1,len(listV)+1)]].max(axis=1)
  28. data_set["平均单体电压"]=data_set[["单体电压"+str(i) for i in range(1,len(listV)+1)]].mean(axis=1)
  29. data_set["最大单体压差"]=[data_set.loc[k,"最高单体电压"]-data_set.loc[k,"最低单体电压"] for k in data_set.index]
  30. data_set["低压差"]=[data_set.loc[k,"平均单体电压"]-data_set.loc[k,"最低单体电压"] for k in data_set.index]
  31. data_set=data_set.drop(["单体电压"+str(i) for i in range(1,len(listV)+1)],axis=1)
  32. return data_set
  33. #标准化
  34. def process(data_set):
  35. features=data_set.columns
  36. sX=RobustScaler(copy=True)
  37. data_set2=data_set.copy()
  38. data_set2.loc[:,features]=sX.fit_transform(data_set2[features])
  39. return data_set2
  40. #异常指数函数
  41. def anomalyScores(originalDF,reducedDF):
  42. loss=np.sum((np.array(originalDF)-np.array(reducedDF))**2,axis=1)
  43. loss=pd.Series(data=loss,index=originalDF.index)
  44. loss=(loss-np.min(loss))/(np.max(loss)-np.min(loss))
  45. return loss
  46. #建立PCA模型
  47. def anomalyPCA(x_train_pro):
  48. n_components=4
  49. whiten=True
  50. random_state=2
  51. pca=PCA(n_components=n_components,whiten=whiten,random_state=random_state)
  52. pca.fit(x_train_pro)
  53. return pca
  54. #判断PCA异常指数
  55. def transform(df_data_pro,model,df_data):
  56. #降维
  57. X_train=model.transform(df_data_pro)
  58. X_train=pd.DataFrame(data=X_train,index=df_data_pro.index)
  59. #还原
  60. X_train_inverse=model.inverse_transform(X_train)
  61. X_train_inverse=pd.DataFrame(data=X_train_inverse,index=df_data_pro.index)
  62. #异常指数
  63. anomalyScoresModel=anomalyScores(df_data_pro,X_train_inverse)
  64. df_data2=df_data.copy()
  65. if len(anomalyScoresModel)>15:
  66. anomalyScoresModel=savgol_filter(anomalyScoresModel,15,3)
  67. df_data2['anomalyScores_'+str(model)]=anomalyScoresModel
  68. else:
  69. df_data2['anomalyScores_'+str(model)]=0
  70. return df_data2
  71. #判断离群
  72. def detect_outliers(data,pred,threshold=3):
  73. anomaly=data['anomalyScores_PCA(n_components=4, random_state=2, whiten=True)']
  74. anomalypred=pred['anomalyScores_PCA(n_components=4, random_state=2, whiten=True)']
  75. mean_d=np.mean(anomaly.values)
  76. std_d=np.std(anomaly.values)
  77. max_score=np.max(anomaly.values)
  78. outliers2=pd.DataFrame()
  79. for k in anomalypred.index:
  80. z_score= (anomalypred[k]-mean_d)/std_d
  81. if (np.abs(z_score) >threshold) & (anomalypred[k]>max_score):
  82. outliers2=outliers2.append(pred[anomalypred.values==anomalypred[k]])
  83. return outliers2
  84. #训练模型
  85. def train_model(data_train):
  86. x_train1=makedataset1(data_train)
  87. x_train2=makedataset2(data_train)
  88. x_train_pro1=process(x_train1)
  89. x_train_pro2=process(x_train2)
  90. pca1=anomalyPCA(x_train_pro1)
  91. pca2=anomalyPCA(x_train_pro2)
  92. res1=transform(x_train_pro1,pca1,x_train1)
  93. res2=transform(x_train_pro2,pca2,x_train2)
  94. return pca1,pca2,res1,res2
  95. #预测
  96. def prediction(data_test,pca1,pca2):
  97. x_test1=makedataset1(data_test)
  98. x_test2=makedataset2(data_test)
  99. x_test_pro1=process(x_test1)
  100. x_test_pro2=process(x_test2)
  101. pred1=transform(x_test_pro1,pca1,x_test1)
  102. pred2=transform(x_test_pro2,pca2,x_test2)
  103. return pred1,pred2
  104. def boxplot_fill(res2):
  105. col=res2['低压差']
  106. # 计算iqr:数据四分之三分位值与四分之一分位值的差
  107. iqr=col.quantile(0.75)-col.quantile(0.25)
  108. # 根据iqr计算异常值判断阈值
  109. u_th=col.quantile(0.75) + 2*iqr # 上界
  110. return u_th
  111. #判定异常
  112. def check_anomaly(outliers1,outliers2,res2):
  113. outliers=pd.merge(outliers1,outliers2,on='时间')
  114. outliers=outliers[outliers['SOC[%]_x']>50]
  115. outliers=outliers.drop(['总电压[V]_y','单体压差_y','SOC[%]_y'],axis=1)
  116. u_th=boxplot_fill(res2)
  117. outliers=outliers[outliers['低压差']>u_th]
  118. return outliers