賽題地址:
https://www.datafountain.cn/competitions/451
賽題任務:
依據提供的12颱風力電機1年的10min間隔SCADA執行資料,包括時間戳資訊、風速資訊和功率資訊等,利用機器學習相關技術,建立魯棒的風電機組異常資料檢測模型,用於識別並剔除潛在的異常資料,提高資料品質。
此任務未給出異常資料標籤,視為聚類任務,為引導選手向賽題需求對接,現簡單闡述異常資料定義。異常資料是由風機執行過程與設計執行工況出現較大偏離時產生,如風速儀測風異常導致採集的功率散點明顯偏離設計風功率。
資料介紹:
https://www.datafountain.cn/competitions/451/datasets
以下是線上 f1 0.858方案:
#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt
data_df = pd.read_csv('../data/dataset.csv')
fan_info = pd.read_csv('../data/12faninfo.csv', names=["WindNumber", "fan_diam", "rated_power", "speed_in", "speed_out", "speed_min", "speed_max", "speed_range"])
data_fan_df = data_df.merge(fan_info, on='WindNumber')
data_fan_df['label'] = 0
#異常值
#三列值小於0
data_fan_df.loc[(data_fan_df['WindSpeed'] < 0) ,'label'] = 1
data_fan_df.loc[(data_fan_df['Power'] < 0 ) ,'label'] = 1
data_fan_df.loc[(data_fan_df['RotorSpeed'] < 0),'label'] = 1
#風速小於切入,功率大於0
data_fan_df.loc[(data_fan_df['WindSpeed'] < data_fan_df['speed_in']) & (data_fan_df['Power'] > 0 ) ,'label'] = 1
#風速大於切入,功率小於等於0
data_fan_df.loc[(data_fan_df['WindSpeed'] >= data_fan_df['speed_in']) & (data_fan_df['Power'] <= 0) ,'label'] = 1
#風速大於切除,功率大於0
data_fan_df.loc[(data_fan_df['WindSpeed'] > data_fan_df['speed_out']) & (data_fan_df['Power'] > 0) ,'label'] = 1
#功率大於額定功率1.2倍
data_fan_df.loc[ data_fan_df['Power'] > 1.2*data_fan_df['rated_power'] ,'label'] = 1
#風輪異常
data_fan_df.loc[ data_fan_df['RotorSpeed'] > 1.2*data_fan_df['speed_max'] ,'label'] = 1
data_fan_df.loc[ data_fan_df['RotorSpeed'] < 0.8*data_fan_df['speed_min'] ,'label'] = 1
data_fan_df_cp = data_fan_df.reset_index()
data_fan_df_cp = data_fan_df.reset_index()
data_fan_df_filter = data_fan_df_cp[data_fan_df['label']==0].copy()
for corr_col in tqdm( ['Power','RotorSpeed','WindSpeed' ] ):
for cut_col in ['Power','RotorSpeed','WindSpeed' ]:
if corr_col != cut_col:
data_df = data_fan_df_filter[data_fan_df_filter['label']==0].copy()
cut_label = pd.DataFrame()
cut_col_name = cut_col + '_bin'
corr_col_name = corr_col + 'Qt75'
low_ratio = 0.5
hig_ratio = 1.5
for i in data_df['WindNumber'].unique():
temp_data_df = data_df[data_df['WindNumber']==i].copy()
cut_bin = int( data_fan_df_filter[cut_col].max() ) * 5
print('cut col and bin is :', cut_col, cut_bin, 'temp data rows:', temp_data_df.shape[0])
temp_data_df[cut_col_name] = pd.cut(temp_data_df[cut_col], cut_bin, duplicates='drop').apply(lambda x:x.left)
PowerQt75 = pd.DataFrame( (temp_data_df.groupby([cut_col_name])[corr_col].quantile(0.75)) )
PowerQt75.columns = [corr_col_name]
temp_data_df = temp_data_df.merge(PowerQt75, on=cut_col_name)
temp_x = temp_data_df[cut_col_name]
temp_y = temp_data_df[corr_col]
temp_data_df.loc[ (temp_data_df[corr_col] < low_ratio*temp_data_df[corr_col_name]) | (temp_data_df[corr_col] > hig_ratio*temp_data_df[corr_col_name]),'label'] = 1
cut_label = pd.concat([cut_label, temp_data_df[['index','label']].copy() ], axis=0)
temp_data_df = temp_data_df[ (temp_data_df[corr_col] < low_ratio*temp_data_df[corr_col_name]) | (temp_data_df[corr_col] > hig_ratio*temp_data_df[corr_col_name])]
print(temp_data_df.shape)
#分別畫異常點、所有點散點圖
plt.scatter(temp_data_df[cut_col_name], temp_data_df[corr_col])
plt.scatter(temp_x, temp_y, alpha=0.1)
plt.xlabel(cut_col)
plt.ylabel(corr_col)
plt.show()
cut_label.columns = ['index', 'labelcut']
data_fan_df_filter = data_fan_df_filter.merge(cut_label, how='outer', on='index')
data_fan_df_filter['labelcut'] = data_fan_df_filter['labelcut'].fillna(0)
data_fan_df_filter['label'] = data_fan_df_filter['label'] + data_fan_df_filter['labelcut']
data_fan_df_filter['label'] = data_fan_df_filter['label'].replace(2,1)
data_fan_df_filter.drop(['labelcut'], axis=1,inplace=True)
filter_res = data_fan_df_filter[['index', 'label']].copy()
filter_res.columns = ['index', 'labelres']
data_fan_df_cp = data_fan_df_cp.merge(filter_res, how='outer', on='index')
data_fan_df_cp['labelres'] = data_fan_df_cp['labelres'].fillna(1)
data_fan_df_cp['label'] = data_fan_df_cp['label'] + data_fan_df_cp['labelres']
data_fan_df_cp['label'] = data_fan_df_cp['label'].replace(2,1)
data_fan_df_cp.drop(['labelres'], axis=1,inplace=True)
data_sub = data_fan_df_cp[['WindNumber','Time','label']].copy()
data_sub.to_csv('../subs/sub_6_cut_low0.5_hight1.5.csv',index=None)
以上提交結果,線上約為:0.85824