煉丹記之國家電投2020風電機組異常資料識別與清洗 baseline f1=0.858分享

2020-09-22 11:00:34

賽題地址:

https://www.datafountain.cn/competitions/451

賽題任務:

依據提供的12颱風力電機1年的10min間隔SCADA執行資料,包括時間戳資訊、風速資訊和功率資訊等,利用機器學習相關技術,建立魯棒的風電機組異常資料檢測模型,用於識別並剔除潛在的異常資料,提高資料品質。
此任務未給出異常資料標籤,視為聚類任務,為引導選手向賽題需求對接,現簡單闡述異常資料定義。異常資料是由風機執行過程與設計執行工況出現較大偏離時產生,如風速儀測風異常導致採集的功率散點明顯偏離設計風功率。

資料介紹:

https://www.datafountain.cn/competitions/451/datasets

以下是線上 f1 0.858方案:

#!/usr/bin/env python
# coding: utf-8
import pandas as pd
import numpy as np
from tqdm import tqdm
from matplotlib import pyplot as plt

data_df = pd.read_csv('../data/dataset.csv')
fan_info = pd.read_csv('../data/12faninfo.csv', names=["WindNumber", "fan_diam", "rated_power", "speed_in", "speed_out", "speed_min", "speed_max", "speed_range"])

data_fan_df = data_df.merge(fan_info, on='WindNumber')
data_fan_df['label'] = 0

#異常值
#三列值小於0
data_fan_df.loc[(data_fan_df['WindSpeed'] < 0) ,'label'] = 1
data_fan_df.loc[(data_fan_df['Power'] < 0    ) ,'label'] = 1
data_fan_df.loc[(data_fan_df['RotorSpeed'] < 0),'label'] = 1
#風速小於切入,功率大於0
data_fan_df.loc[(data_fan_df['WindSpeed'] < data_fan_df['speed_in'])  & (data_fan_df['Power'] > 0 ) ,'label'] = 1
#風速大於切入,功率小於等於0
data_fan_df.loc[(data_fan_df['WindSpeed'] >= data_fan_df['speed_in']) & (data_fan_df['Power'] <= 0) ,'label'] = 1
#風速大於切除,功率大於0
data_fan_df.loc[(data_fan_df['WindSpeed'] > data_fan_df['speed_out']) & (data_fan_df['Power'] > 0)  ,'label'] = 1

#功率大於額定功率1.2倍
data_fan_df.loc[ data_fan_df['Power'] > 1.2*data_fan_df['rated_power'] ,'label'] = 1
#風輪異常
data_fan_df.loc[ data_fan_df['RotorSpeed'] > 1.2*data_fan_df['speed_max'] ,'label'] = 1
data_fan_df.loc[ data_fan_df['RotorSpeed'] < 0.8*data_fan_df['speed_min'] ,'label'] = 1

data_fan_df_cp = data_fan_df.reset_index()
data_fan_df_cp = data_fan_df.reset_index()
data_fan_df_filter = data_fan_df_cp[data_fan_df['label']==0].copy()

for corr_col in tqdm( ['Power','RotorSpeed','WindSpeed' ] ):
    for cut_col in ['Power','RotorSpeed','WindSpeed' ]:
        if corr_col != cut_col:
            data_df = data_fan_df_filter[data_fan_df_filter['label']==0].copy()
            cut_label = pd.DataFrame()
            cut_col_name = cut_col   + '_bin'
            corr_col_name = corr_col + 'Qt75'
            low_ratio = 0.5
            hig_ratio = 1.5

            for i in data_df['WindNumber'].unique():
                temp_data_df = data_df[data_df['WindNumber']==i].copy()
                cut_bin   = int( data_fan_df_filter[cut_col].max() ) * 5
                print('cut col and bin is :', cut_col, cut_bin, 'temp data rows:', temp_data_df.shape[0])
                temp_data_df[cut_col_name] = pd.cut(temp_data_df[cut_col], cut_bin, duplicates='drop').apply(lambda x:x.left)

                PowerQt75 = pd.DataFrame( (temp_data_df.groupby([cut_col_name])[corr_col].quantile(0.75)) )
                PowerQt75.columns = [corr_col_name]

                temp_data_df = temp_data_df.merge(PowerQt75, on=cut_col_name)
                temp_x = temp_data_df[cut_col_name]
                temp_y = temp_data_df[corr_col]
                temp_data_df.loc[ (temp_data_df[corr_col] < low_ratio*temp_data_df[corr_col_name]) | (temp_data_df[corr_col] > hig_ratio*temp_data_df[corr_col_name]),'label'] = 1

                cut_label = pd.concat([cut_label, temp_data_df[['index','label']].copy() ], axis=0)
                temp_data_df = temp_data_df[ (temp_data_df[corr_col] < low_ratio*temp_data_df[corr_col_name]) | (temp_data_df[corr_col] > hig_ratio*temp_data_df[corr_col_name])]
                print(temp_data_df.shape)
                #分別畫異常點、所有點散點圖
                plt.scatter(temp_data_df[cut_col_name], temp_data_df[corr_col])
                plt.scatter(temp_x, temp_y, alpha=0.1)
                plt.xlabel(cut_col)
                plt.ylabel(corr_col)
                plt.show()
            cut_label.columns = ['index', 'labelcut']
            data_fan_df_filter = data_fan_df_filter.merge(cut_label, how='outer', on='index')
            data_fan_df_filter['labelcut'] = data_fan_df_filter['labelcut'].fillna(0)
            data_fan_df_filter['label'] = data_fan_df_filter['label'] + data_fan_df_filter['labelcut']
            data_fan_df_filter['label'] = data_fan_df_filter['label'].replace(2,1)
            data_fan_df_filter.drop(['labelcut'], axis=1,inplace=True)

filter_res  = data_fan_df_filter[['index', 'label']].copy()
filter_res.columns = ['index', 'labelres']
data_fan_df_cp = data_fan_df_cp.merge(filter_res, how='outer', on='index')
data_fan_df_cp['labelres'] = data_fan_df_cp['labelres'].fillna(1)
data_fan_df_cp['label'] = data_fan_df_cp['label'] + data_fan_df_cp['labelres']
data_fan_df_cp['label'] = data_fan_df_cp['label'].replace(2,1)
data_fan_df_cp.drop(['labelres'], axis=1,inplace=True)

data_sub = data_fan_df_cp[['WindNumber','Time','label']].copy()
data_sub.to_csv('../subs/sub_6_cut_low0.5_hight1.5.csv',index=None)

以上提交結果,線上約為:0.85824