ばびぞうブログ

統計モデリング・機械学習・Python・R・Django・PostgreSQLに関してはなにもわかりません

LightGBMでよくやるやつ

パラメータチューニング(optuna)、CV

scikit-learn インターフェースを使わずLightGBM

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold
import optuna
import lightgbm as lgb

def get_evaluate(y_test, predict):

    fpr, tpr, thr_arr = metrics.roc_curve(y_test, predict)

    auc = metrics.auc(fpr, tpr)
    precision = metrics.precision_score(y_test, predict)
    recall = metrics.recall_score(y_test, predict)      

    return auc, precision, recall_score

def objective(trial):

    train_x, test_x, train_y, test_y = train_test_split(X_train, y_train, test_size=0.25, 
                                                        shuffle=True, stratify=y_train)
    dtrain = lgb.Dataset(train_x, label=train_y)

    param = {
        'objective': 'binary',
        'metric': 'auc',
        'verbosity': -1,
        'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']),
        'num_leaves': trial.suggest_int('num_leaves', 10, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0)
    }

    if param['boosting_type'] == 'dart':
        param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0)
        param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0)
    if param['boosting_type'] == 'goss':
        param['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0)
        param['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - param['top_rate'])

    gbm = lgb.train(param, dtrain)
    preds = gbm.predict(test_x)
    pred_labels = np.rint(preds)
    
    fpr, tpr, thr_arr = metrics.roc_curve(test_y, pred_labels)
    accuracy = metrics.auc(fpr, tpr)

    return accuracy

k = 5
skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=0)

lgbm_params = {'objective': 'binary'}

auc_list = []
precision_list = []
recall_list = []

for train_index, test_index in skf.split(X, y):

    X_train = X.iloc[train_index]
    y_train = y.iloc[train_index]
    X_test = X.iloc[test_index]
    y_test = y.iloc[test_index]

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)

    print('Number of finished trials: {}'.format(len(study.trials)))

    print('Best trial:')
    trial = study.best_trial

    print('  Value: {}'.format(trial.value))

    print('  Params: ')
    for key, value in trial.params.items():
        print('    {}: {}'.format(key, value))    

    # optunaでサーチしたパラメータ
    trial.params['objective'] = 'binary'
    lgbm_params = trial.params

    # ここではvalidをモデル評価、evalをフォールドアウト検証に使う・・・分割の大きさはデータセットと相談する
    X_eval, X_valid, y_eval, y_valid = train_test_split(X_test, y_test, random_state=90, 
                                                        shuffle=True, stratify=y_test, test_size=0.3)

    # データセットを生成する
    lgb_train = lgb.Dataset(X_train, y_train)

    # モデル評価用
    lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

    model = lgb.train(lgbm_params, 
                      lgb_train,
                      valid_sets=lgb_valid,
                      num_boost_round=100000,
                      early_stopping_rounds=10)

    predict = model.predict(X_test, num_iteration=model.best_iteration)

    auc, precision, recall = get_evaluate(y_test, predict)

    print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall))

    auc_list.append(auc)
    precision_list.append(precision)
    recall_list.append(recall)

# kfoldの平均値を取得
print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list), 
                                                         np.mean(precision_list), 
                                                         np.mean(recall_list)))

scikit-learn インターフェースでのLightGBM

また今度