LightGBMでよくやるやつ
パラメータチューニング(optuna)、CV
scikit-learn インターフェースを使わずLightGBM
import numpy as np import pandas as pd from sklearn import metrics from sklearn.model_selection import train_test_split, StratifiedKFold import optuna import lightgbm as lgb def get_evaluate(y_test, predict): fpr, tpr, thr_arr = metrics.roc_curve(y_test, predict) auc = metrics.auc(fpr, tpr) precision = metrics.precision_score(y_test, predict) recall = metrics.recall_score(y_test, predict) return auc, precision, recall_score def objective(trial): train_x, test_x, train_y, test_y = train_test_split(X_train, y_train, test_size=0.25, shuffle=True, stratify=y_train) dtrain = lgb.Dataset(train_x, label=train_y) param = { 'objective': 'binary', 'metric': 'auc', 'verbosity': -1, 'boosting_type': trial.suggest_categorical('boosting', ['gbdt', 'dart', 'goss']), 'num_leaves': trial.suggest_int('num_leaves', 10, 1000), 'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0) } if param['boosting_type'] == 'dart': param['drop_rate'] = trial.suggest_loguniform('drop_rate', 1e-8, 1.0) param['skip_drop'] = trial.suggest_loguniform('skip_drop', 1e-8, 1.0) if param['boosting_type'] == 'goss': param['top_rate'] = trial.suggest_uniform('top_rate', 0.0, 1.0) param['other_rate'] = trial.suggest_uniform('other_rate', 0.0, 1.0 - param['top_rate']) gbm = lgb.train(param, dtrain) preds = gbm.predict(test_x) pred_labels = np.rint(preds) fpr, tpr, thr_arr = metrics.roc_curve(test_y, pred_labels) accuracy = metrics.auc(fpr, tpr) return accuracy k = 5 skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=0) lgbm_params = {'objective': 'binary'} auc_list = [] precision_list = [] recall_list = [] for train_index, test_index in skf.split(X, y): X_train = X.iloc[train_index] y_train = y.iloc[train_index] X_test = X.iloc[test_index] y_test = y.iloc[test_index] study = optuna.create_study(direction='maximize') study.optimize(objective, n_trials=100) print('Number of finished trials: {}'.format(len(study.trials))) print('Best trial:') trial = study.best_trial print(' Value: {}'.format(trial.value)) print(' Params: ') for key, value in trial.params.items(): print(' {}: {}'.format(key, value)) # optunaでサーチしたパラメータ trial.params['objective'] = 'binary' lgbm_params = trial.params # ここではvalidをモデル評価、evalをフォールドアウト検証に使う・・・分割の大きさはデータセットと相談する X_eval, X_valid, y_eval, y_valid = train_test_split(X_test, y_test, random_state=90, shuffle=True, stratify=y_test, test_size=0.3) # データセットを生成する lgb_train = lgb.Dataset(X_train, y_train) # モデル評価用 lgb_valid = lgb.Dataset(X_valid, y_valid, reference=lgb_train) model = lgb.train(lgbm_params, lgb_train, valid_sets=lgb_valid, num_boost_round=100000, early_stopping_rounds=10) predict = model.predict(X_test, num_iteration=model.best_iteration) auc, precision, recall = get_evaluate(y_test, predict) print('AUC:{}, precision:{}, recall:{}'.format(auc, precision, recall)) auc_list.append(auc) precision_list.append(precision) recall_list.append(recall) # kfoldの平均値を取得 print('Kfold平均 AUC:{}, precision:{}, recall:{}'.format(np.mean(auc_list), np.mean(precision_list), np.mean(recall_list)))
scikit-learn インターフェースでのLightGBM
また今度