1

GBDT / XGBoost / LightGBM

 2 years ago
source link: https://xfliu1998.github.io/2022/03/17/6.2-GBDT-XGBoost-LightGBM/
Go to the source link to view the article. You can view the picture content, updated content and better typesetting reading experience. If the link is broken, please click the button below to view the snapshot at that time.

一直进步 做喜欢的

GBDT / XGBoost / LightGBM

Created2022-03-17|Updated2022-03-24|Machine Learning
Word count:746|Reading time:4min|Post View:29|Comments:

系列笔记

如果说我看得比别人更远些,那是因为我站在巨人的肩膀上。

GBDT代码实现

import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets, ensemble
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# load data
diabetes = datasets.load_diabetes()
X, y = diabetes.data, diabetes.target

# data preprocessing
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.1, random_state=13
)
params = {
"n_estimators": 500, # 迭代次数
"max_depth": 4, # 树中结点的最大深度
"min_samples_split": 5, # 拆分内部结点所需的最小样本数
"learning_rate": 0.01, # 每棵树的贡献会减少多少
"loss": "squared_error", # 优化损失函数
}

# fit regression model
reg = ensemble.GradientBoostingRegressor(**params)
reg.fit(X_train, y_train)
mse = mean_squared_error(y_test, reg.predict(X_test))
print("The mean squared error (MSE) on test set: {:.4f}".format(mse)) # 3009.1324

# polt training deviance
test_score = np.zeros((params["n_estimators"],), dtype=np.float64)
for i, y_pred in enumerate(reg.staged_predict(X_test)):
test_score[i] = reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title("Deviance")
plt.plot(
np.arange(params["n_estimators"]) + 1,
reg.train_score_,
"b-",
label="Training Set Deviance",
)
plt.plot(
np.arange(params["n_estimators"]) + 1, test_score, "r-", label="Test Set Deviance"
)
plt.legend(loc="upper right")
plt.xlabel("Boosting Iterations")
plt.ylabel("Deviance")
fig.tight_layout()
plt.show()

GBDT regression运行结果

XGBoost代码实现

import xgboost as xgb
# read in data
dtrain = xgb.DMatrix('demo/data/agaricus.txt.train')
dtest = xgb.DMatrix('demo/data/agaricus.txt.test')
# specify parameters via map
param = {'max_depth':2, 'eta':1, 'objective':'binary:logistic' }
num_round = 2
bst = xgb.train(param, dtrain, num_round)
# make prediction
preds = bst.predict(dtest)

LightGBM代码实现

import lightgbm as lgb
import numpy as np
import scipy
import h5py

# Data Interface
train_data = lgb.Dataset('train.svm.bin')
data = np.random.rand(500, 10) # 500 entities, each contains 10 features
label = np.random.randint(2, size=500) # binary target
train_data = lgb.Dataset(data, label=label)

'''
# 创建数据集的其他方式
csr = scipy.sparse.csr_matrix((dat, (row, col)))
train_data = lgb.Dataset(csr)

class HDFSequence(lgb.Sequence):
def __init__(self, hdf_dataset, batch_size):
self.data = hdf_dataset
self.batch_size = batch_size

def __getitem__(self, idx):
return self.data[idx]

def __len__(self):
return len(self.data)

f = h5py.File('train.hdf5', 'r')
train_data = lgb.Dataset(HDFSequence(f['X'], 8192), label=f['Y'][:])

train_data = lgb.Dataset('train.svm.txt')
train_data.save_binary('train.bin')

validation_data = train_data.create_valid('validation.svm')
# validation_data = lgb.Dataset('validation.svm', reference=train_data)

train_data = lgb.Dataset(data, label=label, feature_name=['c1', 'c2', 'c3'], categorical_feature=['c3'])

w = np.random.rand(500, )
train_data = lgb.Dataset(data, label=label, weight=w)
'''

# Setting Parameters
param = {'num_leaves': 31, 'objective': 'binary'}
param['metric'] = 'auc'

# Traing
num_round = 10
bst = lgb.train(param, train_data, num_round, valid_sets=[validation_data])
bst.save_model('model.txt')
json_model = bst.dump_model()
bst = lgb.Booster(model_file='model.txt') # init model

# CV
lgb.cv(param, train_data, num_round, nfold=5)

# Early Stopping
bst = lgb.train(param, train_data, num_round, valid_sets=valid_sets, callbacks=[lgb.early_stopping(stopping_rounds=5)])
bst.save_model('model.txt', num_iteration=bst.best_iteration)

# Prediction
# 7 entities, each contains 10 features
data = np.random.rand(7, 10)
ypred = bst.predict(data)
# ypred = bst.predict(data, num_iteration=bst.best_iteration) # if early stopping

About Joyk


Aggregate valuable and interesting links.
Joyk means Joy of geeK