连续折半迭代#

本示例说明了连续折半搜索（HalvingGridSearchCV 和 HalvingRandomSearchCV）如何从多个候选者中迭代选择最佳参数组合。

# Authors: The scikit-learn developers
# SPDX-License-Identifier: BSD-3-Clause

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import randint

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_halving_search_cv  # noqa: F401
from sklearn.model_selection import HalvingRandomSearchCV

我们首先定义参数空间并训练一个 HalvingRandomSearchCV 实例。

rng = np.random.RandomState(0)

X, y = datasets.make_classification(n_samples=400, n_features=12, random_state=rng)

clf = RandomForestClassifier(n_estimators=20, random_state=rng)

param_dist = {
    "max_depth": [3, None],
    "max_features": randint(1, 6),
    "min_samples_split": randint(2, 11),
    "bootstrap": [True, False],
    "criterion": ["gini", "entropy"],
}

rsh = HalvingRandomSearchCV(
    estimator=clf, param_distributions=param_dist, factor=2, random_state=rng
)
rsh.fit(X, y)

HalvingRandomSearchCV(estimator=RandomForestClassifier(n_estimators=20,
                                                       random_state=RandomState(MT19937) at 0x7FAD12B5D040),
                      factor=2,
                      param_distributions={'bootstrap': [True, False],
                                           'criterion': ['gini', 'entropy'],
                                           'max_depth': [3, None],
                                           'max_features': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7fad2247a8f0>,
                                           'min_samples_split': <scipy.stats._distn_infrastructure.rv_discrete_frozen object at 0x7fad294c2a40>},
                      random_state=RandomState(MT19937) at 0x7FAD12B5D040)

在 Jupyter 环境中，请重新运行此单元格以显示 HTML 表示或信任此 notebook。
在 GitHub 上，HTML 表示无法渲染，请尝试使用 nbviewer.org 加载此页面。

HalvingRandomSearchCV

?HalvingRandomSearchCV 文档i已拟合

参数

	estimator	RandomForestC...x7FAD12B5D040)
	param_distributions	{'bootstrap': [True, False], 'criterion': ['gini', 'entropy'], 'max_depth': [3, None], 'max_features': <scipy.stats....x7fad2247a8f0>, ...}
	n_candidates	'exhaust'
	factor	2
	resource	'n_samples'
	max_resources	'auto'
	min_resources	'smallest'
	aggressive_elimination	False
	cv	5
	scoring	None
	refit	True
	error_score	nan
	return_train_score	True
	random_state	RandomState(M...0x7FAD12B5D040
	n_jobs	None
	verbose	0

best_estimator_: RandomForestClassifier

RandomForestClassifier(bootstrap=False, criterion='entropy', max_features=4,
                       min_samples_split=5, n_estimators=20,
                       random_state=RandomState(MT19937) at 0x7FAD12B5F940)

RandomForestClassifier

?RandomForestClassifier 文档

参数

	n_estimators	20
	criterion	'entropy'
	max_depth	None
	min_samples_split	5
	min_samples_leaf	1
	min_weight_fraction_leaf	0.0
	max_features	4
	max_leaf_nodes	None
	min_impurity_decrease	0.0
	bootstrap	False
	oob_score	False
	n_jobs	None
	random_state	RandomState(M...0x7FAD12B5F940
	verbose	0
	warm_start	False
	class_weight	None
	ccp_alpha	0.0
	max_samples	None
	monotonic_cst	None

我们现在可以使用搜索估计器的 cv_results_ 属性来检查和绘制搜索的演变过程。

results = pd.DataFrame(rsh.cv_results_)
results["params_str"] = results.params.apply(str)
results.drop_duplicates(subset=("params_str", "iter"), inplace=True)
mean_scores = results.pivot(
    index="iter", columns="params_str", values="mean_test_score"
)
ax = mean_scores.plot(legend=False, alpha=0.6)

labels = [
    f"iter={i}\nn_samples={rsh.n_resources_[i]}\nn_candidates={rsh.n_candidates_[i]}"
    for i in range(rsh.n_iterations_)
]

ax.set_xticks(range(rsh.n_iterations_))
ax.set_xticklabels(labels, rotation=45, multialignment="left")
ax.set_title("Scores of candidates over iterations")
ax.set_ylabel("mean test score", fontsize=15)
ax.set_xlabel("iterations", fontsize=15)
plt.tight_layout()
plt.show()

每次迭代的候选数量和资源量#

在第一次迭代中，使用少量资源。这里的资源是用于训练估计器的样本数量。所有候选者都会被评估。

在第二次迭代中，只评估最佳一半的候选者。分配的资源数量翻倍：候选者在两倍的样本上进行评估。

这个过程重复进行，直到最后一次迭代，只剩下 2 个候选者。最佳候选者是在最后一次迭代中得分最高的候选者。

脚本总运行时间： (0 分钟 4.778 秒)

连续折半迭代#

每次迭代的候选数量和资源量#

本页