TSNE 中的近似最近邻#
此示例演示如何在管道中链接 KNeighborsTransformer 和 TSNE。它还展示了如何包装 nmslib
和 pynndescent
包以替换 KNeighborsTransformer 并执行近似最近邻。这些包可以使用 pip install nmslib pynndescent
安装。
注意:在 KNeighborsTransformer 中,我们使用包括每个训练点作为其自身邻居的定义,该定义包含在 n_neighbors
的计数中,并且出于兼容性原因,当 mode == 'distance'
时,会计算一个额外的邻居。请注意,我们在建议的 nmslib
包装器中执行相同的操作。
# Author: Tom Dupre la Tour
# License: BSD 3 clause
首先,我们尝试导入包,并在它们缺失的情况下向用户发出警告。
我们定义了一个包装类,用于将 scikit-learn API 实现到 nmslib
,以及一个加载函数。
import joblib
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.datasets import fetch_openml
from sklearn.utils import shuffle
class NMSlibTransformer(TransformerMixin, BaseEstimator):
"""Wrapper for using nmslib as sklearn's KNeighborsTransformer"""
def __init__(self, n_neighbors=5, metric="euclidean", method="sw-graph", n_jobs=-1):
self.n_neighbors = n_neighbors
self.method = method
self.metric = metric
self.n_jobs = n_jobs
def fit(self, X):
self.n_samples_fit_ = X.shape[0]
# see more metric in the manual
# https://github.com/nmslib/nmslib/tree/master/manual
space = {
"euclidean": "l2",
"cosine": "cosinesimil",
"l1": "l1",
"l2": "l2",
}[self.metric]
self.nmslib_ = nmslib.init(method=self.method, space=space)
self.nmslib_.addDataPointBatch(X.copy())
self.nmslib_.createIndex()
return self
def transform(self, X):
n_samples_transform = X.shape[0]
# For compatibility reasons, as each sample is considered as its own
# neighbor, one extra neighbor will be computed.
n_neighbors = self.n_neighbors + 1
if self.n_jobs < 0:
# Same handling as done in joblib for negative values of n_jobs:
# in particular, `n_jobs == -1` means "as many threads as CPUs".
num_threads = joblib.cpu_count() + self.n_jobs + 1
else:
num_threads = self.n_jobs
results = self.nmslib_.knnQueryBatch(
X.copy(), k=n_neighbors, num_threads=num_threads
)
indices, distances = zip(*results)
indices, distances = np.vstack(indices), np.vstack(distances)
indptr = np.arange(0, n_samples_transform * n_neighbors + 1, n_neighbors)
kneighbors_graph = csr_matrix(
(distances.ravel(), indices.ravel(), indptr),
shape=(n_samples_transform, self.n_samples_fit_),
)
return kneighbors_graph
def load_mnist(n_samples):
"""Load MNIST, shuffle the data, and return only n_samples."""
mnist = fetch_openml("mnist_784", as_frame=False)
X, y = shuffle(mnist.data, mnist.target, random_state=2)
return X[:n_samples] / 255, y[:n_samples]
我们对不同的精确/近似最近邻转换器进行基准测试。
import time
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsTransformer
from sklearn.pipeline import make_pipeline
datasets = [
("MNIST_10000", load_mnist(n_samples=10_000)),
("MNIST_20000", load_mnist(n_samples=20_000)),
]
n_iter = 500
perplexity = 30
metric = "euclidean"
# TSNE requires a certain number of neighbors which depends on the
# perplexity parameter.
# Add one since we include each sample as its own neighbor.
n_neighbors = int(3.0 * perplexity + 1) + 1
tsne_params = dict(
init="random", # pca not supported for sparse matrices
perplexity=perplexity,
method="barnes_hut",
random_state=42,
n_iter=n_iter,
learning_rate="auto",
)
transformers = [
(
"KNeighborsTransformer",
KNeighborsTransformer(n_neighbors=n_neighbors, mode="distance", metric=metric),
),
(
"NMSlibTransformer",
NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
),
(
"PyNNDescentTransformer",
PyNNDescentTransformer(
n_neighbors=n_neighbors, metric=metric, parallel_batch_queries=True
),
),
]
for dataset_name, (X, y) in datasets:
msg = f"Benchmarking on {dataset_name}:"
print(f"\n{msg}\n" + str("-" * len(msg)))
for transformer_name, transformer in transformers:
longest = np.max([len(name) for name, model in transformers])
start = time.time()
transformer.fit(X)
fit_duration = time.time() - start
print(f"{transformer_name:<{longest}} {fit_duration:.3f} sec (fit)")
start = time.time()
Xt = transformer.transform(X)
transform_duration = time.time() - start
print(f"{transformer_name:<{longest}} {transform_duration:.3f} sec (transform)")
if transformer_name == "PyNNDescentTransformer":
start = time.time()
Xt = transformer.transform(X)
transform_duration = time.time() - start
print(
f"{transformer_name:<{longest}} {transform_duration:.3f} sec"
" (transform)"
)
示例输出
Benchmarking on MNIST_10000:
----------------------------
KNeighborsTransformer 0.007 sec (fit)
KNeighborsTransformer 1.139 sec (transform)
NMSlibTransformer 0.208 sec (fit)
NMSlibTransformer 0.315 sec (transform)
PyNNDescentTransformer 4.823 sec (fit)
PyNNDescentTransformer 4.884 sec (transform)
PyNNDescentTransformer 0.744 sec (transform)
Benchmarking on MNIST_20000:
----------------------------
KNeighborsTransformer 0.011 sec (fit)
KNeighborsTransformer 5.769 sec (transform)
NMSlibTransformer 0.733 sec (fit)
NMSlibTransformer 1.077 sec (transform)
PyNNDescentTransformer 14.448 sec (fit)
PyNNDescentTransformer 7.103 sec (transform)
PyNNDescentTransformer 1.759 sec (transform)
请注意,PyNNDescentTransformer
在第一次 fit
和第一次 transform
期间需要更多时间,这是由于 numba 即时编译器的开销。但在第一次调用之后,编译后的 Python 代码将被 numba 保存在缓存中,后续调用不会受到此初始开销的影响。KNeighborsTransformer
和 NMSlibTransformer
在这里只运行一次,因为它们会显示更稳定的 fit
和 transform
时间(它们没有 PyNNDescentTransformer 的冷启动问题)。
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
transformers = [
("TSNE with internal NearestNeighbors", TSNE(metric=metric, **tsne_params)),
(
"TSNE with KNeighborsTransformer",
make_pipeline(
KNeighborsTransformer(
n_neighbors=n_neighbors, mode="distance", metric=metric
),
TSNE(metric="precomputed", **tsne_params),
),
),
(
"TSNE with NMSlibTransformer",
make_pipeline(
NMSlibTransformer(n_neighbors=n_neighbors, metric=metric),
TSNE(metric="precomputed", **tsne_params),
),
),
]
# init the plot
nrows = len(datasets)
ncols = np.sum([1 for name, model in transformers if "TSNE" in name])
fig, axes = plt.subplots(
nrows=nrows, ncols=ncols, squeeze=False, figsize=(5 * ncols, 4 * nrows)
)
axes = axes.ravel()
i_ax = 0
for dataset_name, (X, y) in datasets:
msg = f"Benchmarking on {dataset_name}:"
print(f"\n{msg}\n" + str("-" * len(msg)))
for transformer_name, transformer in transformers:
longest = np.max([len(name) for name, model in transformers])
start = time.time()
Xt = transformer.fit_transform(X)
transform_duration = time.time() - start
print(
f"{transformer_name:<{longest}} {transform_duration:.3f} sec"
" (fit_transform)"
)
# plot TSNE embedding which should be very similar across methods
axes[i_ax].set_title(transformer_name + "\non " + dataset_name)
axes[i_ax].scatter(
Xt[:, 0],
Xt[:, 1],
c=y.astype(np.int32),
alpha=0.2,
cmap=plt.cm.viridis,
)
axes[i_ax].xaxis.set_major_formatter(NullFormatter())
axes[i_ax].yaxis.set_major_formatter(NullFormatter())
axes[i_ax].axis("tight")
i_ax += 1
fig.tight_layout()
plt.show()
示例输出
Benchmarking on MNIST_10000:
----------------------------
TSNE with internal NearestNeighbors 24.828 sec (fit_transform)
TSNE with KNeighborsTransformer 20.111 sec (fit_transform)
TSNE with NMSlibTransformer 21.757 sec (fit_transform)
Benchmarking on MNIST_20000:
----------------------------
TSNE with internal NearestNeighbors 51.955 sec (fit_transform)
TSNE with KNeighborsTransformer 50.994 sec (fit_transform)
TSNE with NMSlibTransformer 43.536 sec (fit_transform)
我们可以观察到,默认的 TSNE
估计器及其内部 NearestNeighbors
实现,在性能方面与使用 TSNE
和 KNeighborsTransformer
的管道大致相同。这是预期的,因为这两个管道都在内部依赖于相同的 NearestNeighbors
实现,该实现执行精确的邻居搜索。近似的 NMSlibTransformer
已经比最小的数据集上的精确搜索略快,但预计这种速度差异在样本数量更大的数据集上会变得更加显著。
但是请注意,并非所有近似搜索方法都能保证提高默认精确搜索方法的速度:事实上,自 scikit-learn 1.1 以来,精确搜索实现已经显著改进。此外,蛮力精确搜索方法不需要在 fit
阶段构建索引。因此,为了在 TSNE
管道中获得整体性能提升,近似搜索在 transform
阶段的收益需要大于在 fit
阶段构建近似搜索索引所花费的额外时间。
最后,TSNE 算法本身也是计算密集型的,与最近邻搜索无关。因此,将最近邻搜索步骤的速度提高 5 倍,不会导致整个管道的速度提高 5 倍。
相关示例