В примере разобрана кластеризация документов по темам, где документ - мешок слов.1) Подключаем нужные библиотеки, импортируем данные. from __future__ import print_function from sklearn.datasets import fetch_20newsgroups from sklearn.decomposition import TruncatedSVD from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.feature_extraction.text import HashingVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import make_pipeline from sklearn.preprocessing import Normalizer from sklearn import metrics from sklearn.cluster import KMeans, MiniBatchKMeans import logging from optparse import OptionParser import sys from time import time import numpy as np logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)s %(message)s') Аргументы командной строки. op = OptionParser() op.add_option("--lsa", dest="n_components", type="int", help="Preprocess documents with latent semantic analysis.") op.add_option("--no-minibatch", action="store_false", dest="minibatch", default=True, help="Use ordinary k-means algorithm (in batch mode).") op.add_option("--no-idf", action="store_false", dest="use_idf", default=True, help="Disable Inverse Document Frequency feature weighting.") op.add_option("--use-hashing", action="store_true", default=False, help="Use a hashing feature vectorizer") op.add_option("--n-features", type=int, default=10000, help="Maximum number of features (dimensions)" " to extract from text.") op.add_option("--verbose", action="store_true", dest="verbose", default=False, help="Print progress reports inside k-means algorithm.") print(__doc__) op.print_help() Загрузим некоторые категории из обучающей выборки categories = [ 'alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space', ] print("Loading 20 newsgroups dataset for categories:") print(categories) dataset = fetch_20newsgroups(subset='all', categories=categories,shuffle=True, random_state=42) print("%d documents" % len(dataset.data)) print("%d categories" % len(dataset.target_names)) print() labels = dataset.target true_k = np.unique(labels).shape[0] print("Extracting features from the training dataset using a sparse vectorizer") if opts.use_hashing: if opts.use_idf: hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=True, norm=None, binary=False) vectorizer = make_pipeline(hasher, TfidfTransformer()) else: vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', non_negative=False, norm='l2', binary=False) else: vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf) X = vectorizer.fit_transform(dataset.data) print("done in %fs" % (time() - t0)) print("n_samples: %d, n_features: %d" % X.shape) print() if opts.n_components: print("Performing dimensionality reduction using LSA") t0 = time() svd = TruncatedSVD(opts.n_components) lsa = make_pipeline(svd, Normalizer(copy=False)) X = lsa.fit_transform(X) print("done in %fs" % (time() - t0)) explained_variance = svd.explained_variance_ratio_.sum() print("Explained variance of the SVD step: {}%".format( int(explained_variance * 100))) print() ############################################################################### # Кластеризация if opts.minibatch: km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose) else: km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=opts.verbose) print("Clustering sparse data with %s" % km) km.fit(X) print() print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) print("Adjusted Rand-Index: %.3f" % metrics.adjusted_rand_score(labels, km.labels_)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels, sample_size=1000)) print() if not (opts.n_components or opts.use_hashing): print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(true_k): print("Cluster %d:" % i, end='') for ind in order_centroids[i, :10]: print(' %s' % terms[ind], end='') print() Extracting features from the training dataset using a sparse vectorizer n_samples: 3387, n_features: 10000 Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++', init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=4, n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0, verbose=False) Homogeneity: 0.623 Completeness: 0.707 V-measure: 0.662 Adjusted Rand-Index: 0.661 Silhouette Coefficient: 0.007 Top terms per cluster: Cluster 0: graphics image university thanks com file 3d files posting ac Cluster 1: god com people don keith say article think sgi jesus Cluster 2: space nasa henry access digex toronto gov alaska pat com Cluster 3: sandvik kent apple newton com brian alink ksand jesus cookamunga