Признаки извлекаются двумя способами: 1)TfidfVectorizer; 2)HashingVectorizer. Используются два алгоритма: k-means и MiniBatchKMeans.
from __future__ import print_function
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np
logging.basicConfig(level=logging.INFO,format='%(asctime)s %(levelname)s %(message)s')
op = OptionParser()
op.add_option("--lsa",
dest="n_components", type="int",
help="Preprocess documents with latent semantic analysis.")
op.add_option("--no-minibatch",
action="store_false", dest="minibatch", default=True,
help="Use ordinary k-means algorithm (in batch mode).")
op.add_option("--no-idf",
action="store_false", dest="use_idf", default=True,
help="Disable Inverse Document Frequency feature weighting.")
op.add_option("--use-hashing",
action="store_true", default=False,
help="Use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000,
help="Maximum number of features (dimensions)"
" to extract from text.")
op.add_option("--verbose",
action="store_true", dest="verbose", default=False,
help="Print progress reports inside k-means algorithm.")
print(__doc__)
op.print_help()
Automatically created module for IPython interactive environment Usage: -c [options] Options: -h, --help show this help message and exit --lsa=N_COMPONENTS Preprocess documents with latent semantic analysis. --no-minibatch Use ordinary k-means algorithm (in batch mode). --no-idf Disable Inverse Document Frequency feature weighting. --use-hashing Use a hashing feature vectorizer --n-features=N_FEATURES Maximum number of features (dimensions) to extract from text. --verbose Print progress reports inside k-means algorithm.
categories = [
'alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space',
]
print("Loading 20 newsgroups dataset for categories:")
print(categories)
Loading 20 newsgroups dataset for categories: ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
dataset = fetch_20newsgroups(subset='all', categories=categories,shuffle=True, random_state=42)
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()
3387 documents 4 categories
labels = dataset.target
true_k = np.unique(labels).shape[0]
print("Extracting features from the training dataset using a sparse vectorizer")
if opts.use_hashing:
if opts.use_idf:
hasher = HashingVectorizer(n_features=opts.n_features,
stop_words='english', non_negative=True,
norm=None, binary=False)
vectorizer = make_pipeline(hasher, TfidfTransformer())
else:
vectorizer = HashingVectorizer(n_features=opts.n_features,
stop_words='english',
non_negative=False, norm='l2',
binary=False)
else:
vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features,
min_df=2, stop_words='english',
use_idf=opts.use_idf)
X = vectorizer.fit_transform(dataset.data)
print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" % X.shape)
print()
if opts.n_components:
print("Performing dimensionality reduction using LSA")
t0 = time()
svd = TruncatedSVD(opts.n_components)
lsa = make_pipeline(svd, Normalizer(copy=False))
X = lsa.fit_transform(X)
print("done in %fs" % (time() - t0))
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(
int(explained_variance * 100)))
print()
###############################################################################
# Кластеризация
if opts.minibatch:
km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1,
init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
verbose=opts.verbose)
print("Clustering sparse data with %s" % km)
km.fit(X)
print()
print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
% metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
% metrics.silhouette_score(X, labels, sample_size=1000))
print()
if not (opts.n_components or opts.use_hashing):
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
print("Cluster %d:" % i, end='')
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind], end='')
print()
Extracting features from the training dataset using a sparse vectorizer
n_samples: 3387, n_features: 10000
Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=4,
n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
verbose=False)
Homogeneity: 0.623
Completeness: 0.707
V-measure: 0.662
Adjusted Rand-Index: 0.661
Silhouette Coefficient: 0.007
Top terms per cluster:
Cluster 0: graphics image university thanks com file 3d files posting ac
Cluster 1: god com people don keith say article think sgi jesus
Cluster 2: space nasa henry access digex toronto gov alaska pat com
Cluster 3: sandvik kent apple newton com brian alink ksand jesus cookamunga