from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) print(vectorizer) content = ["How to format my hard disk", "Hard disk format problems "] X = vectorizer.fit_transform(content) vectorizer.get_feature_names() print X.toarray() print X.toarray().transpose() print X import os import sys import scipy as sp sys.path.append('/Users/masai/Documents/BuildingMachineLearningSystemsWithPython/ch03') from utils import DATA_DIR TOY_DIR = os.path.join(DATA_DIR, "toy") posts = [open(os.path.join(TOY_DIR, f)).read() for f in os.listdir(TOY_DIR)] print posts X_train = vectorizer.fit_transform(posts) num_samples, num_features = X_train.shape print("#samples: %d, #features: %d" % (num_samples,num_features)) print(vectorizer.get_feature_names()) new_post = "imaging databases" new_post_vec = vectorizer.transform([new_post]) print(new_post_vec) print(new_post_vec.toarray()) import numpy as np import scipy as sp def dist_raw(v1, v2): delta = v1 - v2 return sp.linalg.norm(delta.toarray()) import sys best_doc = None best_dist = sys.maxint best_i = None for i in range(0, num_samples): post = posts[i] if post == new_post: continue post_vec = X_train.getrow(i) d = dist_raw(post_vec, new_post_vec) print "=== Post %i with dist=%.2f: %s" %(i, d, post) if d < best_dist: best_dist = d best_i = i print("Best post is %i with dist=%.2f"%(best_i, best_dist)) print X_train.shape print X_train.toarray() print X_train print X_train.getrow(0) print X_train.getrow(1) print X_train.getrow(1).toarray() def dist_norm(v1, v2): v1_normalized = v1 / sp.linalg.norm(v1.toarray()) v2_normalized = v2 / sp.linalg.norm(v2.toarray()) delta = v1_normalized - v2_normalized return sp.linalg.norm(delta.toarray()) import sys best_doc = None best_dist = sys.maxint best_i = None for i in range(0, num_samples): post = posts[i] if post == new_post: continue post_vec = X_train.getrow(i) d = dist_norm(post_vec, new_post_vec) print "=== Post %i with dist=%.2f: %s" %(i, d, post) if d < best_dist: best_dist = d best_i = i print("Best post is %i with dist=%.2f"%(best_i, best_dist)) vectorizer = CountVectorizer(min_df=1, stop_words='english') print vectorizer.get_stop_words() X_train = vectorizer.fit_transform(posts) print X_train.shape new_post = "imaging databases" new_post_vec = vectorizer.transform([new_post]) print(new_post_vec) import sys best_doc = None best_dist = sys.maxint best_i = None for i in range(0, num_samples): post = posts[i] if post == new_post: continue post_vec = X_train.getrow(i) d = dist_norm(post_vec, new_post_vec) print "=== Post %i with dist=%.2f: %s" %(i, d, post) if d < best_dist: best_dist = d best_i = i print("Best post is %i with dist=%.2f"%(best_i, best_dist)) import nltk.stem english_stemmer = nltk.stem.SnowballStemmer('english') class StemmedCountVectorizer(CountVectorizer): def build_analyzer(self): analyzer = super(StemmedCountVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) vectorizer = StemmedCountVectorizer(min_df=1, stop_words='english') X_train = vectorizer.fit_transform(posts) new_post = "imaging databases" new_post_vec = vectorizer.transform([new_post]) print(new_post_vec) import sys best_doc = None best_dist = sys.maxint best_i = None for i in range(0, num_samples): post = posts[i] if post == new_post: continue post_vec = X_train.getrow(i) d = dist_norm(post_vec, new_post_vec) print "=== Post %i with dist=%.2f: %s" %(i, d, post) if d < best_dist: best_dist = d best_i = i print("Best post is %i with dist=%.2f"%(best_i, best_dist)) import scipy as sp import math def tfidf(term, doc, docset): tf = float(doc.count(term))/sum(doc.count(w) for w in set(doc)) idf = math.log(float(len(docset))/(len([doc for doc in docset if term in doc]))) return tf * idf doc_a, doc_abb, doc_abc = ["a"],["a", "b", "b"],["a","b","c"] D=[doc_a, doc_abb, doc_abc] print(tfidf("a", doc_a, D)) print(tfidf("b", doc_abb, D)) print(tfidf("c", doc_a, D)) print(tfidf("c", doc_abc, D)) print(tfidf("b", doc_abb, D)) print(tfidf(("b"), doc_abc, D)) from sklearn.feature_extraction.text import TfidfVectorizer class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: ( english_stemmer.stem(w) for w in analyzer(doc)) vectorizer = StemmedTfidfVectorizer(min_df=1, stop_words='english') X_train = vectorizer.fit_transform(posts) X_train.shape print X_train new_post = "imaging databases" new_post_vec = vectorizer.transform([new_post]) print(new_post_vec) import sys best_doc = None best_dist = sys.maxint best_i = None for i in range(0, num_samples): post = posts[i] if post == new_post: continue post_vec = X_train.getrow(i) d = dist_norm(post_vec, new_post_vec) print "=== Post %i with dist=%.2f: %s" %(i, d, post) if d < best_dist: best_dist = d best_i = i print("Best post is %i with dist=%.2f"%(best_i, best_dist)) import os import scipy as sp from scipy.stats import norm from matplotlib import pylab from sklearn.cluster import KMeans seed = 2 sp.random.seed(seed) # to reproduce the data later on num_clusters = 3 def plot_clustering(x, y, title, mx=None, ymax=None, xmin=None, km=None): pylab.figure(num=None, figsize=(8, 6)) if km: pylab.scatter(x, y, s=50, c=km.predict(list(zip(x, y)))) else: pylab.scatter(x, y, s=50) pylab.title(title) pylab.xlabel("Occurrence word 1") pylab.ylabel("Occurrence word 2") pylab.autoscale(tight=True) pylab.ylim(ymin=0, ymax=1) pylab.xlim(xmin=0, xmax=1) pylab.grid(True, linestyle='-', color='0.75') return pylab xw1 = norm(loc=0.3, scale=.15).rvs(20) yw1 = norm(loc=0.3, scale=.15).rvs(20) xw2 = norm(loc=0.7, scale=.15).rvs(20) yw2 = norm(loc=0.7, scale=.15).rvs(20) xw3 = norm(loc=0.2, scale=.15).rvs(20) yw3 = norm(loc=0.8, scale=.15).rvs(20) print xw1 print yw1 x = sp.append(sp.append(xw1, xw2), xw3) y = sp.append(sp.append(yw1, yw2), yw3) print x print y import matplotlib.pyplot as plt %pylab inline i = 1 plot_clustering(x, y, "Vectors") i += 1 # 1 iteration #################### mx, my = sp.meshgrid(sp.arange(0, 1, 0.001), sp.arange(0, 1, 0.001)) km = KMeans(init='random', n_clusters=num_clusters, verbose=1, n_init=1, max_iter=1, random_state=seed) km.fit(sp.array(list(zip(x, y)))) plot_clustering(x, y, "Clustering iteration 1", km=km) c1a, c1b, c1c = km.cluster_centers_ pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='x', linewidth=2, s=100, color='black') i += 1 # 2 iterations #################### km = KMeans(init='random', n_clusters=num_clusters, verbose=1, n_init=1, max_iter=2, random_state=seed) km.fit(sp.array(list(zip(x, y)))) Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape) plot_clustering(x, y, "Clustering iteration 2", km=km) c2a, c2b, c2c = km.cluster_centers_ pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='x', linewidth=2, s=100, color='black') pylab.gca().add_patch( pylab.Arrow(c1a[0], c1a[1], c2a[0] - c1a[0], c2a[1] - c1a[1], width=0.1)) pylab.gca().add_patch( pylab.Arrow(c1b[0], c1b[1], c2b[0] - c1b[0], c2b[1] - c1b[1], width=0.1)) pylab.gca().add_patch( pylab.Arrow(c1c[0], c1c[1], c2c[0] - c1c[0], c2c[1] - c1c[1], width=0.1)) i += 1 # 3 iterations #################### km = KMeans(init='random', n_clusters=num_clusters, verbose=1, n_init=1, max_iter=10, random_state=seed) km.fit(sp.array(list(zip(x, y)))) Z = km.predict(sp.c_[mx.ravel(), my.ravel()]).reshape(mx.shape) plot_clustering(x, y, "Clustering iteration 10", km=km) pylab.scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1], marker='x', linewidth=2, s=100, color='black') import sklearn.datasets import scipy as sp new_post = \ """Disk drive problems. Hi, I have a problem with my hard disk. After 1 year it is working only sporadically now. I tried to format it, but now it doesn't boot any more. Any ideas? Thanks. """ all_data = sklearn.datasets.fetch_20newsgroups(subset="all") print("Number of total posts: %i" % len(all_data.filenames)) groups = [ 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'sci.space'] train_data = sklearn.datasets.fetch_20newsgroups(subset="train", categories=groups) print("Number of training posts in tech groups:", len(train_data.filenames)) labels = train_data.target num_clusters=50 import nltk.stem english_stemmer = nltk.stem.SnowballStemmer('english') from sklearn.feature_extraction.text import TfidfVectorizer class StemmedTfidfVectorizer(TfidfVectorizer): def build_analyzer(self): analyzer = super(TfidfVectorizer, self).build_analyzer() return lambda doc: (english_stemmer.stem(w) for w in analyzer(doc)) vectorizer = StemmedTfidfVectorizer(min_df=10, max_df=0.5, stop_words='english', decode_error='ignore' ) vectorized = vectorizer.fit_transform(train_data.data) num_samples, num_features = vectorized.shape print("#samples: %d, #features: %d" % (num_samples, num_features)) from sklearn.cluster import KMeans km = KMeans(n_clusters=num_clusters, n_init=1, verbose=1, random_state=3) clustered = km.fit(vectorized) print("km.labels=%s" % km.labels_) print("km.labels_.shape=%s" % km.labels_.shape) from sklearn import metrics # Homogeneity は,比較対象となるクラスタリング結果におけるあるクラスタが, # もう片方の正解となるクラスタリング結果におけるあるクラスタに属する要素のみを含んでいるかどうかの指標 print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_)) # Completeness はあるクラスタリング結果におけるあるクラスタの要素全てが,比較対象となるクラスタリング結果 # において同じクラスタに属しているかどうか(同じ語義をもつ要素が1つのクラスタにどれだけまとめられているか) print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_)) # HomogeneityとCompletenessは1 つのクラスタに属すべき要素群がどの程度同じクラスタに分類されやすいかを検証しやすい # V-measureはHomogeneityとCompletnessの調和平均 print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_)) # Adjusted Rand Index # 調整Rand指標。2つのクラスタ間で分け方の類似度を示す指標。相関がない場合に0となるようにAdjustしたもの。 print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels, km.labels_)) # Adjusted Mutual Information # Mutual Informationは相互情報量とよばれる。文章にカテゴリが与えられている場合、単語とカテゴリの相関性の強さを数値化してものである。 print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels, km.labels_)) # Silihouette Coefficient(シルエット係数) # あるクラスタについて、クラスタに含まれる要素の凝集性と他のクラスタに含まれる要素の分離性のバランスを数値化したもの print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(vectorized, labels, sample_size=1000)) new_post # クエリのベクトル化 new_post_vec = vectorizer.transform([new_post]) # クラスタの推測 new_post_label = km.predict(new_post_vec)[0] # 同じクラスタに所属する文書のインデックスを取得 similar_indices = (km.labels_ == new_post_label).nonzero()[0] print(similar_indices) np.array([True, False, True, False, False]).nonzero() np.array([[True, False, True], [True, True, False]]).nonzero() sp.linalg.norm((new_post_vec - vectorized[0]).toarray()) similar = [] similar_index = [] for i in similar_indices: dist = sp.linalg.norm((new_post_vec - vectorized[i]).toarray()) similar.append((dist, train_data.data[i])) similar_index.append((dist, i)) similar_index len(similar) similar = sorted(similar) print("Count similar: %i" % len(similar)) # 最も類似 show_at_1 = similar[0] # 中間 show_at_2 = similar[int(len(similar) / 10)] # 最も類似していない show_at_3 = similar[int(len(similar) / 2)] print("== #1 most similar ==") print(show_at_1) print("== #2 ==") print(show_at_2) print("== #3 most different ==") print(show_at_3) post_group = zip(train_data.data, train_data.target) all = [(len(post[0]), post[0], train_data.target_names[post[1]]) for post in post_group] graphics = sorted([post for post in all if post[2] == 'comp.graphics']) print(graphics[5:7]) noise_post = graphics[5][1] print(noise_post) analyzer = vectorizer.build_analyzer() print(list(analyzer(noise_post))) noise_post_2 = graphics[6][1] print(noise_post_2) analyzer = vectorizer.build_analyzer() print(list(analyzer(noise_post_2))) useful = set(analyzer(noise_post)).intersection(vectorizer.get_feature_names()) print(sorted(useful)) useful_2 = set(analyzer(noise_post_2)).intersection(vectorizer.get_feature_names()) print(sorted(useful)) for term in sorted(useful): print('IDF(%s)=%.2f' % (term, vectorizer._tfidf.idf_[vectorizer.vocabulary_[term]]))