from os import path
import os
import re
rootdir = 'SogouC.reduced/Reduced'
dirs = os.listdir(rootdir)
dirs = [path.join(rootdir,f) for f in dirs if f.startswith('C')]
dirs
['SogouC.reduced/Reduced/C000008', 'SogouC.reduced/Reduced/C000010', 'SogouC.reduced/Reduced/C000013', 'SogouC.reduced/Reduced/C000014', 'SogouC.reduced/Reduced/C000016', 'SogouC.reduced/Reduced/C000020', 'SogouC.reduced/Reduced/C000022', 'SogouC.reduced/Reduced/C000023', 'SogouC.reduced/Reduced/C000024']
def load_txt(x):
with open(x) as f:
res = [t.decode('gbk','ignore') for t in f]
return ''.join(res)
text_t = {}
for i, d in enumerate(dirs):
files = os.listdir(d)
files = [path.join(d, x) for x in files if x.endswith('txt') and not x.startswith('.')]
text_t[i] = [load_txt(f) for f in files]
# to dataframe
import pandas as pd
import numpy as np
flen = [len(t) for t in text_t.values()]
labels = np.repeat(text_t.keys(),flen)
# flatter nested list
import itertools
merged = list(itertools.chain.from_iterable(text_t.values()))
df = pd.DataFrame({'label': labels, 'txt': merged})
df.head()
label | txt | |
---|---|---|
0 | 0 | 本报记者陈雪频实习记者唐翔发自上海\r\n 一家刚刚成立两年的网络支付公司,它的目标是... |
1 | 0 | 证券通:百联股份未来5年有能力保持高速增长\r\n\r\n 深度报告 权威内参... |
2 | 0 | 5月09日消息快评\r\n\r\n 深度报告 权威内参 来自“证券通”www.... |
3 | 0 | 5月09日消息快评\r\n\r\n 深度报告 权威内参 来自“证券通”www.... |
4 | 0 | 5月09日消息快评\r\n\r\n 深度报告 权威内参 来自“证券通”www.... |
# cut word
import jieba
jieba.enable_parallel(4)
def cutword_1(x):
words = jieba.cut(x)
return ' '.join(words)
Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt DEBUG:jieba:Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache DEBUG:jieba:dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache loading model cost 3.71529507637 seconds. DEBUG:jieba:loading model cost 3.71529507637 seconds. Trie has been built succesfully. DEBUG:jieba:Trie has been built succesfully.
df['seg_word'] = df.txt.map(cutword_1)
from cPickle import dump,load
#dump(df, open('df.pickle', 'wb'))
df = load(open('df.pickle','rb'))
# model
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(ngram_range=(1,1), min_df = 2, max_features = 10000)
xvec = vect.fit_transform(df.seg_word)
xvec.shape
(17903, 10000)
y = df.label
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(xvec, y , train_size=0.7, random_state=1)
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(train_X, train_y)
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
from sklearn import metrics
pre = clf.predict(test_X)
print metrics.classification_report(test_y, pre)
#print metrics.confusion_matrix(test_y, pre)
precision recall f1-score support 0 0.91 0.88 0.89 576 1 0.86 0.83 0.84 604 2 0.88 0.83 0.86 616 3 0.99 0.97 0.98 580 4 0.87 0.88 0.88 597 5 0.88 0.80 0.83 607 6 0.78 0.89 0.83 599 7 0.74 0.79 0.76 613 8 0.92 0.93 0.92 579 avg / total 0.87 0.86 0.87 5371
# word2vec
txt = df.seg_word.values
txtlist = []
for sent in txt:
temp = [w for w in sent.split()]
txtlist.append(temp)
num_features = 100
min_word_count = 10
num_workers = 4
context = 20
epoch = 20
sample = 1e-5
from gensim.models import word2vec
model = word2vec.Word2Vec(txtlist, workers = num_workers,
sample = sample,
size = num_features,
min_count=min_word_count,
window = context,
iter = epoch)
model.syn0.shape
(57675, 100)
for w in model.most_similar(u'互联网'):
print w[0], w[1]
在线 0.809367001057 网络 0.792132735252 网民 0.789814114571 网站 0.766795158386 网络广告 0.763081729412 门户网站 0.757833242416 互联网内容 0.728336572647 访问量 0.703088879585 商业模式 0.701648652554 Web2 0.698530614376
#model.save('sogo_wv')
model = word2vec.Word2Vec.load('sogo_wv')
# kmeans based on word_vec
from sklearn.cluster import KMeans
word_vectors = model.syn0
num_clusters = word_vectors.shape[0]//20
kmeans_clustering = KMeans(n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)
word_centroid_map = dict(zip(model.index2word, idx))
word_centroid_df = pd.DataFrame(zip( model.index2word, idx ))
word_centroid_df.columns = ['word','cluster']
word_centroid_df.head()
# 观察前十个群的效果
for cluster in xrange(10):
print "\nCluster %d" % cluster
words = word_centroid_df.ix[word_centroid_df.cluster==cluster,'word'].values
print ' '.join(words)
# 观察有很多词的大群
big_cluster = word_centroid_df.groupby('cluster').apply(lambda x: len(x.word)).reset_index()
big_cluster.columns = ['cluster','word_num']
key_cluster = big_cluster.ix[big_cluster['word_num']>=10,'cluster'].values
def create_bag_of_centroids( wordlist, word_centroid_map ):
# 从词到类别编号的映射函数
# wordlist是文本中的词,word_centroid_map是诩到编号的dict
num_centroids = max( word_centroid_map.values() ) + 1
bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
for word in wordlist:
if word in word_centroid_map:
index = word_centroid_map[word]
if index in key_cluster:
bag_of_centroids[index] += 1
return bag_of_centroids
# 从原始文本映射成群编号
train_centroids = np.zeros( (len(txtlist), num_clusters),dtype="float32" )
for i, review in enumerate(txtlist):
train_centroids[i] = create_bag_of_centroids( review,word_centroid_map )
# 变为0-1特征
train_centroids = np.where(train_centroids>0,1,0)
train_centroids_df = pd.DataFrame(train_centroids)
train_centroids_df= train_centroids_df.ix[:,train_centroids.sum(axis=0)!=0]
train_centroids_df.shape
(17910, 1429)
from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(train_centroids_df.values, y , train_size=0.7, random_state=1)
from sklearn.naive_bayes import MultinomialNB
clf = SGDClassifier()
clf.fit(train_X, train_y)
from sklearn import metrics
pre = clf.predict(test_X)
print metrics.classification_report(test_y, pre)
print metrics.confusion_matrix(test_y, pre)
precision recall f1-score support 0 0.88 0.91 0.90 577 1 0.74 0.86 0.80 603 2 0.83 0.86 0.84 619 3 0.85 0.73 0.78 584 4 0.82 0.75 0.78 570 5 0.75 0.74 0.75 600 6 0.82 0.82 0.82 600 7 0.99 0.94 0.96 615 8 0.71 0.75 0.73 605 avg / total 0.82 0.82 0.82 5373 [[525 1 1 1 9 4 3 0 33] [ 0 519 9 13 22 18 2 1 19] [ 3 14 533 2 4 47 10 0 6] [ 1 52 30 425 13 22 11 0 30] [ 8 53 6 21 429 5 10 1 37] [ 26 19 26 18 10 447 31 1 22] [ 7 8 19 5 9 27 491 1 33] [ 1 3 11 1 1 9 6 578 5] [ 23 29 8 14 26 14 37 1 453]]
# 将词向量平均化为文档向量
def sentvec(sent,m=num_features,model=model):
res = np.zeros(m)
words = sent.split()
num = 0
for w in words:
if w in model.index2word:
res += model[w]
num += 1.0
if num == 0: return np.zeros(m)
else: return res/num
n = df.shape[0]
sent_matrix = np.zeros([n,num_features],float)
for i ,sent in enumerate(df.seg_word.values):
sent_matrix[i,:] = sentvec(sent)
sent_matrix.shape
(17910, 100)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(sent_matrix, y , train_size=0.7, random_state=1)
clf = GradientBoostingClassifier()
clf.fit(train_X, train_y)
from sklearn import metrics
pre = clf.predict(test_X)
print metrics.classification_report(test_y, pre)
print metrics.confusion_matrix(test_y, pre)
precision recall f1-score support 0 0.93 0.93 0.93 577 1 0.83 0.84 0.84 603 2 0.91 0.85 0.88 619 3 0.85 0.87 0.86 584 4 0.85 0.83 0.84 570 5 0.83 0.80 0.81 600 6 0.88 0.88 0.88 600 7 0.97 0.96 0.97 615 8 0.76 0.83 0.80 605 avg / total 0.87 0.87 0.87 5373 [[539 1 2 3 5 3 6 1 17] [ 0 507 4 20 20 9 4 1 38] [ 3 10 529 11 6 35 12 3 10] [ 0 25 8 509 11 13 5 1 12] [ 6 27 4 17 472 9 5 2 28] [ 15 12 22 15 12 477 19 4 24] [ 6 7 8 9 3 13 530 3 21] [ 0 1 0 2 5 4 5 592 6] [ 9 20 6 10 22 12 19 2 505]]