Python的文本挖掘¶

本文主要演练三种文本挖掘方法
- 使用的是sogou的语料库http://www.sogou.com/labs/dl/c.html
- 常规的词袋模型用于分类
- 使用word2vec得到词向量，再对词汇进行聚类，用类编号作为特征再进行分类
- 使用word2vec得到词向量，对文档中的词向量平均化作为文档向量，用文档向量作为特征进行分类

In [1]:

from os import path
import os
import re

In [2]:

rootdir = 'SogouC.reduced/Reduced'
dirs = os.listdir(rootdir)
dirs = [path.join(rootdir,f) for f in dirs if f.startswith('C')]
dirs

Out[2]:

['SogouC.reduced/Reduced/C000008',
 'SogouC.reduced/Reduced/C000010',
 'SogouC.reduced/Reduced/C000013',
 'SogouC.reduced/Reduced/C000014',
 'SogouC.reduced/Reduced/C000016',
 'SogouC.reduced/Reduced/C000020',
 'SogouC.reduced/Reduced/C000022',
 'SogouC.reduced/Reduced/C000023',
 'SogouC.reduced/Reduced/C000024']

In [3]:

def load_txt(x):
    with open(x) as f:
        res = [t.decode('gbk','ignore') for t in f]
        return ''.join(res)

In [4]:

text_t = {}
for i, d in enumerate(dirs):
    files = os.listdir(d)
    files = [path.join(d, x) for x in files if x.endswith('txt') and not x.startswith('.')]
    text_t[i] = [load_txt(f) for f in files]

In [5]:

# to dataframe
import pandas as pd
import numpy as np

In [6]:

flen = [len(t) for t in text_t.values()]

In [7]:

labels = np.repeat(text_t.keys(),flen)

In [8]:

# flatter nested list
import itertools
merged = list(itertools.chain.from_iterable(text_t.values()))

In [9]:

df = pd.DataFrame({'label': labels, 'txt': merged})
df.head()

Out[9]:

	label	txt
0	0	本报记者陈雪频实习记者唐翔发自上海\r\n　　一家刚刚成立两年的网络支付公司，它的目标是...
1	0	证券通：百联股份未来5年有能力保持高速增长\r\n\r\n 深度报告权威内参...
2	0	5月09日消息快评\r\n\r\n 深度报告权威内参来自“证券通”www....
3	0	5月09日消息快评\r\n\r\n 深度报告权威内参来自“证券通”www....
4	0	5月09日消息快评\r\n\r\n 深度报告权威内参来自“证券通”www....

In [11]:

# cut word
import jieba
jieba.enable_parallel(4)
def cutword_1(x):
    words = jieba.cut(x)
    return ' '.join(words)

Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt
DEBUG:jieba:Building Trie..., from /Users/xiaokai/anaconda/lib/python2.7/site-packages/jieba/dict.txt
dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache
DEBUG:jieba:dumping model to file cache /var/folders/7t/2bxpnffn2r9fdg94r9fk6t4h0000gn/T/jieba.cache
loading model cost 3.71529507637 seconds.
DEBUG:jieba:loading model cost 3.71529507637 seconds.
Trie has been built succesfully.
DEBUG:jieba:Trie has been built succesfully.

In [13]:

df['seg_word'] = df.txt.map(cutword_1)

In [10]:

from cPickle import dump,load
#dump(df, open('df.pickle', 'wb'))
df = load(open('df.pickle','rb'))

In [11]:

# model 
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer(ngram_range=(1,1), min_df = 2, max_features = 10000)
xvec = vect.fit_transform(df.seg_word)
xvec.shape

Out[11]:

(17903, 10000)

In [12]:

y = df.label

In [13]:

from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(xvec, y , train_size=0.7, random_state=1)
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

In [14]:

clf.fit(train_X, train_y)

Out[14]:

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:

from sklearn import metrics
pre = clf.predict(test_X)
print metrics.classification_report(test_y, pre)
#print metrics.confusion_matrix(test_y, pre)

             precision    recall  f1-score   support

          0       0.91      0.88      0.89       576
          1       0.86      0.83      0.84       604
          2       0.88      0.83      0.86       616
          3       0.99      0.97      0.98       580
          4       0.87      0.88      0.88       597
          5       0.88      0.80      0.83       607
          6       0.78      0.89      0.83       599
          7       0.74      0.79      0.76       613
          8       0.92      0.93      0.92       579

avg / total       0.87      0.86      0.87      5371

In [24]:

# word2vec
txt = df.seg_word.values
txtlist = []
for sent in txt:
    temp = [w for w in sent.split()]
    txtlist.append(temp)

In [22]:

num_features = 100
min_word_count = 10
num_workers = 4
context = 20
epoch = 20
sample = 1e-5

In [16]:

from gensim.models import word2vec

In [89]:

model = word2vec.Word2Vec(txtlist, workers = num_workers,
                          sample = sample,
                          size = num_features,
                          min_count=min_word_count,
                          window = context,
                          iter = epoch)

In [90]:

model.syn0.shape

Out[90]:

(57675, 100)

In [91]:

for w in model.most_similar(u'互联网'):
    print w[0], w[1]

在线 0.809367001057
网络 0.792132735252
网民 0.789814114571
网站 0.766795158386
网络广告 0.763081729412
门户网站 0.757833242416
互联网内容 0.728336572647
访问量 0.703088879585
商业模式 0.701648652554
Web2 0.698530614376

In [17]:

#model.save('sogo_wv')
model = word2vec.Word2Vec.load('sogo_wv')

In [ ]:

# kmeans based on word_vec
from sklearn.cluster import KMeans
word_vectors = model.syn0
num_clusters = word_vectors.shape[0]//20
kmeans_clustering = KMeans(n_clusters = num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

In [ ]:

word_centroid_map = dict(zip(model.index2word, idx))
word_centroid_df = pd.DataFrame(zip( model.index2word, idx ))  
word_centroid_df.columns = ['word','cluster'] 
word_centroid_df.head() 

In [ ]:

# 观察前十个群的效果  
for cluster in xrange(10):  
    print "\nCluster %d" % cluster  
    words = word_centroid_df.ix[word_centroid_df.cluster==cluster,'word'].values 
    print ' '.join(words) 

In [107]:

# 观察有很多词的大群 
big_cluster = word_centroid_df.groupby('cluster').apply(lambda x: len(x.word)).reset_index() 
big_cluster.columns = ['cluster','word_num'] 
key_cluster = big_cluster.ix[big_cluster['word_num']>=10,'cluster'].values 


def create_bag_of_centroids( wordlist, word_centroid_map ):  
    # 从词到类别编号的映射函数  
    # wordlist是文本中的词，word_centroid_map是诩到编号的dict  
    num_centroids = max( word_centroid_map.values() ) + 1  
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )  
    for word in wordlist:  
        if word in word_centroid_map:  
            index = word_centroid_map[word]  
            if index in key_cluster: 
                bag_of_centroids[index] += 1  
    return bag_of_centroids  

In [112]:

# 从原始文本映射成群编号  
train_centroids = np.zeros( (len(txtlist), num_clusters),dtype="float32" )  
for i, review in enumerate(txtlist):  
    train_centroids[i] = create_bag_of_centroids( review,word_centroid_map )  
# 变为0-1特征 
train_centroids = np.where(train_centroids>0,1,0) 
train_centroids_df = pd.DataFrame(train_centroids) 
train_centroids_df= train_centroids_df.ix[:,train_centroids.sum(axis=0)!=0] 

In [113]:

train_centroids_df.shape

Out[113]:

(17910, 1429)

In [118]:

from sklearn.linear_model import SGDClassifier
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(train_centroids_df.values, y , train_size=0.7, random_state=1)
from sklearn.naive_bayes import MultinomialNB
clf = SGDClassifier()

In [119]:

clf.fit(train_X, train_y)
from sklearn import metrics
pre = clf.predict(test_X)
print metrics.classification_report(test_y, pre)
print metrics.confusion_matrix(test_y, pre)

             precision    recall  f1-score   support

          0       0.88      0.91      0.90       577
          1       0.74      0.86      0.80       603
          2       0.83      0.86      0.84       619
          3       0.85      0.73      0.78       584
          4       0.82      0.75      0.78       570
          5       0.75      0.74      0.75       600
          6       0.82      0.82      0.82       600
          7       0.99      0.94      0.96       615
          8       0.71      0.75      0.73       605

avg / total       0.82      0.82      0.82      5373

[[525   1   1   1   9   4   3   0  33]
 [  0 519   9  13  22  18   2   1  19]
 [  3  14 533   2   4  47  10   0   6]
 [  1  52  30 425  13  22  11   0  30]
 [  8  53   6  21 429   5  10   1  37]
 [ 26  19  26  18  10 447  31   1  22]
 [  7   8  19   5   9  27 491   1  33]
 [  1   3  11   1   1   9   6 578   5]
 [ 23  29   8  14  26  14  37   1 453]]

In [95]:

# 将词向量平均化为文档向量 
def sentvec(sent,m=num_features,model=model): 
    res = np.zeros(m) 
    words = sent.split() 
    num = 0  
    for w in words: 
        if w in model.index2word: 
            res += model[w] 
            num += 1.0 
    if num == 0: return np.zeros(m) 
    else: return res/num 
     
n = df.shape[0] 
sent_matrix = np.zeros([n,num_features],float) 
for i ,sent in enumerate(df.seg_word.values): 
    sent_matrix[i,:] = sentvec(sent) 
sent_matrix.shape 

Out[95]:

(17910, 100)

In [99]:

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.cross_validation import train_test_split
train_X, test_X, train_y, test_y = train_test_split(sent_matrix, y , train_size=0.7, random_state=1)
clf = GradientBoostingClassifier()

In [100]:

clf.fit(train_X, train_y)
from sklearn import metrics
pre = clf.predict(test_X)
print metrics.classification_report(test_y, pre)
print metrics.confusion_matrix(test_y, pre)

             precision    recall  f1-score   support

          0       0.93      0.93      0.93       577
          1       0.83      0.84      0.84       603
          2       0.91      0.85      0.88       619
          3       0.85      0.87      0.86       584
          4       0.85      0.83      0.84       570
          5       0.83      0.80      0.81       600
          6       0.88      0.88      0.88       600
          7       0.97      0.96      0.97       615
          8       0.76      0.83      0.80       605

avg / total       0.87      0.87      0.87      5373

[[539   1   2   3   5   3   6   1  17]
 [  0 507   4  20  20   9   4   1  38]
 [  3  10 529  11   6  35  12   3  10]
 [  0  25   8 509  11  13   5   1  12]
 [  6  27   4  17 472   9   5   2  28]
 [ 15  12  22  15  12 477  19   4  24]
 [  6   7   8   9   3  13 530   3  21]
 [  0   1   0   2   5   4   5 592   6]
 [  9  20   6  10  22  12  19   2 505]]

In [ ]: