# Import packages to extended the functionality of basic Python
# Let Python 2 behave like Python 3
from __future__ import division, unicode_literals, print_function
# Utility data structures
from collections import Counter
# Plotting
import matplotlib.pyplot as plt
import matplotlib.cm as cm
% matplotlib inline
# Natural language processing package
import nltk
import nltk.corpus
# Numerical
import numpy as np
# Operating system functions
import os
# Data analysis package
import pandas as pd
# Kenneth Reitz' module to download data from the Web. (instead of urllib)
import requests
# Specify a URL to download information from the Semantic MediaWiki 'WikiLit'
# Such a query can be constructed on the web site and the resulting URL copy-and-pasted here.
url = ("http://wikilit.referata.com/"
"wiki/Special:Ask/"
"-5B-5BCategory:Publications-5D-5D/"
"-3FHas-20author%3DAuthor(s)/-3FYear/"
"-3FPublished-20in/-3FAbstract/-3FHas-20topic%3DTopic(s)/"
"-3FHas-20domain%3DDomain(s)/"
"format%3D-20csv/limit%3D-20600/offset%3D0")
# Download and read data as comma-separated values (CSV) information into a Pandas DataFrame
documents = pd.read_csv(url)
filename = os.path.expanduser('~/data/dtu02819/wikilit.csv')
# Write to a comma-separated values file at the local file system
documents.to_csv(filename)
# Read the comma-separated values file from the local file system
documents = pd.read_csv(filename, index_col=0)
documents.head()
Unnamed: 0.1 | Author(s) | Year | Published in | Abstract | Topic(s) | Domain(s) | |
---|---|---|---|---|---|---|---|
0 | 'Wikipedia, the free encyclopedia' as a role m... | Gordon Müller-Seitz,Guido Reger | 2010 | International Journal of Technology Management | Accounts of open source software (OSS) develop... | Contributor motivation,Policies and governance... | Information systems |
1 | A 'resource review' of Wikipedia | Cormac Lawler | 2006 | Counselling & Psychotherapy Research | The article offers information on Wikipedia, a... | Miscellaneous topics | Information systems |
2 | A Persian web page classifier applying a combi... | Mojgan Farhoodi,Alireza Yari,Maryam Mahmoudi | 2009 | International Journal of Information Studies | There are many automatic classification method... | Text classification | Computer science |
3 | A Wikipedia literature review | Owen S. Martin | 2010 | ArXiv | This paper was originally designed as a litera... | Literature review | Mathematics |
4 | A Wikipedia matching approach to contextual ad... | Alexander N. Pak,Chin-Wan Chung | 2010 | World Wide Web | Contextual advertising is an important part of... | Other information retrieval topics | Computer science |
# Example on word tokenization of the first sentence in the first abstract
sentences = nltk.sent_tokenize(documents.ix[0, 'Abstract'])
# Show the sentences as a Python list of strings
print(nltk.word_tokenize(sentences[0]))
['Accounts', 'of', 'open', 'source', 'software', '(', 'OSS', ')', 'development', 'projects', 'frequently', 'stress', 'their', 'democratic', ',', 'sometimes', 'even', 'anarchic', 'nature', ',', 'in', 'contrast', 'to', 'for-profit', 'organisations', '.']
# Tokenize all the text and count tokens
# A extra attribute to contain the new data in the documents object
documents.data = []
# Token counter
token_counts = Counter()
# Iterate over all documents and all sentence and all words
for abstract in documents['Abstract']:
datum = {}
datum['sentences'] = nltk.sent_tokenize(abstract)
datum['tokenlist'] = [word.lower() for sent in datum['sentences']
for word in nltk.word_tokenize(sent)]
token_counts.update(datum['tokenlist'])
documents.data.append(datum)
# The five most common tokens in the entire WikiLit corpus
token_counts.most_common(5)
[('the', 4866), ('of', 3826), (',', 3785), ('.', 3585), ('and', 2856)]
# Read stopword list ('the', 'a', 'for')
stopwords = nltk.corpus.stopwords.words('english')
relevant_tokens = {token: count for token, count in token_counts.items()
if count > 2 and token not in stopwords and token.isalpha()}
# Show the most common tokens in the reduced token set
Counter(relevant_tokens).most_common(5)
[('wikipedia', 1428), ('information', 486), ('knowledge', 341), ('articles', 279), ('online', 265)]
# Exclude the work 'wikipedia'
relevant_tokens.pop('wikipedia', 0)
1428
# Construct a dense document-term matrix with word counts in the elements
# as a Numpy matrix
tokens = relevant_tokens.keys() # as list
M = np.asmatrix(np.zeros([len(documents), len(tokens)]))
for n in range(len(documents)):
for m, token in enumerate(tokens):
M[n, m] = documents.data[n]['tokenlist'].count(token)
M.shape
(525, 2899)
# Value of the element in the first row in the column corresponding to the word 'software'.
M[0, tokens.index('software')]
1.0
# Plot part of the matrix as an image
plt.imshow(M[:100, :100], cmap=cm.gray_r, interpolation='nearest')
plt.xlabel('Tokens')
plt.ylabel('Documents')
plt.show()
def nmf(M, components=5, iterations=500):
"""Factorize matrix with non-negative matrix factorization."""
# Initialize to matrices
W = np.asmatrix(np.random.random(([M.shape[0], components])))
H = np.asmatrix(np.random.random(([components, M.shape[1]])))
for n in range(0, iterations):
H = np.multiply(H, (W.T * M) / (W.T * W * H + 0.001))
W = np.multiply(W, (M * H.T) / (W * (H * H.T) + 0.001))
return (W, H)
# Perform the actual computation
W, H = nmf(M, iterations=50, components=3)
W.max(), H.max()
(15.094951905539212, 1.8585767533145232)
# Show the results in some format - This could be written nicer, using Jinja2
for component in range(W.shape[1]):
print("=" * 80)
print("COMPONENT %d: " % (component + 1,))
indices = (-H[component, :]).getA1().argsort()
print(" - ".join([tokens[i] for i in indices[:6] ]))
print("-")
indices = (-W[:, component]).getA1().argsort()
print("\n".join([documents.ix[i, 0][:80] for i in indices[:5] ]))
================================================================================ COMPONENT 1: authors - content - community - number - articles - analysis - Wikipedia - a quantitative analysis Open content and value creation Sharing knowledge and building communities: a narrative of the formation, develo Extracting content holes by comparing community-type content with Wikipedia An analysis of open content systems ================================================================================ COMPONENT 2: knowledge - document - clustering - linkage - topic - algorithm - Exploiting external/domain knowledge to enhance traditional text mining using gr Wikitology: a novel hybrid knowledge base derived from Wikipedia The WikiID: an alternative approach to the body of knowledge Breaking the knowledge acquisition bottleneck through conversational knowledge m Extracting lexical semantic knowledge from Wikipedia and Wiktionary ================================================================================ COMPONENT 3: information - web - use - students - search - results - Gender differences in information behavior concerning Wikipedia, an unorthodox i How and why do college students use Wikipedia? Where does the information come from? Information source use patterns in Wikiped What is the quality of surgery-related information on the Internet? Lessons lear Reliability of Wikipedia as a medication information source for pharmacy student