One-hot encoding represents this explanatory variable using one binary feature for each of the three possible cities (NY, SF, CH).
# DictVectorizer class can be used to one-hot encode categorical features
from sklearn.feature_extraction import DictVectorizer
onehot_encoder = DictVectorizer()
instances = [
{'city': 'New York'},
{'city': 'San Francisco'},
{'city': 'Chapel Hill'},
]
print onehot_encoder.fit_transform(instances).toarray()
[[ 0. 1. 0.] [ 0. 0. 1.] [ 1. 0. 0.]]
# corpus(말뭉치) : A collection of documents
corpus = [
'UNC played Duke in basketball',
'Duke lost the basketball game'
]
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
# 문서에 일치하는 단어가 있으면 1, 없으면 0으로 맵핑
print vectorizer.fit_transform(corpus).todense()
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-1-574598f69f63> in <module>() 1 # 문서에 일치하는 단어가 있으면 1, 없으면 0으로 맵핑 ----> 2 print vectorizer.fit_transform(corpus).todense() NameError: name 'vectorizer' is not defined
# 유니크한 단어 모음. abc순으로 정렬
vectorizer.vocabulary_
--------------------------------------------------------------------------- NameError Traceback (most recent call last) <ipython-input-2-41f3ebad90c0> in <module>() 1 # 유니크한 단어 모음. abc순으로 정렬 ----> 2 vectorizer.vocabulary_ NameError: name 'vectorizer' is not defined
# 문서 추가
corpus = [
'UNC played Duke in basketball',
'Duke lost the basketball game',
'I ate a sandwich'
]
# 문서에 일치하는 단어가 있으면 1, 없으면 0으로 맵핑
print vectorizer.fit_transform(corpus).todense()
[[0 1 1 0 1 0 1 0 0 1] [0 1 1 1 0 1 0 0 1 0] [1 0 0 0 0 0 0 1 0 0]]
# 유니크한 단어 모음. abc순으로 정렬
print vectorizer.vocabulary_
{u'duke': 2, u'basketball': 1, u'lost': 4, u'played': 5, u'game': 3, u'sandwich': 6, u'unc': 7, u'ate': 0}
from sklearn.metrics.pairwise import euclidean_distances
counts = [
[0, 1, 1, 0, 0, 1, 0, 1],
[0, 1, 1, 1, 1, 0, 0, 0],
[1, 0, 0, 0, 0, 0, 1, 0]
]
print 'Distance between 1st and 2nd documents:', euclidean_distances(counts[0], counts[1])
print 'Distance between 1st and 3rd documents:', euclidean_distances(counts[0], counts[2])
print 'Distance between 2nd and 3rd documents:', euclidean_distances(counts[1], counts[2])
Distance between 1st and 2nd documents: [[ 2.]] Distance between 1st and 3rd documents: [[ 2.44948974]] Distance between 2nd and 3rd documents: [[ 2.44948974]]
including those that do not involve text.
to ensure that there are enough training instances with each combination of the feature's values.
vectorizer = CountVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
print vectorizer.vocabulary_
[[0 1 1 0 0 1 0 1] [0 1 1 1 1 0 0 0] [1 0 0 0 0 0 1 0]] {u'duke': 2, u'basketball': 1, u'lost': 4, u'played': 5, u'game': 3, u'sandwich': 6, u'unc': 7, u'ate': 0}
corpus = [
'He ate the sandwiches',
'Every sandwich was eaten by him'
]
vectorizer = CountVectorizer(binary=True, stop_words='english')
print vectorizer.fit_transform(corpus).todense()
[[1 0 0 1] [0 1 1 0]]
vectorizer.vocabulary_
{u'ate': 0, u'eaten': 1, u'sandwich': 2, u'sandwiches': 3}
The documents have similar meanings, but their feature vectors have no elements in common.
We will use the Natural Language Tool Kit (NTLK) to stem and lemmatize the corpus.
import nltk
#nltk.download()
# i) lemmatizer 테스트
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print lemmatizer.lemmatize('gathering', 'v')
print lemmatizer.lemmatize('gathering', 'n')
gather gathering
# ii) stemming 테스트
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
print stemmer.stem('gathering')
gather
# iii) lemmatizer & stemming 비교
from nltk import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import pos_tag
wordnet_tags = ['n', 'v']
corpus2 = [
'He ate the sandwiches',
'Every sandwich was eaten by him'
]
stemmer = PorterStemmer()
print 'Stemmed:', [[stemmer.stem(token) for token in word_tokenize(document)] for document in corpus]
Stemmed: [[u'He', u'ate', u'the', u'sandwich'], [u'Everi', u'sandwich', u'wa', u'eaten', u'by', u'him']]
lemmatizer = WordNetLemmatizer()
def lemmatize(token, tag):
if tag[0].lower() in ['n', 'v']:
return lemmatizer.lemmatize(token, tag[0].lower())
return token
tagged_corpus = [pos_tag(word_tokenize(document)) for document in corpus]
tagged_corpus
[[('He', 'PRP'), ('ate', 'VBP'), ('the', 'DT'), ('sandwiches', 'NNS')], [('Every', 'DT'), ('sandwich', 'NN'), ('was', 'VBD'), ('eaten', 'VBN'), ('by', 'IN'), ('him', 'PRP')]]
print 'Lemmatized:', [[lemmatize(token, tag) for token, tag in document] for document in tagged_corpus]
Lemmatized: [['He', u'eat', 'the', u'sandwich'], ['Every', 'sandwich', u'be', u'eat', 'by', 'him']]
Through stemming and lemmatization, we reduced the dimensionality of our feature space.
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['The dog ate a sandwich, the wizard transfigured a sandwich, and I ate a sandwich']
vectorizer = CountVectorizer(stop_words='english')
# CountVectorizer의 parameter로 binary=True를 주지 않으면,
# 0,1이 아닌 문서에 포함된 단어의 개수 matrix가 반환됨(binary=False가 디폴트).
print vectorizer.fit_transform(corpus).todense()
[[2 1 3 1 1]]
print vectorizer.vocabulary_
{u'sandwich': 2, u'wizard': 4, u'dog': 1, u'transfigured': 3, u'ate': 0}
~~
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
'The dog ate a sandwich and I ate a sandwich',
'The wizard transfigured a sandwich'
]
vectorizer = TfidfVectorizer(stop_words='english')
print vectorizer.fit_transform(corpus).todense()
[[ 0.75458397 0.37729199 0.53689271 0. 0. ] [ 0. 0. 0.44943642 0.6316672 0.6316672 ]]
vectorizer.vocabulary_
{u'ate': 0, u'dog': 1, u'sandwich': 2, u'transfigured': 3, u'wizard': 4}
from sklearn.feature_extraction.text import HashingVectorizer
#corpus = ['the dog', 'ate a', 'bacon is delicious', 'cat is cute']
vectorizer = HashingVectorizer(n_features=6)
# n_features의 디폴트는 2^20. 지금은 매트릭스를 작게하기 위해 6으로 셋팅
print vectorizer.transform(corpus2).todense()
[[-0.40824829 0. 0. 0.81649658 0.40824829 0. ] [ 0. 0.5 0. -0.5 -0.5 0.5 ]]
digits dataset : - included with scikit-learn contains grayscale images of more than 1,700 hand-written digits between zero and nine. - Each image has eight pixels on a side. - Each pixel is represented by an intensity value between zero and 16
from sklearn import datasets
digits = datasets.load_digits()
%matplotlib inline
# digits.images[0] 그래프에 시각화
import pylab as pl
pl.matshow(digits.images[0])
pl.show()
print 'Digit:', digits.target[0]
print digits.images[0]
Digit: 0 [[ 0. 0. 5. 13. 9. 1. 0. 0.] [ 0. 0. 13. 15. 10. 15. 5. 0.] [ 0. 3. 15. 2. 0. 11. 8. 0.] [ 0. 4. 12. 0. 0. 8. 8. 0.] [ 0. 5. 8. 0. 0. 9. 8. 0.] [ 0. 4. 11. 0. 1. 12. 7. 0.] [ 0. 2. 14. 5. 10. 12. 0. 0.] [ 0. 0. 6. 13. 10. 0. 0. 0.]]
create a feature vector for the image by reshaping its 8 x 8 matrix into a 64-dimensional vector:
print 'Feature vector:\n', digits.images[0].reshape(-1, 64)
Feature vector: [[ 0. 0. 5. 13. 9. 1. 0. 0. 0. 0. 13. 15. 10. 15. 5. 0. 0. 3. 15. 2. 0. 11. 8. 0. 0. 4. 12. 0. 0. 8. 8. 0. 0. 5. 8. 0. 0. 9. 8. 0. 0. 4. 11. 0. 1. 12. 7. 0. 0. 2. 14. 5. 10. 12. 0. 0. 0. 0. 6. 13. 10. 0. 0. 0.]]
import numpy as np
from skimage.feature import corner_harris, corner_peaks
from skimage.color import rgb2gray
import matplotlib.pyplot as plt
import skimage.io as io
from skimage.exposure import equalize_hist
def show_corners(corners, image):
fig = plt.figure()
plt.gray()
plt.imshow(image)
y_corner, x_corner = zip(*corners)
plt.plot(x_corner, y_corner, 'or')
plt.xlim(0, image.shape[1])
plt.ylim(image.shape[0], 0)
fig.set_size_inches(np.array(fig.get_size_inches()) * 1.5)
plt.show()
mandrill = io.imread('test_img.jpg')
mandrill = equalize_hist(rgb2gray(mandrill))
corners = corner_peaks(corner_harris(mandrill), min_distance=10)
show_corners(corners, mandrill)
import mahotas as mh
from mahotas.features import surf
image = mh.imread('test_img.jpg', as_grey=True)
print 'The first SURF descriptor:\n', surf.surf(image)[0]
print 'Extracted %s SURF descriptors' % len(surf.surf(image))
import numpy as np
X = np.array([
[0., 0., 5., 13., 9., 1.],
[0., 0., 13., 15., 10., 15.],
[0., 3., 15., 2., 0., 11.]
])
The first SURF descriptor: [ 2.27196086e+02 4.07421737e+02 4.39201028e+00 1.41085734e+03 -1.00000000e+00 2.68581608e+00 1.54309084e-03 2.32795098e-03 1.54309084e-03 2.32795098e-03 6.52103807e-03 1.34513620e-02 7.10188076e-03 1.34513620e-02 9.44529178e-03 1.28432399e-02 9.44529178e-03 1.28432399e-02 7.57499694e-04 9.22517434e-04 1.16708006e-03 1.15417991e-03 3.14752523e-03 1.22539929e-02 6.86719267e-03 1.22539929e-02 -2.20079391e-01 1.38937612e-02 2.84843262e-01 7.82140690e-02 -1.45077621e-01 2.08699609e-01 1.87692817e-01 2.15863520e-01 8.36510747e-04 4.92314366e-03 6.64366047e-03 6.80813073e-03 8.49174783e-03 2.65391656e-02 5.83626875e-02 3.15371149e-02 3.57454611e-01 1.23822929e-01 3.83228327e-01 1.62797539e-01 2.48417933e-01 3.61213454e-01 2.63505608e-01 3.64319957e-01 3.02655536e-03 4.17239750e-03 3.57762929e-03 4.17239750e-03 3.41016608e-04 5.82642781e-04 5.98184774e-04 6.66943351e-04 -4.59779837e-03 2.12976559e-03 5.75517987e-03 3.59433882e-03 -4.18734512e-04 6.75004412e-03 3.18498790e-03 6.87710537e-03 2.18234902e-05 4.64987057e-04 2.12506186e-04 5.62256778e-04] Extracted 610 SURF descriptors
from sklearn import preprocessing
import numpy as np
X = np.array([
[0., 0., 5., 13., 9., 1.],
[0., 0., 13., 15., 10., 15.],
[0., 3., 15., 2., 0., 11.]
])
print preprocessing.scale(X)
[[ 0. -0.70710678 -1.38873015 0.52489066 0.59299945 -1.35873244] [ 0. -0.70710678 0.46291005 0.87481777 0.81537425 1.01904933] [ 0. 1.41421356 0.9258201 -1.39970842 -1.4083737 0.33968311]]