import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
reviews = pd.read_csv('./data/reviews.txt', header=None)
labels = pd.read_csv('./data/labels.txt', header=None)
from collections import Counter
total = Counter()
for _, row in reviews.iterrows():
total.update(row[0].split(' '))
print("Total words in data set: ", len(total))
Total words in data set: 74074
vocab = sorted(total, key=total.get, reverse=True)[:10000]
print(vocab[0:60])
['', 'the', '.', 'and', 'a', 'of', 'to', 'is', 'br', 'it', 'in', 'i', 'this', 'that', 's', 'was', 'as', 'for', 'with', 'movie', 'but', 'film', 'you', 'on', 't', 'not', 'he', 'are', 'his', 'have', 'be', 'one', 'all', 'at', 'they', 'by', 'an', 'who', 'so', 'from', 'like', 'there', 'her', 'or', 'just', 'about', 'out', 'if', 'has', 'what', 'some', 'good', 'can', 'more', 'she', 'when', 'very', 'up', 'time', 'no']
# print(vocab[-1], ': ', total[vocab[-1]])
print(vocab[-1], ': ', total[vocab[-1]])
fulfilled : 30
word2idx = {word: i for i, word in enumerate(vocab)}
# Text to vector function
def text2vector(text):
wordvec = np.zeros(len(vocab), dtype=np.int_)
for word in text.split(' '):
idx = word2idx.get(word, None)
if idx is None:
continue
else:
wordvec[idx] += 1
return np.array(wordvec)
text2vector('The tea is for a party to celebrate '
'the movie so she has no time for a cake')[:65]
array([0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0])
wordvec = np.zeros((len(reviews), len(vocab)), dtype=np.int_)
for ii, (_, text) in enumerate(reviews.iterrows()):
wordvec[ii] = text2vector(text[0])
wordvec[:5, :23]
array([[ 18, 9, 27, 1, 4, 4, 6, 4, 0, 2, 2, 5, 0, 4, 1, 0, 2, 0, 0, 0, 0, 0, 0], [ 5, 4, 8, 1, 7, 3, 1, 2, 0, 4, 0, 0, 0, 1, 2, 0, 0, 1, 3, 0, 0, 0, 1], [ 78, 24, 12, 4, 17, 5, 20, 2, 8, 8, 2, 1, 1, 2, 8, 0, 5, 5, 4, 0, 2, 1, 4], [167, 53, 23, 0, 22, 23, 13, 14, 8, 10, 8, 12, 9, 4, 11, 2, 11, 5, 11, 0, 5, 3, 0], [ 19, 10, 11, 4, 6, 2, 2, 5, 0, 1, 2, 3, 1, 0, 0, 0, 3, 1, 0, 1, 0, 0, 0]])