This notebook reads a set of plain text files and builds topic models with LSI (a.k.a. LSA) and LDA.
The texts are specified in the variable TEXTS
below.
The models are written to the current
directory in several files name model.*
.
"""Construct LSI and LDA topic models of texts."""
from __future__ import print_function
import io, os, re, glob, logging, operator
from itertools import islice
from collections import Counter
import pandas, gensim
# Specify one or more patterns of filenames to use as the corpus.
# Each file will be treated as a single document.
TEXTS = [
'corpus/Adventure/FOLD1/*.txt',
'corpus/Fiction/FOLD1/*.txt',
]
ENCODING = 'utf8' # when this gives issues, try 'latin1'
# A set of words that will be ignored.
# Either specify a file with one word per line:
# STOPWORDS = frozenset(
# open('data/dutch-stop-words.txt').read().splitlines())
# Or give a space-separated list here:
STOPWORDS = frozenset(
'andere deze over zei zal ge niet als daar moet had wel te toch bij '
'niets dan nog maar dat doch geen worden die een dit der en altijd '
'haar ze mijn kunnen zonder naar er doen omdat we iemand wezen men '
'met ja toen om tegen of kon voor iets hier geweest veel op wie zelf '
'wil wij zo zijn ons het heeft van eens tot heb hem wat was door hun '
'ook me dus ben zij uw aan hij je werd meer alles reeds af is al ik '
'uit want in hoe na zou waren nu de kan mij zich hebben u'.split())
# Tokens must contain at least three letters; no numbers or punctuation
# (also ignores words with accents)
TOKENRE = re.compile(r'\b[-A-Za-z]{3,}\b')
# A more inclusive alternative--any sequence of alphanumeric characters:
# TOKENRE = re.compile(r'\b\w+\b')
# or, same for non-whitespace characters (includes punctuation):
# TOKENRE = re.compile(r'\b\S+\b')
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)
"""Build vector space model."""
filenames = [a for pattern in TEXTS
for a in glob.glob(pattern)]
# Collect statistics about all tokens;
# extract lowercased tokens from plain text files.
dictionary = gensim.corpora.Dictionary(
TOKENRE.findall(io.open(filename, encoding=ENCODING).read().lower())
for filename in filenames)
# remove stop words and words that appear only once / too many times
stop_ids = [dictionary.token2id[stopword]
for stopword in STOPWORDS
if stopword in dictionary.token2id]
dictionary.filter_tokens(stop_ids)
dictionary.filter_extremes(no_below=2, no_above=0.5, keep_n=40000)
# remove gaps in id sequence after words that were removed
dictionary.compactify()
dictionary.save('model.dict')
corpus = (dictionary.doc2bow(
TOKENRE.findall(io.open(filename, encoding=ENCODING).read().lower()))
for filename in filenames)
gensim.corpora.MmCorpus.serialize('model.mm', corpus)
# load corpus
dictionary = gensim.corpora.Dictionary.load('model.dict')
corpus = gensim.corpora.MmCorpus('model.mm')
# make topic model with online LDA
lda = gensim.models.ldamodel.LdaModel(
corpus=corpus, id2word=dictionary, num_topics=50,
update_every=0, chunksize=50, passes=10, eval_every=1, alpha='auto')
lda.save('model.lda')
# apply tf-idf to BOW counts
tfidf = gensim.models.TfidfModel(corpus)
corpus = tfidf[corpus]
# transform corpus to LSI space and index it
lsi = gensim.models.LsiModel(
corpus, id2word=dictionary, num_topics=50, chunksize=50)
index = gensim.similarities.Similarity(
None, lsi[corpus], num_features=lsi.num_terms)
tfidf.save('model.tfidf')
lsi.save('model.lsi')
index.save('model.index')
filenames = [a for pattern in TEXTS
for a in glob.glob(pattern)]
# load models
dictionary = gensim.corpora.Dictionary.load('model.dict')
index = gensim.similarities.MatrixSimilarity.load('model.index')
lsi = gensim.models.LsiModel.load('model.lsi')
lda = gensim.models.LdaModel.load('model.lda')
tfidf = gensim.models.TfidfModel.load('model.tfidf')
corpus = gensim.corpora.MmCorpus('model.mm')
def summary(vec_bow):
"""Helper function that compares a vector in both models."""
doc_lda = lda[vec_bow]
# convert the query to LSI space
vec_lsi = lsi[tfidf[vec_bow]]
sims = index[vec_lsi]
sims = sorted(enumerate(sims),
key=operator.itemgetter(1), reverse=True)
print('LSI: most similar texts')
for doc, score in sims[:5]:
print(score, filenames[doc])
print()
print('LSI: topics of this text (highest scoring first)')
for topicno, score in sorted(
vec_lsi, key=operator.itemgetter(1), reverse=True)[:5]:
print('%g: %s' % (score, lsi.print_topic(topicno)))
print()
print('LDA: topics of this text (highest scoring first)')
for topicno, p in sorted(
doc_lda, key=operator.itemgetter(1), reverse=True)[:5]:
print('%g: %s' % (p, lda.print_topic(topicno)))
print('5 topics from LSI model:')
for n, line in enumerate(lsi.show_topics(
num_topics=5, num_words=10, formatted=True)):
print('Topic #%d: %s' % (n + 1, line))
5 topics from LSI model: Topic #1: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" Topic #2: 0.657*"sapt" + 0.401*"rudolf" + 0.246*"rupert" + 0.243*"fritz" + 0.240*"strelsau" + 0.177*"rassendyll" + 0.153*"bernenstein" + 0.152*"zenda" + -0.136*"ivan" + 0.106*"flavia" Topic #3: -0.697*"ivan" + -0.176*"roubles" + 0.131*"dick" + -0.124*"peter" + -0.115*"eugene" + -0.112*"peasants" + -0.109*"peasant" + 0.107*"lionel" + 0.107*"stella" + -0.100*"simeon" Topic #4: -0.304*"lionel" + -0.282*"stella" + 0.230*"marquis" + -0.227*"macumazahn" + -0.202*"denis" + 0.196*"monsieur" + -0.175*"percy" + -0.163*"henry" + 0.155*"turner" + 0.148*"aline" Topic #5: -0.303*"stewart" + -0.275*"thet" + 0.220*"marquis" + 0.164*"monsieur" + 0.159*"turner" + -0.156*"bland" + -0.148*"fer" + -0.146*"lawson" + -0.146*"prim" + -0.143*"cowboys"
print('\ntopics from LDA model:')
for n, line in enumerate(lda.show_topics(
num_topics=25, num_words=10, formatted=True)):
print('Topic #%d: %s' % (n + 1, line))
print()
topics from LDA model: Topic #1: 0.000*oliver + 0.000*drake + 0.000*lionel + 0.000*billy + 0.000*dick + 0.000*henry + 0.000*harold + 0.000*virginia + 0.000*sapt + 0.000*steve Topic #2: 0.000*lionel + 0.000*denis + 0.000*sapt + 0.000*hendricks + 0.000*percy + 0.000*rupert + 0.000*harold + 0.000*rudolf + 0.000*virginia + 0.000*martin Topic #3: 0.000*drake + 0.000*steve + 0.000*dick + 0.000*billy + 0.000*lionel + 0.000*martin + 0.000*marcus + 0.000*rudolf + 0.000*ain + 0.000*rupert Topic #4: 0.000*stewart + 0.000*rudolf + 0.000*rupert + 0.000*steve + 0.000*lionel + 0.000*majesty + 0.000*sapt + 0.000*florence + 0.000*ain + 0.000*cowboys Topic #5: 0.000*drake + 0.000*dick + 0.000*rupert + 0.000*rudolf + 0.000*stewart + 0.000*marcus + 0.000*percy + 0.000*lionel + 0.000*ain + 0.000*colonel Topic #6: 0.000*billy + 0.000*steve + 0.000*byrne + 0.000*stewart + 0.000*virginia + 0.000*sapt + 0.000*ain + 0.000*rudolf + 0.000*marcus + 0.000*barbara Topic #7: 0.000*steve + 0.000*ralph + 0.000*cliff + 0.000*edna + 0.000*lionel + 0.000*horn + 0.000*drake + 0.000*paris + 0.000*percy + 0.000*france Topic #8: 0.000*billy + 0.000*dick + 0.000*drake + 0.000*byrne + 0.000*ain + 0.000*steve + 0.000*monsieur + 0.000*joan + 0.000*barbara + 0.000*stewart Topic #9: 0.000*lionel + 0.000*drake + 0.000*stewart + 0.000*denis + 0.000*martin + 0.000*percy + 0.000*dick + 0.000*monsieur + 0.000*indians + 0.000*kitty Topic #10: 0.000*billy + 0.000*stewart + 0.000*byrne + 0.000*kid + 0.000*dick + 0.000*ain + 0.000*harding + 0.000*barbara + 0.000*lionel + 0.000*commander Topic #11: 0.000*thet + 0.000*hagar + 0.000*bland + 0.000*billy + 0.000*ain + 0.000*buck + 0.000*outlaws + 0.000*lionel + 0.000*reckon + 0.000*outlaw Topic #12: 0.001*stewart + 0.000*majesty + 0.000*sapt + 0.000*billy + 0.000*thet + 0.000*rupert + 0.000*cowboys + 0.000*rudolf + 0.000*florence + 0.000*indians Topic #13: 0.017*france + 0.017*turner + 0.016*marquis + 0.016*loo + 0.011*monsieur + 0.010*miriam + 0.008*madame + 0.008*pierre + 0.008*lawrence + 0.007*paris Topic #14: 0.035*dick + 0.016*button + 0.016*reef + 0.015*lagoon + 0.011*paddy + 0.010*lestrange + 0.008*dinghy + 0.008*coral + 0.005*deck + 0.005*cocoa-nut Topic #15: 0.063*billy + 0.029*byrne + 0.017*harding + 0.017*barbara + 0.012*mucker + 0.008*grayson + 0.007*eddie + 0.006*ain + 0.005*ward + 0.005*youse Topic #16: 0.026*denis + 0.025*percy + 0.016*hendricks + 0.015*zulus + 0.011*lionel + 0.010*waggon + 0.010*crawford + 0.008*rupert + 0.007*lion + 0.006*farm Topic #17: 0.028*commander + 0.011*bishop + 0.008*laura + 0.008*empire + 0.006*colonel + 0.006*aboard + 0.006*peter + 0.006*lordship + 0.006*spanish + 0.005*sword Topic #18: 0.026*adam + 0.015*marion + 0.014*robin + 0.013*allan + 0.013*hagar + 0.006*adelaide + 0.005*baron + 0.003*castle + 0.003*carleton + 0.003*corn Topic #19: 0.020*marcus + 0.014*dentist + 0.014*harold + 0.013*virginia + 0.010*maria + 0.007*mac + 0.006*dollars + 0.006*ain + 0.005*baker + 0.003*joe Topic #20: 0.119*oliver + 0.011*suzanne + 0.011*emma + 0.008*verdi + 0.006*myron + 0.005*beach + 0.004*hospital + 0.004*guy + 0.004*richard + 0.004*someone Topic #21: 0.009*hamilton + 0.005*leary + 0.004*paris + 0.004*jane + 0.004*guy + 0.003*harry + 0.003*colonel + 0.003*tom + 0.002*regiment + 0.002*lordship Topic #22: 0.048*martin + 0.031*barney + 0.010*hermit + 0.006*indians + 0.005*brazil + 0.005*rattler + 0.005*bob + 0.004*savages + 0.004*aunt + 0.004*canoe Topic #23: 0.042*drake + 0.021*dick + 0.010*falconer + 0.009*vernon + 0.007*henry + 0.006*earl + 0.006*mills + 0.006*countess + 0.005*thou + 0.005*mamma Topic #24: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess Topic #25: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery
# show similarities / topics of 20 texts the models were based on
for n, vec_bow in enumerate(islice(corpus, 20)):
print('Text:', filenames[n])
print(io.open(filenames[n], encoding=ENCODING).read(500), '[...]')
print('-' * 50)
summary(vec_bow)
print('=' * 50, '\n\n')
Text: corpus/Adventure/FOLD1/18857.txt Project Gutenberg's A Journey to the Centre of the Earth, by Jules Verne This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: A Journey to the Centre of the Earth Author: Jules Verne Release Date: July 18, 2006 [EBook #18857] Last updated: December 27, 2012 Language: English Character set [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/18857.txt 0.0665643 corpus/Adventure/FOLD1/2727.txt 0.0665143 corpus/Fiction/FOLD1/5240.txt 0.0619348 corpus/Adventure/FOLD1/21459.txt 0.0573205 corpus/Adventure/FOLD1/2166.txt LSI: topics of this text (highest scoring first) 0.478681: 0.378*"hans" + 0.342*"marshall" + -0.213*"oliver" + 0.184*"laura" + 0.181*"livingstone" + 0.179*"senator" + 0.167*"consul" + -0.156*"bishop" + 0.151*"commander" + -0.132*"lordship" 0.366615: 0.282*"hans" + -0.228*"joan" + 0.224*"adam" + 0.210*"kirkland" + -0.206*"bart" + -0.201*"willy" + -0.174*"thee" + 0.159*"marion" + -0.157*"marshall" + -0.153*"kate" 0.200951: 0.679*"oliver" + -0.383*"hagar" + -0.238*"adam" + 0.159*"hans" + -0.158*"hamilton" + 0.136*"emma" + -0.132*"robin" + -0.124*"marcus" + -0.111*"dentist" + -0.101*"thee" 0.198607: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.190409: -0.566*"hamilton" + 0.292*"marcus" + 0.266*"dentist" + -0.214*"willy" + -0.176*"oliver" + -0.174*"adam" + 0.144*"hans" + 0.139*"maria" + 0.127*"mac" + -0.113*"camel" LDA: topics of this text (highest scoring first) 0.99927: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery ================================================== Text: corpus/Adventure/FOLD1/393.txt The Project Gutenberg EBook of The Blue Lagoon, by H. de Vere Stacpoole This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: The Blue Lagoon A Romance Author: H. de Vere Stacpoole Release Date: January 19, 2008 [EBook #393] Language: English *** START OF THIS PROJECT GUTENBERG EBO [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/393.txt 0.158348 corpus/Fiction/FOLD1/22961.txt 0.136583 corpus/Adventure/FOLD1/21459.txt 0.0699755 corpus/Fiction/FOLD1/525.txt 0.0324906 corpus/Adventure/FOLD1/1965.txt LSI: topics of this text (highest scoring first) 0.271972: 0.271*"drake" + 0.251*"dick" + 0.212*"marquis" + -0.168*"marshall" + 0.160*"turner" + 0.150*"kitty" + -0.136*"willy" + 0.134*"steve" + 0.127*"stella" + -0.124*"lionel" 0.193956: -0.697*"ivan" + -0.176*"roubles" + 0.131*"dick" + -0.124*"peter" + -0.115*"eugene" + -0.112*"peasants" + -0.109*"peasant" + 0.107*"lionel" + 0.107*"stella" + -0.100*"simeon" 0.182906: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.176242: -0.398*"drake" + -0.166*"hong" + -0.163*"kong" + 0.159*"laura" + -0.155*"bombay" + -0.155*"thee" + -0.150*"billy" + 0.140*"prim" + -0.124*"francis" + 0.123*"burton" 0.112625: 0.364*"kitty" + -0.337*"steve" + 0.238*"thee" + 0.229*"grandma" + -0.213*"drake" + 0.197*"marion" + -0.154*"macumazahn" + -0.137*"laura" + 0.128*"lionel" + -0.121*"emma" LDA: topics of this text (highest scoring first) 0.999863: 0.035*dick + 0.016*button + 0.016*reef + 0.015*lagoon + 0.011*paddy + 0.010*lestrange + 0.008*dinghy + 0.008*coral + 0.005*deck + 0.005*cocoa-nut ================================================== Text: corpus/Adventure/FOLD1/24695.txt The Project Gutenberg EBook of The Snowshoe Trail, by Edison Marshall This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: The Snowshoe Trail Author: Edison Marshall Posting Date: March 8, 2009 [EBook #24695] Release Date: February 26, 2008 Language: English Character set encoding: ASCII [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/24695.txt 0.0496626 corpus/Adventure/FOLD1/21459.txt 0.0240968 corpus/Fiction/FOLD1/1897.txt 0.0235749 corpus/Adventure/FOLD1/23662.txt 0.0173788 corpus/Fiction/FOLD1/5240.txt LSI: topics of this text (highest scoring first) 0.443732: 0.325*"harold" + 0.277*"virginia" + -0.277*"hamilton" + -0.201*"joan" + -0.191*"marcus" + -0.190*"edna" + -0.181*"bart" + -0.174*"dentist" + 0.155*"commander" + -0.131*"kate" 0.247424: 0.267*"billy" + 0.219*"byrne" + 0.193*"harold" + -0.179*"joan" + 0.162*"virginia" + -0.161*"bart" + -0.155*"willy" + 0.152*"edna" + 0.146*"hans" + 0.141*"kitty" 0.232661: -0.411*"steve" + 0.246*"kitty" + 0.246*"drake" + -0.225*"marion" + -0.197*"billy" + 0.184*"prim" + 0.181*"harold" + 0.165*"stella" + 0.165*"burton" + 0.156*"grandma" 0.164141: 0.515*"steve" + 0.374*"kitty" + 0.237*"grandma" + -0.199*"laura" + -0.175*"thee" + -0.164*"drake" + -0.150*"stewart" + -0.126*"emma" + -0.118*"hagar" + -0.114*"dick" 0.163801: 0.279*"marcus" + 0.254*"dentist" + 0.209*"marshall" + -0.199*"hans" + -0.189*"hamilton" + 0.183*"oliver" + 0.177*"martin" + -0.163*"prim" + -0.156*"steve" + 0.154*"kirkland" LDA: topics of this text (highest scoring first) 0.981579: 0.020*marcus + 0.014*dentist + 0.014*harold + 0.013*virginia + 0.010*maria + 0.007*mac + 0.006*dollars + 0.006*ain + 0.005*baker + 0.003*joe ================================================== Text: corpus/Adventure/FOLD1/95.txt The Project Gutenberg EBook of The Prisoner of Zenda, by Anthony Hope This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: The Prisoner of Zenda Author: Anthony Hope Release Date: January 10, 2006 [EBook #95] [This file last updated October 6, 2010] Language: English *** START OF THIS PR [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/95.txt 0.677422 corpus/Adventure/FOLD1/1145.txt 0.0266588 corpus/Fiction/FOLD1/5240.txt 0.0255197 corpus/Adventure/FOLD1/21393.txt 0.0176788 corpus/Adventure/FOLD1/10368.txt LSI: topics of this text (highest scoring first) 0.870599: 0.657*"sapt" + 0.401*"rudolf" + 0.246*"rupert" + 0.243*"fritz" + 0.240*"strelsau" + 0.177*"rassendyll" + 0.153*"bernenstein" + 0.152*"zenda" + -0.136*"ivan" + 0.106*"flavia" 0.401136: -0.600*"rudolf" + -0.349*"bernenstein" + 0.337*"fritz" + -0.279*"rassendyll" + 0.263*"sapt" + 0.200*"flavia" + 0.163*"duke" + 0.131*"johann" + 0.130*"zenda" + -0.125*"constable" 0.27622: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.0376122: -0.592*"lionel" + -0.274*"denis" + -0.234*"percy" + 0.197*"stella" + 0.171*"macumazahn" + -0.171*"hendricks" + 0.166*"henry" + 0.156*"thee" + -0.142*"moore" + -0.133*"dick" 0.0161147: -0.304*"lionel" + -0.282*"stella" + 0.230*"marquis" + -0.227*"macumazahn" + -0.202*"denis" + 0.196*"monsieur" + -0.175*"percy" + -0.163*"henry" + 0.155*"turner" + 0.148*"aline" LDA: topics of this text (highest scoring first) 0.995377: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess ================================================== Text: corpus/Adventure/FOLD1/10368.txt The Project Gutenberg EBook of The Vizier of the Two-Horned Alexander by Frank R. Stockton This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: The Vizier of the Two-Horned Alexander Author: Frank R. Stockton Release Date: December 2, 2003 [EBook #10368] Language: English *** START OF THI [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/10368.txt 0.146627 corpus/Adventure/FOLD1/2166.txt 0.112932 corpus/Fiction/FOLD1/243.txt 0.0710879 corpus/Fiction/FOLD1/15181.txt 0.0575988 corpus/Fiction/FOLD1/5240.txt LSI: topics of this text (highest scoring first) 0.393863: 0.364*"kitty" + -0.337*"steve" + 0.238*"thee" + 0.229*"grandma" + -0.213*"drake" + 0.197*"marion" + -0.154*"macumazahn" + -0.137*"laura" + 0.128*"lionel" + -0.121*"emma" 0.294297: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.25737: -0.592*"lionel" + -0.274*"denis" + -0.234*"percy" + 0.197*"stella" + 0.171*"macumazahn" + -0.171*"hendricks" + 0.166*"henry" + 0.156*"thee" + -0.142*"moore" + -0.133*"dick" 0.150428: -0.279*"martin" + -0.225*"barney" + -0.214*"commander" + 0.213*"willy" + -0.184*"bishop" + -0.160*"lordship" + 0.151*"marcus" + -0.143*"hagar" + 0.140*"kirkland" + 0.138*"dentist" 0.144327: -0.234*"leary" + 0.233*"denis" + 0.204*"percy" + -0.202*"lionel" + 0.197*"bishop" + 0.179*"stewart" + 0.162*"hendricks" + -0.145*"henry" + -0.141*"jane" + -0.118*"harry" LDA: topics of this text (highest scoring first) 0.727034: 0.013*thee + 0.010*steamer + 0.007*detective + 0.007*hong + 0.006*kong + 0.006*bombay + 0.006*passengers + 0.006*francis + 0.004*yokohama + 0.004*india 0.14788: 0.042*drake + 0.021*dick + 0.010*falconer + 0.009*vernon + 0.007*henry + 0.006*earl + 0.006*mills + 0.006*countess + 0.005*thou + 0.005*mamma 0.0518649: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery 0.0479786: 0.029*edna + 0.026*cliff + 0.025*horn + 0.025*ralph + 0.012*mound + 0.012*burke + 0.012*shirley + 0.011*bags + 0.009*cave + 0.009*banker 0.0116218: 0.054*marshall + 0.028*senator + 0.025*consul + 0.022*livingstone + 0.016*admiral + 0.012*porto + 0.009*president + 0.009*wireless + 0.008*las + 0.007*henry ================================================== Text: corpus/Adventure/FOLD1/21459.txt The Project Gutenberg EBook of Dick Onslow, by W.H.G. Kingston This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Dick Onslow Among the Redskins Author: W.H.G. Kingston Illustrator: George Soper Release Date: May 15, 2007 [EBook #21459] Language: English Character set encoding: A [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/21459.txt 0.136583 corpus/Adventure/FOLD1/393.txt 0.101856 corpus/Fiction/FOLD1/5240.txt 0.0715007 corpus/Adventure/FOLD1/2166.txt 0.0619348 corpus/Adventure/FOLD1/18857.txt LSI: topics of this text (highest scoring first) 0.377979: -0.238*"steve" + -0.218*"drake" + -0.189*"hans" + 0.189*"billy" + -0.165*"prim" + 0.148*"byrne" + -0.147*"burton" + 0.144*"kitty" + -0.135*"marion" + -0.134*"kid" 0.278085: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.237038: 0.365*"drake" + -0.276*"lagoon" + -0.195*"dick" + -0.193*"paddy" + -0.192*"lestrange" + 0.153*"leary" + 0.145*"stewart" + -0.143*"bishop" + -0.140*"reef" + 0.112*"marshall" 0.216762: -0.697*"ivan" + -0.176*"roubles" + 0.131*"dick" + -0.124*"peter" + -0.115*"eugene" + -0.112*"peasants" + -0.109*"peasant" + 0.107*"lionel" + 0.107*"stella" + -0.100*"simeon" 0.205202: -0.398*"drake" + -0.166*"hong" + -0.163*"kong" + 0.159*"laura" + -0.155*"bombay" + -0.155*"thee" + -0.150*"billy" + 0.140*"prim" + -0.124*"francis" + 0.123*"burton" LDA: topics of this text (highest scoring first) 0.984055: 0.014*indians + 0.013*rifle + 0.009*tent + 0.006*delaware + 0.005*wolves + 0.005*dick + 0.005*sam + 0.004*pole + 0.004*rifles + 0.004*wagon 0.0157949: 0.003*skulls + 0.003*flayed + 0.002*rabbits + 0.002*ramrod + 0.002*brag + 0.002*shrouded + 0.002*affording + 0.002*horde + 0.002*seventies + 0.002*equivalent ================================================== Text: corpus/Adventure/FOLD1/23662.txt The Project Gutenberg EBook of The Heart of Unaga, by Ridgwell Cullum This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: The Heart of Unaga Author: Ridgwell Cullum Release Date: November 30, 2007 [EBook #23662] Last Updated: January 14, 2009 Language: English Character set encoding: ASCI [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/23662.txt 0.0921901 corpus/Adventure/FOLD1/15072.txt 0.0491475 corpus/Adventure/FOLD1/21459.txt 0.0277324 corpus/Fiction/FOLD1/363.txt 0.0238107 corpus/Fiction/FOLD1/5240.txt LSI: topics of this text (highest scoring first) 0.551729: 0.515*"steve" + 0.374*"kitty" + 0.237*"grandma" + -0.199*"laura" + -0.175*"thee" + -0.164*"drake" + -0.150*"stewart" + -0.126*"emma" + -0.118*"hagar" + -0.114*"dick" 0.139929: 0.271*"drake" + 0.251*"dick" + 0.212*"marquis" + -0.168*"marshall" + 0.160*"turner" + 0.150*"kitty" + -0.136*"willy" + 0.134*"steve" + 0.127*"stella" + -0.124*"lionel" 0.113773: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.0908025: 0.325*"harold" + 0.277*"virginia" + -0.277*"hamilton" + -0.201*"joan" + -0.191*"marcus" + -0.190*"edna" + -0.181*"bart" + -0.174*"dentist" + 0.155*"commander" + -0.131*"kate" 0.0846385: -0.697*"ivan" + -0.176*"roubles" + 0.131*"dick" + -0.124*"peter" + -0.115*"eugene" + -0.112*"peasants" + -0.109*"peasant" + 0.107*"lionel" + 0.107*"stella" + -0.100*"simeon" LDA: topics of this text (highest scoring first) 0.920782: 0.055*steve + 0.008*indians + 0.006*fort + 0.006*seal + 0.005*darn + 0.005*outfit + 0.005*ross + 0.005*folks + 0.005*feller + 0.005*squaw 0.0790382: 0.048*martin + 0.031*barney + 0.010*hermit + 0.006*indians + 0.005*brazil + 0.005*rattler + 0.005*bob + 0.004*savages + 0.004*aunt + 0.004*canoe ================================================== Text: corpus/Adventure/FOLD1/1965.txt The Project Gutenberg EBook of Captain Blood, by Rafael Sabatini This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Captain Blood Author: Rafael Sabatini Posting Date: September 26, 2008 [EBook #1965] Release Date: November, 1999 Language: English *** START OF THIS PROJECT GUTENBERG EB [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/1965.txt 0.184564 corpus/Fiction/FOLD1/5240.txt 0.130528 corpus/Fiction/FOLD1/243.txt 0.0799009 corpus/Adventure/FOLD1/1947.txt 0.0713367 corpus/Fiction/FOLD1/1762.txt LSI: topics of this text (highest scoring first) 0.368183: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.33753: -0.234*"leary" + 0.233*"denis" + 0.204*"percy" + -0.202*"lionel" + 0.197*"bishop" + 0.179*"stewart" + 0.162*"hendricks" + -0.145*"henry" + -0.141*"jane" + -0.118*"harry" 0.226162: 0.371*"lionel" + -0.327*"denis" + -0.242*"percy" + 0.215*"henry" + -0.197*"hendricks" + -0.145*"leary" + 0.137*"bishop" + 0.133*"moore" + 0.126*"maurice" + -0.126*"lagoon" 0.198903: -0.304*"lionel" + -0.282*"stella" + 0.230*"marquis" + -0.227*"macumazahn" + -0.202*"denis" + 0.196*"monsieur" + -0.175*"percy" + -0.163*"henry" + 0.155*"turner" + 0.148*"aline" 0.191072: 0.284*"marshall" + -0.266*"martin" + -0.257*"commander" + 0.212*"billy" + -0.208*"barney" + 0.173*"marion" + 0.172*"kirkland" + 0.165*"byrne" + 0.163*"drake" + 0.152*"livingstone" LDA: topics of this text (highest scoring first) 0.769824: 0.011*joan + 0.010*bart + 0.010*bishop + 0.009*kate + 0.007*satan + 0.007*colonel + 0.006*lordship + 0.006*buck + 0.006*haines + 0.005*sheriff 0.223622: 0.028*commander + 0.011*bishop + 0.008*laura + 0.008*empire + 0.006*colonel + 0.006*aboard + 0.006*peter + 0.006*lordship + 0.006*spanish + 0.005*sword ================================================== Text: corpus/Adventure/FOLD1/13290.txt The Project Gutenberg EBook of Martin Rattler, by Robert Michael Ballantyne This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: Martin Rattler Author: Robert Michael Ballantyne Release Date: August 25, 2004 [EBook #13290] Language: English *** START OF THIS PROJECT GUTENBERG EBOOK MARTIN [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/13290.txt 0.0441942 corpus/Adventure/FOLD1/24091.txt 0.0437165 corpus/Fiction/FOLD1/5240.txt 0.0340096 corpus/Adventure/FOLD1/21459.txt 0.0157667 corpus/Adventure/FOLD1/1965.txt LSI: topics of this text (highest scoring first) 0.599935: -0.478*"commander" + 0.431*"martin" + 0.386*"barney" + -0.190*"edna" + -0.157*"empire" + -0.147*"kirkland" + -0.137*"ralph" + -0.124*"viceroy" + 0.105*"marshall" + -0.101*"steve" 0.304325: 0.309*"billy" + -0.262*"hamilton" + 0.235*"byrne" + 0.235*"marion" + 0.234*"commander" + 0.233*"martin" + -0.206*"marshall" + 0.182*"barney" + 0.164*"allan" + 0.157*"hagar" 0.256847: -0.457*"harold" + -0.387*"virginia" + 0.221*"commander" + 0.201*"martin" + 0.194*"kitty" + -0.192*"hans" + -0.160*"hagar" + 0.154*"barney" + -0.154*"marion" + -0.148*"joan" 0.234655: 0.279*"marcus" + 0.254*"dentist" + 0.209*"marshall" + -0.199*"hans" + -0.189*"hamilton" + 0.183*"oliver" + 0.177*"martin" + -0.163*"prim" + -0.156*"steve" + 0.154*"kirkland" 0.151521: 0.342*"edna" + 0.255*"ralph" + -0.248*"commander" + -0.202*"hamilton" + -0.180*"marcus" + 0.171*"kirkland" + -0.170*"marion" + -0.163*"dentist" + 0.160*"hagar" + -0.159*"joan" LDA: topics of this text (highest scoring first) 0.997661: 0.048*martin + 0.031*barney + 0.010*hermit + 0.006*indians + 0.005*brazil + 0.005*rattler + 0.005*bob + 0.004*savages + 0.004*aunt + 0.004*canoe ================================================== Text: corpus/Adventure/FOLD1/12190.txt Project Gutenberg's The Adventures of Captain Horn, by Frank Richard Stockton This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: The Adventures of Captain Horn Author: Frank Richard Stockton Release Date: April 29, 2004 [EBook #12190] Language: English Character set encoding: ASCII *** [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/12190.txt 0.0553897 corpus/Adventure/FOLD1/103.txt 0.0436207 corpus/Fiction/FOLD1/5240.txt 0.0396242 corpus/Adventure/FOLD1/18399.txt 0.0360836 corpus/Fiction/FOLD1/525.txt LSI: topics of this text (highest scoring first) 0.517421: 0.342*"edna" + 0.255*"ralph" + -0.248*"commander" + -0.202*"hamilton" + -0.180*"marcus" + 0.171*"kirkland" + -0.170*"marion" + -0.163*"dentist" + 0.160*"hagar" + -0.159*"joan" 0.272126: -0.537*"willy" + 0.213*"joan" + 0.193*"bart" + 0.182*"edna" + -0.170*"hongkong" + 0.160*"ralph" + 0.144*"kate" + -0.143*"helmsman" + -0.139*"chinamen" + 0.118*"hong" 0.246315: -0.332*"kirkland" + 0.256*"prim" + 0.226*"burton" + -0.225*"mortimer" + 0.193*"kid" + -0.192*"hans" + 0.191*"marcus" + 0.174*"dentist" + 0.164*"edna" + -0.147*"joan" 0.23826: 0.288*"marion" + 0.221*"willy" + 0.221*"hagar" + -0.203*"billy" + 0.202*"allan" + -0.182*"kirkland" + -0.157*"commander" + -0.153*"byrne" + -0.148*"adam" + 0.147*"edna" 0.216545: 0.267*"billy" + 0.219*"byrne" + 0.193*"harold" + -0.179*"joan" + 0.162*"virginia" + -0.161*"bart" + -0.155*"willy" + 0.152*"edna" + 0.146*"hans" + 0.141*"kitty" LDA: topics of this text (highest scoring first) 0.996734: 0.029*edna + 0.026*cliff + 0.025*horn + 0.025*ralph + 0.012*mound + 0.012*burke + 0.012*shirley + 0.011*bags + 0.009*cave + 0.009*banker ================================================== Text: corpus/Adventure/FOLD1/21393.txt The Project Gutenberg EBook of Hendricks the Hunter, by W.H.G. Kingston This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Hendricks the Hunter The Border Farm, a Tale of Zululand Author: W.H.G. Kingston Release Date: May 8, 2007 [EBook #21393] Language: English Character set enco [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/21393.txt 0.232275 corpus/Fiction/FOLD1/16217.txt 0.087352 corpus/Adventure/FOLD1/2727.txt 0.055515 corpus/Adventure/FOLD1/2166.txt 0.038622 corpus/Fiction/FOLD1/5240.txt LSI: topics of this text (highest scoring first) 0.332482: -0.234*"leary" + 0.233*"denis" + 0.204*"percy" + -0.202*"lionel" + 0.197*"bishop" + 0.179*"stewart" + 0.162*"hendricks" + -0.145*"henry" + -0.141*"jane" + -0.118*"harry" 0.209731: -0.432*"dick" + -0.347*"drake" + -0.207*"lagoon" + 0.185*"stewart" + 0.167*"lionel" + 0.154*"thet" + 0.147*"marquis" + -0.146*"paddy" + -0.127*"lestrange" + -0.115*"reef" 0.166803: -0.329*"stella" + 0.297*"henry" + -0.292*"turner" + 0.252*"aline" + 0.170*"andre" + -0.161*"miriam" + 0.147*"tour" + 0.146*"thou" + -0.142*"baboons" + 0.142*"kitty" 0.166365: -0.697*"ivan" + -0.176*"roubles" + 0.131*"dick" + -0.124*"peter" + -0.115*"eugene" + -0.112*"peasants" + -0.109*"peasant" + 0.107*"lionel" + 0.107*"stella" + -0.100*"simeon" 0.152659: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" LDA: topics of this text (highest scoring first) 0.999759: 0.026*denis + 0.025*percy + 0.016*hendricks + 0.015*zulus + 0.011*lionel + 0.010*waggon + 0.010*crawford + 0.008*rupert + 0.007*lion + 0.006*farm ================================================== Text: corpus/Adventure/FOLD1/103.txt The Project Gutenberg EBook of Around the World in 80 Days, by Jules Verne This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: Around the World in 80 Days Author: Jules Verne Release Date: May 15, 2008 [EBook #103] Last updated: February 18, 2012 Last updated: May 5, 2012 Language: Englis [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/103.txt 0.0975255 corpus/Fiction/FOLD1/5240.txt 0.0947979 corpus/Fiction/FOLD1/525.txt 0.0724676 corpus/Adventure/FOLD1/1947.txt 0.0593174 corpus/Adventure/FOLD1/1965.txt LSI: topics of this text (highest scoring first) 0.345129: -0.537*"willy" + 0.213*"joan" + 0.193*"bart" + 0.182*"edna" + -0.170*"hongkong" + 0.160*"ralph" + 0.144*"kate" + -0.143*"helmsman" + -0.139*"chinamen" + 0.118*"hong" 0.314399: -0.238*"steve" + -0.218*"drake" + -0.189*"hans" + 0.189*"billy" + -0.165*"prim" + 0.148*"byrne" + -0.147*"burton" + 0.144*"kitty" + -0.135*"marion" + -0.134*"kid" 0.274888: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.220694: 0.325*"harold" + 0.277*"virginia" + -0.277*"hamilton" + -0.201*"joan" + -0.191*"marcus" + -0.190*"edna" + -0.181*"bart" + -0.174*"dentist" + 0.155*"commander" + -0.131*"kate" 0.190412: -0.304*"lionel" + -0.282*"stella" + 0.230*"marquis" + -0.227*"macumazahn" + -0.202*"denis" + 0.196*"monsieur" + -0.175*"percy" + -0.163*"henry" + 0.155*"turner" + 0.148*"aline" LDA: topics of this text (highest scoring first) 0.80322: 0.013*thee + 0.010*steamer + 0.007*detective + 0.007*hong + 0.006*kong + 0.006*bombay + 0.006*passengers + 0.006*francis + 0.004*yokohama + 0.004*india 0.0552566: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery 0.0481754: 0.007*deck + 0.007*steamer + 0.006*boats + 0.005*marcus + 0.004*coal + 0.004*dollars + 0.004*dentist + 0.004*cargo + 0.003*skipper + 0.003*aft 0.0355613: 0.042*drake + 0.021*dick + 0.010*falconer + 0.009*vernon + 0.007*henry + 0.006*earl + 0.006*mills + 0.006*countess + 0.005*thou + 0.005*mamma 0.0252097: 0.038*kid + 0.026*prim + 0.016*burton + 0.009*car + 0.009*charlie + 0.009*ain + 0.008*dopey + 0.007*detective + 0.007*beppo + 0.007*pilot ================================================== Text: corpus/Adventure/FOLD1/15072.txt The Project Gutenberg eBook, Marjorie's Maytime, by Carolyn Wells This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: Marjorie's Maytime Author: Carolyn Wells Release Date: February 15, 2005 [eBook #15072] Language: English Character set encoding: ISO-646-US (US-ASCII) ***START OF [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/15072.txt 0.0970604 corpus/Adventure/FOLD1/2727.txt 0.0921901 corpus/Adventure/FOLD1/23662.txt 0.0231356 corpus/Fiction/FOLD1/1897.txt 0.0213549 corpus/Adventure/FOLD1/21459.txt LSI: topics of this text (highest scoring first) 0.543896: 0.515*"steve" + 0.374*"kitty" + 0.237*"grandma" + -0.199*"laura" + -0.175*"thee" + -0.164*"drake" + -0.150*"stewart" + -0.126*"emma" + -0.118*"hagar" + -0.114*"dick" 0.419478: 0.364*"kitty" + -0.337*"steve" + 0.238*"thee" + 0.229*"grandma" + -0.213*"drake" + 0.197*"marion" + -0.154*"macumazahn" + -0.137*"laura" + 0.128*"lionel" + -0.121*"emma" 0.302999: -0.411*"steve" + 0.246*"kitty" + 0.246*"drake" + -0.225*"marion" + -0.197*"billy" + 0.184*"prim" + 0.181*"harold" + 0.165*"stella" + 0.165*"burton" + 0.156*"grandma" 0.261049: -0.457*"harold" + -0.387*"virginia" + 0.221*"commander" + 0.201*"martin" + 0.194*"kitty" + -0.192*"hans" + -0.160*"hagar" + 0.154*"barney" + -0.154*"marion" + -0.148*"joan" 0.224654: 0.271*"drake" + 0.251*"dick" + 0.212*"marquis" + -0.168*"marshall" + 0.160*"turner" + 0.150*"kitty" + -0.136*"willy" + 0.134*"steve" + 0.127*"stella" + -0.124*"lionel" LDA: topics of this text (highest scoring first) 0.973059: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess 0.0149565: 0.014*indians + 0.013*rifle + 0.009*tent + 0.006*delaware + 0.005*wolves + 0.005*dick + 0.005*sam + 0.004*pole + 0.004*rifles + 0.004*wagon 0.0117751: 0.119*oliver + 0.011*suzanne + 0.011*emma + 0.008*verdi + 0.006*myron + 0.005*beach + 0.004*hospital + 0.004*guy + 0.004*richard + 0.004*someone ================================================== Text: corpus/Adventure/FOLD1/18399.txt The Project Gutenberg eBook, The Shipwreck, by Joseph Spillman, Translated by Mary Richards Gray This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: The Shipwreck A Story for the Young Author: Joseph Spillman Release Date: May 16, 2006 [eBook #18399] Language: English Chara [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/18399.txt 0.0484134 corpus/Fiction/FOLD1/525.txt 0.0396242 corpus/Adventure/FOLD1/12190.txt 0.0321986 corpus/Adventure/FOLD1/393.txt 0.0255324 corpus/Adventure/FOLD1/103.txt LSI: topics of this text (highest scoring first) 0.283363: 0.288*"marion" + 0.221*"willy" + 0.221*"hagar" + -0.203*"billy" + 0.202*"allan" + -0.182*"kirkland" + -0.157*"commander" + -0.153*"byrne" + -0.148*"adam" + 0.147*"edna" 0.276646: -0.279*"martin" + -0.225*"barney" + -0.214*"commander" + 0.213*"willy" + -0.184*"bishop" + -0.160*"lordship" + 0.151*"marcus" + -0.143*"hagar" + 0.140*"kirkland" + 0.138*"dentist" 0.137868: 0.309*"billy" + -0.262*"hamilton" + 0.235*"byrne" + 0.235*"marion" + 0.234*"commander" + 0.233*"martin" + -0.206*"marshall" + 0.182*"barney" + 0.164*"allan" + 0.157*"hagar" 0.126233: -0.361*"laura" + -0.321*"oliver" + -0.297*"hagar" + -0.266*"emma" + -0.248*"steve" + -0.187*"mamma" + -0.163*"marion" + -0.147*"governess" + -0.146*"kitty" + -0.129*"abbey" 0.111108: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" LDA: topics of this text (highest scoring first) 0.527788: 0.020*lionel + 0.007*moore + 0.006*maurice + 0.005*theatre + 0.005*hans + 0.004*leo + 0.003*raft + 0.003*robert + 0.002*harry + 0.002*gallery 0.368154: 0.015*willy + 0.008*peter + 0.007*roubles + 0.006*ivan + 0.006*peasant + 0.006*holy + 0.006*peasants + 0.006*priest + 0.005*cell + 0.005*eugene 0.100204: 0.035*dick + 0.016*button + 0.016*reef + 0.015*lagoon + 0.011*paddy + 0.010*lestrange + 0.008*dinghy + 0.008*coral + 0.005*deck + 0.005*cocoa-nut ================================================== Text: corpus/Adventure/FOLD1/1145.txt The Project Gutenberg EBook of Rupert of Hentzau, by Anthony Hope This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Rupert of Hentzau From The Memoirs of Fritz Von Tarlenheim: The Sequel to The Prisoner of Zenda Author: Anthony Hope Posting Date: August 3, 2008 [EBook #1145 [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/1145.txt 0.677422 corpus/Adventure/FOLD1/95.txt 0.0341073 corpus/Adventure/FOLD1/21393.txt 0.0176074 corpus/Fiction/FOLD1/5240.txt 0.0173433 corpus/Adventure/FOLD1/1965.txt LSI: topics of this text (highest scoring first) 0.874067: 0.657*"sapt" + 0.401*"rudolf" + 0.246*"rupert" + 0.243*"fritz" + 0.240*"strelsau" + 0.177*"rassendyll" + 0.153*"bernenstein" + 0.152*"zenda" + -0.136*"ivan" + 0.106*"flavia" 0.265833: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.0289141: -0.592*"lionel" + -0.274*"denis" + -0.234*"percy" + 0.197*"stella" + 0.171*"macumazahn" + -0.171*"hendricks" + 0.166*"henry" + 0.156*"thee" + -0.142*"moore" + -0.133*"dick" 0.0128141: 0.371*"lionel" + -0.327*"denis" + -0.242*"percy" + 0.215*"henry" + -0.197*"hendricks" + -0.145*"leary" + 0.137*"bishop" + 0.133*"moore" + 0.126*"maurice" + -0.126*"lagoon" 0.0102128: -0.361*"laura" + -0.321*"oliver" + -0.297*"hagar" + -0.266*"emma" + -0.248*"steve" + -0.187*"mamma" + -0.163*"marion" + -0.147*"governess" + -0.146*"kitty" + -0.129*"abbey" LDA: topics of this text (highest scoring first) 0.9054: 0.059*rudolf + 0.047*sapt + 0.040*rupert + 0.027*bernenstein + 0.026*rassendyll + 0.017*strelsau + 0.015*james + 0.013*constable + 0.010*hentzau + 0.009*fritz 0.0667251: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess 0.0234222: 0.013*kirkland + 0.009*mortimer + 0.008*farm + 0.007*egypt + 0.006*lone + 0.006*messengers + 0.004*cable + 0.004*nile + 0.004*pond + 0.004*terrace ================================================== Text: corpus/Adventure/FOLD1/24091.txt The Project Gutenberg EBook of Despoilers of the Golden Empire, by Gordon Randall Garrett This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Despoilers of the Golden Empire Author: Gordon Randall Garrett Illustrator: Kelly Freas Release Date: December 31, 2007 [EBook #24091] Language: E [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/24091.txt 0.0441942 corpus/Adventure/FOLD1/13290.txt 0.0326124 corpus/Adventure/FOLD1/10368.txt 0.0295066 corpus/Adventure/FOLD1/1965.txt 0.0269335 corpus/Adventure/FOLD1/21459.txt LSI: topics of this text (highest scoring first) 0.297073: 0.309*"billy" + -0.262*"hamilton" + 0.235*"byrne" + 0.235*"marion" + 0.234*"commander" + 0.233*"martin" + -0.206*"marshall" + 0.182*"barney" + 0.164*"allan" + 0.157*"hagar" 0.277774: -0.457*"harold" + -0.387*"virginia" + 0.221*"commander" + 0.201*"martin" + 0.194*"kitty" + -0.192*"hans" + -0.160*"hagar" + 0.154*"barney" + -0.154*"marion" + -0.148*"joan" 0.188337: 0.378*"hans" + 0.342*"marshall" + -0.213*"oliver" + 0.184*"laura" + 0.181*"livingstone" + 0.179*"senator" + 0.167*"consul" + -0.156*"bishop" + 0.151*"commander" + -0.132*"lordship" 0.188018: 0.325*"harold" + 0.277*"virginia" + -0.277*"hamilton" + -0.201*"joan" + -0.191*"marcus" + -0.190*"edna" + -0.181*"bart" + -0.174*"dentist" + 0.155*"commander" + -0.131*"kate" 0.117855: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" LDA: topics of this text (highest scoring first) 0.98727: 0.028*commander + 0.011*bishop + 0.008*laura + 0.008*empire + 0.006*colonel + 0.006*aboard + 0.006*peter + 0.006*lordship + 0.006*spanish + 0.005*sword 0.0101011: 0.026*sapt + 0.020*kitty + 0.015*fritz + 0.011*grandma + 0.011*rupert + 0.010*duke + 0.009*strelsau + 0.007*zenda + 0.006*flavia + 0.006*princess ================================================== Text: corpus/Adventure/FOLD1/8493.txt The Project Gutenberg EBook of The Last Hope, by Henry Seton Merriman This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: The Last Hope Author: Henry Seton Merriman Release Date: July, 2005 [EBook #8493] Posting Date: July 27, 2009 Language: English Character set encoding: ASCII *** STA [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/8493.txt 0.273577 corpus/Adventure/FOLD1/1947.txt 0.0887184 corpus/Fiction/FOLD1/5240.txt 0.0610417 corpus/Adventure/FOLD1/2166.txt 0.0448207 corpus/Adventure/FOLD1/103.txt LSI: topics of this text (highest scoring first) 0.38461: -0.303*"stewart" + -0.275*"thet" + 0.220*"marquis" + 0.164*"monsieur" + 0.159*"turner" + -0.156*"bland" + -0.148*"fer" + -0.146*"lawson" + -0.146*"prim" + -0.143*"cowboys" 0.380421: -0.304*"lionel" + -0.282*"stella" + 0.230*"marquis" + -0.227*"macumazahn" + -0.202*"denis" + 0.196*"monsieur" + -0.175*"percy" + -0.163*"henry" + 0.155*"turner" + 0.148*"aline" 0.34475: 0.271*"drake" + 0.251*"dick" + 0.212*"marquis" + -0.168*"marshall" + 0.160*"turner" + 0.150*"kitty" + -0.136*"willy" + 0.134*"steve" + 0.127*"stella" + -0.124*"lionel" 0.277414: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.243673: -0.432*"dick" + -0.347*"drake" + -0.207*"lagoon" + 0.185*"stewart" + 0.167*"lionel" + 0.154*"thet" + 0.147*"marquis" + -0.146*"paddy" + -0.127*"lestrange" + -0.115*"reef" LDA: topics of this text (highest scoring first) 0.998851: 0.017*france + 0.017*turner + 0.016*marquis + 0.016*loo + 0.011*monsieur + 0.010*miriam + 0.008*madame + 0.008*pierre + 0.008*lawrence + 0.007*paris ================================================== Text: corpus/Adventure/FOLD1/2166.txt The Project Gutenberg EBook of King Solomon's Mines, by H. Rider Haggard This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: King Solomon's Mines Author: H. Rider Haggard Posting Date: January 15, 2009 [EBook #2166] Release Date: October 11, 2005 Last updated: August 18, 2011 Last updated: [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/2166.txt 0.20818 corpus/Adventure/FOLD1/2727.txt 0.146627 corpus/Adventure/FOLD1/10368.txt 0.0715007 corpus/Adventure/FOLD1/21459.txt 0.064237 corpus/Fiction/FOLD1/5240.txt LSI: topics of this text (highest scoring first) 0.374357: -0.329*"stella" + 0.297*"henry" + -0.292*"turner" + 0.252*"aline" + 0.170*"andre" + -0.161*"miriam" + 0.147*"tour" + 0.146*"thou" + -0.142*"baboons" + 0.142*"kitty" 0.347208: -0.592*"lionel" + -0.274*"denis" + -0.234*"percy" + 0.197*"stella" + 0.171*"macumazahn" + -0.171*"hendricks" + 0.166*"henry" + 0.156*"thee" + -0.142*"moore" + -0.133*"dick" 0.30836: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.299665: 0.371*"lionel" + -0.327*"denis" + -0.242*"percy" + 0.215*"henry" + -0.197*"hendricks" + -0.145*"leary" + 0.137*"bishop" + 0.133*"moore" + 0.126*"maurice" + -0.126*"lagoon" 0.210767: -0.361*"laura" + -0.321*"oliver" + -0.297*"hagar" + -0.266*"emma" + -0.248*"steve" + -0.187*"mamma" + -0.163*"marion" + -0.147*"governess" + -0.146*"kitty" + -0.129*"abbey" LDA: topics of this text (highest scoring first) 0.551928: 0.026*denis + 0.025*percy + 0.016*hendricks + 0.015*zulus + 0.011*lionel + 0.010*waggon + 0.010*crawford + 0.008*rupert + 0.007*lion + 0.006*farm 0.447945: 0.042*drake + 0.021*dick + 0.010*falconer + 0.009*vernon + 0.007*henry + 0.006*earl + 0.006*mills + 0.006*countess + 0.005*thou + 0.005*mamma ================================================== Text: corpus/Adventure/FOLD1/2727.txt The Project Gutenberg EBook of Allan's Wife, by H. Rider Haggard This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Allan's Wife Author: H. Rider Haggard Release Date: March 28, 2006 [EBook #2727] Language: English *** START OF THIS PROJECT GUTENBERG EBOOK ALLAN'S WIFE *** Produced [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/2727.txt 0.20818 corpus/Adventure/FOLD1/2166.txt 0.0970604 corpus/Adventure/FOLD1/15072.txt 0.087352 corpus/Adventure/FOLD1/21393.txt 0.0835832 corpus/Fiction/FOLD1/15182.txt LSI: topics of this text (highest scoring first) 0.330356: -0.293*"turner" + 0.290*"aline" + 0.276*"stella" + -0.248*"loo" + -0.205*"denis" + 0.196*"andre" + -0.171*"henry" + 0.167*"lionel" + 0.162*"tour" + -0.159*"percy" 0.316777: -0.592*"lionel" + -0.274*"denis" + -0.234*"percy" + 0.197*"stella" + 0.171*"macumazahn" + -0.171*"hendricks" + 0.166*"henry" + 0.156*"thee" + -0.142*"moore" + -0.133*"dick" 0.207412: -0.697*"ivan" + -0.176*"roubles" + 0.131*"dick" + -0.124*"peter" + -0.115*"eugene" + -0.112*"peasants" + -0.109*"peasant" + 0.107*"lionel" + 0.107*"stella" + -0.100*"simeon" 0.18471: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" 0.182816: 0.284*"marshall" + -0.266*"martin" + -0.257*"commander" + 0.212*"billy" + -0.208*"barney" + 0.173*"marion" + 0.172*"kirkland" + 0.165*"byrne" + 0.163*"drake" + 0.152*"livingstone" LDA: topics of this text (highest scoring first) 0.528421: 0.026*denis + 0.025*percy + 0.016*hendricks + 0.015*zulus + 0.011*lionel + 0.010*waggon + 0.010*crawford + 0.008*rupert + 0.007*lion + 0.006*farm 0.468855: 0.052*stella + 0.028*macumazahn + 0.023*baboons + 0.015*carson + 0.013*allan + 0.013*waggons + 0.011*kraals + 0.010*baboon + 0.008*peak + 0.008*marble ================================================== Text: corpus/Adventure/FOLD1/1947.txt The Project Gutenberg EBook of Scaramouche, by Rafael Sabatini This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org Title: Scaramouche A Romance of the French Revolution Author: Rafael Sabatini Release Date: November, 1999 [Etext #1947] Posting Date: August 13, 2009 Language: English ** [...] -------------------------------------------------- LSI: most similar texts 1.0 corpus/Adventure/FOLD1/1947.txt 0.273577 corpus/Adventure/FOLD1/8493.txt 0.129544 corpus/Fiction/FOLD1/5240.txt 0.0799009 corpus/Adventure/FOLD1/1965.txt 0.0724676 corpus/Adventure/FOLD1/103.txt LSI: topics of this text (highest scoring first) 0.462423: -0.293*"turner" + 0.290*"aline" + 0.276*"stella" + -0.248*"loo" + -0.205*"denis" + 0.196*"andre" + -0.171*"henry" + 0.167*"lionel" + 0.162*"tour" + -0.159*"percy" 0.417472: -0.304*"lionel" + -0.282*"stella" + 0.230*"marquis" + -0.227*"macumazahn" + -0.202*"denis" + 0.196*"monsieur" + -0.175*"percy" + -0.163*"henry" + 0.155*"turner" + 0.148*"aline" 0.383253: -0.329*"stella" + 0.297*"henry" + -0.292*"turner" + 0.252*"aline" + 0.170*"andre" + -0.161*"miriam" + 0.147*"tour" + 0.146*"thou" + -0.142*"baboons" + 0.142*"kitty" 0.367033: -0.303*"stewart" + -0.275*"thet" + 0.220*"marquis" + 0.164*"monsieur" + 0.159*"turner" + -0.156*"bland" + -0.148*"fer" + -0.146*"lawson" + -0.146*"prim" + -0.143*"cowboys" 0.333359: 0.319*"ivan" + 0.182*"sapt" + 0.149*"peter" + 0.133*"monsieur" + 0.123*"marquis" + 0.122*"thee" + 0.115*"colonel" + 0.113*"bishop" + 0.112*"leary" + 0.109*"rudolf" LDA: topics of this text (highest scoring first) 0.94682: 0.014*tour + 0.012*monsieur + 0.011*marquis + 0.010*aline + 0.009*madame + 0.007*andre + 0.006*paris + 0.005*nantes + 0.005*mademoiselle + 0.005*philippe 0.0102775: 0.009*hamilton + 0.005*leary + 0.004*paris + 0.004*jane + 0.004*guy + 0.003*harry + 0.003*colonel + 0.003*tom + 0.002*regiment + 0.002*lordship ==================================================