mrjob is a software package developed by the restaurant recommendation company Yelp. It's goal is to simplify the deployment of map-reduce jobs based on streaming and python onto different frameworks such as Hadoop on a private cluster or hadoop on AWS (called EMR).
In this notebook we run a simple word-count example, add to it some logging commands, and look at two modes of running the job.
import os
home_dir=os.environ['HOME']
root_dir = '/Users/yoavfreund/BigData/mrjob'
examples_dir=root_dir+'/examples/'
!ls -l $examples_dir
total 152 -rw-r--r-- 1 yoavfreund staff 0 Apr 30 17:08 __init__.py drwxr-xr-x 5 yoavfreund staff 170 Apr 30 17:08 bash_wrap drwxr-xr-x 3 yoavfreund staff 102 Apr 30 17:08 contrib -rw-r--r-- 1 yoavfreund staff 3176 Apr 30 17:08 mr_cmd.py -rw-r--r-- 1 yoavfreund staff 1198 Apr 30 17:08 mr_grep.py -rw-r--r-- 1 yoavfreund staff 2125 Apr 30 17:08 mr_jar_step_example.py -rw-r--r-- 1 yoavfreund staff 4108 Apr 30 17:08 mr_log_sampler.py -rwxr-xr-x 1 yoavfreund staff 1972 Apr 30 17:08 mr_most_used_word.py -rw-r--r-- 1 yoavfreund staff 3400 Apr 30 17:08 mr_next_word_stats.py -rw-r--r-- 1 yoavfreund staff 3501 Apr 30 17:08 mr_page_rank.py drwxr-xr-x 6 yoavfreund staff 204 Apr 30 17:08 mr_postfix_bounce -rw-r--r-- 1 yoavfreund staff 21954 Apr 30 17:08 mr_text_classifier.py drwxr-xr-x 6 yoavfreund staff 204 Apr 30 17:08 mr_travelling_salesman -rw-r--r-- 1 yoavfreund staff 1552 Apr 30 17:08 mr_wc.py -rwxr-xr-x 1 yoavfreund staff 1977 Apr 30 17:08 mr_wc.rb -rwxr-xr-x 1 yoavfreund staff 1065 Apr 30 17:08 mr_word_freq_count.py -rw-r--r-- 1 yoavfreund staff 4887 Apr 30 17:08 py3k_word_freq_count.py
filename=examples_dir+'mr_word_freq_count.py'
print filename
!ls $filaname
# load example code from mr jobs as a starting point
%load $filename
/Users/yoavfreund/BigData/mrjob/examples/mr_word_freq_count.py Simple use of mrjob.ipynb Weather Analysis.ipynb counts
#!/usr/bin/python
# Copyright 2009-2010 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The classic MapReduce job: count the frequency of words.
"""
from mrjob.job import MRJob
import re
WORD_RE = re.compile(r"[\w']+")
class MRWordFreqCount(MRJob):
def mapper(self, _, line):
for word in WORD_RE.findall(line):
yield (word.lower(), 1)
def combiner(self, word, counts):
yield (word, sum(counts))
def reducer(self, word, counts):
yield (word, sum(counts))
if __name__ == '__main__':
MRWordFreqCount.run()
%%writefile mr_word_freq_count.py
#!/usr/bin/python
# Copyright 2009-2010 Yelp
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""The classic MapReduce job: count the frequency of words.
"""
from mrjob.job import MRJob
import re
from sys import stderr
WORD_RE = re.compile(r"[\w']+")
#logfile=open('log','w')
logfile=stderr
class MRWordFreqCount(MRJob):
def mapper(self, _, line):
for word in WORD_RE.findall(line):
logfile.write('mapper '+word.lower()+'\n')
yield (word.lower(), 1)
def combiner(self, word, counts):
#yield (word, sum(counts))
l_counts=[c for c in counts] # extract list from iterator
S=sum(l_counts)
logfile.write('combiner '+word+' ['+','.join([str(c) for c in l_counts])+']='+str(S)+'\n')
yield (word, S)
def reducer(self, word, counts):
#yield (word, sum(counts))
l_counts=[c for c in counts] # extract list from iterator
S=sum(l_counts)
logfile.write('reducer '+word+' ['+','.join([str(c) for c in l_counts])+']='+str(S)+'\n')
yield (word, S)
if __name__ == '__main__':
MRWordFreqCount.run()
Overwriting mr_word_freq_count.py
!python mr_word_freq_count.py $root_dir/README.rst > counts
!cat log
cat: log: No such file or directory
!cat counts
"'__main__'" 1 "04" 1 "05" 1 "08" 1 "1" 1 "2" 2 "2009" 1 "2010" 1 "2011" 4 "2012" 1 "4" 1 "4898987" 1 "5" 1 "_" 18 "__name__" 1 "a" 3 "access" 1 "accordingly" 1 "account" 3 "advanced" 2 "aimotion" 1 "allows" 1 "also" 1 "amazon" 6 "amazon's" 1 "an" 2 "analysis" 2 "and" 12 "apache" 1 "automatically" 1 "aws" 5 "aws_access_key_id" 2 "aws_secret_access_key" 2 "basics" 1 "basis" 1 "blind" 3 "blip" 1 "blogspot" 1 "buy" 1 "by" 1 "ci" 2 "class" 1 "classic" 1 "click" 1 "cluster" 5 "code" 4 "com" 10 "combiner" 1 "compile" 1 "computing" 1 "conf" 6 "config" 1 "configs" 1 "configuration" 1 "contents" 1 "count" 1 "counts" 7 "create" 1 "credentials" 1 "def" 3 "development" 1 "discussion" 1 "distributed" 1 "docs" 2 "documentation" 5 "duplicate" 1 "e" 1 "easily" 1 "elastic" 5 "elasticmapreduce" 2 "emr" 9 "en" 1 "environment" 3 "error" 1 "etc" 1 "everyone" 1 "example" 1 "examples" 4 "features" 2 "feeds" 1 "file" 2 "findall" 1 "for" 8 "frequency" 1 "from" 5 "fully" 1 "g" 1 "get" 1 "github" 3 "google" 1 "graph" 2 "greg" 2 "group" 2 "groups" 1 "guides" 1 "hadoop" 11 "hadoop_home" 1 "handled" 1 "helps" 1 "hourly" 1 "html" 3 "http" 16 "https" 3 "if" 1 "image" 2 "import" 2 "important" 1 "in" 5 "information" 2 "inside" 1 "install" 4 "installation" 1 "interpret" 1 "into" 1 "introduction" 2 "is" 2 "it" 3 "its" 1 "job" 4 "job's" 1 "jobs" 3 "keys" 1 "killion" 1 "latest" 1 "line" 2 "links" 1 "live" 1 "locally" 2 "logo" 1 "logo_medium" 1 "logos" 1 "logs" 1 "looks" 1 "lower" 1 "mailto" 1 "make" 3 "map" 2 "mapper" 1 "mapreduce" 8 "marcelcaraciolo" 1 "master" 1 "minimal" 1 "more" 3 "mr_word_freq_count" 3 "mrjob" 31 "mrjob_conf" 1 "mrwordfreqcount" 2 "multi" 1 "need" 1 "net" 3 "next" 1 "of" 2 "on" 10 "one" 1 "only" 1 "or" 1 "org" 7 "other" 3 "out" 1 "overview" 1 "own" 2 "package" 1 "packages" 4 "page" 1 "pip" 1 "png" 2 "postneo" 1 "production" 1 "project" 1 "put" 1 "py" 4 "pycon" 3 "pypi" 1 "pypy" 2 "python" 10 "pythonpath" 1 "r" 3 "raw" 1 "re" 2 "readme" 3 "readthedocs" 1 "recommendations" 2 "recsys" 1 "reduce" 2 "reducer" 1 "reference" 1 "regions" 1 "rst" 3 "run" 8 "scripts" 1 "secret" 1 "security" 1 "see" 1 "self" 3 "service" 1 "services" 1 "set" 5 "setting" 1 "setup" 4 "sign" 1 "simple" 1 "simplejson" 1 "social" 2 "some" 1 "source" 5 "ssh" 1 "stable" 1 "stable1" 1 "step" 2 "streaming" 3 "sum" 2 "supports" 1 "sure" 1 "tarballs" 1 "target" 1 "testing" 1 "thanks" 1 "that" 1 "the" 7 "this" 1 "time" 1 "to" 9 "tracker" 1 "transparently" 1 "travis" 2 "tree" 2 "try" 1 "tunnel" 1 "tv" 1 "tz" 1 "up" 3 "upload" 2 "us" 1 "use" 1 "using" 2 "v0" 1 "variables" 2 "version" 2 "videos" 1 "w'" 1 "web" 1 "which" 1 "with" 3 "word" 6 "word_re" 2 "words" 1 "works" 4 "write" 2 "www" 1 "yelp" 4 "yield" 3 "you" 2 "you'll" 1 "your" 10
The keyword yield is somewhat similar to return however, while return terminates the function and returns the result, yield, the first time it is encountered, return an object called a generator, without executing the function even once. On subsequent calls, the function is executed until one or more yield commands are encountered, these values are returned, and the function halts (but does not terminate) until it is called again.
Here is a simple example:
def myrange(start,stop,step):
value=start
while value<=stop:
yield value
value += step
print [x for x in myrange(1.0,3.0,0.3)]
[1.0, 1.3, 1.6, 1.9000000000000001, 2.2, 2.5, 2.8]
print myrange(1.0,3.0,0.3)
<generator object myrange at 0x217e870>
gen1=myrange(1.0,3.0,0.3)
gen2=myrange(2.0,5.0,0.7)
print 'gen1:',[x for x in gen1]
print 'gen1:',[x for x in gen1] # after the generator terminated, it does not yield any more values.
print 'gen2:',[x for x in gen2]
gen1: [1.0, 1.3, 1.6, 1.9000000000000001, 2.2, 2.5, 2.8] gen1: [] gen2: [2.0, 2.7, 3.4000000000000004, 4.1000000000000005, 4.800000000000001]
A generator is similar to an array or a list, all of those are iterable objects. However, while list store all of the values in memory and can be read in any order, generators create the values on the fly and can only traversed once and in order
It is the fact that values are generated on the fly and then discarded which makes generators attractive when processing large amounts of data - only a small amount of intermedite results, the outputs of the mapper which are inputs to the reducer, need to be stored in memory. How much depends on the communication speed between mappers and reducers.
It is instructive to see how generators can be cascaded by passing a generator as a parameter to another generator.
def mycumul(values): # values can be a list or a generator.
s=0
for value in values:
s+=value
yield s
# Here we pass a generator as an input to another generator.
gen3=mycumul(myrange(1.0,3.0,0.3))
print 'gen3:',[x for x in gen3]
gen3: [1.0, 2.3, 3.9, 5.8, 8.0, 10.5, 13.3]
Once the mapper, combiner and reducer have been written and tested, you can run the job on different types of infrastructure:
Below we run the same process we ran at the top using local instead of the default inline. Observe that in this case the reducers have some non-trivial work to do even when combiners are used.
!python mr_word_freq_count.py --runner=local $root_dir/README.rst > counts
using configs in /Users/yoavfreund/.mrjob.conf creating tmp directory /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860 writing to /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-mapper_part-00000 > //anaconda/bin/python mr_word_freq_count.py --step-num=0 --mapper /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/input_part-00000 | sort | //anaconda/bin/python mr_word_freq_count.py --step-num=0 --combiner > /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-mapper_part-00000 writing to /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-mapper_part-00001 > //anaconda/bin/python mr_word_freq_count.py --step-num=0 --mapper /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/input_part-00001 | sort | //anaconda/bin/python mr_word_freq_count.py --step-num=0 --combiner > /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-mapper_part-00001 STDERR: mapper mrjob STDERR: mapper image STDERR: mapper http STDERR: mapper github STDERR: mapper com STDERR: mapper yelp STDERR: mapper mrjob STDERR: mapper raw STDERR: mapper master STDERR: mapper docs STDERR: mapper logos STDERR: mapper logo_medium STDERR: mapper png STDERR: mapper mrjob STDERR: mapper is STDERR: mapper a STDERR: mapper python STDERR: mapper 2 STDERR: mapper 5 STDERR: mapper package STDERR: mapper that STDERR: mapper helps STDERR: mapper you STDERR: mapper write STDERR: mapper and STDERR: mapper run STDERR: mapper hadoop STDERR: mapper streaming STDERR: mapper jobs STDERR: mapper stable STDERR: mapper version STDERR: mapper v0 STDERR: mapper 4 STDERR: mapper 2 STDERR: mapper documentation STDERR: mapper http STDERR: mapper packages STDERR: mapper python STDERR: mapper org STDERR: mapper mrjob STDERR: mapper _ STDERR: mapper development STDERR: mapper version STDERR: mapper documentation STDERR: mapper http STDERR: mapper mrjob STDERR: mapper readthedocs STDERR: mapper org STDERR: mapper en STDERR: mapper latest STDERR: mapper _ STDERR: mapper image STDERR: mapper https STDERR: mapper travis STDERR: mapper ci STDERR: mapper org STDERR: mapper yelp STDERR: mapper mrjob STDERR: mapper png STDERR: mapper target STDERR: mapper https STDERR: mapper travis STDERR: mapper ci STDERR: mapper org STDERR: mapper yelp STDERR: mapper mrjob STDERR: mapper mrjob STDERR: mapper fully STDERR: mapper supports STDERR: mapper amazon's STDERR: mapper elastic STDERR: mapper mapreduce STDERR: mapper emr STDERR: mapper service STDERR: mapper which STDERR: mapper allows STDERR: mapper you STDERR: mapper to STDERR: mapper buy STDERR: mapper time STDERR: mapper on STDERR: mapper a STDERR: mapper hadoop STDERR: mapper cluster STDERR: mapper on STDERR: mapper an STDERR: mapper hourly STDERR: mapper basis STDERR: mapper it STDERR: mapper also STDERR: mapper works STDERR: mapper with STDERR: mapper your STDERR: mapper own STDERR: mapper hadoop STDERR: mapper cluster STDERR: mapper some STDERR: mapper important STDERR: mapper features STDERR: mapper run STDERR: mapper jobs STDERR: mapper on STDERR: mapper emr STDERR: mapper your STDERR: mapper own STDERR: mapper hadoop STDERR: mapper cluster STDERR: mapper or STDERR: mapper locally STDERR: mapper for STDERR: mapper testing STDERR: mapper write STDERR: mapper multi STDERR: mapper step STDERR: mapper jobs STDERR: mapper one STDERR: mapper map STDERR: mapper reduce STDERR: mapper step STDERR: mapper feeds STDERR: mapper into STDERR: mapper the STDERR: mapper next STDERR: mapper duplicate STDERR: mapper your STDERR: mapper production STDERR: mapper environment STDERR: mapper inside STDERR: mapper hadoop STDERR: mapper upload STDERR: mapper your STDERR: mapper source STDERR: mapper tree STDERR: mapper and STDERR: mapper put STDERR: mapper it STDERR: mapper in STDERR: mapper your STDERR: mapper job's STDERR: mapper pythonpath STDERR: mapper run STDERR: mapper make STDERR: mapper and STDERR: mapper other STDERR: mapper setup STDERR: mapper scripts STDERR: mapper set STDERR: mapper environment STDERR: mapper variables STDERR: mapper e STDERR: mapper g STDERR: mapper tz STDERR: mapper easily STDERR: mapper install STDERR: mapper python STDERR: mapper packages STDERR: mapper from STDERR: mapper tarballs STDERR: mapper emr STDERR: mapper only STDERR: mapper setup STDERR: mapper handled STDERR: mapper transparently STDERR: mapper by STDERR: mapper mrjob STDERR: mapper conf STDERR: mapper config STDERR: mapper file STDERR: mapper automatically STDERR: mapper interpret STDERR: mapper error STDERR: mapper logs STDERR: mapper from STDERR: mapper emr STDERR: mapper ssh STDERR: mapper tunnel STDERR: mapper to STDERR: mapper hadoop STDERR: mapper job STDERR: mapper tracker STDERR: mapper on STDERR: mapper emr STDERR: mapper minimal STDERR: mapper setup STDERR: mapper to STDERR: mapper run STDERR: mapper on STDERR: mapper emr STDERR: mapper set STDERR: mapper aws_access_key_id STDERR: mapper and STDERR: mapper aws_secret_access_key STDERR: mapper to STDERR: mapper run STDERR: mapper on STDERR: mapper your STDERR: mapper hadoop STDERR: mapper cluster STDERR: mapper install STDERR: mapper simplejson STDERR: mapper and STDERR: mapper make STDERR: mapper sure STDERR: mapper hadoop_home STDERR: mapper is STDERR: mapper set STDERR: mapper installation STDERR: mapper from STDERR: mapper pypi STDERR: mapper pip STDERR: mapper install STDERR: mapper mrjob STDERR: mapper from STDERR: mapper source STDERR: mapper python STDERR: mapper setup STDERR: mapper py STDERR: mapper install STDERR: mapper a STDERR: mapper simple STDERR: mapper map STDERR: mapper reduce STDERR: mapper job STDERR: mapper code STDERR: mapper for STDERR: mapper this STDERR: mapper example STDERR: mapper and STDERR: mapper more STDERR: mapper live STDERR: mapper in STDERR: mapper mrjob STDERR: mapper examples STDERR: mapper code STDERR: mapper python STDERR: mapper the STDERR: mapper classic STDERR: mapper mapreduce STDERR: mapper job STDERR: mapper count STDERR: mapper the STDERR: mapper frequency STDERR: mapper of STDERR: mapper words STDERR: mapper from STDERR: mapper mrjob STDERR: mapper job STDERR: mapper import STDERR: mapper mrjob STDERR: mapper import STDERR: mapper re STDERR: mapper word_re STDERR: mapper re STDERR: mapper compile STDERR: mapper r STDERR: mapper w' STDERR: mapper class STDERR: mapper mrwordfreqcount STDERR: mapper mrjob STDERR: mapper def STDERR: mapper mapper STDERR: mapper self STDERR: mapper _ STDERR: mapper line STDERR: mapper for STDERR: mapper word STDERR: mapper in STDERR: mapper word_re STDERR: mapper findall STDERR: mapper line STDERR: mapper yield STDERR: mapper word STDERR: mapper lower STDERR: mapper 1 STDERR: mapper def STDERR: mapper combiner STDERR: mapper self STDERR: mapper word STDERR: mapper counts STDERR: mapper yield STDERR: mapper word STDERR: mapper sum STDERR: mapper counts STDERR: mapper def STDERR: mapper reducer STDERR: mapper self STDERR: mapper word STDERR: mapper counts STDERR: mapper yield STDERR: mapper word STDERR: mapper sum STDERR: mapper counts STDERR: mapper if STDERR: mapper __name__ STDERR: mapper '__main__' STDERR: mapper mrwordfreqcount STDERR: mapper run STDERR: mapper try STDERR: mapper it STDERR: mapper out STDERR: combiner '__main__' [1]=1 STDERR: combiner 1 [1]=1 STDERR: combiner 2 [1,1]=2 STDERR: combiner 4 [1]=1 STDERR: combiner 5 [1]=1 STDERR: combiner _ [1,1,1]=3 STDERR: combiner __name__ [1]=1 STDERR: combiner a [1,1,1]=3 STDERR: combiner allows [1]=1 STDERR: combiner also [1]=1 STDERR: combiner amazon's [1]=1 STDERR: combiner an [1]=1 STDERR: combiner and [1,1,1,1,1,1]=6 STDERR: combiner automatically [1]=1 STDERR: combiner aws_access_key_id [1]=1 STDERR: combiner aws_secret_access_key [1]=1 STDERR: combiner basis [1]=1 STDERR: combiner buy [1]=1 STDERR: combiner by [1]=1 STDERR: combiner ci [1,1]=2 STDERR: combiner class [1]=1 STDERR: combiner classic [1]=1 STDERR: combiner cluster [1,1,1,1]=4 STDERR: combiner code [1,1]=2 STDERR: combiner com [1]=1 STDERR: combiner combiner [1]=1 STDERR: combiner compile [1]=1 STDERR: combiner conf [1]=1 STDERR: combiner config [1]=1 STDERR: combiner count [1]=1 STDERR: combiner counts [1,1,1,1]=4 STDERR: combiner def [1,1,1]=3 STDERR: combiner development [1]=1 STDERR: combiner docs [1]=1 STDERR: combiner documentation [1,1]=2 STDERR: combiner duplicate [1]=1 STDERR: combiner e [1]=1 STDERR: combiner easily [1]=1 STDERR: combiner elastic [1]=1 STDERR: combiner emr [1,1,1,1,1,1]=6 STDERR: combiner en [1]=1 STDERR: combiner environment [1,1]=2 STDERR: combiner error [1]=1 STDERR: combiner example [1]=1 STDERR: combiner examples [1]=1 STDERR: combiner features [1]=1 STDERR: combiner feeds [1]=1 STDERR: combiner file [1]=1 STDERR: combiner findall [1]=1 STDERR: combiner for [1,1,1]=3 STDERR: combiner frequency [1]=1 STDERR: combiner from [1,1,1,1,1]=5 STDERR: combiner fully [1]=1 STDERR: combiner g [1]=1 STDERR: combiner github [1]=1 STDERR: combiner hadoop [1,1,1,1,1,1,1]=7 STDERR: combiner hadoop_home [1]=1 STDERR: combiner handled [1]=1 STDERR: combiner helps [1]=1 STDERR: combiner hourly [1]=1 STDERR: combiner http [1,1,1]=3 STDERR: combiner https [1,1]=2 STDERR: combiner if [1]=1 STDERR: combiner image [1,1]=2 STDERR: combiner import [1,1]=2 STDERR: combiner important [1]=1 STDERR: combiner in [1,1,1]=3 STDERR: combiner inside [1]=1 STDERR: combiner install [1,1,1,1]=4 STDERR: combiner installation [1]=1 STDERR: combiner interpret [1]=1 STDERR: combiner into [1]=1 STDERR: combiner is [1,1]=2 STDERR: combiner it [1,1,1]=3 STDERR: combiner job [1,1,1,1]=4 STDERR: combiner job's [1]=1 STDERR: combiner jobs [1,1,1]=3 STDERR: combiner latest [1]=1 STDERR: combiner line [1,1]=2 STDERR: combiner live [1]=1 STDERR: combiner locally [1]=1 STDERR: combiner logo_medium [1]=1 STDERR: combiner logos [1]=1 STDERR: combiner logs [1]=1 STDERR: combiner lower [1]=1 STDERR: combiner make [1,1]=2 STDERR: combiner map [1,1]=2 STDERR: combiner mapper [1]=1 STDERR: combiner mapreduce [1,1]=2 STDERR: combiner master [1]=1 STDERR: combiner minimal [1]=1 STDERR: combiner more [1]=1 STDERR: combiner mrjob [1,1,1,1,1,1,1,1,1,1,1,1,1,1]=14 STDERR: combiner mrwordfreqcount [1,1]=2 STDERR: combiner multi [1]=1 STDERR: combiner next [1]=1 STDERR: combiner of [1]=1 STDERR: combiner on [1,1,1,1,1,1]=6 STDERR: combiner one [1]=1 STDERR: combiner only [1]=1 STDERR: combiner or [1]=1 STDERR: combiner org [1,1,1,1]=4 STDERR: combiner other [1]=1 STDERR: combiner out [1]=1 STDERR: combiner own [1,1]=2 STDERR: combiner package [1]=1 STDERR: combiner packages [1,1]=2 STDERR: combiner pip [1]=1 STDERR: combiner png [1,1]=2 STDERR: combiner production [1]=1 STDERR: combiner put [1]=1 STDERR: combiner py [1]=1 STDERR: combiner pypi [1]=1 STDERR: combiner python [1,1,1,1,1]=5 STDERR: combiner pythonpath [1]=1 STDERR: combiner r [1]=1 STDERR: combiner raw [1]=1 STDERR: combiner re [1,1]=2 STDERR: combiner readthedocs [1]=1 STDERR: combiner reduce [1,1]=2 STDERR: combiner reducer [1]=1 STDERR: combiner run [1,1,1,1,1,1]=6 STDERR: combiner scripts [1]=1 STDERR: combiner self [1,1,1]=3 STDERR: combiner service [1]=1 STDERR: combiner set [1,1,1]=3 STDERR: combiner setup [1,1,1,1]=4 STDERR: combiner simple [1]=1 STDERR: combiner simplejson [1]=1 STDERR: combiner some [1]=1 STDERR: combiner source [1,1]=2 STDERR: combiner ssh [1]=1 STDERR: combiner stable [1]=1 STDERR: combiner step [1,1]=2 STDERR: combiner streaming [1]=1 STDERR: combiner sum [1,1]=2 STDERR: combiner supports [1]=1 STDERR: combiner sure [1]=1 STDERR: combiner tarballs [1]=1 STDERR: combiner target [1]=1 STDERR: combiner testing [1]=1 STDERR: combiner that [1]=1 STDERR: combiner the [1,1,1]=3 STDERR: combiner this [1]=1 STDERR: combiner time [1]=1 STDERR: combiner to [1,1,1,1]=4 STDERR: combiner tracker [1]=1 STDERR: combiner transparently [1]=1 STDERR: combiner travis [1,1]=2 STDERR: combiner tree [1]=1 STDERR: combiner try [1]=1 STDERR: combiner tunnel [1]=1 STDERR: combiner tz [1]=1 STDERR: combiner upload [1]=1 STDERR: combiner v0 [1]=1 STDERR: combiner variables [1]=1 STDERR: combiner version [1,1]=2 STDERR: combiner w' [1]=1 STDERR: combiner which [1]=1 STDERR: combiner with [1]=1 STDERR: combiner word [1,1,1,1,1,1]=6 STDERR: combiner word_re [1,1]=2 STDERR: combiner words [1]=1 STDERR: combiner works [1]=1 STDERR: combiner write [1,1]=2 STDERR: combiner yelp [1,1,1]=3 STDERR: combiner yield [1,1,1]=3 STDERR: combiner you [1,1]=2 STDERR: combiner your [1,1,1,1,1,1]=6 STDERR: mapper locally STDERR: mapper python STDERR: mapper mrjob STDERR: mapper examples STDERR: mapper mr_word_freq_count STDERR: mapper py STDERR: mapper readme STDERR: mapper rst STDERR: mapper counts STDERR: mapper on STDERR: mapper emr STDERR: mapper python STDERR: mapper mrjob STDERR: mapper examples STDERR: mapper mr_word_freq_count STDERR: mapper py STDERR: mapper readme STDERR: mapper rst STDERR: mapper r STDERR: mapper emr STDERR: mapper counts STDERR: mapper on STDERR: mapper your STDERR: mapper hadoop STDERR: mapper cluster STDERR: mapper python STDERR: mapper mrjob STDERR: mapper examples STDERR: mapper mr_word_freq_count STDERR: mapper py STDERR: mapper readme STDERR: mapper rst STDERR: mapper r STDERR: mapper hadoop STDERR: mapper counts STDERR: mapper setting STDERR: mapper up STDERR: mapper emr STDERR: mapper on STDERR: mapper amazon STDERR: mapper create STDERR: mapper an STDERR: mapper amazon STDERR: mapper web STDERR: mapper services STDERR: mapper account STDERR: mapper http STDERR: mapper aws STDERR: mapper amazon STDERR: mapper com STDERR: mapper _ STDERR: mapper sign STDERR: mapper up STDERR: mapper for STDERR: mapper elastic STDERR: mapper mapreduce STDERR: mapper http STDERR: mapper aws STDERR: mapper amazon STDERR: mapper com STDERR: mapper elasticmapreduce STDERR: mapper _ STDERR: mapper get STDERR: mapper your STDERR: mapper access STDERR: mapper and STDERR: mapper secret STDERR: mapper keys STDERR: mapper click STDERR: mapper security STDERR: mapper credentials STDERR: mapper on STDERR: mapper your STDERR: mapper account STDERR: mapper page STDERR: mapper http STDERR: mapper aws STDERR: mapper amazon STDERR: mapper com STDERR: mapper account STDERR: mapper _ STDERR: mapper set STDERR: mapper the STDERR: mapper environment STDERR: mapper variables STDERR: mapper aws_access_key_id STDERR: mapper and STDERR: mapper aws_secret_access_key STDERR: mapper accordingly STDERR: mapper advanced STDERR: mapper configuration STDERR: mapper to STDERR: mapper run STDERR: mapper in STDERR: mapper other STDERR: mapper aws STDERR: mapper regions STDERR: mapper upload STDERR: mapper your STDERR: mapper source STDERR: mapper tree STDERR: mapper run STDERR: mapper make STDERR: mapper and STDERR: mapper use STDERR: mapper other STDERR: mapper advanced STDERR: mapper mrjob STDERR: mapper features STDERR: mapper you'll STDERR: mapper need STDERR: mapper to STDERR: mapper set STDERR: mapper up STDERR: mapper mrjob STDERR: mapper conf STDERR: mapper mrjob STDERR: mapper looks STDERR: mapper for STDERR: mapper its STDERR: mapper conf STDERR: mapper file STDERR: mapper in STDERR: mapper the STDERR: mapper contents STDERR: mapper of STDERR: mapper mrjob_conf STDERR: mapper mrjob STDERR: mapper conf STDERR: mapper etc STDERR: mapper mrjob STDERR: mapper conf STDERR: mapper see STDERR: mapper the STDERR: mapper mrjob STDERR: mapper conf STDERR: mapper documentation STDERR: mapper http STDERR: mapper packages STDERR: mapper python STDERR: mapper org STDERR: mapper mrjob STDERR: mapper guides STDERR: mapper configs STDERR: mapper basics STDERR: mapper html STDERR: mapper _ STDERR: mapper for STDERR: mapper more STDERR: mapper information STDERR: mapper project STDERR: mapper links STDERR: mapper source STDERR: mapper code STDERR: mapper http STDERR: mapper github STDERR: mapper com STDERR: mapper yelp STDERR: mapper mrjob STDERR: mapper _ STDERR: mapper documentation STDERR: mapper http STDERR: mapper packages STDERR: mapper python STDERR: mapper org STDERR: mapper mrjob STDERR: mapper _ STDERR: mapper discussion STDERR: mapper group STDERR: mapper http STDERR: mapper groups STDERR: mapper google STDERR: mapper com STDERR: mapper group STDERR: mapper mrjob STDERR: mapper _ STDERR: mapper reference STDERR: mapper hadoop STDERR: mapper streaming STDERR: mapper http STDERR: mapper hadoop STDERR: mapper apache STDERR: mapper org STDERR: mapper docs STDERR: mapper stable1 STDERR: mapper streaming STDERR: mapper html STDERR: mapper _ STDERR: mapper elastic STDERR: mapper mapreduce STDERR: mapper http STDERR: mapper aws STDERR: mapper amazon STDERR: mapper com STDERR: mapper documentation STDERR: mapper elasticmapreduce STDERR: mapper _ STDERR: mapper more STDERR: mapper information STDERR: mapper pycon STDERR: mapper 2011 STDERR: mapper mrjob STDERR: mapper overview STDERR: mapper http STDERR: mapper blip STDERR: mapper tv STDERR: mapper pycon STDERR: mapper us STDERR: mapper videos STDERR: mapper 2009 STDERR: mapper 2010 STDERR: mapper 2011 STDERR: mapper pycon STDERR: mapper 2011 STDERR: mapper mrjob STDERR: mapper distributed STDERR: mapper computing STDERR: mapper for STDERR: mapper everyone STDERR: mapper 4898987 STDERR: mapper _ STDERR: mapper introduction STDERR: mapper to STDERR: mapper recommendations STDERR: mapper and STDERR: mapper mapreduce STDERR: mapper with STDERR: mapper mrjob STDERR: mapper http STDERR: mapper aimotion STDERR: mapper blogspot STDERR: mapper com STDERR: mapper 2012 STDERR: mapper 08 STDERR: mapper introduction STDERR: mapper to STDERR: mapper recommendations STDERR: mapper with STDERR: mapper html STDERR: mapper _ STDERR: mapper source STDERR: mapper code STDERR: mapper https STDERR: mapper github STDERR: mapper com STDERR: mapper marcelcaraciolo STDERR: mapper recsys STDERR: mapper mapreduce STDERR: mapper mrjob STDERR: mapper _ STDERR: mapper social STDERR: mapper graph STDERR: mapper analysis STDERR: mapper using STDERR: mapper elastic STDERR: mapper mapreduce STDERR: mapper and STDERR: mapper pypy STDERR: mapper http STDERR: mapper postneo STDERR: mapper com STDERR: mapper 2011 STDERR: mapper 05 STDERR: mapper 04 STDERR: mapper social STDERR: mapper graph STDERR: mapper analysis STDERR: mapper using STDERR: mapper elastic STDERR: mapper mapreduce STDERR: mapper and STDERR: mapper pypy STDERR: mapper _ STDERR: mapper thanks STDERR: mapper to STDERR: mapper greg STDERR: mapper killion STDERR: mapper mailto STDERR: mapper greg STDERR: mapper blind STDERR: mapper works STDERR: mapper net STDERR: mapper _ STDERR: mapper blind STDERR: mapper works STDERR: mapper net STDERR: mapper http STDERR: mapper www STDERR: mapper blind STDERR: mapper works STDERR: mapper net STDERR: mapper _ STDERR: mapper for STDERR: mapper the STDERR: mapper logo STDERR: combiner 04 [1]=1 STDERR: combiner 05 [1]=1 STDERR: combiner 08 [1]=1 STDERR: combiner 2009 [1]=1 STDERR: combiner 2010 [1]=1 STDERR: combiner 2011 [1,1,1,1]=4 STDERR: combiner 2012 [1]=1 STDERR: combiner 4898987 [1]=1 STDERR: combiner _ [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]=15 STDERR: combiner access [1]=1 STDERR: combiner accordingly [1]=1 STDERR: combiner account [1,1,1]=3 STDERR: combiner advanced [1,1]=2 STDERR: combiner aimotion [1]=1 STDERR: combiner amazon [1,1,1,1,1,1]=6 STDERR: combiner an [1]=1 STDERR: combiner analysis [1,1]=2 STDERR: combiner and [1,1,1,1,1,1]=6 STDERR: combiner apache [1]=1 STDERR: combiner aws [1,1,1,1,1]=5 STDERR: combiner aws_access_key_id [1]=1 STDERR: combiner aws_secret_access_key [1]=1 STDERR: combiner basics [1]=1 STDERR: combiner blind [1,1,1]=3 STDERR: combiner blip [1]=1 STDERR: combiner blogspot [1]=1 STDERR: combiner click [1]=1 STDERR: combiner cluster [1]=1 STDERR: combiner code [1,1]=2 STDERR: combiner com [1,1,1,1,1,1,1,1,1]=9 STDERR: combiner computing [1]=1 STDERR: combiner conf [1,1,1,1,1]=5 STDERR: combiner configs [1]=1 STDERR: combiner configuration [1]=1 STDERR: combiner contents [1]=1 STDERR: combiner counts [1,1,1]=3 STDERR: combiner create [1]=1 STDERR: combiner credentials [1]=1 STDERR: combiner discussion [1]=1 STDERR: combiner distributed [1]=1 STDERR: combiner docs [1]=1 STDERR: combiner documentation [1,1,1]=3 STDERR: combiner elastic [1,1,1,1]=4 STDERR: combiner elasticmapreduce [1,1]=2 STDERR: combiner emr [1,1,1]=3 STDERR: combiner environment [1]=1 STDERR: combiner etc [1]=1 STDERR: combiner everyone [1]=1 STDERR: combiner examples [1,1,1]=3 STDERR: combiner features [1]=1 STDERR: combiner file [1]=1 STDERR: combiner for [1,1,1,1,1]=5 STDERR: combiner get [1]=1 STDERR: combiner github [1,1]=2 STDERR: combiner google [1]=1 STDERR: combiner graph [1,1]=2 STDERR: combiner greg [1,1]=2 STDERR: combiner group [1,1]=2 STDERR: combiner groups [1]=1 STDERR: combiner guides [1]=1 STDERR: combiner hadoop [1,1,1,1]=4 STDERR: combiner html [1,1,1]=3 STDERR: combiner http [1,1,1,1,1,1,1,1,1,1,1,1,1]=13 STDERR: combiner https [1]=1 STDERR: combiner in [1,1]=2 STDERR: combiner information [1,1]=2 STDERR: combiner introduction [1,1]=2 STDERR: combiner its [1]=1 STDERR: combiner keys [1]=1 STDERR: combiner killion [1]=1 STDERR: combiner links [1]=1 STDERR: combiner locally [1]=1 STDERR: combiner logo [1]=1 STDERR: combiner looks [1]=1 STDERR: combiner mailto [1]=1 STDERR: combiner make [1]=1 STDERR: combiner mapreduce [1,1,1,1,1,1]=6 STDERR: combiner marcelcaraciolo [1]=1 STDERR: combiner more [1,1]=2 STDERR: combiner mr_word_freq_count [1,1,1]=3 STDERR: combiner mrjob [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]=17 STDERR: combiner mrjob_conf [1]=1 STDERR: combiner need [1]=1 STDERR: combiner net [1,1,1]=3 STDERR: combiner of [1]=1 STDERR: combiner on [1,1,1,1]=4 STDERR: combiner org [1,1,1]=3 STDERR: combiner other [1,1]=2 STDERR: combiner overview [1]=1 STDERR: combiner packages [1,1]=2 STDERR: combiner page [1]=1 STDERR: combiner postneo [1]=1 STDERR: combiner project [1]=1 STDERR: combiner py [1,1,1]=3 STDERR: combiner pycon [1,1,1]=3 STDERR: combiner pypy [1,1]=2 STDERR: combiner python [1,1,1,1,1]=5 STDERR: combiner r [1,1]=2 STDERR: combiner readme [1,1,1]=3 STDERR: combiner recommendations [1,1]=2 STDERR: combiner recsys [1]=1 STDERR: combiner reference [1]=1 STDERR: combiner regions [1]=1 STDERR: combiner rst [1,1,1]=3 STDERR: combiner run [1,1]=2 STDERR: combiner secret [1]=1 STDERR: combiner security [1]=1 STDERR: combiner see [1]=1 STDERR: combiner services [1]=1 STDERR: combiner set [1,1]=2 STDERR: combiner setting [1]=1 STDERR: combiner sign [1]=1 STDERR: combiner social [1,1]=2 STDERR: combiner source [1,1,1]=3 STDERR: combiner stable1 [1]=1 STDERR: combiner streaming [1,1]=2 STDERR: combiner thanks [1]=1 STDERR: combiner the [1,1,1,1]=4 STDERR: combiner to [1,1,1,1,1]=5 STDERR: combiner tree [1]=1 STDERR: combiner tv [1]=1 STDERR: combiner up [1,1,1]=3 STDERR: combiner upload [1]=1 STDERR: combiner us [1]=1 STDERR: combiner use [1]=1 STDERR: combiner using [1,1]=2 STDERR: combiner variables [1]=1 STDERR: combiner videos [1]=1 STDERR: combiner web [1]=1 STDERR: combiner with [1,1]=2 STDERR: combiner works [1,1,1]=3 STDERR: combiner www [1]=1 STDERR: combiner yelp [1]=1 STDERR: combiner you'll [1]=1 STDERR: combiner your [1,1,1,1]=4 Counters from step 1: (no counters found) writing to /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-mapper-sorted > sort /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-mapper_part-00000 /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-mapper_part-00001 writing to /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-reducer_part-00000 > //anaconda/bin/python mr_word_freq_count.py --step-num=0 --reducer /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/input_part-00000 > /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-reducer_part-00000 writing to /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-reducer_part-00001 > //anaconda/bin/python mr_word_freq_count.py --step-num=0 --reducer /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/input_part-00001 > /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-reducer_part-00001 STDERR: reducer '__main__' [1]=1 STDERR: reducer 04 [1]=1 STDERR: reducer 05 [1]=1 STDERR: reducer 08 [1]=1 STDERR: reducer 1 [1]=1 STDERR: reducer 2 [2]=2 STDERR: reducer 2009 [1]=1 STDERR: reducer 2010 [1]=1 STDERR: reducer 2011 [4]=4 STDERR: reducer 2012 [1]=1 STDERR: reducer 4 [1]=1 STDERR: reducer 4898987 [1]=1 STDERR: reducer 5 [1]=1 STDERR: reducer _ [15,3]=18 STDERR: reducer __name__ [1]=1 STDERR: reducer a [3]=3 STDERR: reducer access [1]=1 STDERR: reducer accordingly [1]=1 STDERR: reducer account [3]=3 STDERR: reducer advanced [2]=2 STDERR: reducer aimotion [1]=1 STDERR: reducer allows [1]=1 STDERR: reducer also [1]=1 STDERR: reducer amazon [6]=6 STDERR: reducer amazon's [1]=1 STDERR: reducer an [1,1]=2 STDERR: reducer analysis [2]=2 STDERR: reducer and [6,6]=12 STDERR: reducer apache [1]=1 STDERR: reducer automatically [1]=1 STDERR: reducer aws [5]=5 STDERR: reducer aws_access_key_id [1,1]=2 STDERR: reducer aws_secret_access_key [1,1]=2 STDERR: reducer basics [1]=1 STDERR: reducer basis [1]=1 STDERR: reducer blind [3]=3 STDERR: reducer blip [1]=1 STDERR: reducer blogspot [1]=1 STDERR: reducer buy [1]=1 STDERR: reducer by [1]=1 STDERR: reducer ci [2]=2 STDERR: reducer class [1]=1 STDERR: reducer classic [1]=1 STDERR: reducer click [1]=1 STDERR: reducer cluster [1,4]=5 STDERR: reducer code [2,2]=4 STDERR: reducer com [1,9]=10 STDERR: reducer combiner [1]=1 STDERR: reducer compile [1]=1 STDERR: reducer computing [1]=1 STDERR: reducer conf [1,5]=6 STDERR: reducer config [1]=1 STDERR: reducer configs [1]=1 STDERR: reducer configuration [1]=1 STDERR: reducer contents [1]=1 STDERR: reducer count [1]=1 STDERR: reducer counts [3,4]=7 STDERR: reducer create [1]=1 STDERR: reducer credentials [1]=1 STDERR: reducer def [3]=3 STDERR: reducer development [1]=1 STDERR: reducer discussion [1]=1 STDERR: reducer distributed [1]=1 STDERR: reducer docs [1,1]=2 STDERR: reducer documentation [2,3]=5 STDERR: reducer duplicate [1]=1 STDERR: reducer e [1]=1 STDERR: reducer easily [1]=1 STDERR: reducer elastic [1,4]=5 STDERR: reducer elasticmapreduce [2]=2 STDERR: reducer emr [3,6]=9 STDERR: reducer en [1]=1 STDERR: reducer environment [1,2]=3 STDERR: reducer error [1]=1 STDERR: reducer etc [1]=1 STDERR: reducer everyone [1]=1 STDERR: reducer example [1]=1 STDERR: reducer examples [1,3]=4 STDERR: reducer features [1,1]=2 STDERR: reducer feeds [1]=1 STDERR: reducer file [1,1]=2 STDERR: reducer findall [1]=1 STDERR: reducer for [3,5]=8 STDERR: reducer frequency [1]=1 STDERR: reducer from [5]=5 STDERR: reducer fully [1]=1 STDERR: reducer g [1]=1 STDERR: reducer get [1]=1 STDERR: reducer github [1,2]=3 STDERR: reducer google [1]=1 STDERR: reducer graph [2]=2 STDERR: reducer greg [2]=2 STDERR: reducer group [2]=2 STDERR: reducer groups [1]=1 STDERR: reducer guides [1]=1 STDERR: reducer hadoop [4,7]=11 STDERR: reducer hadoop_home [1]=1 STDERR: reducer handled [1]=1 STDERR: reducer helps [1]=1 STDERR: reducer hourly [1]=1 STDERR: reducer html [3]=3 STDERR: reducer http [13,3]=16 STDERR: reducer https [1,2]=3 STDERR: reducer if [1]=1 STDERR: reducer image [2]=2 STDERR: reducer import [2]=2 STDERR: reducer important [1]=1 STDERR: reducer in [2,3]=5 STDERR: reducer information [2]=2 STDERR: reducer inside [1]=1 STDERR: reducer install [4]=4 STDERR: reducer installation [1]=1 STDERR: reducer interpret [1]=1 STDERR: reducer into [1]=1 STDERR: reducer introduction [2]=2 STDERR: reducer is [2]=2 STDERR: reducer it [3]=3 STDERR: reducer its [1]=1 STDERR: reducer job [4]=4 STDERR: reducer job's [1]=1 STDERR: reducer jobs [3]=3 STDERR: reducer keys [1]=1 STDERR: reducer killion [1]=1 STDERR: reducer latest [1]=1 STDERR: reducer line [2]=2 STDERR: reducer links [1]=1 STDERR: reducer live [1]=1 STDERR: reducer locally [1,1]=2 STDERR: reducer logo [1]=1 STDERR: reducer logo_medium [1]=1 STDERR: reducer logos [1]=1 STDERR: reducer logs [1]=1 STDERR: reducer looks [1]=1 STDERR: reducer lower [1]=1 STDERR: reducer mailto [1]=1 STDERR: reducer make [1,2]=3 STDERR: reducer map [2]=2 STDERR: reducer mapper [1]=1 STDERR: reducer mapreduce [2,6]=8 STDERR: reducer marcelcaraciolo [1]=1 STDERR: reducer master [1]=1 STDERR: reducer minimal [1]=1 STDERR: reducer more [1,2]=3 STDERR: reducer mr_word_freq_count [3]=3 STDERR: reducer mrjob [14,17]=31 STDERR: reducer mrjob_conf [1]=1 STDERR: reducer mrwordfreqcount [2]=2 STDERR: reducer multi [1]=1 STDERR: reducer need [1]=1 STDERR: reducer net [3]=3 STDERR: reducer next [1]=1 STDERR: reducer of [1,1]=2 STDERR: reducer on [4,6]=10 STDERR: reducer one [1]=1 STDERR: reducer only [1]=1 STDERR: reducer or [1]=1 STDERR: reducer org [3,4]=7 STDERR: reducer other [1,2]=3 STDERR: reducer out [1]=1 STDERR: reducer overview [1]=1 STDERR: reducer own [2]=2 STDERR: reducer package [1]=1 STDERR: reducer packages [2,2]=4 STDERR: reducer page [1]=1 STDERR: reducer pip [1]=1 STDERR: reducer png [2]=2 STDERR: reducer postneo [1]=1 STDERR: reducer production [1]=1 STDERR: reducer project [1]=1 STDERR: reducer put [1]=1 STDERR: reducer py [1,3]=4 STDERR: reducer pycon [3]=3 STDERR: reducer pypi [1]=1 STDERR: reducer pypy [2]=2 STDERR: reducer python [5,5]=10 STDERR: reducer pythonpath [1]=1 STDERR: reducer r [1,2]=3 STDERR: reducer raw [1]=1 STDERR: reducer re [2]=2 STDERR: reducer readme [3]=3 STDERR: reducer readthedocs [1]=1 STDERR: reducer recommendations [2]=2 STDERR: reducer recsys [1]=1 STDERR: reducer reduce [2]=2 STDERR: reducer reducer [1]=1 STDERR: reducer reference [1]=1 STDERR: reducer regions [1]=1 STDERR: reducer rst [3]=3 STDERR: reducer run [2,6]=8 STDERR: reducer scripts [1]=1 STDERR: reducer secret [1]=1 STDERR: reducer security [1]=1 STDERR: reducer see [1]=1 STDERR: reducer self [3]=3 STDERR: reducer service [1]=1 STDERR: reducer services [1]=1 STDERR: reducer set [2,3]=5 STDERR: reducer setting [1]=1 STDERR: reducer setup [4]=4 STDERR: reducer sign [1]=1 STDERR: reducer simple [1]=1 STDERR: reducer simplejson [1]=1 STDERR: reducer social [2]=2 STDERR: reducer some [1]=1 STDERR: reducer source [2,3]=5 STDERR: reducer ssh [1]=1 STDERR: reducer stable [1]=1 STDERR: reducer stable1 [1]=1 STDERR: reducer step [2]=2 STDERR: reducer streaming [1,2]=3 STDERR: reducer sum [2]=2 STDERR: reducer supports [1]=1 STDERR: reducer sure [1]=1 STDERR: reducer tarballs [1]=1 STDERR: reducer target [1]=1 STDERR: reducer testing [1]=1 STDERR: reducer thanks [1]=1 STDERR: reducer that [1]=1 STDERR: reducer the [3,4]=7 STDERR: reducer this [1]=1 STDERR: reducer time [1]=1 STDERR: reducer to [4,5]=9 STDERR: reducer tracker [1]=1 STDERR: reducer transparently [1]=1 STDERR: reducer travis [2]=2 STDERR: reducer tree [1,1]=2 STDERR: reducer try [1]=1 STDERR: reducer tunnel [1]=1 STDERR: reducer tv [1]=1 STDERR: reducer tz [1]=1 STDERR: reducer up [3]=3 STDERR: reducer upload [1,1]=2 STDERR: reducer us [1]=1 STDERR: reducer use [1]=1 STDERR: reducer using [2]=2 STDERR: reducer v0 [1]=1 STDERR: reducer variables [1,1]=2 STDERR: reducer version [2]=2 STDERR: reducer videos [1]=1 STDERR: reducer w' [1]=1 STDERR: reducer web [1]=1 STDERR: reducer which [1]=1 STDERR: reducer with [1,2]=3 STDERR: reducer word [6]=6 STDERR: reducer word_re [2]=2 STDERR: reducer words [1]=1 STDERR: reducer works [1,3]=4 STDERR: reducer write [2]=2 STDERR: reducer www [1]=1 STDERR: reducer yelp [1,3]=4 STDERR: reducer yield [3]=3 STDERR: reducer you [2]=2 STDERR: reducer you'll [1]=1 STDERR: reducer your [4,6]=10 Counters from step 1: (no counters found) Moving /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-reducer_part-00000 -> /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/output/part-00000 Moving /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/step-0-reducer_part-00001 -> /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/output/part-00001 Streaming final output from /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860/output removing tmp directory /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195136.371860
!python mr_word_freq_count.py -r emr $root_dir/README.rst > counts
python: can't open file 'mr_word_freq_count.py': [Errno 2] No such file or directory
job_flow_id='j-35O01QLMRUFED'
!python mr_word_freq_count.py -r emr --emr-job-flow-id=j-35O01QLMRUFED $root_dir/README.rst > counts
using configs in /Users/yoavfreund/.mrjob.conf using existing scratch bucket mrjob-71c4e33417a2cde8 using s3://mrjob-71c4e33417a2cde8/tmp/ as our scratch dir on S3 creating tmp directory /var/folders/80/c2kfvdvx5cx570r4vlzqgb840000gq/T/mr_word_freq_count.yoavfreund.20140507.195205.656489 Copying non-input files into s3://mrjob-71c4e33417a2cde8/tmp/mr_word_freq_count.yoavfreund.20140507.195205.656489/files/ Adding our job to existing job flow j-3MMGSXIO3FQR3 Job launched 30.4s ago, status RUNNING: Running step (mr_word_freq_count.yoavfreund.20140507.195205.656489: Step 1 of 1) Job launched 60.9s ago, status RUNNING: Running step (mr_word_freq_count.yoavfreund.20140507.195205.656489: Step 1 of 1) Job launched 91.4s ago, status RUNNING: Running step (mr_word_freq_count.yoavfreund.20140507.195205.656489: Step 1 of 1) Job on job flow j-3MMGSXIO3FQR3 failed with status WAITING: Waiting after step failed Logs are in s3://yoav.hadoop/j-3MMGSXIO3FQR3/ ec2_key_pair_file not specified, going to S3 Scanning S3 logs for probable cause of failure Waiting 5.0s for S3 eventual consistency Attempting to terminate job... Traceback (most recent call last): File "mr_word_freq_count.py", line 48, in <module> MRWordFreqCount.run() File "//anaconda/lib/python2.7/site-packages/mrjob/job.py", line 494, in run mr_job.execute() File "//anaconda/lib/python2.7/site-packages/mrjob/job.py", line 512, in execute super(MRJob, self).execute() File "//anaconda/lib/python2.7/site-packages/mrjob/launch.py", line 147, in execute self.run_job() File "//anaconda/lib/python2.7/site-packages/mrjob/launch.py", line 213, in run_job self.stdout.flush() File "//anaconda/lib/python2.7/site-packages/mrjob/runner.py", line 614, in __exit__ self.cleanup() File "//anaconda/lib/python2.7/site-packages/mrjob/emr.py", line 1010, in cleanup super(EMRJobRunner, self).cleanup(mode=mode) File "//anaconda/lib/python2.7/site-packages/mrjob/runner.py", line 560, in cleanup self._cleanup_job() File "//anaconda/lib/python2.7/site-packages/mrjob/emr.py", line 1084, in _cleanup_job self._opts['ec2_key_pair_file']) File "//anaconda/lib/python2.7/site-packages/mrjob/ssh.py", line 200, in ssh_terminate_single_job ssh_bin, address, ec2_key_pair_file, ['hadoop', 'job', '-list'])) File "//anaconda/lib/python2.7/site-packages/mrjob/ssh.py", line 82, in ssh_run p = Popen(args, stdout=PIPE, stderr=PIPE, stdin=PIPE) File "//anaconda/lib/python2.7/subprocess.py", line 709, in __init__ errread, errwrite) File "//anaconda/lib/python2.7/subprocess.py", line 1326, in _execute_child raise child_exception TypeError: execv() arg 2 must contain only strings
!ls mrjob/examples/
__init__.py mr_jar_step_example.py mr_postfix_bounce mr_word_freq_count.py bash_wrap mr_log_sampler.py mr_text_classifier.py py3k_word_freq_count.py contrib mr_most_used_word.py mr_travelling_salesman mr_cmd.py mr_next_word_stats.py mr_wc.py mr_grep.py mr_page_rank.py mr_wc.rb
%load $root_dir/examples/mr_travelling_salesman/README.rst
Hint: One map-reduce job is enough. You might think that you first need to compute the means $\mu_i=E(X_i)$ and then, in a second path, compute $$cov(X_i,X_j) = E((X_i-\mu_i)(X_j-\mu_j))$$ However, recall the formula $$ var(X) \doteq E((X-\mu)^2) = E(X^2) - E(X)^2 $$ This formula can be generalized to the $cov$ matrix.
!wc /home/ubuntu/data/weather/SAMPLE_TMAX.csv
20000 20000 26114979 /home/ubuntu/data/weather/SAMPLE_TMAX.csv