#import urllib2 #response = urllib2.urlopen('http://www.gutenberg.org/files/30760/30760-0.txt') #html = response.read() #file("book_temp.txt","w").write(html) #words=sc.textFile("book_temp.txt") #file("book_temp.txt", "w").write(urllib2.urlopen("http://www.gutenberg.org/files/30760/30760-0.txt").read()) #words = sc.textFile("hdfs://localhost:9000/user//DailyFullModBusMeiDeviceID") words = sc.textFile("hdfs://localhost:9000/Python/book.txt") words.filter(lambda w: w.startswith(" ")).take(5) counts = words.flatMap(lambda line: line.split(" ")) \ .map(lambda word: (word, 1)) \ .reduceByKey(lambda a, b: a + b) counts.saveAsTextFile("hdfs://localhost:9000/Python/spark_output1") counts.collect()