import urllib f = urllib.urlretrieve ("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz", "kddcup.data.gz") data_file = "./kddcup.data.gz" raw_data = sc.textFile(data_file) raw_data_sample = raw_data.sample(False, 0.1, 1234) sample_size = raw_data_sample.count() total_size = raw_data.count() print "Sample size is {} of {}".format(sample_size, total_size) from time import time # transformations to be applied raw_data_sample_items = raw_data_sample.map(lambda x: x.split(",")) sample_normal_tags = raw_data_sample_items.filter(lambda x: "normal." in x) # actions + time t0 = time() sample_normal_tags_count = sample_normal_tags.count() tt = time() - t0 sample_normal_ratio = sample_normal_tags_count / float(sample_size) print "The ratio of 'normal' interactions is {}".format(round(sample_normal_ratio,3)) print "Count done in {} seconds".format(round(tt,3)) # transformations to be applied raw_data_items = raw_data.map(lambda x: x.split(",")) normal_tags = raw_data_items.filter(lambda x: "normal." in x) # actions + time t0 = time() normal_tags_count = normal_tags.count() tt = time() - t0 normal_ratio = normal_tags_count / float(total_size) print "The ratio of 'normal' interactions is {}".format(round(normal_ratio,3)) print "Count done in {} seconds".format(round(tt,3)) t0 = time() raw_data_sample = raw_data.takeSample(False, 400000, 1234) normal_data_sample = [x.split(",") for x in raw_data_sample if "normal." in x] tt = time() - t0 normal_sample_size = len(normal_data_sample) normal_ratio = normal_sample_size / 400000.0 print "The ratio of 'normal' interactions is {}".format(normal_ratio) print "Count done in {} seconds".format(round(tt,3))