# We've setup the notebook so that the hostname of the master is saved # as CLUSTER_URL. master_ui_address = "".join(CLUSTER_URL.split("//")[1].split(":")[0]) print "Master UI located at http://%s:8080" % master_ui_address application_ui_address = "http://" + sc.appName + ":4040" print "Application UI located at %s" % application_ui_address ratings = # YOUR CODE HERE movies = # YOUR CODE HERE ratings_count = ratings.count() movies_count = movies.count() print "%s ratings and %s movies in the datasets" % (ratings_count, movies_count) ### YOUR CODE HERE training = # YOUR CODE HERE validation = # YOUR CODE HERE test = # YOUR CODE HERE print "Training: %s, validation: %s, test: %s" % (training.count(), validation.count(), test.count()) import matplotlib.pyplot as plot # Magic command to make matplotlib and ipython play nicely together. %matplotlib inline ### YOUR CODE HERE highest_existing_user_id = # YOUR CODE HERE my_user_id = highest_existing_user_id + 1 print "My user ID is %s" % my_user_id # Create a dataset of (movieID, number of ratings) pairs. ratings_per_movie = ratings.map(lambda x: (x[1], 1)).reduceByKey(lambda x,y: x+y) # Join average_ratings with movies to get a dataset with movie names and average ratings. ratings_with_names = movies.map(lambda x: (x[0], x[1])).join(ratings_per_movie) # map transforms ratings_with_names into an RDD where the key is the number of ratings # and the value is a 2-item tuple with (movie name, number of ratings). We reformat the # dataset in this way so that we can use sortByKey to get the most-rated movies. sorted_ratings = ratings_with_names.map(lambda x: (x[1][1], (x[1][0], x[0]))).sortByKey(False) print "Most rated movies:" for ratings_tuple in sorted_ratings.take(50): print ratings_tuple my_ratings_RDD = # YOUR CODE HERE # Remember that in general, you shouldn't use collect()! # We use collect here because we know that the RDD with your # ratings is small. print "My movie ratings: ", my_ratings_RDD.collect() training_with_my_ratings = # YOUR CODE HERE print ("The training dataset now has %s more entries than the original training dataset" % (training_with_my_ratings.count() - training.count())) ### YOUR CODE HERE from pyspark.mllib.recommendation import ALS best_rank = # YOUR CODE HERE print "The best model was trained with rank %s" % best_rank test_rmse = # YOUR CODE HERE print "The model had a RMSE on the test set of %s" % test_rmse predicted_10_highest_rated_movies = # YOUR CODE HERE # This should print a list of 10 movie names and the associated # predicted ratings. print "My highest rated movies: ", predicted_10_highest_rated_movies