# We've setup the notebook so that the hostname of the master is saved
# as CLUSTER_URL.
master_ui_address = "".join(CLUSTER_URL.split("//")[1].split(":")[0])
print "Master UI located at http://%s:8080" % master_ui_address

application_ui_address = "http://" + sc.appName + ":4040"
print "Application UI located at %s" % application_ui_address

ratings = # YOUR CODE HERE
movies = # YOUR CODE HERE

ratings_count = ratings.count()
movies_count = movies.count()
print "%s ratings and %s movies in the datasets" % (ratings_count, movies_count)

### YOUR CODE HERE
training = # YOUR CODE HERE 
validation = # YOUR CODE HERE
test = # YOUR CODE HERE

print "Training: %s, validation: %s, test: %s" % (training.count(), validation.count(), test.count())

import matplotlib.pyplot as plot
# Magic command to make matplotlib and ipython play nicely together.
%matplotlib inline

### YOUR CODE HERE

highest_existing_user_id = # YOUR CODE HERE

my_user_id = highest_existing_user_id + 1
print "My user ID is %s" % my_user_id

# Create a dataset of (movieID, number of ratings) pairs.
ratings_per_movie = ratings.map(lambda x: (x[1], 1)).reduceByKey(lambda x,y: x+y)

# Join average_ratings with movies to get a dataset with movie names and average ratings.
ratings_with_names = movies.map(lambda x: (x[0], x[1])).join(ratings_per_movie)

# map transforms ratings_with_names into an RDD where the key is the number of ratings
# and the value is a 2-item tuple with (movie name, number of ratings). We reformat the
# dataset in this way so that we can use sortByKey to get the most-rated movies.
sorted_ratings = ratings_with_names.map(lambda x: (x[1][1], (x[1][0], x[0]))).sortByKey(False)
print "Most rated movies:"
for ratings_tuple in sorted_ratings.take(50):
    print ratings_tuple

my_ratings_RDD = # YOUR CODE HERE

# Remember that in general, you shouldn't use collect()!
# We use collect here because we know that the RDD with your
# ratings is small.
print "My movie ratings: ", my_ratings_RDD.collect()

training_with_my_ratings = # YOUR CODE HERE

print ("The training dataset now has %s more entries than the original training dataset" % 
       (training_with_my_ratings.count() - training.count()))

### YOUR CODE HERE

from pyspark.mllib.recommendation import ALS

best_rank = # YOUR CODE HERE

print "The best model was trained with rank %s" % best_rank

test_rmse = # YOUR CODE HERE
print "The model had a RMSE on the test set of %s" % test_rmse

predicted_10_highest_rated_movies = # YOUR CODE HERE

# This should print a list of 10 movie names and the associated
# predicted ratings.
print "My highest rated movies: ", predicted_10_highest_rated_movies