""" Import modules """ import urllib import zipfile import pandas as pd import numpy as np import warnings warnings.filterwarnings('ignore') """ Download """ urllib.urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "ml-1m.zip") urllib.urlretrieve("https://s3-eu-west-1.amazonaws.com/recsysrules2015/train-index.zip", "train-index.zip") urllib.urlretrieve("https://s3-eu-west-1.amazonaws.com/recsysrules2015/RuleMLChallenge2015.zip", "RuleMLChallenge2015.zip") """ Extract """ with zipfile.ZipFile("ml-1m.zip", "r") as z: z.extractall("./") with zipfile.ZipFile("train-index.zip", "r") as z: z.extractall("./") with zipfile.ZipFile("RuleMLChallenge2015.zip", "r") as z: z.extractall("./") """ Settings """ movieLensUsers = "./ml-1m/users.dat" movieLensMovies = "./ml-1m/movies.dat" movieLensRatings = "./ml-1m/ratings.dat" recSysRulesDatatypes = "./MappingDBpedia2Movielens_datatypes.csv" recSysRulesCategories = "./MappingDBpedia2Movielens_categories.csv" recSysRulesTrainIndex = "./train-index.csv" """ Import MovieLens 1M """ unames = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code'] users = pd.read_table(movieLensUsers, sep='::', header=None, names=unames) rnames = ['UserID', 'MovieID', 'Rating', 'Timestamp'] ratings = pd.read_table(movieLensRatings, sep='::', header=None, names=rnames) mnames = ['MovieID', 'Title', 'Genres'] movies = pd.read_table(movieLensMovies, sep='::', header=None, names=mnames) """ MovieLens 1M Details """ print "Users: {0}".format(users.shape) print users.head(3) print "Movies: {0}".format(movies.shape) print movies.head(3) print "Ratings: {0}".format(ratings.shape) print ratings.head(3) """ Import RecSysRules Mappings """ datatypes=pd.read_csv(recSysRulesDatatypes, sep=';',) categories=pd.read_csv(recSysRulesCategories, sep=';') datatypes.rename(columns={'title':'Title', 'id':'MovieID'},inplace=True) categories.rename(columns={'title':'Title', 'id':'MovieID'},inplace=True) """ RecSysRules Mappings Details """ print "Datatypes: {0}".format(datatypes.shape) print datatypes[:3] print "Categories: {0}".format(categories.shape) print categories[:3] """ Load train split """ trainIndex = np.loadtxt(recSysRulesTrainIndex, dtype=int) trainRatings = ratings.iloc[trainIndex] print "RecSysRules Train Ratings: {}".format(trainRatings.shape) print trainRatings.head(3) """ Merge movielens """ print "Users {} + Movies {} + Ratings {} ".format(users.shape, movies.shape, trainRatings.shape) trainMovielens = pd.merge(pd.merge(trainRatings, users), movies) print "RecSysRules Train Merged: {} \n ...".format(trainMovielens.shape) print trainMovielens.head(3) """ Merge mappings """ print "Datatypes {} + Categories {}".format(datatypes.shape, categories.shape) mappings = pd.merge(datatypes, categories) print "RecSysRules Mappings Merged: {} \n ...".format(mappings.shape) print mappings[:3] """ Merge all """ mappings.drop('Title', axis=1, inplace=True) print "Train {} + Mappings {}".format(trainMovielens.shape, mappings.shape) recsysrulesTrain = pd.merge(trainMovielens, mappings, left_on="MovieID", right_on="MovieID", how="inner") print "RecSysRules All Merged: {} \n ...".format(recsysrulesTrain.shape) print recsysrulesTrain[:3] print "Number of unique users in test set: {}".format(len(recsysrulesTrain['UserID'].unique())) """ Export csv """ recsysrulesTrain.to_csv("recsysrules-train.csv",float_format="%d")