Script to generate train dataset for RecSysRules 2015 Challenge (Rule-based Recommender Systems for the Web of Data - http://www.csw.inf.fu-berlin.de/ruleml2015/recsysrules-2015.html)
Jaroslav Kuchař (Czech Technical University, Prague)
Train dataset for The challenge is subset of MovieLens dataset. It is a dataset provided by GroupLens Research that contains rating datasets of movies from MovieLens website. MovieLens1M with 1 million ratings from 6000 users on 4000 movies will be used for this challenge. This dataset is enriched by additional semantic information using DBpedia mappings provided by SisInf Lab.
"""
Import modules
"""
import urllib
import zipfile
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
"""
Download
"""
urllib.urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "ml-1m.zip")
urllib.urlretrieve("https://s3-eu-west-1.amazonaws.com/recsysrules2015/train-index.zip", "train-index.zip")
urllib.urlretrieve("https://s3-eu-west-1.amazonaws.com/recsysrules2015/RuleMLChallenge2015.zip", "RuleMLChallenge2015.zip")
('RuleMLChallenge2015.zip', <httplib.HTTPMessage instance at 0x1048f56c8>)
"""
Extract
"""
with zipfile.ZipFile("ml-1m.zip", "r") as z:
z.extractall("./")
with zipfile.ZipFile("train-index.zip", "r") as z:
z.extractall("./")
with zipfile.ZipFile("RuleMLChallenge2015.zip", "r") as z:
z.extractall("./")
"""
Settings
"""
movieLensUsers = "./ml-1m/users.dat"
movieLensMovies = "./ml-1m/movies.dat"
movieLensRatings = "./ml-1m/ratings.dat"
recSysRulesDatatypes = "./MappingDBpedia2Movielens_datatypes.csv"
recSysRulesCategories = "./MappingDBpedia2Movielens_categories.csv"
recSysRulesTrainIndex = "./train-index.csv"
"""
Import MovieLens 1M
"""
unames = ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
users = pd.read_table(movieLensUsers, sep='::', header=None, names=unames)
rnames = ['UserID', 'MovieID', 'Rating', 'Timestamp']
ratings = pd.read_table(movieLensRatings, sep='::', header=None, names=rnames)
mnames = ['MovieID', 'Title', 'Genres']
movies = pd.read_table(movieLensMovies, sep='::', header=None, names=mnames)
"""
MovieLens 1M Details
"""
print "Users: {0}".format(users.shape)
print users.head(3)
print "Movies: {0}".format(movies.shape)
print movies.head(3)
print "Ratings: {0}".format(ratings.shape)
print ratings.head(3)
Users: (6040, 5) UserID Gender Age Occupation Zip-code 0 1 F 1 10 48067 1 2 M 56 16 70072 2 3 M 25 15 55117 Movies: (3883, 3) MovieID Title Genres 0 1 Toy Story (1995) Animation|Children's|Comedy 1 2 Jumanji (1995) Adventure|Children's|Fantasy 2 3 Grumpier Old Men (1995) Comedy|Romance Ratings: (1000209, 4) UserID MovieID Rating Timestamp 0 1 1193 5 978300760 1 1 661 3 978302109 2 1 914 3 978301968
"""
Import RecSysRules Mappings
"""
datatypes=pd.read_csv(recSysRulesDatatypes, sep=';',)
categories=pd.read_csv(recSysRulesCategories, sep=';')
datatypes.rename(columns={'title':'Title', 'id':'MovieID'},inplace=True)
categories.rename(columns={'title':'Title', 'id':'MovieID'},inplace=True)
"""
RecSysRules Mappings Details
"""
print "Datatypes: {0}".format(datatypes.shape)
print datatypes[:3]
print "Categories: {0}".format(categories.shape)
print categories[:3]
Datatypes: (3156, 10) Title \ 0 Baby, The (1973) 1 Different for Girls (1996) 2 They Might Be Giants (1971) uri MovieID \ 0 http://dbpedia.org/resource/The_Baby_(film) 3280 1 http://dbpedia.org/resource/Different_for_Girls 3282 2 http://dbpedia.org/resource/They_Might_Be_Gian... 3284 uri_data_http://dbpedia.org/ontology/releaseDate \ 0 NaN 1 9/12/97 12:00 AM 2 NaN uri_data_http://dbpedia.org/ontology/Work/runtime \ 0 85 1 96 2 98 uri_data_http://dbpedia.org/ontology/wikiPageRevisionID \ 0 601064785 1 596577620 2 600714850 uri_data_http://dbpedia.org/ontology/runtime \ 0 5100 1 5760 2 5880 uri_data_http://dbpedia.org/ontology/budget \ 0 NaN 1 NaN 2 NaN uri_data_http://dbpedia.org/ontology/wikiPageID \ 0 25529530 1 3570614 2 1615582 uri_data_http://dbpedia.org/ontology/gross 0 NaN 1 300645 2 NaN Categories: (3156, 5006) Title \ 0 Baby, The (1973) 1 Different for Girls (1996) 2 They Might Be Giants (1971) uri MovieID \ 0 http://dbpedia.org/resource/The_Baby_(film) 3280 1 http://dbpedia.org/resource/Different_for_Girls 3282 2 http://dbpedia.org/resource/They_Might_Be_Gian... 3284 uri_relation_http://dbpedia.org/resource/Category:Golden_Bear_winners \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:British_horror_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Universal_Pictures_films \ 0 0 1 0 2 1 uri_relation_http://dbpedia.org/resource/Category:Documentary_films_about_business \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Films_directed_by_Jake_Kasdan \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Films_directed_by_Tom_McLoughlin \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Animated_feature_films \ 0 0 1 0 2 0 ... \ 0 ... 1 ... 2 ... uri_relation_http://dbpedia.org/resource/Category:Films_directed_by_David_Miller \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Martha's_Vineyard \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:The_Avengers_(TV_series) \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Films_about_pigs \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Medical-themed_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Gambling_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:American_historical_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Adaptations_of_works_by_Edmond_Rostand \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Self-censorship \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Films_directed_by_Robert_Rodriguez 0 0 1 0 2 0 [3 rows x 5006 columns]
"""
Load train split
"""
trainIndex = np.loadtxt(recSysRulesTrainIndex, dtype=int)
trainRatings = ratings.iloc[trainIndex]
print "RecSysRules Train Ratings: {}".format(trainRatings.shape)
print trainRatings.head(3)
RecSysRules Train Ratings: (148407, 4) UserID MovieID Rating Timestamp 0 1 1193 5 978300760 1 1 661 3 978302109 2 1 914 3 978301968
"""
Merge movielens
"""
print "Users {} + Movies {} + Ratings {} ".format(users.shape, movies.shape, trainRatings.shape)
trainMovielens = pd.merge(pd.merge(trainRatings, users), movies)
print "RecSysRules Train Merged: {} \n ...".format(trainMovielens.shape)
print trainMovielens.head(3)
Users (6040, 5) + Movies (3883, 3) + Ratings (148407, 4) RecSysRules Train Merged: (148407, 10) ... UserID MovieID Rating Timestamp Gender Age Occupation Zip-code \ 0 1 1193 5 978300760 F 1 10 48067 1 59 1193 4 977934292 F 50 1 55413 2 81 1193 5 977785864 F 25 0 60640 Title Genres 0 One Flew Over the Cuckoo's Nest (1975) Drama 1 One Flew Over the Cuckoo's Nest (1975) Drama 2 One Flew Over the Cuckoo's Nest (1975) Drama
"""
Merge mappings
"""
print "Datatypes {} + Categories {}".format(datatypes.shape, categories.shape)
mappings = pd.merge(datatypes, categories)
print "RecSysRules Mappings Merged: {} \n ...".format(mappings.shape)
print mappings[:3]
Datatypes (3156, 10) + Categories (3156, 5006) RecSysRules Mappings Merged: (3156, 5013) ... Title \ 0 Baby, The (1973) 1 Different for Girls (1996) 2 They Might Be Giants (1971) uri MovieID \ 0 http://dbpedia.org/resource/The_Baby_(film) 3280 1 http://dbpedia.org/resource/Different_for_Girls 3282 2 http://dbpedia.org/resource/They_Might_Be_Gian... 3284 uri_data_http://dbpedia.org/ontology/releaseDate \ 0 NaN 1 9/12/97 12:00 AM 2 NaN uri_data_http://dbpedia.org/ontology/Work/runtime \ 0 85 1 96 2 98 uri_data_http://dbpedia.org/ontology/wikiPageRevisionID \ 0 601064785 1 596577620 2 600714850 uri_data_http://dbpedia.org/ontology/runtime \ 0 5100 1 5760 2 5880 uri_data_http://dbpedia.org/ontology/budget \ 0 NaN 1 NaN 2 NaN uri_data_http://dbpedia.org/ontology/wikiPageID \ 0 25529530 1 3570614 2 1615582 uri_data_http://dbpedia.org/ontology/gross \ 0 NaN 1 300645 2 NaN ... \ 0 ... 1 ... 2 ... uri_relation_http://dbpedia.org/resource/Category:Films_directed_by_David_Miller \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Martha's_Vineyard \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:The_Avengers_(TV_series) \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Films_about_pigs \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Medical-themed_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Gambling_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:American_historical_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Adaptations_of_works_by_Edmond_Rostand \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Self-censorship \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Films_directed_by_Robert_Rodriguez 0 0 1 0 2 0 [3 rows x 5013 columns]
"""
Merge all
"""
mappings.drop('Title', axis=1, inplace=True)
print "Train {} + Mappings {}".format(trainMovielens.shape, mappings.shape)
recsysrulesTrain = pd.merge(trainMovielens, mappings, left_on="MovieID", right_on="MovieID", how="inner")
print "RecSysRules All Merged: {} \n ...".format(recsysrulesTrain.shape)
print recsysrulesTrain[:3]
Train (148407, 10) + Mappings (3156, 5012) RecSysRules All Merged: (135674, 5021) ... UserID MovieID Rating Timestamp Gender Age Occupation Zip-code \ 0 1 1193 5 978300760 F 1 10 48067 1 59 1193 4 977934292 F 50 1 55413 2 81 1193 5 977785864 F 25 0 60640 Title Genres ... \ 0 One Flew Over the Cuckoo's Nest (1975) Drama ... 1 One Flew Over the Cuckoo's Nest (1975) Drama ... 2 One Flew Over the Cuckoo's Nest (1975) Drama ... uri_relation_http://dbpedia.org/resource/Category:Films_directed_by_David_Miller \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Martha's_Vineyard \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:The_Avengers_(TV_series) \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Films_about_pigs \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Medical-themed_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Gambling_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:American_historical_films \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Adaptations_of_works_by_Edmond_Rostand \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Self-censorship \ 0 0 1 0 2 0 uri_relation_http://dbpedia.org/resource/Category:Films_directed_by_Robert_Rodriguez 0 0 1 0 2 0 [3 rows x 5021 columns]
print "Number of unique users in test set: {}".format(len(recsysrulesTrain['UserID'].unique()))
Number of unique users in test set: 1000
"""
Export csv
"""
recsysrulesTrain.to_csv("recsysrules-train.csv",float_format="%d")