import pandas as pd import numpy as np import os, sqlite3 import json from urllib import urlencode from urlparse import urlparse, parse_qs, urlunparse import urlparse import pprint from difflib import SequenceMatcher def parsed(x): parse = urlparse(x) return parse.netloc lines = [] databases = [] dataku = [] directory = "D:/DATA/example" for filename in os.listdir(directory): flname = os.path.join(directory, filename) databases.append(flname) for database in databases: try: with sqlite3.connect(database) as conn: cur = conn.cursor() sqlqry = pd.read_sql("SELECT value FROM data WHERE name='BrowserBookmarksProbe'",conn) a = sqlqry['value'] #b = sqlqry['timestamp'] records = [json.loads(line) for line in a] for row in records: dataku.append(row) except sqlite3.Error, err: print "[INFO] %s" % err url = [url['url'] for url in dataku] visits = [visits['visits'] for visits in dataku] datazip = zip(url,visits) df = pd.DataFrame(datazip, columns=['url','visits']) dfnew= df['url'].apply(parsed) clusterdata = dfnew.head(100) clusterdata.head(10) strdata = clusterdata.apply(str) strdata.head(10) import pprint from difflib import SequenceMatcher from cluster import HierarchicalClustering def distance(url1, url2): ratio = SequenceMatcher(None, url1, url2).ratio() return 1.0 - ratio # Perform clustering hc = HierarchicalClustering(strdata, distance) clusters = hc.getlevel(0.2) clusters for row in clusters: print row