#coded by Agung, PhD import pandas as pd import numpy as np import os, sqlite3 import json databases = [] dataku = [] directory = "D:/DATA/example" for filename in os.listdir(directory): flname = os.path.join(directory, filename) databases.append(flname) for database in databases: try: with sqlite3.connect(database) as conn: cur = conn.cursor() sqlqry = pd.read_sql("SELECT value FROM data WHERE name='BrowserBookmarksProbe'",conn) a = sqlqry['value'] records = [json.loads(line) for line in a] for row in records: dataku.append(row) except sqlite3.Error, err: print "[INFO] %s" % err url = [url['url'] for url in dataku] visits = [visits['visits'] for visits in dataku] datazip = zip(url,visits) frame = pd.DataFrame(data=datazip, columns=['url','visits']) urls = frame['url'] import pprint from difflib import SequenceMatcher # http://python-cluster.sourceforge.net/ from cluster import HierarchicalClustering def distance(url1, url2): ratio = SequenceMatcher(None, url1, url2).ratio() return 1.0 - ratio # Perform clustering hc = HierarchicalClustering(urls, distance) clusters = hc.getlevel(0.2) pprint.pprint(clusters)