import pandas as pd
import numpy as np
import os, sqlite3
import json
from urllib import urlencode
from urlparse import urlparse, parse_qs, urlunparse
import urlparse
import pprint
from difflib import SequenceMatcher
def parsed(x):
parse = urlparse(x)
return parse.netloc
lines = []
databases = []
dataku = []
directory = "D:/DATA/example"
for filename in os.listdir(directory):
flname = os.path.join(directory, filename)
databases.append(flname)
for database in databases:
try:
with sqlite3.connect(database) as conn:
cur = conn.cursor()
sqlqry = pd.read_sql("SELECT value FROM data WHERE name='BrowserBookmarksProbe'",conn)
a = sqlqry['value']
#b = sqlqry['timestamp']
records = [json.loads(line) for line in a]
for row in records:
dataku.append(row)
except sqlite3.Error, err:
print "[INFO] %s" % err
url = [url['url'] for url in dataku]
visits = [visits['visits'] for visits in dataku]
datazip = zip(url,visits)
df = pd.DataFrame(datazip, columns=['url','visits'])
dfnew= df['url'].apply(parsed)
clusterdata = dfnew.head(100)
clusterdata.head(10)
0 m.blog.naver.com 1 m.facebook.com 2 details 3 story-api.kakao.com 4 m.cafe.daum.net 5 details 6 cafe.daum.net 7 m.facebook.com 8 j.exit-ad.com 9 asked.kr Name: url, dtype: object
strdata = clusterdata.apply(str)
strdata.head(10)
0 m.blog.naver.com 1 m.facebook.com 2 details 3 story-api.kakao.com 4 m.cafe.daum.net 5 details 6 cafe.daum.net 7 m.facebook.com 8 j.exit-ad.com 9 asked.kr Name: url, dtype: object
import pprint
from difflib import SequenceMatcher
from cluster import HierarchicalClustering
def distance(url1, url2):
ratio = SequenceMatcher(None, url1, url2).ratio()
return 1.0 - ratio
# Perform clustering
hc = HierarchicalClustering(strdata, distance)
clusters = hc.getlevel(0.2)
clusters
[['details', 'details'], ['auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at'], ['bit.ly'], ['asked.kr', 'asked.kr', 'asked.kr', 'asked.kr'], ['cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr'], ['jesuside.com.ne.kr'], ['t.co', 't.co'], ['j.exit-ad.com'], ['m.event.toast.com'], ['twitpic.com'], ['i1.media.daumcdn.net'], ['cafe.daum.net', 'm.cafe.daum.net', 'm.cafe.daum.net'], ['eugenejulia.tistory.com'], ['m.imbc.com'], ['m.winixcorp.com'], ['m.humoruniv.com', 'web.humoruniv.com'], ['ggstory.com'], ['m.gsshop.com', 'm.gsshop.com'], ['m.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com'], ['instagram.com'], ['m.universaltoy.co.kr'], ['m.insight.co.kr'], ['www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr'], ['www.lotteconf.co.kr'], ['www.skt-lte.co.kr'], ['www.ebs.co.kr'], ['m.11am.co.kr'], ['www.annpiona.co.kr', 'm.hani.co.kr', 'www.hani.co.kr'], ['www.pulmuoneamio.com'], ['www.threemusic.com'], ['www.youtube.com'], ['www.gobalnews.com'], ['www.google.com'], ['www.100tap.com'], ['story-api.kakao.com'], ['www.korea-ps.com'], ['m.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com'], ['m.cafe.naver.com', 'm.cafe.naver.com', 'm.cafe.naver.com', 'nid.naver.com', 'nid.naver.com', 'www.naver.com', 'm.news.naver.com', 'news.naver.com'], ['plus.kakao.com'], ['www.dalkomm.com'], ['www.kakao.com']]
for row in clusters:
print row
['details', 'details'] ['auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at', 'auth.waffle.at'] ['bit.ly'] ['asked.kr', 'asked.kr', 'asked.kr', 'asked.kr'] ['cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr', 'cyber.kepco.co.kr'] ['jesuside.com.ne.kr'] ['t.co', 't.co'] ['j.exit-ad.com'] ['m.event.toast.com'] ['twitpic.com'] ['i1.media.daumcdn.net'] ['cafe.daum.net', 'm.cafe.daum.net', 'm.cafe.daum.net'] ['eugenejulia.tistory.com'] ['m.imbc.com'] ['m.winixcorp.com'] ['m.humoruniv.com', 'web.humoruniv.com'] ['ggstory.com'] ['m.gsshop.com', 'm.gsshop.com'] ['m.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com', 'm.facebook.com'] ['instagram.com'] ['m.universaltoy.co.kr'] ['m.insight.co.kr'] ['www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr', 'www.shescoming.co.kr'] ['www.lotteconf.co.kr'] ['www.skt-lte.co.kr'] ['www.ebs.co.kr'] ['m.11am.co.kr'] ['www.annpiona.co.kr', 'm.hani.co.kr', 'www.hani.co.kr'] ['www.pulmuoneamio.com'] ['www.threemusic.com'] ['www.youtube.com'] ['www.gobalnews.com'] ['www.google.com'] ['www.100tap.com'] ['story-api.kakao.com'] ['www.korea-ps.com'] ['m.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com', 'm.blog.naver.com'] ['m.cafe.naver.com', 'm.cafe.naver.com', 'm.cafe.naver.com', 'nid.naver.com', 'nid.naver.com', 'www.naver.com', 'm.news.naver.com', 'news.naver.com'] ['plus.kakao.com'] ['www.dalkomm.com'] ['www.kakao.com']