This notebook analyses the domains that are linking to Windeln.de and attempting to help understand the relationship between those websites.
import os
import re
from pandas import *
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from mpld3 import display
%matplotlib inline
Data produced by backlinks.py
df = read_csv('C:/workspace/analysis_reporting/joe/milk_china/anchor_urls.csv', sep='\t')
# df.head()
df_domain = df.ix[:, ['source_domain','alexa_cc_rank']].drop_duplicates()
Find relationship between source domains
relation_pairs = []
key = ['source_domain','links']
df2 = df.ix[:,key].assign(num_links=1).dropna().groupby(key, as_index=0)['num_links'].sum()
# df2 = df.ix[:,['source_domain','links']].drop_duplicates().dropna()
df2.index = range(len(df2))
for i in df2.index:
source_domain = df2.ix[i, 'source_domain']
num_links = df2.ix[i, 'num_links']
for link_domain in df2.ix[i, 'links'].split('|'):
pair = (source_domain, link_domain, num_links)
relation_pairs.append(pair)
df_pair = DataFrame(list(set(relation_pairs)), columns=['src', 'target', 'num_links'])
# plotting graph
G = nx.Graph()
pos=nx.spring_layout(G)
for line in df_pair.values:
src, tgt, wt = line.tolist()
G.add_edge(src, tgt, weight=wt)
# fig, ax = plt.subplots()
plt.figure(figsize=(10, 10))
nx.draw_networkx(G, with_labels=1)
display()