The script is for fetching and cleaning backlinks data for Windeln so that we can futher analyze the referral sites behind Windeln.de.
from urlparse import urlparse
from lxml import etree
import json
import re
import os
import requests
from bs4 import BeautifulSoup
from pandas import *
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
The data is downloaded from http://webmeup.com/.
# data downloaded from http://webmeup.com/
fpath = 'C:/workspace/analysis_reporting/joe/milk_china/windeln.de-1430237039144.csv'
df = read_csv(fpath, encoding='utf-16', sep='\t')
df.columns = [
'source_url','source_title','anchor_url','anchor_text',
'is_text','nofollow','linking_domain_country','linking_domain_ip','check_date','first_seen'
]
df['source_domain'] = df.source_url.apply(lambda x: urlparse(x).netloc)
df['source_path'] = df.source_url.apply(lambda x: urlparse(x).path)
df['anchor_path'] = df.anchor_url.apply(lambda x: urlparse(x).path)
df['anchor_domain'] = df.anchor_url.apply(lambda x: urlparse(x).netloc)
df_cn = df[(df.linking_domain_country=='cn') & (df['anchor_domain']=='www.windeln.de')]
print df.shape
(214827, 14)
As we know, the number of backlinks doesn't necessarily correlate with the actual referral traffic.
Therefore, we need to evaluate the overall site ranking / traffic / influnce of the each website.
We can use Alexa for each source domain. http://data.alexa.com/data?cli=10&url=%7Bdomain%7D.
def get_alexa_rank(domain):
def get_attr(path, attr, default):
return tree.find(path).attrib[attr] if tree.find(path) is not None else default
res_default = {
'alexa_domain': domain,
'alexa_gl_rank': -1,
'alexa_cc_rank': -1,
'alexa_cc': -1
}
try:
url = 'http://data.alexa.com/data?cli=10&url={0}'.format(domain)
# url = 'http://www.alexa.com/siteinfo/{0}'.format(domain)
except Exception, err:
return res_default, None
sc = 0
err_count = 0
while err_count < 2:
try:
r = requests.get(url)
sc = r.status_code
tree = etree.XML(r.content)
cc_rank = int(get_attr('SD/COUNTRY', 'RANK', -1))
gl_rank = int(get_attr('SD/REACH', 'RANK', -1))
cc = get_attr('SD/COUNTRY', 'CODE', '')
title = get_attr('DMOZ/SITE', 'TITLE', '')
desc = get_attr('DMOZ/SITE', 'DESC', '')
if ((title == '') or (desc == '')) and (0 < cc_rank < 10000):
title, desc = fetch_meta(domain)
return {
'alexa_domain': domain,
'alexa_cc_rank': cc_rank,
'alexa_gl_rank': gl_rank,
'alexa_cc': cc,
'alexa_title': title,
'alexa_title_en': translate(title) if title != '' else '',
'alexa_desc': desc,
'alexa_desc_en': translate(desc) if desc != '' else '',
}, r
except Exception, err:
print err
err_count += 1
return res_default, r
def fetch_meta(url):
if re.search('http', url) is None:
url = 'http://' + url
try:
r = requests.get(url)
except Exception, err:
print err
return ''
if r.status_code==200:
soup = BeautifulSoup(r.content)
try:
return soup.select('meta[name="description"]')[0]['content'], soup.select('title')[0].text
except Exception, err:
return '', ''
else:
return '', ''
def translate(t):
api_key = 'trnsl.1.1.20150429T194039Z.49a0abddd64a0760.b4a75adeabde25bb28d0fe58ad10f3ee994fcf6b'
url = u'https://translate.yandex.net/api/v1.5/tr.json/translate?key={0}&lang=zh-en&text={1}'\
.format(api_key, t)
try:
r = requests.get(url)
if r.status_code == 200:
data = json.loads(r.content)
return data.get('text')[0] if data.get('text') is not None else ''
else:
return ''
except Exception, err:
print err
return ''
def is_thread(url):
return re.search('thread|forum|community', url) is not None
def get_thread_stats(url):
def get_num(pat):
m = re.search(pat, raw)
if m is not None:
n = m.groups()[0]
try:
return int(n)
except Exception, err:
return n
return None
pat_views = u'(?:查看|浏览)(?:: ?)(\d+)'
pat_replies = u'(?:回复)(?:: ?)(\d+)'
global r
if is_thread(url):
try:
r = requests.get(url)
except Exception, err:
return None
if r.status_code == 200:
soup = BeautifulSoup(r.content)
raw = soup.text
views = get_num(pat_views)
replies = get_num(pat_replies)
return {
'views': views,
'replies': replies,
'soup': soup,
}
return None
def get_connection_links(soup, domains):
links = []
for a in soup.select('a'):
href = a.get('href')
try:
p = urlparse(href)
except:
return links
if p is not None and p.netloc in domains:
links.append(href)
return links
def has_promotion(soup):
m = re.search('优惠码', soup.text)
return m is not None
def output(df):
for c in df.columns:
try:
df.ix[:,c] = df.ix[:, c].apply(lambda x: re.sub('[\n\r]',' ', x))
except:
pass
df.to_csv('C:/workspace/analysis_reporting/joe/milk_china/anchor_urls.csv', sep='\t', encoding='utf-8')
df.sort('alexa_cc_rank',ascending=1)\
.ix[:,['alexa_cc_rank','source_domain',
'alexa_title','alexa_desc',
'alexa_title_en','alexa_desc_en']].drop_duplicates()\
.to_csv('C:/workspace/analysis_reporting/joe/milk_china/anchor_domain.csv', sep='\t', encoding='utf-8')
fpath = 'df_rank_cn.csv'
if not os.path.exists(fpath):
domain_rank_cn = []
for i, sd in enumerate(df_cn.source_domain.unique()):
res, resp = get_alexa_rank(sd)
domain_rank_cn.append(res)
if i > 0 and not (i % 10):
try:
print '{1} processed {0}. res: {2}'.format(sd, i, json.dumps(res))
except:
print '{1} processed {0}. res:'.format('', i, json.dumps(res))
df_rank_cn = DataFrame(domain_rank_cn)
df_rank_cn.to_csv(fpath,index=0, encoding='utf-8')
else:
df_rank_cn = read_csv(fpath)
# join the alexa data with backlinks data frame
df_cn_ = merge(
df_cn,
df_rank_cn,
left_on='source_domain',
right_on='alexa_domain',
how='left'
)
We don't care about the source domains with no China rank (which means they are not influential).
Let's filter them out.
df_cn2 = df_cn_[df_cn_['alexa_cc_rank']!=-1] # get all source urls with CN ranking
df_cn2.sort('alexa_cc_rank',ascending=1)\
.ix[:,['alexa_cc_rank','source_domain',
'alexa_title','alexa_desc',
'alexa_title_en','alexa_desc_en']].drop_duplicates()
alexa_cc_rank | source_domain | alexa_title | alexa_desc | alexa_title_en | alexa_desc_en | |
---|---|---|---|---|---|---|
70992 | 21 | g.alipay.com | Alipay.com | Launched in 2004, Alipay is China's leading th... | Alipay.com | Launched in 2004, Alipay is China's leading th... |
70223 | 26 | best.pconline.com.cn | 太平洋电脑网 | 提供电脑产品介绍、价格信息、企业名录及电脑知识和新闻。 | Pacific computer network | To provide a computer product profile, pricing... |
21903 | 28 | dongxi.douban.com | 豆瓣 | Douban.com is a multi-products web platform wh... | Bean | Douban.com is a multi-products web platform wh... |
166 | 50 | www.babytree.com | 宝宝树 | 中国最大的育儿网站社区,提供育儿博客、10G超大空间免费电子相册、在线交流育儿论坛等服务,为... | The baby tree | China's largest childcare website communities,... |
70514 | 82 | bbs.haiwainet.cn | 海外网 | 海外网是人民日报海外版数字化转型建设的核心内容和重要支撑平台。海外版通过数字化转型,将逐步构... | Overseas network | The overseas network of the People's daily ove... |
79029 | 178 | www.huihui.cn | 惠惠网 | 网易旗下购物搜索和网购推荐平台,提供主流购物网站的商品比价和购物返现服务。 | Benefits benefits network | Web easy banner shopping search and online sho... |
70377 | 204 | www.boc.cn | 中国银行 | 中国银行股份有限公司,国有控股金融机构,业务范围涵盖商业银行、投资银行和保险领域,旗下有中银... | China Bank | China Bank, Inc., the holding financial instit... |
299 | 213 | home.qqbaobao.com | 育儿社区 ,亲宝网育儿论坛 | 育儿社区 - 亲宝网育儿论坛 - | Childcare community ,my precious Internet pare... | Parenting community - my precious Internet par... |
72969 | 232 | www.ebrun.com | 亿邦动力 | 中国最大的电子商务网络媒体,为做电子商务的企业、电商经理人,每日提供电商行业的公司、人物、政... | Million Federal power | China's largest e-Commerce Network Media, to e... |
400 | 236 | www.mgpyh.com | 买个便宜货、美国便宜货致力于提供最靠谱的海淘、国内购物优惠资讯、汇集数十万网友的智慧,分享真... | \r\n \r\n\t 买个便宜货:买得到的美好生活 | Buy cheap United States, cheap committed to pr... | \r\n\r\nBuy cheap: buy the good life |
75569 | 360 | www.letsebuy.com | 门户 | 海淘网站_海外代购网站_海淘攻略_海淘_网上代购_尽在海外E购-letsebuy官网 - ... | Portal | Sea phase network Station-overseas purchased o... |
78692 | 405 | www.55haitao.com | 55海淘网是目前最值得信赖的专业海淘站,包括美国主流的海淘网站,详尽的海淘网址大全,更是目前... | 海淘网站-海淘返利-海淘族最值得信赖的海淘返利网站|55海淘网 | 55 sea phase network is by far the most trustw... | Sea-going website-the phase to return the-sea ... |
70432 | 508 | bbs.55bbs.com | 我爱打折 | 购物、美食、娱乐、丽人……最具人气的生活消费资讯分享社区。 | I love a discount | Shopping, food, entertainment, beautiful peopl... |
71679 | 513 | bbs.bozhong.com | 播种网 | 播种网(bozhong.com)创立于2003年,创始人非非妈妈是一位有着多年从医经验的妇科... | The seed network | The seed network(bozhong. com)was created in 2... |
70933 | 513 | riji.bozhong.com | 播种网 | 播种网(bozhong.com)创立于2003年,创始人非非妈妈是一位有着多年从医经验的妇科... | The seed network | The seed network(bozhong. com)was created in 2... |
251 | 529 | bbs.seedit.com | 播种网 | 播种网(bozhong.com)创立于2003年,创始人非非妈妈是一位有着多年从医经验的妇科... | The seed network | The seed network(bozhong. com)was created in 2... |
78953 | 605 | guangdiu.com | 最全的实时折扣聚合,汇聚了什么值得买、美国便宜货、折800、惠惠网、北美省钱快报、360特惠... | 逛丢 | 实时同步全网折扣 | Best real-time discounts cohesion, bringing to... | Around lose | real-time synchronization of the... |
70214 | 622 | forum.51nb.com | 专门网 | 专业的笔记本电脑技术交流网站 | Dedicated web | Professional laptop technology exchange website |
79952 | 656 | www.deyi.com | 得意生活(deyi.com)是武汉最大的本地生活交流与服务平台,提供最新最全的武汉打折、武汉... | 得意生活_武汉生活消费社区_武汉论坛_武汉打折 | Get a life(deyi. com)is Wuhan's largest local ... | That life of _ Wuhan consumer communities _ Wu... |
157 | 753 | bbs.8080.net | 极速网 | 华东地区最具商业价值IT门户网站,立足南京放眼全球,贴近市场彰显专业。 | Speed network | East China region the most commercial value of... |
70759 | 758 | www.haitaobei.com | 海淘贝-海淘购物分享社区 | 海淘贝是一家基于SNS海淘购物分享交流网站。在海淘贝你可以展示达人风采,分享海淘宝贝,交流最... | Sea phase.-sea-going shopping sharing community | The sea phase is a home-based SNS sea-going sh... |
36362 | 803 | bbs.fobshanghai.com | FOB Business Forum | Business Forum with information on Chinese and... | FOB Business Forum | Business Forum with information on Chinese and... |
71450 | 806 | www.123haitao.com | 极客海淘(123haitao.com),是以品质生活为主题的购物分享开放社区,鼓励每一个网友... | 极客海淘,属于你的购物分享社区 | The geek, the sea phase (the 123haitao. com), ... | Geek-sea phase of your shopping sharing community |
413 | 1003 | www.ebama.net | 首页 | 爱孩子,爱上爸妈网-幼儿英语|少儿英语|儿童英语|儿童教育 - | On the first page | Love the kid,love mom and dad web-kindergarten... |
70637 | 1032 | we.walatao.com | 瓦拉淘是一款致力于让你海淘更便捷的浏览器插件,简化海淘购物流程,提供诸多实用小工具,如:全文... | 海淘百科_瓦拉淘 | The phase is a commitment to make your sea pha... | Sea phase ENCYCLOPAEDIA of _ Vala phase |
22638 | 1065 | bbs.ahlife.com | 合肥论坛网站中,TOGO论坛是合肥成立较早的,2003年便成立了。是合肥论坛网站中最大的生活... | 合肥论坛_TOGO论坛_安徽生活网论坛_精致生活 尽在安徽生活网 - | Hefei Forum website, TOGO Forum is the fat est... | Hefei Forum _TOGO Forum of _ of Anhui, live we... |
16554 | 1436 | www.020.com | 广州网 | 广州网社区提供婚嫁、家居、美食、房产等讨论交流。 | Guangzhou network | Guangzhou web community to provide marriage, f... |
70537 | 1487 | www.gzmama.com | 广州妈妈网:广州妈妈首选的备孕、怀孕、育儿、早教等育儿交流平台,汇集广州美食、购物、情感、生... | 广州妈妈网_官方网站,广州妈妈首选育儿、生活等交流互动社区 - | Guangzhou mother network: Guangzhou, mom's fir... | Guangzhou mother network _ official website, G... |
70481 | 1570 | bbs.rebatesme.com | RebatesMe海淘论坛是一个内容丰富的海淘交流平台,为您提供海淘教程、海淘攻略、海淘商家... | RebatesMe海淘论坛|海淘交流-海淘返利-海淘优惠-海淘攻略助您轻松海淘 - | RebatesMe sea phase Forum is a content-rich se... | RebatesMe sea-going Forum|sea-phase exchange-t... |
77751 | 1580 | www.buytong.cn | NaN | NaN | NaN | NaN |
... | ... | ... | ... | ... | ... | ... |
70247 | 22732 | woliwowai.com.cn | NaN | NaN | NaN | NaN |
79951 | 22785 | bbs.wfits.com | 潍坊论坛 | 本地论坛,设有潍坊生活、吃遍潍坊、原创摄影、潍坊商业街、交友天地、情感天空等板块。 | 潍 mill Forum | Local forum, there are 潍 mill life, eat 潍 mill... |
70751 | 23119 | www.buluanmai.com | 不乱买名品购物 | 不乱买(www.buluanmai.com)是一个全中文的时尚海淘导购网站。这里汇集了100... | Don't buy name products, shopping | Don't buy(www. buluanmai. com)is one of the Ch... |
71808 | 23251 | bbs.shunderen.com | 顺德人网站 | 一个顺德人的综合社区。 | By German website | One way of integrated communities. |
77426 | 23650 | superhaitao.com | NaN | NaN | NaN | NaN |
71732 | 23650 | www.superhaitao.com | NaN | NaN | NaN | NaN |
71323 | 23904 | bbs.3uol.com | 三优之家 | “三优”即优生、优育、优教。三优之家网,是面向全国孕产育儿人群提供专业优生优育服务的大型社区... | Three home | “The Big Three”namely, students, nurturing and... |
71428 | 23975 | www.90792.com | NaN | NaN | NaN | NaN |
8031 | 24435 | ebuybible.com | NaN | NaN | NaN | NaN |
70667 | 24435 | www.ebuybible.com | NaN | NaN | NaN | NaN |
70286 | 25932 | zhengpinle.com | NaN | NaN | NaN | NaN |
78594 | 26597 | www.zhbaby.com.cn | NaN | NaN | NaN | NaN |
79620 | 26597 | zhbaby.com.cn | NaN | NaN | NaN | NaN |
79961 | 26993 | www.9iyouhui.com | NaN | NaN | NaN | NaN |
18562 | 27738 | www.symama.com | NaN | NaN | NaN | NaN |
70981 | 29926 | www.ht51.com | NaN | NaN | NaN | NaN |
70891 | 29965 | www.easyhaitao.com | NaN | NaN | NaN | NaN |
71425 | 30976 | www.axoeurope.com | NaN | NaN | NaN | NaN |
22341 | 31313 | www.wxmama.com | NaN | NaN | NaN | NaN |
70574 | 34972 | www.178good.com | NaN | NaN | NaN | NaN |
270 | 37583 | www.jmmama.com.cn | 江门妈妈网 | 江门妈妈网是关于家庭生活、亲子育儿、婚姻家庭、工作情感、怀孕咨询的互助互动社区,集聚会活动、... | Jiang door mom network | Jiang door, mom network, on family life, paren... |
70262 | 38707 | hao.yqie.com | NaN | NaN | NaN | NaN |
44218 | 41147 | shopping.ladymax.cn | Ladymax女性网 | 专业女性时尚网站。最爱分享 时尚购物入口 包括美容 服饰搭配等资讯。 | Ladymax female network | Professional female fashion website. Most love... |
70784 | 43437 | www.csmama.net | NaN | NaN | NaN | NaN |
70844 | 43606 | www.herebbs.com | NaN | NaN | NaN | NaN |
116 | 43606 | herebbs.com | NaN | NaN | NaN | NaN |
71080 | 49894 | www.hack6.com | NaN | NaN | NaN | NaN |
71479 | 767521 | www.shmama.net | NaN | NaN | NaN | NaN |
8936 | NaN | www.dd288.com | NaN | NaN | NaN | NaN |
70489 | NaN | www.bank-of-china.com | NaN | NaN | NaN | NaN |
147 rows × 6 columns
There are payment websites like g.alipay.com. Filter them out as well.
banned_str = u'银行|转运|邮寄|媒体|新闻|Alipay'
df_cn3 = df_cn2[~df_cn2.alexa_desc.fillna('').str.contains(banned_str)]
df_cn3.sort('alexa_cc_rank',ascending=1)\
.ix[:,['alexa_cc_rank','source_domain',
'alexa_title','alexa_desc']].drop_duplicates()
alexa_cc_rank | source_domain | alexa_title | alexa_desc | |
---|---|---|---|---|
70223 | 26 | best.pconline.com.cn | 太平洋电脑网 | 提供电脑产品介绍、价格信息、企业名录及电脑知识和新闻。 |
21903 | 28 | dongxi.douban.com | 豆瓣 | Douban.com is a multi-products web platform wh... |
166 | 50 | www.babytree.com | 宝宝树 | 中国最大的育儿网站社区,提供育儿博客、10G超大空间免费电子相册、在线交流育儿论坛等服务,为... |
70514 | 82 | bbs.haiwainet.cn | 海外网 | 海外网是人民日报海外版数字化转型建设的核心内容和重要支撑平台。海外版通过数字化转型,将逐步构... |
77102 | 178 | www.huihui.cn | 惠惠网 | 网易旗下购物搜索和网购推荐平台,提供主流购物网站的商品比价和购物返现服务。 |
70377 | 204 | www.boc.cn | 中国银行 | 中国银行股份有限公司,国有控股金融机构,业务范围涵盖商业银行、投资银行和保险领域,旗下有中银... |
299 | 213 | home.qqbaobao.com | 育儿社区 ,亲宝网育儿论坛 | 育儿社区 - 亲宝网育儿论坛 - |
72969 | 232 | www.ebrun.com | 亿邦动力 | 中国最大的电子商务网络媒体,为做电子商务的企业、电商经理人,每日提供电商行业的公司、人物、政... |
16 | 236 | www.mgpyh.com | 买个便宜货、美国便宜货致力于提供最靠谱的海淘、国内购物优惠资讯、汇集数十万网友的智慧,分享真... | \r\n \r\n\t 买个便宜货:买得到的美好生活 |
76535 | 360 | www.letsebuy.com | 门户 | 海淘网站_海外代购网站_海淘攻略_海淘_网上代购_尽在海外E购-letsebuy官网 - ... |
35610 | 405 | www.55haitao.com | 55海淘网是目前最值得信赖的专业海淘站,包括美国主流的海淘网站,详尽的海淘网址大全,更是目前... | 海淘网站-海淘返利-海淘族最值得信赖的海淘返利网站|55海淘网 |
70432 | 508 | bbs.55bbs.com | 我爱打折 | 购物、美食、娱乐、丽人……最具人气的生活消费资讯分享社区。 |
71679 | 513 | bbs.bozhong.com | 播种网 | 播种网(bozhong.com)创立于2003年,创始人非非妈妈是一位有着多年从医经验的妇科... |
70933 | 513 | riji.bozhong.com | 播种网 | 播种网(bozhong.com)创立于2003年,创始人非非妈妈是一位有着多年从医经验的妇科... |
251 | 529 | bbs.seedit.com | 播种网 | 播种网(bozhong.com)创立于2003年,创始人非非妈妈是一位有着多年从医经验的妇科... |
76214 | 605 | guangdiu.com | 最全的实时折扣聚合,汇聚了什么值得买、美国便宜货、折800、惠惠网、北美省钱快报、360特惠... | 逛丢 | 实时同步全网折扣 |
70216 | 622 | forum.51nb.com | 专门网 | 专业的笔记本电脑技术交流网站 |
79952 | 656 | www.deyi.com | 得意生活(deyi.com)是武汉最大的本地生活交流与服务平台,提供最新最全的武汉打折、武汉... | 得意生活_武汉生活消费社区_武汉论坛_武汉打折 |
157 | 753 | bbs.8080.net | 极速网 | 华东地区最具商业价值IT门户网站,立足南京放眼全球,贴近市场彰显专业。 |
70759 | 758 | www.haitaobei.com | 海淘贝-海淘购物分享社区 | 海淘贝是一家基于SNS海淘购物分享交流网站。在海淘贝你可以展示达人风采,分享海淘宝贝,交流最... |
36078 | 803 | bbs.fobshanghai.com | FOB Business Forum | Business Forum with information on Chinese and... |
71450 | 806 | www.123haitao.com | 极客海淘(123haitao.com),是以品质生活为主题的购物分享开放社区,鼓励每一个网友... | 极客海淘,属于你的购物分享社区 |
413 | 1003 | www.ebama.net | 首页 | 爱孩子,爱上爸妈网-幼儿英语|少儿英语|儿童英语|儿童教育 - |
70637 | 1032 | we.walatao.com | 瓦拉淘是一款致力于让你海淘更便捷的浏览器插件,简化海淘购物流程,提供诸多实用小工具,如:全文... | 海淘百科_瓦拉淘 |
23466 | 1065 | bbs.ahlife.com | 合肥论坛网站中,TOGO论坛是合肥成立较早的,2003年便成立了。是合肥论坛网站中最大的生活... | 合肥论坛_TOGO论坛_安徽生活网论坛_精致生活 尽在安徽生活网 - |
16554 | 1436 | www.020.com | 广州网 | 广州网社区提供婚嫁、家居、美食、房产等讨论交流。 |
20329 | 1487 | www.gzmama.com | 广州妈妈网:广州妈妈首选的备孕、怀孕、育儿、早教等育儿交流平台,汇集广州美食、购物、情感、生... | 广州妈妈网_官方网站,广州妈妈首选育儿、生活等交流互动社区 - |
70481 | 1570 | bbs.rebatesme.com | RebatesMe海淘论坛是一个内容丰富的海淘交流平台,为您提供海淘教程、海淘攻略、海淘商家... | RebatesMe海淘论坛|海淘交流-海淘返利-海淘优惠-海淘攻略助您轻松海淘 - |
77751 | 1580 | www.buytong.cn | NaN | NaN |
71843 | 1735 | www.tompda.com | TomPda | TOMPDA隶属于北京兴华晨辉数码科技有限公司创办于2002年12月,是中国知名的智能手机专... |
... | ... | ... | ... | ... |
70247 | 22732 | woliwowai.com.cn | NaN | NaN |
79951 | 22785 | bbs.wfits.com | 潍坊论坛 | 本地论坛,设有潍坊生活、吃遍潍坊、原创摄影、潍坊商业街、交友天地、情感天空等板块。 |
70751 | 23119 | www.buluanmai.com | 不乱买名品购物 | 不乱买(www.buluanmai.com)是一个全中文的时尚海淘导购网站。这里汇集了100... |
71808 | 23251 | bbs.shunderen.com | 顺德人网站 | 一个顺德人的综合社区。 |
77426 | 23650 | superhaitao.com | NaN | NaN |
71732 | 23650 | www.superhaitao.com | NaN | NaN |
71323 | 23904 | bbs.3uol.com | 三优之家 | “三优”即优生、优育、优教。三优之家网,是面向全国孕产育儿人群提供专业优生优育服务的大型社区... |
71428 | 23975 | www.90792.com | NaN | NaN |
8031 | 24435 | ebuybible.com | NaN | NaN |
70667 | 24435 | www.ebuybible.com | NaN | NaN |
70286 | 25932 | zhengpinle.com | NaN | NaN |
78594 | 26597 | www.zhbaby.com.cn | NaN | NaN |
79620 | 26597 | zhbaby.com.cn | NaN | NaN |
79961 | 26993 | www.9iyouhui.com | NaN | NaN |
18562 | 27738 | www.symama.com | NaN | NaN |
70981 | 29926 | www.ht51.com | NaN | NaN |
70891 | 29965 | www.easyhaitao.com | NaN | NaN |
71425 | 30976 | www.axoeurope.com | NaN | NaN |
22341 | 31313 | www.wxmama.com | NaN | NaN |
70574 | 34972 | www.178good.com | NaN | NaN |
270 | 37583 | www.jmmama.com.cn | 江门妈妈网 | 江门妈妈网是关于家庭生活、亲子育儿、婚姻家庭、工作情感、怀孕咨询的互助互动社区,集聚会活动、... |
70262 | 38707 | hao.yqie.com | NaN | NaN |
44218 | 41147 | shopping.ladymax.cn | Ladymax女性网 | 专业女性时尚网站。最爱分享 时尚购物入口 包括美容 服饰搭配等资讯。 |
70784 | 43437 | www.csmama.net | NaN | NaN |
70844 | 43606 | www.herebbs.com | NaN | NaN |
116 | 43606 | herebbs.com | NaN | NaN |
71080 | 49894 | www.hack6.com | NaN | NaN |
71479 | 767521 | www.shmama.net | NaN | NaN |
8936 | NaN | www.dd288.com | NaN | NaN |
70489 | NaN | www.bank-of-china.com | NaN | NaN |
146 rows × 4 columns
We find out there are quite a lot bbs/forum-like websites such as www.bjmama.com and home.qqbaobao.com which attracted quite significant views. As we cannot only rely on Alexa ranking because it only reflects the site's overall traffic and not actual pageviews for particular page.
So let's scrape their views and replies.
source_domains = df_cn3.source_domain.unique().tolist()
idx_threads = df_cn3[df_cn3.source_url.str.contains('thread|forum|community')].index.tolist()
for c,i in enumerate(idx_threads):
res = get_thread_stats(df_cn3.ix[i, 'source_url'])
if res is not None:
df_cn3.ix[i, 'views'] = res.get('views')
df_cn3.ix[i, 'replies'] = res.get('replies')
soup = res.get('soup')
# get links that is connected to the contained domains
if soup is not None:
links = get_connection_links(soup, [d for d in source_domains if d != df_cn3.ix[i,'source_domain']])
df_cn3.ix[i, 'links'] = '|'.join(links)
has_promo = has_promotion(soup)
df_cn3.ix[i, 'has_promotion'] = has_promo
if (c>0) and (not c % 50):
print str(c), df_cn3.ix[i, 'source_url'], res.get('views'), res.get('replies'), links
150 http://www.gzmama.com/thread-3846190-1-1.html 8433 54 ['http://www.bjmama.com/', 'http://www.shmama.net/', 'http://www.szmama.com/', 'http://www.symama.com/', 'http://www.xamama.net/', 'http://www.jnmama.com/', 'http://www.qdmama.net/', 'http://www.csmama.net/', 'http://www.wxmama.com/'] 200 http://www.55haitao.com/bbs/thread-7828-5-1.html None None [] 250 http://www.letsebuy.com/thread-1539361-1-1.html 362 7 ['http://www.boc.cn/sourcedb/whpj/', 'http://www.eur-go.com/', 'http://www.eur-go.com', 'http://www.eur-go.com/price-service-134-eurgo.aspx'] 300 http://www.letsebuy.com/thread-2003552-1-1.html 1087 37 ['http://www.boc.cn/sourcedb/whpj/'] 350
def output(df):
df.to_csv(k, sep='\t', encoding='utf-8')
df.sort('alexa_cc_rank',ascending=1)\
.ix[:,['alexa_cc_rank','source_domain',
'alexa_title','alexa_desc',
'alexa_title_en','alexa_desc_en']].drop_duplicates()\
.to_csv('C:/workspace/analysis_reporting/joe/milk_china/anchor_domain.csv', sep='\t', encoding='utf-8')
output(df_cn3)