%matplotlib inline
import requests
import pandas as pd
import numpy as np
stock_id = "2451"
res = requests.get("http://tw.stock.yahoo.com/d/s/major_%s.html" % stock_id)
tables = pd.read_html(res.text)
#for one_table in tables:
# print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
# print one_table.values.shape
# print one_table
#tables[4]
tables[3]
tables[7]
data_table = [one_table for one_table in tables if one_table.values.shape==(16,8)][0]
data_table
date_table = [one_table for one_table in tables if one_table.values.shape==(1,3)][0]
date_table
import datetime
date_tuple = map(int,date_table[0].values.tolist()[0].split(u":")[-1].split("/"))
print "date_tuple = ",date_tuple
date_tuple[0] = date_tuple[0] + 1911
print "date_tuple = ",date_tuple
import datetime
date_obj = datetime.date(*date_tuple)
print "date_obj = ",date_obj
print date_obj.strftime("%Y-%m-%d")
date_str = date_obj.strftime("%Y-%m-%d")
pre_df = np.r_[ data_table.values[1:,0:4], data_table.values[1:,4:]]
print pre_df
df = pd.DataFrame(pre_df)
df.columns = ["Name","Buy","Sell","Net"]
df
df["Date"] = df["Name"].map(lambda xx: date_obj)
df["Stock_id"] = df["Name"].map(lambda xx: stock_id)
df.reindex(columns=["Date","Stock_id","Name","Buy","Sell","Net"])
def get_stock_major_info(stock_id):
# Original df
res = requests.get("http://tw.stock.yahoo.com/d/s/major_%s.html" % str(stock_id))
tables = pd.read_html(res.text)
data_table = [one_table for one_table in tables if one_table.values.shape==(16,8)][0]
pre_df = np.r_[ data_table.values[1:,0:4], data_table.values[1:,4:]]
df = pd.DataFrame(pre_df)
df.columns = ["Name","Buy","Sell","Net"]
# df with Date and Stock_id
date_table = [one_table for one_table in tables if one_table.values.shape==(1,3)][0]
date_tuple = map(int,date_table[0].values.tolist()[0].split(u":")[-1].split("/"))
date_tuple[0] = date_tuple[0] + 1911
import datetime
date_obj = datetime.date(*date_tuple)
df["Date"] = df["Name"].map(lambda xx: date_obj)
df["Stock_id"] = df["Name"].map(lambda xx: stock_id)
df = df.reindex(columns=["Date","Stock_id","Name","Buy","Sell","Net"])
return df
stock_major_df = get_stock_major_info(2330)
stock_major_df
import requests
import pandas as pd
import numpy as np
req_data = {
"encodeURIComponent":1,
"step":1,
"firstin":1,
"TYPEK":"sii",
"code":""
}
res = requests.post("http://mops.twse.com.tw/mops/web/ajax_t51sb01",data = req_data)
tables = pd.read_html(res.text)
len(tables)
dirty_df = [ tab_df for tab_df in tables if len(tab_df.values[:,0])>1][0]
# dirty_df
print "dirty_df.values.shape = ",dirty_df.values.shape
print "dirty_df.values[:,0] = ",dirty_df.values[:,0].tolist()
Stock_IDs = [ str(int(xx)) for xx in dirty_df.values[:,0].tolist() if xx>0]
print len(Stock_IDs)
print Stock_IDs
total_df = get_stock_major_info(Stock_IDs[0])
for one_stock_id in Stock_IDs[1:50]:
try:
new_df = get_stock_major_info(one_stock_id)
total_df = total_df.append(new_df)
except Exception as e:
print "~~~~~~~~~~~~~~~~~~~~"
print "Error in %s" % one_stock_id
print e
print "~~~~~~~~~~~~~~~~~~~~"
print total_df.shape
total_df.head()
total_df.to_csv("stock_major_info.csv",encoding="utf8",index=None)
total_df.groupby("Name")["Buy","Sell","Net"].sum().sort("Buy",ascending=False).head(20)
import sqlite3
conn = sqlite3.connect('nccu_mldm_course.db')
total_df.to_sql(name = "stock_major", con=conn, if_exists="replace")
import requests
import pandas as pd
import numpy as np
import datetime
from pyquery import PyQuery
now_time = datetime.datetime.now()
dw_filename = now_time.strftime("t51sb01_%Y%m%d_%H%M%S565.csv")
req_data = {
"encodeURIComponent":1,
"step":1,
"firstin":1,
"TYPEK":"sii",
"code":""
}
res1 = requests.post("http://mops.twse.com.tw/mops/web/ajax_t51sb01",data = req_data)
S = PyQuery(res1.text)
dw_filename = S('input[name="filename"]').map(lambda :PyQuery(this).val())[0]
print "dw_filename = ",dw_filename
req_data = {
"firstin":"true",
"step":"10",
"filename":dw_filename #"t51sb01_20140113_040331565.csv"
}
r = requests.post("http://mops.twse.com.tw/server-java/t105sb02",data = req_data)
r.encoding = "big5"
print r.encoding
print r.text
# with open("stock_ids.csv", 'wb') as f:
# for chunk in r.iter_content(chunk_size=1024):
# if chunk: # filter out keep-alive new chunks
# f.write(chunk)
# f.flush()
f = open("stock_ids.csv", 'wb')
for chunk in r.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
r.text.split("\n")[0]
csv_data_list = map(lambda xx:xx.split(","),r.text.split("\n"))
len(csv_data_list)
#map(len,csv_data_list)
print csv_data_list[13][12]
print csv_data_list[14][12]
from StringIO import StringIO
test_df = pd.read_csv(StringIO(r.text))
import requests
r = requests.get("http://pi.isuphoto.org/api/user/uei?page=0")
print r.text
import re
re_search_result = re.search(r"(?P<jsonp_fn>[^(]*)\((?P<json_data>\[[^]]+\])\)",r.text)
print dir(re_search_result)
re_search_result.groupdict()
json_data_str = re_search_result.groupdict()["json_data"]
json_data = eval(json_data_str)
json_data[:3]
from pymongo import MongoClient
mongo_client = MongoClient()
nccu_mldm_course_db = mongo_client.nccu_mldm_course_db
isuphoto_col = nccu_mldm_course_db.isuphoto_col
isuphoto_col.drop()
isuphoto_col.insert(json_data)
list(isuphoto_col.find())[:3]
import ujson
json_data_df = pd.read_json(ujson.dumps(json_data))
json_data_df
from pyquery import PyQuery
json_data_df["pic"] = json_data_df["pic"].apply(lambda xx:PyQuery(xx)("img").attr("src"))
json_data_df["coll"] = json_data_df["coll"].apply(lambda xx:PyQuery(xx).text())
json_data_df["loc"] = json_data_df["loc"].apply(lambda xx:PyQuery(xx)("a").attr("href") if len(xx)>0 else xx)
#json_data_df["cc_class"] = json_data_df["cc"].apply(lambda xx:PyQuery(xx)("i").attr("class"))
json_data_df
isuphoto_col.drop()
isuphoto_api_url_gen = lambda user_id, page_n: "http://pi.isuphoto.org/api/user/%s?page=%s" % (str(user_id),int(page_n))
page_count = 0
cont_connect = True
while cont_connect:
try:
web_url = isuphoto_api_url_gen("uei",page_count)
r = requests.get(web_url)
re_search_result = re.search(r"(?P<jsonp_fn>[^(]*)\((?P<json_data>\[[^]]+\])\)",r.text)
if re_search_result != None:
json_data_str = re_search_result.groupdict()["json_data"]
json_data = eval(json_data_str)
isuphoto_col.insert(json_data)
page_count = page_count + 1
print "[HAS_DATA]",web_url
else:
cont_connect = False
print "[No_DATA]",web_url
except:
cont_connect = False
page_count = 0
cont_connect = True
while cont_connect:
try:
web_url = isuphoto_api_url_gen("toomore",page_count)
r = requests.get(web_url)
re_search_result = re.search(r"(?P<jsonp_fn>[^(]*)\((?P<json_data>\[[^]]+\])\)",r.text)
if re_search_result != None:
json_data_str = re_search_result.groupdict()["json_data"]
json_data = eval(json_data_str)
isuphoto_col.insert(json_data)
page_count = page_count + 1
print "[HAS_DATA]",web_url
else:
cont_connect = False
print "[No_DATA]",web_url
except:
cont_connect = False
import requests
r = requests.get("http://www.taifex.com.tw/eng/eng3/eng3_2dl.asp?COMMODITY_ID=all&DATA_DATE=2013/03/01&DATA_DATE1=2013/03/05")
r.history
f = open("taifex_future_data.csv", 'wb')
for chunk in r.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
r.text
from StringIO import StringIO
taifex_future_data = pd.read_csv(StringIO(r.text))
taifex_future_data.head()
import requests
from pyquery import PyQuery
s = requests.Session()
res1 = s.get("http://lvr.land.moi.gov.tw/N11/homePage.action")
res2 = s.get("http://lvr.land.moi.gov.tw/N11/ImageNumberN13?")
res2.text
f = open("login_img.jpg", 'wb')
for chunk in res2.iter_content(chunk_size=512 * 1024):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
f.close()
# import matplotlib.pyplot as plt
# img = plt.imread("login_img.jpg")
# plt.imshow(img)
from IPython.core.display import Image
Image("login_img.jpg")
login_password = 9945
S = PyQuery(res1.text)
post_data = dict(S("form input").map(lambda :(PyQuery(this).attr("name"),PyQuery(this).attr("value"))))
post_data
post_data.update({"command":"login",
"rand_code":login_password})
post_data
res3 = s.post("http://lvr.land.moi.gov.tw/N11/login.action",data = post_data)
res3.request.headers
res4 = s.post("http://lvr.land.moi.gov.tw/N11/pro/setToken.jsp")
SS = PyQuery(res4.text)
web_tocken = SS("input[name='token']").attr("value")
web_tocken
query_post_data = {"type":"UXJ5ZGF0YQ==",
"Qry_city":"QQ==",
"Qry_area_office":"QTAy",
"Qry_paytype":"MQ==",
"Qry_build":"",
"Qry_price_s":"",
"Qry_price_e":"",
"Qry_unit_price_s":"",
"Qry_unit_price_e":"",
"Qry_p_yyy_s":"MTAx",
"Qry_p_yyy_e":"MTAy",
"Qry_season_s":"MQ==",
"Qry_season_e":"MTE=",
"Qry_doorno":"",
"Qry_area_s":"",
"Qry_area_e":"",
"Qry_order":"UUEwOCZkZXNj",
"Qry_unit":"Mg==",
"Qry_area_srh":"",
"Qry_buildyear_s":"",
"Qry_buildyear_e":"",
"Qry_origin":"P",
"Qry_avg":"off",
"struts.token.name":"token",
"token":web_tocken}
res5 = s.post("http://lvr.land.moi.gov.tw/N11/QryClass_land.action",data = query_post_data)
print res5.text
SSS = PyQuery(res5.text)
for one_row_data in PyQuery(SSS("tr.nowrap[id]")[5])("td").map(lambda :PyQuery(this).text() if len(PyQuery(this)("img")) == 0 else PyQuery(this)("img").attr("title")):
print one_row_data
import pandas as pd
Raw_Data_List = [list(PyQuery(xx)("td").map(lambda :PyQuery(this).text() if len(PyQuery(this)("img")) == 0 else PyQuery(this)("img").attr("title"))) for xx in SSS("tr.nowrap[id]")]
Raw_Data_df = pd.DataFrame(Raw_Data_List)
Raw_Data_df.columns = ["Address","Date","TotalPrice","Area","UniPrice","TradingNumber","Type","Layout"]
Raw_Data_df.head()
import requests
from pyquery import PyQuery
gen_ptt_board_page_url = lambda b_name: "http://www.ptt.cc/bbs/%s/index.html" % b_name
board_name = "R_Language"
r_lang_url = gen_ptt_board_page_url(board_name)
print r_lang_url
res = requests.get(r_lang_url)
S = PyQuery(res.text)
_article_urls = S(".title a").map(lambda :PyQuery(this).attr("href"))
article_urls = ["http://www.ptt.cc%s" % one_url for one_url in _article_urls if one_url.startswith("/bbs")]
article_urls
def get_ptt_article_url_lists(one_ptt_url):
res = requests.get(one_ptt_url)
S = PyQuery(res.text)
_article_urls = S(".title a").map(lambda :PyQuery(this).attr("href"))
article_urls = ["http://www.ptt.cc%s" % one_url for one_url in _article_urls if one_url.startswith("/bbs")]
return article_urls
get_ptt_article_url_lists("http://www.ptt.cc/bbs/BuyTogether/index.html")
PyQuery(S("div.btn-group.pull-right > a")[1]).attr("href").split("index")[1].split(".")[0]
def get_max_pages(one_ptt_board_url):
res = requests.get(one_ptt_board_url)
S = PyQuery(res.text)
return int(PyQuery(S("div.btn-group.pull-right > a")[1]).attr("href").split("index")[1].split(".")[0])
get_max_pages("http://www.ptt.cc/bbs/BuyTogether/index.html")
def get_all_pages_url(one_ptt_board_url):
max_n = get_max_pages(one_ptt_board_url)
url_head = one_ptt_board_url.split("index")[0]
all_urls = [ url_head + "index%s.html" % ii for ii in [""] + range(max_n)]
return all_urls
get_all_pages_url("http://www.ptt.cc/bbs/R_Language/index.html")
one_article_url = article_urls[2]
one_article_res = requests.get(one_article_url)
SS = PyQuery(one_article_res.text)
one_article_data = {}
one_article_data["Board"] = SS(".article-metaline-right > .article-meta-value").text()
one_article_data.update(dict(zip(["Author","Title","Time"],SS(".article-metaline > .article-meta-value").map(lambda :PyQuery(this).text()))))
one_article_data
SS("#main-content > div.article-metaline").remove()
SS("#main-content > div.article-metaline-right").remove()
SS("#main-content").contents()
import lxml
one_article_data["text"] = "".join(map(lambda xx:PyQuery(xx).text() if isinstance(xx,lxml.html.HtmlElement) else xx,
SS("#main-content").contents()))
one_article_data["url"] = one_article_res.url
print one_article_data["text"]
one_article_data
#one_article_url = article_urls[2]
def get_one_article_meta_data(one_article_url):
one_article_res = requests.get(one_article_url)
SS = PyQuery(one_article_res.text)
one_article_data = {}
one_article_data["Board"] = SS(".article-metaline-right > .article-meta-value").text()
one_article_data.update(dict(zip(["Author","Title","Time"],SS(".article-metaline > .article-meta-value").map(lambda :PyQuery(this).text()))))
SS("#main-content > div.article-metaline").remove()
SS("#main-content > div.article-metaline-right").remove()
one_article_data["text"] = "".join(map(lambda xx:PyQuery(xx).text() if isinstance(xx,lxml.html.HtmlElement) else xx,
SS("#main-content").contents()))
one_article_data["url"] = one_article_res.url
return one_article_data
get_one_article_meta_data(article_urls[3])
from pymongo import MongoClient
mongo_client = MongoClient()
nccu_mldm_course_db = mongo_client.nccu_mldm_course_db
ptt_article_col = nccu_mldm_course_db.ptt_article_col
ptt_article_col.drop()
ptt_article_col.insert(one_article_data)
list(ptt_article_col.find())
def save_all_ariticles_to_db(board_name, Limit = None):
board_url = gen_ptt_board_page_url(board_name)
all_article_pages_list = get_all_pages_url(board_url)
if isinstance(Limit,int):
if Limit < len(all_article_pages_list):
all_article_pages_list = all_article_pages_list[:Limit]
for one_page in all_article_pages_list:
article_urls_list = get_ptt_article_url_lists(one_page)
results = map(get_one_article_meta_data,article_urls_list)
if len(results) > 0:
ptt_article_col.insert(results)
# ptt_article_col.drop()
# save_all_ariticles_to_db("R_Language")
# save_all_ariticles_to_db("BuyTogether", Limit=30)
list(ptt_article_col.find())
!mongoexport --db nccu_mldm_course_db --collection ptt_article_col --out ptt_articles.json