from IPython.display import Image, display
display(Image(filename='noti98.png', embed=True))
import urllib2
print urllib2.urlopen('http://www.cc98.org/queryresult.asp?stype=3').read()
<html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <link rel="alternate" type="application/rss+xml" title="察看新帖" href="boardrss.asp" /> <link rel="alternate" type="application/rss+xml" title="热门话题" href="rss.asp" /> <title>搜索结果 » CC98论坛</title> <link rel="shortcut icon" href="favicon.ico"> <link rel="icon" href="favicon.ico"> <link rel="stylesheet" href="inc/style.css" type="text/css"> <script type="text/javascript"> var currentUserID = 0; var currentBoardID = 0; var bannerPath = 'banner/'; var bannerInfo = {"t":[{"p":"5172104394.gif","i":"651"},{"p":"5172113093.gif","i":"652"},{"p":"5172115259.gif","i":"653"}],"b":[{"p":"10172365833.jpg","i":"632"},{"p":"32010553856.gif","i":"646"},{"p":"32023391310.jpg","i":"648"},{"p":"32023405085.gif","i":"649"}]}; </script> <script src="js/mootools-core-1.4.5-full-compat.js"></script> <script type="text/javascript" src="js/common.js"></script> <script type="text/javascript" src="js/ccdialog.js"></script> <script type="text/javascript" src="js/boardquery.js"></script> </head> <body topmargin=0 leftmargin=0> <!-- 菜单 div --> <div id="menuDiv" style='Z-INDEX: 2; display: none; WIDTH: 1px; POSITION: absolute; HEIGHT: 1px; BACKGROUND-COLOR: #9cc5f8'></div> <!-- 头部table --> <table cellspacing="0" cellpadding="0" align="center" style="border: 1px #6595D6 solid; border-top-width: 0px; border-right-width: 1px; border-bottom-width: 0px; border-left-width: 1px; width: 97%;"> <tr> <td width="100%"> <!-- 真正的开始表格 外面那个是用来限制的 --> <table width="100%" align="center" border="0" cellspacing="0" cellpadding="0"> <!-- ==============第一行横杠====================== --> <tr> <td class="TopDarkNav" height="9"></td> </tr> <!-- ==============logo和帮助====================== --> <tr> <td height="70" class="TopLighNav2"> <table border="0" width="100%" align="center"> <tr> <!-- 小logo --> <td align="left" width="25%"> <a href="http://www.cc98.org"> <img border="0" src="images/logo/logo.gif"></a> </td> <!-- 大logo --> <td align="center" width="65%"> <div class="advImg" id="advImg"><a id="advLink" href="#" target="_blank"></a></div> <script language="javascript"> showadvs(1); </script> </td> <!-- 帮助 --> <td align="right" style="line-height: 15pt" width="10%"> <a href="#"><span style="CURSOR: hand" onclick="window.external.AddFavorite('http://www.cc98.org', 'www.cc98.org')">加入收藏</span></a><br> <a href="#"><span style="cursor: pointer" onclick="this.style.behavior='url(#default#homepage)';this.setHomePage('http://www.cc98.org');">设为首页</span></a><br> <a href="#"><span style="cursor: pointer" class="boardQueryLink" id="boardQueryLink">搜索版面</span></a><br> <!--<a href="boardhelp.asp">论坛帮助</a>--> </td> </tr> </table> </td> </tr> <!-- ==============第二行横杠====================== --> <tr> <td class="TopLighNav" height="9"></td> </tr> <!-- ============== 菜单 ====================== --> <tr> <td class="TopLighNav1" height="22" valign="top"> <div style="position: relative; margin-top: 2px"> <div style="position: absolute; width: 960px; top: 0px;"> <b>未登录</b> <a href="login.asp" onmouseout='DelayHideMenu();' onmouseover='ShowMenu(this,manage1,100);'>注册登录</a> <!--<img src=pic/navspacer.gif align=absmiddle> <a href="guidboard.asp" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,hotboard,100)'>热门版面</a>--> <img src="pic/navspacer.gif" align="absmiddle"> <a href="query.asp?boardid=0" onmouseout='DelayHideMenu();' onmouseover='ShowMenu(this,Search,100);'>论坛搜索</a> <img src="pic/navspacer.gif" align="absmiddle"> <!--<a href="#" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,stylelist,100)'>自选风格</a> <img src=pic/navspacer.gif align=absmiddle>--> <a href="boardstat.asp?boardid=0" onmouseout='DelayHideMenu();' onmouseover='ShowMenu(this,boardstat,100);'>论坛状态</a> <!-- <img src=pic/navspacer.gif align=absmiddle> <a href="show.asp?boardid=0" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,downlist,100)'>论坛展区</a> --> <!--<img src=pic/navspacer.gif align=absmiddle> <a href="http://10.71.45.100/newblog" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,blog,100)' target="_blank">98blog</a> <img src=pic/navspacer.gif align=absmiddle> <a href="music.asp" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,music,100)'>点歌</a>--> <img src="pic/navspacer.gif" align="absmiddle"> <a onmouseout='DelayHideMenu();this.style.cursor="auto";' onmouseover='ShowMenu(this,addon,100);this.style.cursor="pointer";' target="_blank">论坛附件</a> <img src="pic/navspacer.gif" align="absmiddle"> <a href="boardhelp.asp" onmouseout='DelayHideMenu();' onmouseover='ShowMenu(this,help,100);'>论坛帮助</a> <img src="pic/navspacer.gif" align="absmiddle"> <a href="javascript:;" onclick="changeSimple(1)">Simple版</a> <!-- <img src="pic/navspacer.gif" align="absmiddle"> <a href="http://bbs.cc98.org/" title="Alpha 就是比 Beta 更烂的...">2.0 Alpha</a> --> </div> <div id="multiLogin"></div> </div> </td> </tr> </table> <!-- 最外层 --> </td> </tr> </table> <table cellspacing="1" cellpadding="3" align="center" border="0" width="97%"> <tr> <td height="25"> <br> >> 欢迎光临 <b>www.cc98.org</b> </td> </tr> </table> <table cellspacing="1" cellpadding="3" align="center" class="tableBorder2"> <tr> <td height="25" valign="middle"> <img src="pic/Forum_nav.gif" align="absmiddle"> <a href="index.asp">www.cc98.org</a> → <a href=query.asp?boardid=0>论坛搜索</a> → 搜索结果 <a name="top"></a> </td> </tr> </table> <br> <br> <script type="text/javascript" src="js/md5.js"></script> <script type="text/javascript" src="js/jquery-1.8.2.min.js"></script> <script type="text/javascript"> function getHash(){ document.getElementById("hashedPassword").value = hex_md5(document.getElementById("password").value); document.getElementById("password").value = "tech.cc98.org"; return true; } </script> <table cellpadding="3" cellspacing="1" align="center" class="tableborder1" style="width: 75%"> <tr align="center"> <th width="100%" height="25" colspan="2"> 论坛错误信息</td> </tr> <tr> <td width="100%" class="tablebody1" colspan="2"> <b>产生错误的原因:</b> <br><li>您没有在本论坛搜索的权限,请<a href=login.asp>登录</a>或者同管理员联系。 <br> <br> <font color="#FF0000"><b>请您仔细阅读了 <a href="boardhelp.asp?boardid=0">帮助文件</a>, 可能您还没有登录。如果有任何问题请联系站务组。</b> </font> </td> </tr> <form id="loginForm" action="login.asp?action=chk" method="post" onsubmit="return sign.In()"> <tr> <th valign="middle" colspan="2" align="center" height="25"> 请输入您的用户名、密码登录</td> </tr> <tr> <td valign="middle" class="tablebody1">请输入您的用户名</td> <td valign="middle" class="tablebody1"> <input id="userName" name="username" type="text"> <a href="https://secure.cc98.org/Register">按这儿注册,加入我们吧</a> </td> </tr> <tr> <td valign="middle" class="tablebody1">请输入您的密码</font></td> <td valign="middle" class="tablebody1"> <input name="password" id="password" type="password"><!-- <a href=lostpass.asp>忘记密码的话就按这儿吧</a>--> </td> </tr> <tr> <td valign="top" width="30%" class="tablebody1"> <b>隐身登录</b><br> 您可以选择隐身登录,论坛会员将在用户列表看不到您的信息。 </td> <td valign="middle" class="tablebody1"> <input type="radio" name="userhidden" value="2" checked>正常登录<br> <input type="radio" name="userhidden" value="1">隐身登录<br> <input type="hidden" id="hashedPassword" name="hashedPassword"> </td> </tr> <input type="hidden" id="comeUrl" name="comeurl" value=""> <tr> <td class="tablebody2" valign="middle" align="center"><span id="loginTip" style="height: 26px; line-height: 24px; color: red;"></span></td> <td class="tablebody2" valign="middle" align="left"> <input type="submit" id="submit" name="submit" value="登 录" style="height: 40px; width: 120px;"> <input type="button" name="back" value="返 回" onclick="location.href=''" style="height: 40px; width: 120px;"> </td> </tr> </form> </table> <div id="copyright">Copyright © 2003-2013 CC98 Network Association. Email: contact@cc98.org</div> </body> </html> <!--end:10.986毫秒--> <script type="text/javascript"> <!-- //快捷键 翻页 function _shortcut(evt) { var fileName="queryResult.asp"; var star=parseInt(""); var total_star=parseInt(""); //stype=request("stype") //pSearch=request("pSearch") //nSearch=request("nSearch") //keyword=trim(checkStr(request("keyword"))) //stable=checkstr(request("stable")) //sertype=checkstr(request("sertype")) //searchDate=request("SearchDate") //<a href=""?page="&Pcount&"&stype="&stype&"&pSearch="&pSearch&"&nSearch="&nSearch&"&keyword="&server.urlencode(keyword)&"&SearchDate="&server.urlencode(searchDate)&"&boardid="&boardid&"&stable="&stable&"&sertype="&sertype&""">["&Pcount&"]</a>" var origin_search= "?stype=" + "3" + "&psearch=" + "" + "nsearch" + "" + "&keyword=" + "" + "&searchDate=" + "" + "&boardid=" + "0" + "&stable=" + "bbs7" + "&sertype=" + "1"; var evt=evt?evt:window.event; if(star==""&star==0) star=1; var nextStar=1; if(evt.altKey&&evt.keyCode==33){ if(star<=1) return; nextStar=star-1; window.location.href=fileName+origin_search+"&page="+nextStar; }else if (evt.altKey && evt.keyCode == 34){ if(star>=total_star) return; nextStar=star+1; window.location.href=fileName+origin_search+"&page="+nextStar; } } if(window.document.addEventListener){ window.document.addEventListener("keydown",_shortcut,false); }else{ window.document.attachEvent("onkeydown",_shortcut,false); } // --> </script>
出错了, 因为我们还没有登录. 如果用浏览器查看, 会返回如下结果:
display(Image(filename='login.thumbnail.jpg', embed=True))
Cookie是HTTP协议头中的一小段数据, 浏览器通常将其保存在本地, 用来存储用户与网站交互过程中的状态.
下面是一个来自维基百科Cookie应用的典型例子:
因为HTTP协议是无状态的,即服务器不知道用户上一次做了什么,这严重阻碍了交互式Web应用程序的实现。在典型的网上购物场景中,用户浏览了几个页面,买了一盒饼干和两瓶饮料。最后结帐时,由于HTTP的无状态性,不通过额外的手段,服务器并不知道用户到底买了什么。 所以Cookie就是用来绕开HTTP的无状态性的“额外手段”之一。服务器可以设置或读取Cookies中包含信息,借此维护用户跟服务器会话中的状态。
在刚才的购物场景中,当用户选购了第一项商品,服务器在向用户发送网页的同时,还发送了一段Cookie,记录着那项商品的信息。当用户访问另一个页面,浏览器会把Cookie发送给服务器,于是服务器知道他之前选购了什么。用户继续选购饮料,服务器就在原来那段Cookie里追加新的商品信息。结帐时,服务器读取发送来的Cookie就行了。
当用户登录网站时勾选了"下次自动登录", 用户名和密码就会被加密保存在Cookie中. 用户下次登录时, 浏览器会将该Cookie发送给网站, 网站验证加密信息, 从而识别该用户.
urllib2
本身并不具备保存Cookie的功能, 需要借助cookielib
:
import urllib
import urllib2
from cookielib import CookieJar
login_url = 'http://www.cc98.org/login.asp'
posts_url = 'http://www.cc98.org/queryresult.asp?stype=3'
# 保存了Cookie的对象
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(CookieJar()))
# 用opener登录, Cookie被保存下来
opener.open(login_url, urllib.urlencode(dict(
username='answeror',
password='42',
action='chk'
)))
# 使用保存了用户凭据的opener访问页面
data = opener.open(posts_url).read()
print data
<html> <head> <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> <link rel="alternate" type="application/rss+xml" title="察看新帖" href="boardrss.asp" /> <link rel="alternate" type="application/rss+xml" title="热门话题" href="rss.asp" /> <title>搜索结果 » CC98论坛</title> <link rel="shortcut icon" href="favicon.ico"> <link rel="icon" href="favicon.ico"> <link rel="stylesheet" href="inc/style.css" type="text/css"> <script type="text/javascript"> var currentUserID = 401578; var currentBoardID = 0; var bannerPath = 'banner/'; var bannerInfo = {"t":[{"p":"5172104394.gif","i":"651"},{"p":"5172113093.gif","i":"652"},{"p":"5172115259.gif","i":"653"}],"b":[{"p":"10172365833.jpg","i":"632"},{"p":"32010553856.gif","i":"646"},{"p":"32023391310.jpg","i":"648"},{"p":"32023405085.gif","i":"649"}]}; </script> <script src="js/mootools-core-1.4.5-full-compat.js"></script> <script type="text/javascript" src="js/common.js"></script> <script type="text/javascript" src="js/ccdialog.js"></script> <script type="text/javascript" src="js/boardquery.js"></script> </head> <body topmargin=0 leftmargin=0> <!-- 菜单 div --> <div id="menuDiv" style='Z-INDEX: 2; display: none; WIDTH: 1px; POSITION: absolute; HEIGHT: 1px; BACKGROUND-COLOR: #9cc5f8'></div> <!-- 头部table --> <table cellspacing="0" cellpadding="0" align="center" style="border: 1px #6595D6 solid; border-top-width: 0px; border-right-width: 1px; border-bottom-width: 0px; border-left-width: 1px; width: 97%;"> <tr> <td width="100%"> <!-- 真正的开始表格 外面那个是用来限制的 --> <table width="100%" align="center" border="0" cellspacing="0" cellpadding="0"> <!-- ==============第一行横杠====================== --> <tr> <td class="TopDarkNav" height="9"></td> </tr> <!-- ==============logo和帮助====================== --> <tr> <td height="70" class="TopLighNav2"> <table border="0" width="100%" align="center"> <tr> <!-- 小logo --> <td align="left" width="25%"> <a href="http://www.cc98.org"> <img border="0" src="images/logo/logo.gif"></a> </td> <!-- 大logo --> <td align="center" width="65%"> <div class="advImg" id="advImg"><a id="advLink" href="#" target="_blank"></a></div> <script language="javascript"> showadvs(1); </script> </td> <!-- 帮助 --> <td align="right" style="line-height: 15pt" width="10%"> <a href="#"><span style="CURSOR: hand" onclick="window.external.AddFavorite('http://www.cc98.org', 'www.cc98.org')">加入收藏</span></a><br> <a href="#"><span style="cursor: pointer" onclick="this.style.behavior='url(#default#homepage)';this.setHomePage('http://www.cc98.org');">设为首页</span></a><br> <a href="#"><span style="cursor: pointer" class="boardQueryLink" id="boardQueryLink">搜索版面</span></a><br> <!--<a href="boardhelp.asp">论坛帮助</a>--> </td> </tr> </table> </td> </tr> <!-- ==============第二行横杠====================== --> <tr> <td class="TopLighNav" height="9"></td> </tr> <!-- ============== 菜单 ====================== --> <tr> <td class="TopLighNav1" height="22" valign="top"> <div style="position: relative; margin-top: 2px"> <div style="position: absolute; width: 960px; top: 0px;"> <b>answeror</b> <a href="#" onmouseout='DelayHideMenu();' onmouseover='ShowMenu(this,logon,100);'>账号选项</a> <img src="pic/navspacer.gif" align="absmiddle"> <a href="usermanager.asp" onmouseout='DelayHideMenu();' onmouseover='ShowMenu(this,manage2,100);'>控制面板</a> <!--<img src=pic/navspacer.gif align=absmiddle> <a href="guidboard.asp" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,hotboard,100)'>热门版面</a>--> <img src="pic/navspacer.gif" align="absmiddle"> <a href="query.asp?boardid=0" onmouseout='DelayHideMenu();' onmouseover='ShowMenu(this,Search,100);'>论坛搜索</a> <img src="pic/navspacer.gif" align="absmiddle"> <!--<a href="#" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,stylelist,100)'>自选风格</a> <img src=pic/navspacer.gif align=absmiddle>--> <a href="boardstat.asp?boardid=0" onmouseout='DelayHideMenu();' onmouseover='ShowMenu(this,boardstat,100);'>论坛状态</a> <!-- <img src=pic/navspacer.gif align=absmiddle> <a href="show.asp?boardid=0" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,downlist,100)'>论坛展区</a> --> <!--<img src=pic/navspacer.gif align=absmiddle> <a href="http://10.71.45.100/newblog" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,blog,100)' target="_blank">98blog</a> <img src=pic/navspacer.gif align=absmiddle> <a href="music.asp" onmouseout='DelayHideMenu()' onMouseOver='ShowMenu(this,music,100)'>点歌</a>--> <img src="pic/navspacer.gif" align="absmiddle"> <a onmouseout='DelayHideMenu();this.style.cursor="auto";' onmouseover='ShowMenu(this,addon,100);this.style.cursor="pointer";' target="_blank">论坛附件</a> <img src="pic/navspacer.gif" align="absmiddle"> <a href="boardhelp.asp" onmouseout='DelayHideMenu();' onmouseover='ShowMenu(this,help,100);'>论坛帮助</a> <img src="pic/navspacer.gif" align="absmiddle"> <a href="javascript:;" onclick="changeSimple(1)">Simple版</a> <!-- <img src="pic/navspacer.gif" align="absmiddle"> <a href="http://bbs.cc98.org/" title="Alpha 就是比 Beta 更烂的...">2.0 Alpha</a> --> </div> <div id="multiLogin"></div> </div> </td> </tr> </table> <!-- 最外层 --> </td> </tr> </table> <table cellspacing="1" cellpadding="3" align="center" border="0" width="97%"> <tr> <td width="65%"></td> <td width="35%" align="right"> <img src="pic/msg_no_new_bar.gif"> <a href="usersms.asp?action=inbox">我的收件箱</a> (<span style="color: gray;">0 新</span>) </td> </tr> </table> <table cellspacing="1" cellpadding="3" align="center" class="tableBorder2"> <tr> <td height="25" valign="middle"> <img src="pic/Forum_nav.gif" align="absmiddle"> <a href="index.asp">www.cc98.org</a> → <a href=query.asp?boardid=0>论坛搜索</a> → 搜索结果 <a name="top"></a> </td> </tr> </table> <br> <table cellpadding="0" cellspacing="0" border="0" width="97%" align="center"> <tr> <td>查询天内有回复的帖子,查看新贴共查询到 <font color="#FF0000">100</font>个结果 </td> </tr> </table> <table cellpadding="3" cellspacing="1" class="tableborder1" align="center"> <tr valign="middle"> <th width="*">主 题</th> <th width="100">版 面</th> <th width="100">作 者</th> <th width="125">发贴日期</th> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=16&ID=4223274' target="_blank"> <img src='face/face20.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=16&ID=4223274' target="_blank"> 愚蠢的Logo设计案例(为什么我一个也看不懂,一定是我太纯洁了) </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=16" target="_blank">自由贴图</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=361909" target="_blank">穿山乙</a> </td> <td class="tablebody2" width="195"> 7/15/2013 00:21 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=361909" target=_blank>穿山乙</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=562&ID=4223273' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=562&ID=4223273' target="_blank"> [求][笔记本]急求 笔记本 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=562" target="_blank">电脑数码</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=381654" target="_blank">无天</a> </td> <td class="tablebody2" width="195"> 7/15/2013 00:15 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=381654" target=_blank>无天</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=617&ID=4223272' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=617&ID=4223272' target="_blank"> IGSS项目 非理工科的同学可以看看 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=617" target="_blank">浙江大学出国留学学生圈SOSA</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=464966" target="_blank">chenkun</a> </td> <td class="tablebody2" width="195"> 7/15/2013 00:10 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=464966" target=_blank>chenkun</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=180&ID=4223271' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=180&ID=4223271' target="_blank"> 求问 我的htc onex(g23) 手机卡插进去没反应 一直都搜到sim卡 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=180" target="_blank">数码世界</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=434774" target="_blank">fuwen0202</a> </td> <td class="tablebody2" width="195"> 7/15/2013 00:07 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=434774" target=_blank>fuwen0202</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=74&ID=4223270' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=74&ID=4223270' target="_blank"> 突然意识到,大四交流带来一个严峻的问题 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=74" target="_blank">软件工程</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=377896" target="_blank">cgwy</a> </td> <td class="tablebody2" width="195"> 7/15/2013 00:04 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=377896" target=_blank>cgwy</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=376&ID=4223269' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=376&ID=4223269' target="_blank"> [LIVE] 水樹奈々初の海外公演決定 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=376" target="_blank">ACG音乐区</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=189626" target="_blank">伊茶</a> </td> <td class="tablebody2" width="195"> 7/15/2013 00:01 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=189626" target=_blank>伊茶</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=537&ID=4223268' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=537&ID=4223268' target="_blank"> 临时找来的图片,庆祝电脑重装发一下某场 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=537" target="_blank">暑假·2013</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=284542" target="_blank">endless07</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:54 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=284542" target=_blank>endless07</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=537&ID=4223267' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=537&ID=4223267' target="_blank"> surface rt 来啦 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=537" target="_blank">暑假·2013</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=420924" target="_blank">君之风范</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:49 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=420924" target=_blank>君之风范</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=459&ID=4223266' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=459&ID=4223266' target="_blank"> [征][兼职]电话销售以及销售主管(做的是电台和电视节目业务) </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=459" target="_blank">实习兼职</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=354315" target="_blank">竹居贤</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:43 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=354315" target=_blank>竹居贤</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=141&ID=4223265' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=141&ID=4223265' target="_blank"> 史上最淡定选手DD </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=141" target="_blank">魔兽RPG专区</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=395779" target="_blank">淡墨青衫</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:34 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=395779" target=_blank>淡墨青衫</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=537&ID=4223264' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=537&ID=4223264' target="_blank"> (玉泉)打牌,桌游,有人组织么? </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=537" target="_blank">暑假·2013</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=464847" target="_blank">西湖甜醋鱼</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:32 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=464847" target=_blank>西湖甜醋鱼</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=229&ID=4223263' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=229&ID=4223263' target="_blank"> 出绿茶100元代金券3张 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=229" target="_blank">美食天地</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=273755" target="_blank">wang7721</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:27 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=273755" target=_blank>wang7721</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=81&ID=4223262' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=81&ID=4223262' target="_blank"> 跟不傲气的牛人在一起的时候,特别有安全感 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=81" target="_blank">情感空气</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=460854" target="_blank">findbeauty</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:26 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=460854" target=_blank>findbeauty</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=80&ID=4223261' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=80&ID=4223261' target="_blank"> 【出】绿茶餐厅100元代金券3张 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=80" target="_blank">生活点滴</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=273755" target="_blank">wang7721</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:24 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=273755" target=_blank>wang7721</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=562&ID=4223260' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=562&ID=4223260' target="_blank"> [换][平板]ipadmini求换ipad4可以吗?补差价 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=562" target="_blank">电脑数码</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=449388" target="_blank">当冬夜渐暖</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:24 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=449388" target=_blank>当冬夜渐暖</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=152&ID=4223259' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=152&ID=4223259' target="_blank"> 缘来是你 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=152" target="_blank">缘分天空</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=426565" target="_blank">带鱼</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:22 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=426565" target=_blank>带鱼</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=402&ID=4223258' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=402&ID=4223258' target="_blank"> 建个楼记录实习的见闻吧 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=402" target="_blank">气象万千</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=354785" target="_blank">水木清见</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:21 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=354785" target=_blank>水木清见</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=537&ID=4223257' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=537&ID=4223257' target="_blank"> 求个玉泉夜宵外卖电话 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=537" target="_blank">暑假·2013</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=353290" target="_blank">xuedine</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:19 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=353290" target=_blank>xuedine</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=135&ID=4223256' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=135&ID=4223256' target="_blank"> 等天凉了我就出柜 </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=135" target="_blank">开怀一笑</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=410207" target="_blank">sixfancy</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:18 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=410207" target=_blank>sixfancy</a> --> </td> </tr> <tr> <td class="tablebody1" width="*"> <a href='dispbbs.asp?boardID=57&ID=4223255' target="_blank"> <img src='face/face7.gif' border="0" alt="开新窗口浏览此主题"></a> <a href='dispbbs.asp?boardID=57&ID=4223255' target="_blank"> 【来一斤凉拌猥琐!不要节操!】WS专楼——献给我大数学系众多WSN(含LZ) </a> </td> <td align="middle" class="tablebody2" width="80"> <a href="http://www.cc98.org/list.asp?boardid=57" target="_blank">数学之韵</a> </td> <td align="middle" class="tablebody1" width="80"> <a href="dispuser.asp?id=150363" target="_blank">★→B52</a> </td> <td class="tablebody2" width="195"> 7/14/2013 23:15 <!--<font color="#FF0000">|</font> <a href="dispuser.asp?id=150363" target=_blank>★→B52</a> --> </td> </tr> </table> <table border=0 cellpadding=0 cellspacing=3 width="97%" align=center><tr><td valign=middle nowrap><font color=>页次:<b>1</b>/<b>5</b>页每页<b>20</b> 帖子数<b>100</b></font></td><td valign=middle nowrap><font color=><div align=right><p>分页: <font color=#FF0000>[1]</font> <a href="?page=2&stype=3&pSearch=&nSearch=&keyword=&SearchDate=&boardid=0&stable=bbs7&sertype=1">[2]</a> <a href="?page=3&stype=3&pSearch=&nSearch=&keyword=&SearchDate=&boardid=0&stable=bbs7&sertype=1">[3]</a> <a href="?page=4&stype=3&pSearch=&nSearch=&keyword=&SearchDate=&boardid=0&stable=bbs7&sertype=1">[4]</a>... <a href="?page=5&stype=3&pSearch=&nSearch=&keyword=&SearchDate=&boardid=0&stable=bbs7&sertype=1">[5]</a></p></div></font></td></tr></table> <div id="copyright">Copyright © 2003-2013 CC98 Network Association. Email: contact@cc98.org</div> </body> </html> <!--end:125.000毫秒--> <script type="text/javascript"> <!-- //快捷键 翻页 function _shortcut(evt) { var fileName="queryResult.asp"; var star=parseInt("1"); var total_star=parseInt("5"); //stype=request("stype") //pSearch=request("pSearch") //nSearch=request("nSearch") //keyword=trim(checkStr(request("keyword"))) //stable=checkstr(request("stable")) //sertype=checkstr(request("sertype")) //searchDate=request("SearchDate") //<a href=""?page="&Pcount&"&stype="&stype&"&pSearch="&pSearch&"&nSearch="&nSearch&"&keyword="&server.urlencode(keyword)&"&SearchDate="&server.urlencode(searchDate)&"&boardid="&boardid&"&stable="&stable&"&sertype="&sertype&""">["&Pcount&"]</a>" var origin_search= "?stype=" + "3" + "&psearch=" + "" + "nsearch" + "" + "&keyword=" + "" + "&searchDate=" + "" + "&boardid=" + "0" + "&stable=" + "bbs7" + "&sertype=" + "1"; var evt=evt?evt:window.event; if(star==""&star==0) star=1; var nextStar=1; if(evt.altKey&&evt.keyCode==33){ if(star<=1) return; nextStar=star-1; window.location.href=fileName+origin_search+"&page="+nextStar; }else if (evt.altKey && evt.keyCode == 34){ if(star>=total_star) return; nextStar=star+1; window.location.href=fileName+origin_search+"&page="+nextStar; } } if(window.document.addEventListener){ window.document.addEventListener("keydown",_shortcut,false); }else{ window.document.attachEvent("onkeydown",_shortcut,false); } // --> </script>
我们需要提取帖子的标题和链接. 仅仅使用re
是比较困难的. 我们需要更加"架构化"的文本分析工具:
import bs4
import re
soup = bs4.BeautifulSoup(data)
def ispost(t):
return (
t.name == 'a'
and len(t.contents) == 1
and type(t.contents[0]) is bs4.NavigableString
and t.has_attr('href')
and re.match(r'dispbbs.*', t['href'])
)
posts = soup.find_all(ispost)
print posts
[<a href="dispbbs.asp?boardID=16&ID=4223274" target="_blank"> 愚蠢的Logo设计案例(为什么我一个也看不懂,一定是我太纯洁了) </a>, <a href="dispbbs.asp?boardID=562&ID=4223273" target="_blank"> [求][笔记本]急求 笔记本 </a>, <a href="dispbbs.asp?boardID=617&ID=4223272" target="_blank"> IGSS项目 非理工科的同学可以看看 </a>, <a href="dispbbs.asp?boardID=180&ID=4223271" target="_blank"> 求问 我的htc onex(g23) 手机卡插进去没反应 一直都搜到sim卡 </a>, <a href="dispbbs.asp?boardID=74&ID=4223270" target="_blank"> 突然意识到,大四交流带来一个严峻的问题 </a>, <a href="dispbbs.asp?boardID=376&ID=4223269" target="_blank"> [LIVE] 水樹奈々初の海外公演決定 </a>, <a href="dispbbs.asp?boardID=537&ID=4223268" target="_blank"> 临时找来的图片,庆祝电脑重装发一下某场 </a>, <a href="dispbbs.asp?boardID=537&ID=4223267" target="_blank"> surface rt 来啦 </a>, <a href="dispbbs.asp?boardID=459&ID=4223266" target="_blank"> [征][兼职]电话销售以及销售主管(做的是电台和电视节目业务) </a>, <a href="dispbbs.asp?boardID=141&ID=4223265" target="_blank"> 史上最淡定选手DD </a>, <a href="dispbbs.asp?boardID=537&ID=4223264" target="_blank"> (玉泉)打牌,桌游,有人组织么? </a>, <a href="dispbbs.asp?boardID=229&ID=4223263" target="_blank"> 出绿茶100元代金券3张 </a>, <a href="dispbbs.asp?boardID=81&ID=4223262" target="_blank"> 跟不傲气的牛人在一起的时候,特别有安全感 </a>, <a href="dispbbs.asp?boardID=80&ID=4223261" target="_blank"> 【出】绿茶餐厅100元代金券3张 </a>, <a href="dispbbs.asp?boardID=562&ID=4223260" target="_blank"> [换][平板]ipadmini求换ipad4可以吗?补差价 </a>, <a href="dispbbs.asp?boardID=152&ID=4223259" target="_blank"> 缘来是你 </a>, <a href="dispbbs.asp?boardID=402&ID=4223258" target="_blank"> 建个楼记录实习的见闻吧 </a>, <a href="dispbbs.asp?boardID=537&ID=4223257" target="_blank"> 求个玉泉夜宵外卖电话 </a>, <a href="dispbbs.asp?boardID=135&ID=4223256" target="_blank"> 等天凉了我就出柜 </a>, <a href="dispbbs.asp?boardID=57&ID=4223255" target="_blank"> 【来一斤凉拌猥琐!不要节操!】WS专楼——献给我大数学系众多WSN(含LZ) </a>]
print posts[0]['href']
print posts[0].string.strip()
dispbbs.asp?boardID=16&ID=4223274 愚蠢的Logo设计案例(为什么我一个也看不懂,一定是我太纯洁了)
更多用法参见BeautifulSoup文档.
如果对JQuery熟悉, 推荐使用pyquery.
from PyQt4.QtGui import (
QApplication,
QSystemTrayIcon,
QIcon,
QMenu,
QDesktopServices,
QWidget
)
from PyQt4.QtCore import (
QTimer,
QUrl
)
from urlparse import urljoin
from urllib import urlencode
import re
import bs4
import urllib2
from cookielib import CookieJar
import sys
base_url = 'http://www.cc98.org'
login_url = 'http://www.cc98.org/login.asp'
posts_url = 'http://www.cc98.org/queryresult.asp?stype=3'
def ispost(t):
return (
t.name == 'a'
and len(t.contents) == 1
and type(t.contents[0]) is bs4.NavigableString
and t.has_attr('href')
and re.match(r'dispbbs.*', t['href'])
)
class Tray(QSystemTrayIcon):
def __init__(self, icon, parent=None):
super(Tray, self).__init__(icon, parent)
menu = QMenu(parent)
exit = menu.addAction('&Exit')
exit.triggered.connect(QApplication.instance().quit)
self.setContextMenu(menu)
self.cached_post = None
self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(CookieJar()))
self.messageClicked.connect(self.openurl)
self.login()
def last_post(self):
data = self.opener.open(posts_url).read()
soup = bs4.BeautifulSoup(data)
post = soup.find_all(ispost)[0]
title = post.string.strip()
url = urljoin(base_url, post['href'])
return title, url
def login(self):
self.opener.open(login_url, urlencode(dict(
username='answeror',
password='42',
action='chk'
)))
def openurl(self):
QDesktopServices.openUrl(QUrl(self.url))
def check(self):
post = self.last_post()
if post != self.cached_post:
title, url = post
self.url = url
self.showMessage('', title)
self.cached_post = post
app = QApplication(sys.argv)
w = QWidget()
tray = Tray(QIcon('smile.png'), w)
tray.show()
timer = QTimer()
timer.timeout.connect(tray.check)
timer.start(5000)
app.exec_()
0