# pip install requests
# pip install beautifulsoup4
import requests as rs
import bs4
import time
naver_url = 'http://www.naver.com'
response = rs.get(naver_url)
html_content = response.text.encode(response.encoding);
type(html_content)
str
html_content[:1000]
'<!doctype html>\n<html lang="ko">\n<head>\n<meta charset="utf-8">\n<meta http-equiv="Content-Script-Type" content="text/javascript">\n<meta http-equiv="Content-Style-Type" content="text/css">\n<meta http-equiv="X-UA-Compatible" content="IE=edge">\n<meta name="viewport" content="width=960">\n<meta name="apple-mobile-web-app-title" content="NAVER" />\n<link rel="shortcut icon" type="image/x-icon" href="/favicon.ico?1" />\n\n<title>NAVER</title>\n\n\n\n\n\n\n\n<link rel="stylesheet" type="text/css" href="http://s.pm.naver.net/css/main_v20150302.css" />\n\n<link rel="stylesheet" type="text/css" id="css" href="http://sstatic.naver.net/search/css/2014/api_atcmp_0415.css" />\n\n<script type="text/javascript">\n//<![CDATA[\ndocument.domain = "naver.com";\nhistory.navigationMode = \'compatible\';\nvar nsc = "navertop.v3";\nvar jindoAll = "http://s.pm.naver.net/js/c/jindo.all.20140327.min.js";\n</script>\n<script type="text/javascript" src="http://s.pm.naver.net/js/c/nlog_20140205.min.js"></script>\n<script type="text/javascrip'
navigator = bs4.BeautifulSoup(html_content)
type(navigator)
bs4.BeautifulSoup
realRankTag = navigator.find_all(id='realrank')
type(realRankTag)
bs4.element.ResultSet
len(realRankTag)
1
realRankTag
[<ol id="realrank" style="margin-top:0px;"> <li class="up" value="1"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%B1%EC%9D%98%EC%A2%85%EA%B5%B0&sm=top_lve&ie=utf8" title="백의종군">백의종군<span class="tx">상승</span><span class="ic"></span><span class="rk">960</span></a></li> <li class="up" value="2"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EC%95%A0%ED%94%8C%EC%9B%8C%EC%B9%98&sm=top_lve&ie=utf8" title="애플워치">애플워치<span class="tx">상승</span><span class="ic"></span><span class="rk">120</span></a></li> <li class="up" value="3"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EA%B9%80%EC%83%81%EA%B2%BD&sm=top_lve&ie=utf8" title="김상경">김상경<span class="tx">상승</span><span class="ic"></span><span class="rk">240</span></a></li> <li class="up" value="4"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%95%EC%A3%BC%EC%98%81&sm=top_lve&ie=utf8" title="박주영">박주영<span class="tx">상승</span><span class="ic"></span><span class="rk">177</span></a></li> <li class="up" value="5"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EC%95%A0%ED%94%8C&sm=top_lve&ie=utf8" title="애플">애플<span class="tx">상승</span><span class="ic"></span><span class="rk">60</span></a></li> <li class="up" value="6"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%A7%A5%EB%B6%81&sm=top_lve&ie=utf8" title="맥북">맥북<span class="tx">상승</span><span class="ic"></span><span class="rk">144</span></a></li> <li class="up" value="7"><a href="http://search.naver.com/search.naver?where=nexearch&query=ios8.2&sm=top_lve&ie=utf8" title="ios8.2">ios8.2<span class="tx">상승</span><span class="ic"></span><span class="rk">279</span></a></li> <li class="up" value="8"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%A7%A8%EC%9C%A0+%EC%95%84%EC%8A%A4%EB%82%A0&sm=top_lve&ie=utf8" title="맨유 아스날">맨유 아스날<span class="tx">상승</span><span class="ic"></span><span class="rk">198</span></a></li> <li class="up" value="9"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B9%84%EC%A0%95%EC%83%81%ED%9A%8C%EB%8B%B4&sm=top_lve&ie=utf8" title="비정상회담">비정상회담<span class="tx">상승</span><span class="ic"></span><span class="rk">54</span></a></li> <li class="up" value="10"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EA%B9%80%EC%82%AC%EC%9D%80&sm=top_lve&ie=utf8" title="김사은">김사은<span class="tx">상승</span><span class="ic"></span><span class="rk">312</span></a></li> <li class="up" id="lastrank" value="1"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%B1%EC%9D%98%EC%A2%85%EA%B5%B0&sm=top_lve&ie=utf8" title="백의종군">백의종군<span class="tx">상승</span><span class="ic"></span><span class="rk">960</span></a></li> </ol>]
type(realRankTag[0])
bs4.element.Tag
realRankTag[0]
<ol id="realrank" style="margin-top:0px;"> <li class="up" value="1"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%B1%EC%9D%98%EC%A2%85%EA%B5%B0&sm=top_lve&ie=utf8" title="백의종군">백의종군<span class="tx">상승</span><span class="ic"></span><span class="rk">960</span></a></li> <li class="up" value="2"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EC%95%A0%ED%94%8C%EC%9B%8C%EC%B9%98&sm=top_lve&ie=utf8" title="애플워치">애플워치<span class="tx">상승</span><span class="ic"></span><span class="rk">120</span></a></li> <li class="up" value="3"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EA%B9%80%EC%83%81%EA%B2%BD&sm=top_lve&ie=utf8" title="김상경">김상경<span class="tx">상승</span><span class="ic"></span><span class="rk">240</span></a></li> <li class="up" value="4"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%95%EC%A3%BC%EC%98%81&sm=top_lve&ie=utf8" title="박주영">박주영<span class="tx">상승</span><span class="ic"></span><span class="rk">177</span></a></li> <li class="up" value="5"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EC%95%A0%ED%94%8C&sm=top_lve&ie=utf8" title="애플">애플<span class="tx">상승</span><span class="ic"></span><span class="rk">60</span></a></li> <li class="up" value="6"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%A7%A5%EB%B6%81&sm=top_lve&ie=utf8" title="맥북">맥북<span class="tx">상승</span><span class="ic"></span><span class="rk">144</span></a></li> <li class="up" value="7"><a href="http://search.naver.com/search.naver?where=nexearch&query=ios8.2&sm=top_lve&ie=utf8" title="ios8.2">ios8.2<span class="tx">상승</span><span class="ic"></span><span class="rk">279</span></a></li> <li class="up" value="8"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%A7%A8%EC%9C%A0+%EC%95%84%EC%8A%A4%EB%82%A0&sm=top_lve&ie=utf8" title="맨유 아스날">맨유 아스날<span class="tx">상승</span><span class="ic"></span><span class="rk">198</span></a></li> <li class="up" value="9"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B9%84%EC%A0%95%EC%83%81%ED%9A%8C%EB%8B%B4&sm=top_lve&ie=utf8" title="비정상회담">비정상회담<span class="tx">상승</span><span class="ic"></span><span class="rk">54</span></a></li> <li class="up" value="10"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EA%B9%80%EC%82%AC%EC%9D%80&sm=top_lve&ie=utf8" title="김사은">김사은<span class="tx">상승</span><span class="ic"></span><span class="rk">312</span></a></li> <li class="up" id="lastrank" value="1"><a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%B1%EC%9D%98%EC%A2%85%EA%B5%B0&sm=top_lve&ie=utf8" title="백의종군">백의종군<span class="tx">상승</span><span class="ic"></span><span class="rk">960</span></a></li> </ol>
# a 태그를 가진 요소들 모두 추출
resultList = realRankTag[0].find_all('a')
type(resultList)
bs4.element.ResultSet
len(resultList)
11
resultList
[<a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%B1%EC%9D%98%EC%A2%85%EA%B5%B0&sm=top_lve&ie=utf8" title="백의종군">백의종군<span class="tx">상승</span><span class="ic"></span><span class="rk">960</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=%EC%95%A0%ED%94%8C%EC%9B%8C%EC%B9%98&sm=top_lve&ie=utf8" title="애플워치">애플워치<span class="tx">상승</span><span class="ic"></span><span class="rk">120</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=%EA%B9%80%EC%83%81%EA%B2%BD&sm=top_lve&ie=utf8" title="김상경">김상경<span class="tx">상승</span><span class="ic"></span><span class="rk">240</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%95%EC%A3%BC%EC%98%81&sm=top_lve&ie=utf8" title="박주영">박주영<span class="tx">상승</span><span class="ic"></span><span class="rk">177</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=%EC%95%A0%ED%94%8C&sm=top_lve&ie=utf8" title="애플">애플<span class="tx">상승</span><span class="ic"></span><span class="rk">60</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%A7%A5%EB%B6%81&sm=top_lve&ie=utf8" title="맥북">맥북<span class="tx">상승</span><span class="ic"></span><span class="rk">144</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=ios8.2&sm=top_lve&ie=utf8" title="ios8.2">ios8.2<span class="tx">상승</span><span class="ic"></span><span class="rk">279</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%A7%A8%EC%9C%A0+%EC%95%84%EC%8A%A4%EB%82%A0&sm=top_lve&ie=utf8" title="맨유 아스날">맨유 아스날<span class="tx">상승</span><span class="ic"></span><span class="rk">198</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B9%84%EC%A0%95%EC%83%81%ED%9A%8C%EB%8B%B4&sm=top_lve&ie=utf8" title="비정상회담">비정상회담<span class="tx">상승</span><span class="ic"></span><span class="rk">54</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=%EA%B9%80%EC%82%AC%EC%9D%80&sm=top_lve&ie=utf8" title="김사은">김사은<span class="tx">상승</span><span class="ic"></span><span class="rk">312</span></a>, <a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%B1%EC%9D%98%EC%A2%85%EA%B5%B0&sm=top_lve&ie=utf8" title="백의종군">백의종군<span class="tx">상승</span><span class="ic"></span><span class="rk">960</span></a>]
item = resultList[0]
type(item)
bs4.element.Tag
len(item)
4
item
<a href="http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%B1%EC%9D%98%EC%A2%85%EA%B5%B0&sm=top_lve&ie=utf8" title="백의종군">백의종군<span class="tx">상승</span><span class="ic"></span><span class="rk">960</span></a>
item['href']
u'http://search.naver.com/search.naver?where=nexearch&query=%EB%B0%B1%EC%9D%98%EC%A2%85%EA%B5%B0&sm=top_lve&ie=utf8'
item['title']
u'\ubc31\uc758\uc885\uad70'
keywords = [item['title'] for item in resultList]
keywords[:3]
[u'\ubc31\uc758\uc885\uad70', u'\uc560\ud50c\uc6cc\uce58', u'\uae40\uc0c1\uacbd']
for index, keyword in enumerate(keywords):
resultText = '[%d위] %s'%(index,keyword.encode('utf-8'))
print resultText.decode('utf-8').encode('utf-8')
[0위] 백의종군 [1위] 애플워치 [2위] 김상경 [3위] 박주영 [4위] 애플 [5위] 맥북 [6위] ios8.2 [7위] 맨유 아스날 [8위] 비정상회담 [9위] 김사은 [10위] 백의종군
위의 단계들을 모두 합쳐서 함수로 만든다.
#-*- encoding: utf-8 -*-
#실시간 검색어
import requests as rs
import bs4
import time
def getTopRank():
naver_url = 'http://www.naver.com'
#1)요청
response = rs.get(naver_url)
#2)응답으로 부터 HTML 추출
html_content = response.text.encode(response.encoding);
#3)HTML 파싱
navigator = bs4.BeautifulSoup(html_content)
#4)네비게이터를 이용해 원하는 태그 리스트 가져오기
realRankTag = navigator.find_all(id='realrank')
resultList = realRankTag[0].find_all('a')
#5)키워드 추출
keywords = [item['title'] for item in resultList]
print '============='
print time.ctime()
print ''
#6)키워드 출력
for index, keyword in enumerate(keywords):
resultText = '[%d위] %s'%(index,keyword.encode('utf-8'))
print resultText.decode('utf-8').encode('utf-8')
# 실행
getTopRank()
============= Tue Mar 10 10:44:19 2015 [0위] 백의종군 [1위] 애플워치 [2위] 김상경 [3위] 박주영 [4위] 애플 [5위] 맥북 [6위] ios8.2 [7위] 비정상회담 [8위] 김사은 [9위] 맨유 아스날 [10위] 백의종군