from IPython.core.display import HTML
with open('creative_commons.txt', 'r') as f:
html = f.read()
name = '2015-07-06-podcasts'
html = '''
<small>
<p> This post was written as an IPython notebook.
It is available for <a href='https://ocefpaf.github.com/python4oceanographers/downloads/notebooks/%s.ipynb'>download</a>
or as a static <a href='https://nbviewer.ipython.org/url/ocefpaf.github.com/python4oceanographers/downloads/notebooks/%s.ipynb'>html</a>.</p>
<p></p>
%s''' % (name, name, html)
%matplotlib inline
from matplotlib import style
style.use('ggplot')
import os
from datetime import datetime
title = "Web scraping 101 (or how to get ready for a long trip)"
hour = datetime.utcnow().strftime('%H:%M')
comments="true"
date = '-'.join(name.split('-')[:3])
slug = '-'.join(name.split('-')[3:])
metadata = dict(title=title,
date=date,
hour=hour,
comments=comments,
slug=slug,
name=name)
markdown = """Title: {title}
date: {date} {hour}
comments: {comments}
slug: {slug}
{{% notebook {name}.ipynb cells[2:] %}}
""".format(**metadata)
content = os.path.abspath(os.path.join(os.getcwd(), os.pardir, os.pardir, '{}.md'.format(name)))
with open('{}'.format(content), 'w') as f:
f.writelines(markdown)
In this post I will show how to write a simple script to scrape a webpage with a list of podcasts links. I did that while preparing to go to Austin. It is a nice way to use my extra "airport time" to study a little bit.
The first step is to list all the links in the podcast webpage,
import requests
from bs4 import BeautifulSoup, SoupStrainer
def urllister(url):
r = requests.get(url)
soup = r.content
urls = []
for link in BeautifulSoup(soup, parse_only=SoupStrainer('a')):
try:
if link.has_attr('href'):
urls.append(link['href'])
except AttributeError:
pass
return urls
and filter it by the file extension you want to download:
import fnmatch
def filter_url(urls, filetype="*.mp3"):
return (fname for fname in fnmatch.filter(urls, filetype))
Now we need to create a download function. I do not remember where I got the function below. It is probably a mixture of StackOverflow and some customizations. The beauty of this function is that it can resume a partial download and displays a nice progress bar.
import os
import sys
try:
from urllib.error import HTTPError
from urllib.request import FancyURLopener
except ImportError:
from urllib2 import HTTPError
from urllib import FancyURLopener
from progressbar import ProgressBar
class URLOpener(FancyURLopener):
"""Subclass to override error 206 (partial file being sent)."""
def http_error_206(self, url, fp, errcode, errmsg, headers, data=None):
pass # Ignore the expected "non-error" code.
def download(fname, url, verbose=False):
"""Resume download."""
current_size = 0
url_obj = URLOpener()
if os.path.exists(fname):
output = open(fname, "ab")
current_size = os.path.getsize(fname)
# If the file exists, then download only the remainder.
url_obj.addheader("Range", "bytes=%s-" % (current_size))
else:
output = open(fname, "wb")
web_page = url_obj.open(url)
if verbose:
for key, value in web_page.headers.items():
sys.stdout.write("{} = {}\n".format(key, value))
# If we already have the whole file, there is no need to download it again.
num_bytes = 0
full_size = int(web_page.headers['Content-Length'])
if full_size == current_size:
msg = "File ({}) was already downloaded from URL ({})".format
sys.stdout.write(msg(fname, url))
elif full_size == 0:
sys.stdout.write("Full file size equal zero!"
"Try again later or check the file")
else:
if verbose:
msg = "Downloading {:d} more bytes".format
sys.stdout.write(msg(full_size - current_size))
pbar = ProgressBar(maxval=full_size)
pbar.start()
while True:
try:
data = web_page.read(8192)
except ValueError:
break
if not data:
break
output.write(data)
num_bytes = num_bytes + len(data)
pbar.update(num_bytes)
pbar.finish()
web_page.close()
output.close()
if verbose:
msg = "Downloaded {} bytes from {}".format
sys.stdout.write(msg(num_bytes, web_page.url))
Now find a URL with the podcasts you want and start scrapping. Be nice and sleep a little bit before each download!
from time import sleep
podcasts = range(0, 101)
uri = ("http://some-url-with-podcasts/podcast-{}.mp3".format)
for podcast in podcasts:
url = uri(podcast)
print(url + '\n')
try:
fname = url.split('/')[-1]
download(fname, url, verbose=True)
except HTTPError:
print('Cannot download {}\n'.format(url))
print('\n')
sleep(2)
Be sure to read the page terms of use. Some podcasts providers do not like scrapping!!
I will be listening to some Spanish classes. Nope, just lost my phone at
the airport... I won't be listening to anything :-(
HTML(html)
This post was written as an IPython notebook. It is available for download or as a static html.