#!/usr/bin/env python # coding: utf-8 # In[1]: import requests import urllib.request import time from bs4 import BeautifulSoup # https://towardsdatascience.com/how-to-web-scrape-with-python-in-4-minutes-bc49186a8460 # In[2]: baseurl = "http://www.nanotube.msu.edu/fullerene/" url = "http://www.nanotube.msu.edu/fullerene/fullerene-isomers.html" # In[3]: response = requests.get(url) # In[4]: soup = BeautifulSoup(response.text, "html.parser") # In[5]: soup.find_all("a") # In[35]: for item in soup.find_all("a"): time.sleep(1) href = item["href"] print("\n--->", href) if href[:15] != "fullerene.php?C": continue fullerene_page = requests.get(baseurl + href) fullerene_soup = BeautifulSoup(fullerene_page.text, "html.parser") files = list() for structure in fullerene_soup.find_all("a"): s_href = structure["href"] if ".xyz" == s_href[-4:]: filename = s_href.split("/")[-1] if filename in files: continue files.append(filename) print("Download: ", filename) urllib.request.urlretrieve(baseurl + s_href[1:], filename) time.sleep(1) # In[37]: import os import shutil for file in os.listdir(): if file[-4:] == ".xyz": folder = file.split("-")[0] if not os.path.exists(folder): os.mkdir(folder) shutil.move(file, folder)