#!/usr/bin/env python # coding: utf-8 # In[4]: # pip install beautifulsoup4. from bs4 import BeautifulSoup # In[5]: #pip install urllib3 #This library helps in downloading data import urllib.request # # Request Data from URL # In[6]: r = urllib.request.urlopen('http://www.yelp.ca/search?find_loc=Calgary,+AB&cflt=homeservices').read() # In[28]: #Using Beautiful Soup Library to parse the data soup = BeautifulSoup(r, "lxml") type(soup) # In[52]: #We find the number of chracters in data downloaded len(str(soup.prettify())) # In[53]: #We convert the data to a string format using str. #Note in R we use str for structure, but in Python we use str to convert to charachter ( like as.charachter or paste command would do in R) a=str(soup.prettify()) # In[57]: # We try and find location of a particular tag we are interested in. #Note we are using triple quotes to escape scpecial charachters a.find('''class="snippet"''') # In[58]: a[352000:358000] # In[21]: #Lets try and find the list of phone numbers. We note both the HTNL tag and the class for it. # We use the find_all function letters = soup.find_all("span", class_="biz-phone") letters[1:100] # In[22]: #Lets try and see the feedback given by users. letters2 = soup.find_all("p", class_="snippet") letters2[1:100] # In[23]: type(letters2) # In[24]: str(letters2)[1:1000] # In[25]: str(letters2).count("service")