# pip install beautifulsoup4.
from bs4 import BeautifulSoup
#pip install urllib3
#This library helps in downloading data
import urllib.request
r = urllib.request.urlopen('http://www.yelp.ca/search?find_loc=Calgary,+AB&cflt=homeservices').read()
#Using Beautiful Soup Library to parse the data
soup = BeautifulSoup(r, "lxml")
type(soup)
bs4.BeautifulSoup
#We find the number of chracters in data downloaded
len(str(soup.prettify()))
440689
#We convert the data to a string format using str.
#Note in R we use str for structure, but in Python we use str to convert to charachter ( like as.charachter or paste command would do in R)
a=str(soup.prettify())
# We try and find location of a particular tag we are interested in.
#Note we are using triple quotes to escape scpecial charachters
a.find('''class="snippet"''')
352138
a[352000:358000]
'dth="30"/>\n </a>\n </div>\n </div>\n <div class="media-story">\n <p class="snippet">\n We\'re the best of bank and broker. We have locations so that you know where we are. We\'re connected with all banks, not just one. And we pass along our volume discount to get your mortgage…\n </p>\n </div>\n </div>\n </div>\n </div>\n </li>\n <li class="regular-search-result">\n <div class="search-result natural-search-result" data-key="1">\n <div class="biz-listing-large">\n <div class="main-attributes">\n <div class="media-block media-block--12">\n <div class="media-avatar">\n <div class="photo-box pb-90s">\n <a href="/biz/always-affordable-always-available-locksmiths-calgary?search_key=36031">\n <img alt="Always Affordable Always Available Locksmiths" class="photo-box-img" height="90" src="//s3-media2.fl.yelpcdn.com/bphoto/8DBH3BpLINfTAK_Up5BtUQ/90s.jpg" width="90"/>\n </a>\n </div>\n </div>\n <div class="media-story">\n <h3 class="search-result-title">\n <span class="indexed-biz-name">\n 1.\n <a class="biz-name" data-hovercard-id="8QwuvWymqegNxbMgegZ1kg" href="/biz/always-affordable-always-available-locksmiths-calgary?search_key=36031">\n <span>\n Always Affordable Always Available Locksmiths\n </span>\n </a>\n </span>\n </h3>\n <div class="biz-rating biz-rating-large clearfix">\n <div class="rating-large">\n <i class="star-img stars_5" title="5.0 star rating">\n <img alt="5.0 star rating" class="offscreen" height="303" src="//s3-media4.fl.yelpcdn.com/assets/srv0/yelp_styleguide/c2252a4cd43e/assets/img/stars/stars_map.png" width="84"/>\n </i>\n </div>\n <span class="review-count rating-qualifier">\n 7 reviews\n </span>\n </div>\n <div class="price-category">\n <span class="category-str-list">\n <a href="/search?find_loc=Calgary%2C+AB&cflt=locksmiths">\n Keys & Locksmiths\n </a>\n </span>\n </div>\n <ul class="search-result_tags">\n </ul>\n </div>\n </div>\n </div>\n <div class="secondary-attributes">\n <address>\n 1437 Kensington Road NW\n <br/>\n Calgary, AB T2N 3R1\n </address>\n <span class="offscreen">\n Phone number\n </span>\n <span class="biz-phone">\n (403) 272-8923\n </span>\n </div>\n </div>\n <div class="snippet-block review-snippet">\n <div class="media-block">\n <div class="media-avatar">\n <div class="photo-box pb-30s" data-hovercard-id="6G17PcLIXZHTsRUqLgo44A">\n <a href="/user_details?userid=iPZyJg1jY9iUEuwCiAoQ4w">\n <img alt="Brian P." class="photo-box-img" height="30" src="//s3-media1.fl.yelpcdn.com/photo/bHq_rRLGej4oD-ck-5NQ6A/30s.jpg" width="30"/>\n </a>\n </div>\n </div>\n <div class="media-story">\n <p class="snippet">\n We were very pleased with the quick, professional, quality service we got from this company. \xa0When booking the appointment, the person on the phone was efficient and helpful, and although I…\n </p>\n </div>\n </div>\n </div>\n </div>\n </li>\n <li class="regular-search-result">\n <div class="search-result natural-search-result" data-key="2">\n <div class="biz-listing-large">\n <div class="main-attributes">\n <div class="media-block media-block--12">\n <div class="media-avatar">\n <div class="photo-box pb-90s">\n <a href="/biz/golden-acre-garden-sentres-calgary?search_key=36031">\n <img alt="Golden Acre Garden Sentres" class="photo-box-img" height="90" src="//s3-media1.fl.yelpcdn.com/bphoto/6T8npInLwEQx-cx-Emm6yA/90s.jpg" width="90"/>\n </a>\n </div>\n </div>\n <div class="media-story">\n <h3 class="search-result-title">\n <span class="indexed-biz-name">\n 2.\n <a class="biz-name" data-hovercard-id="DG-pdTKaegi87Df9xQvp2A" href="/biz/golden-acre-garden-sentres-calgary?search_key=36031">\n <span>\n Golden Acre Garden Sentres\n </span>\n </a>\n </span>\n </h3>\n <div class="biz-rating biz-rating-large clearfix">\n <div class="rating-large">\n <i class="star-img stars_4" title="4.0 star rating">\n <img alt="4.0 star rating" class="offscreen" height="303" src="//s3-media4.fl.yelpcdn.com/assets/srv0/yelp_styleguide/c2252a4cd43e/assets/img/stars/stars_map.png" width="84"/>\n </i>\n </div>\n <span class="review-count rating-qualifier">\n 13 reviews\n </span>\n </div>\n <div class="price-category">\n <span class="bullet-after">\n <span class="business-attribute price-range">\n '
#Lets try and find the list of phone numbers. We note both the HTNL tag and the class for it.
# We use the find_all function
letters = soup.find_all("span", class_="biz-phone")
letters[1:100]
[<span class="biz-phone"> (403) 272-8923 </span>, <span class="biz-phone"> (403) 274-4286 </span>, <span class="biz-phone"> (403) 918-4475 </span>, <span class="biz-phone"> (403) 681-4376 </span>, <span class="biz-phone"> (403) 454-0243 </span>, <span class="biz-phone"> (403) 457-6333 </span>, <span class="biz-phone"> (403) 899-0599 </span>, <span class="biz-phone"> (403) 452-2881 </span>, <span class="biz-phone"> (587) 229-0673 </span>, <span class="biz-phone"> (403) 770-4700 </span>]
#Lets try and see the feedback given by users.
letters2 = soup.find_all("p", class_="snippet")
letters2[1:100]
[<p class="snippet"> We were very pleased with the quick, professional, quality service we got from this company. When booking the appointment, the person on the phone was efficient and helpful, and although I… </p>, <p class="snippet"> Yesterday I was at Golden Acres and carelessly had let myself become dehydrated, but hadn't realized what was going on. An employee, Rachel, recognized I was in trouble, made suggestions,… </p>, <p class="snippet"> Holy crap, I believe I have died and gone to heaven... I can't believe that I just discovered that there is actually a store that sells mid century modern furniture and accessories in town. I… </p>, <p class="snippet"> Really appreciate the help I've received from Mark at Mortgage Alliance. On two occasions he sent me back to my bank with some advice to get what I was looking for and saved me a lot of grief… </p>, <p class="snippet"> Such a wicked venue, place, space, I'm not even sure what the term is. I've been here on a couple occasions, the first time was a random Saturday in Inglewood and popped in. We got to meet the… </p>, <p class="snippet"> I called Carol mid-afternoon on Monday for a move-out clean. She showed up bright and early the next morning with her supplies, and (dare I say) insanely beautiful and outgoing colleague, Liz.… </p>, <p class="snippet"> ...Did not think I'd be writing a review on a furnace company but here I am. Right now in the middle of troubleshooting a heating issue. Thanks to Flash Furnace I am identifying the issue… </p>, <p class="snippet"> F2 Furnishings is a great place to shop for furniture and other home decor. The company really supports local artists and designers. A lot of their pieces are originals from local crafts… </p>, <p class="snippet"> Brandon was prompt in answering any questions we had prior to the move. On the day of the move they were on time, efficient, and professional. Brandon and Jesse took especial care of our… </p>, <p class="snippet"> I am a huge fan of what the Niklas Group has done to my community. I live just a block away from the Casel Marche building on 17th ave and I'm really impressed with the sense of community this… </p>]
type(letters2)
bs4.element.ResultSet
str(letters2)[1:1000]
'<p class="snippet">We\'re the best of bank and broker. We have locations so that you know where we are. We\'re connected with all banks, not just one. And we pass along our volume discount to get your mortgage…</p>, <p class="snippet">\n We were very pleased with the quick, professional, quality service we got from this company. \xa0When booking the appointment, the person on the phone was efficient and helpful, and although I…\n </p>, <p class="snippet">\n Yesterday I was at Golden Acres and carelessly had let myself become dehydrated, but hadn\'t realized what was going on. \xa0An employee, Rachel, recognized I was in trouble, made suggestions,…\n </p>, <p class="snippet">\n Holy crap, I believe I have died and gone to heaven... I can\'t believe that I just discovered that there is actually a store that sells mid century modern furniture and accessories in town. I…\n </p>, <p class="snippet">\n '
str(letters2).count("service")
1