#!/usr/bin/env python # coding: utf-8 # The purpose of this five-part Ipython Notebook series is to gather, clean, visaulize, and model data for my final project in General Assembly's data science course. # # Using listings on craigslist, the below code is a set of functions which programatically specifies markets of interest and extracts listing data. All defined functions are used in the main class near the bottom of the notebook - which runs all functions to collect the data. # # The project data consists of listing data - text descriptions, geolocation, image attributes, and listiing metadata - for rentals in the North Virginia, Maryland, and Washington DC. # In[12]: import requests # Helps construct the request to send to the API import json # JSON helper functions from bs4 import BeautifulSoup #Data Scraping Library import pandas as pd import time # In[13]: # Specify markets to search for housing ads. # Each market has 2 levels - e.g [washingtondc, nva] for listings in North VA would create the start of the # following URL: http://washingtondc.craigslist.org/search/nva/ def define_markets_to_search(): #To get listings for new locations, add the new location in the markets list markets = [['washingtondc','nva'],['washingtondc','mld'],['washingtondc','doc']] return markets # In[14]: # Determines how many pages of listings exist in each market - listings are split into pages with 100 listings per page # A market (e.g. North Virginia) with 2500 listings would return a pages_of_listings value of 25 def count_number_of_listings(market): #create url for each specified market url = 'http://' + market[0] + '.craigslist.org/search/apa' #apa == apartment # Make the request response = requests.post(url) #place data in a Beautiful soup object soup = BeautifulSoup(response.text) #Determine how many pages (listings) exist to define number of times to run code pages_of_listings = int(soup.find_all('span', class_='totalcount')[0].text) return pages_of_listings # In[15]: # Get a list of all listing ids - a market with 100 listings would return 100 distinct ids def get_listing_url_list(market, pages_of_listings): listing_ids = [] #create list of all the listing ids - these are the unique ids, which make each URL distinct for page in xrange(0, pages_of_listings, 100): #create url for each specified market if page == 0: url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa' else: #all pages of listings after the first append a page number to the end of the URL url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa?s=' + str(page) + '&' # delay script each time it gets a new set of listing urls (100 for each page) to avoid burdening the server time.sleep(0.5) #seconds # Make the request response = requests.post(url) #place data in Beautiful soup object soup = BeautifulSoup(response.text) #create links to each listing page data_pid = soup.find_all('p', class_='row') for listing_id in data_pid: listing_ids.append(listing_id['data-pid']) print page, url #prints the page number of URL for each page of listings used return listing_ids[0:5] #currently limited to five listings for test, remove slicing at end to increase results # In[16]: # Create a URL and get data from craigslist for a single listing id. # This code is rerun in a for loop to create a distinct url and get the data for each listing of interest def get_craigslist_listing(id, market): url = 'http://' + market[0] + '.craigslist.org/' + market[1] + '/apa/' + id + '.html' # Make the request response = requests.post(url) # Confirm the response worked response.status_code #place data in Beautiful soup object soup = BeautifulSoup(response.text) return soup, url # In[1]: # Craigslist example listing page from IPython.display import HTML HTML('') # In[17]: # Get the property attributes from the beautiful soup object for each listing, including # of bedrooms, bathrooms, and # square feet; housing type, pets allowed, laundry availability, parking access, smoking permissions, and availability def get_property_attributes(property): #create empty property dict to collect property attributes attribute_dict = {} try: attributes_data = property.find('p',class_='attrgroup').find_all('span') for attribute in attributes_data: if 'BR' in attribute.text: attribute_dict['bedroom'] = attribute.text.split('/')[0].replace('BR','') #only keep the number if 'Ba' in attribute.text: attribute_dict['bathroom'] = attribute.text.split('/')[1].replace('Ba','')#only keep the number if 'ft' in attribute.text: attribute_dict['square_footage'] = attribute.text.replace('ft2','') #only keep the number if attribute.text in ['apartment', 'condo', 'cottage/cabin', 'duplex', 'flat', 'house', 'in-law', 'loft', 'townhouse','manufactured', 'assisted living', 'land']: attribute_dict['housing_type'] = attribute.text if 'cat' in attribute.text: attribute_dict['cat'] = attribute.text if 'dog' in attribute.text: attribute_dict['dog'] = attribute.text if attribute.text in ['w/d in unit','laundry in bldg','laundry on site','w/d hookups']: attribute_dict['laundry'] = attribute.text if attribute.text in ['carport', 'attached garage', 'detached garage', 'off-street parking', 'street parking', 'valet parking']: attribute_dict['parking'] = attribute.text if 'smoking' in attribute.text: attribute_dict['smoking'] = attribute.text attribute_dict['availability'] = property.find('span', class_='housing_movein_now property_date')['today_msg'] attribute_dict['date_available'] = property.find('span', class_='housing_movein_now property_date')['date'] return attribute_dict except: pass # In[18]: # Get the listed price - listings without pricing data are not relevant and will be discarded in later code def get_property_price(property): price = property.find('h2') price = price.contents[3].text price = price.split() return price[0] # In[19]: # Get the date and time of the posting (e.g. 2015-01-01 12:00pm) def get_posting_date_and_time(property): posting_time = property.find('p', class_='postinginfo').find('time').text return posting_time # In[20]: # Get the user created text description for the listing def get_property_description(property): listing_description = property.find('section', class_='userbody').find('section') return listing_description.text # In[21]: #Get image attributes, such as the number of images and average image size def get_image_data(property): #Create a list of dicts containing image number, and image size average_image_size = 0 image_size_sum = 0 image_number = 0 try: images = property.find('figure').find_all('div') for pic in images[-1]: image_number = int(pic['title']) #find out number of images in listing - only keep the last (max) number image_size = pic['href'].split('_')[-1].split('.')[0].split('x') image_size_sum += int(image_size[0]) * int(image_size[1]) #get sum of image size (Width * Height) except: pass try: average_image_size = image_size_sum / image_number except: pass return image_number, average_image_size # In[22]: # Get location data, including latitude, longitutde, country, city, state, & googlemaps metric: location_data_accuracy def get_property_location(property): #get longitude, latitude, and location accuracy metric location = property.find('div', class_='viewposting') try: location_data_accuracy = property.find('div', class_='viewposting')['data-accuracy'] except: pass try: latitude = property.find('div', class_='viewposting')['data-latitude'] except: pass try: longitude = property.find('div', class_='viewposting')['data-longitude'] except: pass #get country, state, city try: #get yahoo maps link - easier to extract data than google maps map = property.find('p', class_='mapaddress').find_all('a')[1]['href'] country = map.split('country=')[1] state = map.split('csz=')[1].split('&')[0].split('+')[1] city = map.split('csz=')[1].split('&')[0].split('+')[0] except: pass #get address try: address = property.find('div', class_='mapaddress').text except: pass #place all location data into dict location_dict = {} try: location_dict['location_data_accuracy'] = location_data_accuracy except: pass try: location_dict['latitude'] = latitude except: pass try: location_dict['longitude'] = longitude except: pass try: location_dict['country'] = country except: pass try: location_dict['state'] = state except: pass try: location_dict['city'] = city except: pass latitude = "" longitude = "" country = "" state = "" city = "" location_data_accuracy = '' return location_dict # In[24]: #Main class - loop though all listings in listings_ids and extract features #initialize lists to collect attribute and location data property_attributes_data = [] property_location_data = [] markets = define_markets_to_search() #Define which markets to search (e.g. Washington DC) for market in markets: pages_of_listings = count_number_of_listings(market) #Count number of listings in specified market listing_ids = get_listing_url_list(market, pages_of_listings) #Get a list of all the individual listing to search for id in listing_ids: time.sleep(0.5) # delay script each time it get a new listing try: #get the listing HTML and the listing URL property, url = get_craigslist_listing(id, market) except: pass try: #create initial dict with property attributes property_attributes = get_property_attributes(property) property_attributes['url'] = url #add price, description, and image data to dict property_attributes['price'] = get_property_price(property) property_attributes['description'] = get_property_description(property).encode('utf-8') property_attributes['time_of_posting'] = get_posting_date_and_time(property) image_number, average_image_size = get_image_data(property) property_attributes['image_number'] = image_number property_attributes['average_image_size'] = average_image_size #add property attributes to property_attributes_data list property_attributes = pd.Series(property_attributes, name=id) property_attributes_data.append(property_attributes) except: pass try: #add location data to location data list location = get_property_location(property) location = pd.Series(location, name=id) property_location_data.append(location) except: pass #put lists into DataFrames property_attributes_dataframe = pd.DataFrame(property_attributes_data) property_location_dataframe = pd.DataFrame(property_location_data) # In[25]: # Merge the property attributes and location dataframes dat = pd.merge(property_location_dataframe, property_attributes_dataframe, left_index=True, right_index=True) # In[30]: # Check results before converting to a csv len(dat) dat[0:2] # In[34]: # Create a csv with a utf-8 encoding dat.to_csv(r'C:\Users\alsherman\Desktop\GitHub\DataScience_GeneralAssembly\Data\Craigslist_Data_May_3_.csv', encoding='utf-8')