The purpose of this five-part Ipython Notebook series is to gather, clean, visaulize, and model data for my final project in General Assembly's data science course.
Using listings on craigslist, the below code is a set of functions which programatically specifies markets of interest and extracts listing data. All defined functions are used in the main class near the bottom of the notebook - which runs all functions to collect the data.
The project data consists of listing data - text descriptions, geolocation, image attributes, and listiing metadata - for rentals in the North Virginia, Maryland, and Washington DC.
import requests # Helps construct the request to send to the API
import json # JSON helper functions
from bs4 import BeautifulSoup #Data Scraping Library
import pandas as pd
import time
# Specify markets to search for housing ads.
# Each market has 2 levels - e.g [washingtondc, nva] for listings in North VA would create the start of the
# following URL: http://washingtondc.craigslist.org/search/nva/
def define_markets_to_search():
#To get listings for new locations, add the new location in the markets list
markets = [['washingtondc','nva'],['washingtondc','mld'],['washingtondc','doc']]
return markets
# Determines how many pages of listings exist in each market - listings are split into pages with 100 listings per page
# A market (e.g. North Virginia) with 2500 listings would return a pages_of_listings value of 25
def count_number_of_listings(market):
#create url for each specified market
url = 'http://' + market[0] + '.craigslist.org/search/apa' #apa == apartment
# Make the request
response = requests.post(url)
#place data in a Beautiful soup object
soup = BeautifulSoup(response.text)
#Determine how many pages (listings) exist to define number of times to run code
pages_of_listings = int(soup.find_all('span', class_='totalcount')[0].text)
return pages_of_listings
# Get a list of all listing ids - a market with 100 listings would return 100 distinct ids
def get_listing_url_list(market, pages_of_listings):
listing_ids = [] #create list of all the listing ids - these are the unique ids, which make each URL distinct
for page in xrange(0, pages_of_listings, 100):
#create url for each specified market
if page == 0:
url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa'
else:
#all pages of listings after the first append a page number to the end of the URL
url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa?s=' + str(page) + '&'
# delay script each time it gets a new set of listing urls (100 for each page) to avoid burdening the server
time.sleep(0.5) #seconds
# Make the request
response = requests.post(url)
#place data in Beautiful soup object
soup = BeautifulSoup(response.text)
#create links to each listing page
data_pid = soup.find_all('p', class_='row')
for listing_id in data_pid:
listing_ids.append(listing_id['data-pid'])
print page, url #prints the page number of URL for each page of listings used
return listing_ids[0:5] #currently limited to five listings for test, remove slicing at end to increase results
# Create a URL and get data from craigslist for a single listing id.
# This code is rerun in a for loop to create a distinct url and get the data for each listing of interest
def get_craigslist_listing(id, market):
url = 'http://' + market[0] + '.craigslist.org/' + market[1] + '/apa/' + id + '.html'
# Make the request
response = requests.post(url)
# Confirm the response worked
response.status_code
#place data in Beautiful soup object
soup = BeautifulSoup(response.text)
return soup, url
# Craigslist example listing page
from IPython.display import HTML
HTML('<iframe src=http://washingtondc.craigslist.org/apa/ width=1000 height=500></iframe>')
# Get the property attributes from the beautiful soup object for each listing, including # of bedrooms, bathrooms, and
# square feet; housing type, pets allowed, laundry availability, parking access, smoking permissions, and availability
def get_property_attributes(property):
#create empty property dict to collect property attributes
attribute_dict = {}
try:
attributes_data = property.find('p',class_='attrgroup').find_all('span')
for attribute in attributes_data:
if 'BR' in attribute.text:
attribute_dict['bedroom'] = attribute.text.split('/')[0].replace('BR','') #only keep the number
if 'Ba' in attribute.text:
attribute_dict['bathroom'] = attribute.text.split('/')[1].replace('Ba','')#only keep the number
if 'ft' in attribute.text:
attribute_dict['square_footage'] = attribute.text.replace('ft2','') #only keep the number
if attribute.text in ['apartment', 'condo', 'cottage/cabin', 'duplex', 'flat',
'house', 'in-law', 'loft', 'townhouse','manufactured', 'assisted living', 'land']:
attribute_dict['housing_type'] = attribute.text
if 'cat' in attribute.text:
attribute_dict['cat'] = attribute.text
if 'dog' in attribute.text:
attribute_dict['dog'] = attribute.text
if attribute.text in ['w/d in unit','laundry in bldg','laundry on site','w/d hookups']:
attribute_dict['laundry'] = attribute.text
if attribute.text in ['carport', 'attached garage', 'detached garage', 'off-street parking', 'street parking', 'valet parking']:
attribute_dict['parking'] = attribute.text
if 'smoking' in attribute.text:
attribute_dict['smoking'] = attribute.text
attribute_dict['availability'] = property.find('span', class_='housing_movein_now property_date')['today_msg']
attribute_dict['date_available'] = property.find('span', class_='housing_movein_now property_date')['date']
return attribute_dict
except: pass
# Get the listed price - listings without pricing data are not relevant and will be discarded in later code
def get_property_price(property):
price = property.find('h2')
price = price.contents[3].text
price = price.split()
return price[0]
# Get the date and time of the posting (e.g. 2015-01-01 12:00pm)
def get_posting_date_and_time(property):
posting_time = property.find('p', class_='postinginfo').find('time').text
return posting_time
# Get the user created text description for the listing
def get_property_description(property):
listing_description = property.find('section', class_='userbody').find('section')
return listing_description.text
#Get image attributes, such as the number of images and average image size
def get_image_data(property):
#Create a list of dicts containing image number, and image size
average_image_size = 0
image_size_sum = 0
image_number = 0
try:
images = property.find('figure').find_all('div')
for pic in images[-1]:
image_number = int(pic['title']) #find out number of images in listing - only keep the last (max) number
image_size = pic['href'].split('_')[-1].split('.')[0].split('x')
image_size_sum += int(image_size[0]) * int(image_size[1]) #get sum of image size (Width * Height)
except: pass
try:
average_image_size = image_size_sum / image_number
except: pass
return image_number, average_image_size
# Get location data, including latitude, longitutde, country, city, state, & googlemaps metric: location_data_accuracy
def get_property_location(property):
#get longitude, latitude, and location accuracy metric
location = property.find('div', class_='viewposting')
try:
location_data_accuracy = property.find('div', class_='viewposting')['data-accuracy']
except: pass
try:
latitude = property.find('div', class_='viewposting')['data-latitude']
except: pass
try:
longitude = property.find('div', class_='viewposting')['data-longitude']
except: pass
#get country, state, city
try:
#get yahoo maps link - easier to extract data than google maps
map = property.find('p', class_='mapaddress').find_all('a')[1]['href']
country = map.split('country=')[1]
state = map.split('csz=')[1].split('&')[0].split('+')[1]
city = map.split('csz=')[1].split('&')[0].split('+')[0]
except: pass
#get address
try:
address = property.find('div', class_='mapaddress').text
except: pass
#place all location data into dict
location_dict = {}
try:
location_dict['location_data_accuracy'] = location_data_accuracy
except: pass
try:
location_dict['latitude'] = latitude
except: pass
try:
location_dict['longitude'] = longitude
except: pass
try:
location_dict['country'] = country
except: pass
try:
location_dict['state'] = state
except: pass
try:
location_dict['city'] = city
except: pass
latitude = ""
longitude = ""
country = ""
state = ""
city = ""
location_data_accuracy = ''
return location_dict
#Main class - loop though all listings in listings_ids and extract features
#initialize lists to collect attribute and location data
property_attributes_data = []
property_location_data = []
markets = define_markets_to_search() #Define which markets to search (e.g. Washington DC)
for market in markets:
pages_of_listings = count_number_of_listings(market) #Count number of listings in specified market
listing_ids = get_listing_url_list(market, pages_of_listings) #Get a list of all the individual listing to search
for id in listing_ids:
time.sleep(0.5) # delay script each time it get a new listing
try:
#get the listing HTML and the listing URL
property, url = get_craigslist_listing(id, market)
except: pass
try:
#create initial dict with property attributes
property_attributes = get_property_attributes(property)
property_attributes['url'] = url
#add price, description, and image data to dict
property_attributes['price'] = get_property_price(property)
property_attributes['description'] = get_property_description(property).encode('utf-8')
property_attributes['time_of_posting'] = get_posting_date_and_time(property)
image_number, average_image_size = get_image_data(property)
property_attributes['image_number'] = image_number
property_attributes['average_image_size'] = average_image_size
#add property attributes to property_attributes_data list
property_attributes = pd.Series(property_attributes, name=id)
property_attributes_data.append(property_attributes)
except: pass
try:
#add location data to location data list
location = get_property_location(property)
location = pd.Series(location, name=id)
property_location_data.append(location)
except: pass
#put lists into DataFrames
property_attributes_dataframe = pd.DataFrame(property_attributes_data)
property_location_dataframe = pd.DataFrame(property_location_data)
0 http://washingtondc.craigslist.org/search/nva/apa 100 http://washingtondc.craigslist.org/search/nva/apa?s=100& 200 http://washingtondc.craigslist.org/search/nva/apa?s=200& 300 http://washingtondc.craigslist.org/search/nva/apa?s=300& 400 http://washingtondc.craigslist.org/search/nva/apa?s=400& 500 http://washingtondc.craigslist.org/search/nva/apa?s=500& 600 http://washingtondc.craigslist.org/search/nva/apa?s=600& 700 http://washingtondc.craigslist.org/search/nva/apa?s=700& 800 http://washingtondc.craigslist.org/search/nva/apa?s=800& 900 http://washingtondc.craigslist.org/search/nva/apa?s=900& 1000 http://washingtondc.craigslist.org/search/nva/apa?s=1000& 1100 http://washingtondc.craigslist.org/search/nva/apa?s=1100& 1200 http://washingtondc.craigslist.org/search/nva/apa?s=1200& 1300 http://washingtondc.craigslist.org/search/nva/apa?s=1300& 1400 http://washingtondc.craigslist.org/search/nva/apa?s=1400& 1500 http://washingtondc.craigslist.org/search/nva/apa?s=1500& 1600 http://washingtondc.craigslist.org/search/nva/apa?s=1600& 1700 http://washingtondc.craigslist.org/search/nva/apa?s=1700& 1800 http://washingtondc.craigslist.org/search/nva/apa?s=1800& 1900 http://washingtondc.craigslist.org/search/nva/apa?s=1900& 2000 http://washingtondc.craigslist.org/search/nva/apa?s=2000& 2100 http://washingtondc.craigslist.org/search/nva/apa?s=2100& 2200 http://washingtondc.craigslist.org/search/nva/apa?s=2200& 2300 http://washingtondc.craigslist.org/search/nva/apa?s=2300& 2400 http://washingtondc.craigslist.org/search/nva/apa?s=2400& 0 http://washingtondc.craigslist.org/search/mld/apa 100 http://washingtondc.craigslist.org/search/mld/apa?s=100& 200 http://washingtondc.craigslist.org/search/mld/apa?s=200& 300 http://washingtondc.craigslist.org/search/mld/apa?s=300& 400 http://washingtondc.craigslist.org/search/mld/apa?s=400& 500 http://washingtondc.craigslist.org/search/mld/apa?s=500& 600 http://washingtondc.craigslist.org/search/mld/apa?s=600& 700 http://washingtondc.craigslist.org/search/mld/apa?s=700& 800 http://washingtondc.craigslist.org/search/mld/apa?s=800& 900 http://washingtondc.craigslist.org/search/mld/apa?s=900& 1000 http://washingtondc.craigslist.org/search/mld/apa?s=1000& 1100 http://washingtondc.craigslist.org/search/mld/apa?s=1100& 1200 http://washingtondc.craigslist.org/search/mld/apa?s=1200& 1300 http://washingtondc.craigslist.org/search/mld/apa?s=1300& 1400 http://washingtondc.craigslist.org/search/mld/apa?s=1400& 1500 http://washingtondc.craigslist.org/search/mld/apa?s=1500& 1600 http://washingtondc.craigslist.org/search/mld/apa?s=1600& 1700 http://washingtondc.craigslist.org/search/mld/apa?s=1700& 1800 http://washingtondc.craigslist.org/search/mld/apa?s=1800& 1900 http://washingtondc.craigslist.org/search/mld/apa?s=1900& 2000 http://washingtondc.craigslist.org/search/mld/apa?s=2000& 2100 http://washingtondc.craigslist.org/search/mld/apa?s=2100& 2200 http://washingtondc.craigslist.org/search/mld/apa?s=2200& 2300 http://washingtondc.craigslist.org/search/mld/apa?s=2300& 2400 http://washingtondc.craigslist.org/search/mld/apa?s=2400& 0 http://washingtondc.craigslist.org/search/doc/apa 100 http://washingtondc.craigslist.org/search/doc/apa?s=100& 200 http://washingtondc.craigslist.org/search/doc/apa?s=200& 300 http://washingtondc.craigslist.org/search/doc/apa?s=300& 400 http://washingtondc.craigslist.org/search/doc/apa?s=400& 500 http://washingtondc.craigslist.org/search/doc/apa?s=500& 600 http://washingtondc.craigslist.org/search/doc/apa?s=600& 700 http://washingtondc.craigslist.org/search/doc/apa?s=700& 800 http://washingtondc.craigslist.org/search/doc/apa?s=800& 900 http://washingtondc.craigslist.org/search/doc/apa?s=900& 1000 http://washingtondc.craigslist.org/search/doc/apa?s=1000& 1100 http://washingtondc.craigslist.org/search/doc/apa?s=1100& 1200 http://washingtondc.craigslist.org/search/doc/apa?s=1200& 1300 http://washingtondc.craigslist.org/search/doc/apa?s=1300& 1400 http://washingtondc.craigslist.org/search/doc/apa?s=1400& 1500 http://washingtondc.craigslist.org/search/doc/apa?s=1500& 1600 http://washingtondc.craigslist.org/search/doc/apa?s=1600& 1700 http://washingtondc.craigslist.org/search/doc/apa?s=1700& 1800 http://washingtondc.craigslist.org/search/doc/apa?s=1800& 1900 http://washingtondc.craigslist.org/search/doc/apa?s=1900& 2000 http://washingtondc.craigslist.org/search/doc/apa?s=2000& 2100 http://washingtondc.craigslist.org/search/doc/apa?s=2100& 2200 http://washingtondc.craigslist.org/search/doc/apa?s=2200& 2300 http://washingtondc.craigslist.org/search/doc/apa?s=2300& 2400 http://washingtondc.craigslist.org/search/doc/apa?s=2400&
# Merge the property attributes and location dataframes
dat = pd.merge(property_location_dataframe, property_attributes_dataframe, left_index=True, right_index=True)
# Check results before converting to a csv
len(dat)
dat[0:2]
city | country | latitude | location_data_accuracy | longitude | state | availability | average_image_size | bathroom | bedroom | ... | dog | housing_type | image_number | laundry | parking | price | smoking | square_footage | time_of_posting | url | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4959351766 | NaN | NaN | NaN | NaN | NaN | NaN | available now | 270000 | 1 | 2 | ... | NaN | condo | 18 | NaN | NaN | $1310 | NaN | NaN | 2015-04-01 2:44pm | http://washingtondc.craigslist.org/mld/apa/495... |
4959370650 | Alexandria | US | 38.806000 | 22 | -77.052900 | DC | available now | 0 | NaN | 0 | ... | dogs are OK - wooof | house | 0 | NaN | attached garage | $860 | NaN | NaN | 2015-04-01 2:55pm | http://washingtondc.craigslist.org/mld/apa/495... |
2 rows × 23 columns
# Create a csv with a utf-8 encoding
dat.to_csv(r'C:\Users\alsherman\Desktop\GitHub\DataScience_GeneralAssembly\Data\Craigslist_Data_May_3_.csv', encoding='utf-8')