Notebook

The purpose of this five-part Ipython Notebook series is to gather, clean, visaulize, and model data for my final project in General Assembly's data science course.

Using listings on craigslist, the below code is a set of functions which programatically specifies markets of interest and extracts listing data. All defined functions are used in the main class near the bottom of the notebook - which runs all functions to collect the data.

The project data consists of listing data - text descriptions, geolocation, image attributes, and listiing metadata - for rentals in the North Virginia, Maryland, and Washington DC.

In [12]:

import requests # Helps construct the request to send to the API
import json # JSON helper functions
from bs4 import BeautifulSoup #Data Scraping Library
import pandas as pd
import time

In [13]:

# Specify markets to search for housing ads. 
# Each market has 2 levels - e.g [washingtondc, nva] for listings in North VA would create the start of the 
# following URL:  http://washingtondc.craigslist.org/search/nva/

def define_markets_to_search():
    
    #To get listings for new locations, add the new location in the markets list
    markets = [['washingtondc','nva'],['washingtondc','mld'],['washingtondc','doc']]
    return markets

In [14]:

# Determines how many pages of listings exist in each market - listings are split into pages with 100 listings per page
# A market (e.g. North Virginia) with 2500 listings would return a pages_of_listings value of 25

def count_number_of_listings(market):

        #create url for each specified market
        url = 'http://' + market[0] + '.craigslist.org/search/apa' #apa == apartment

        # Make the request
        response = requests.post(url)

        #place data in a Beautiful soup object
        soup = BeautifulSoup(response.text)

        #Determine how many pages (listings) exist to define number of times to run code
        pages_of_listings = int(soup.find_all('span', class_='totalcount')[0].text)
        return pages_of_listings

In [15]:

# Get a list of all listing ids - a market with 100 listings would return 100 distinct ids

def get_listing_url_list(market, pages_of_listings):
    listing_ids = []  #create list of all the listing ids - these are the unique ids, which make each URL distinct
        
    for page in xrange(0, pages_of_listings, 100):

        #create url for each specified market
        if page == 0: 
            url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa'
        else: 
            #all pages of listings after the first append a page number to the end of the URL 
            url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa?s=' + str(page) + '&'

        # delay script each time it gets a new set of listing urls (100 for each page) to avoid burdening the server
        time.sleep(0.5) #seconds

        # Make the request
        response = requests.post(url)

        #place data in Beautiful soup object
        soup = BeautifulSoup(response.text)

        #create links to each listing page
        data_pid = soup.find_all('p', class_='row')
        for listing_id in data_pid:
            listing_ids.append(listing_id['data-pid'])

        print page, url #prints the page number of URL for each page of listings used

    return listing_ids[0:5] #currently limited to five listings for test, remove slicing at end to increase results

In [16]:

# Create a URL and get data from craigslist for a single listing id. 
# This code is rerun in a for loop to create a distinct url and get the data for each listing of interest

def get_craigslist_listing(id, market):
    url = 'http://' + market[0] + '.craigslist.org/' + market[1] + '/apa/' + id + '.html'
    
    # Make the request
    response = requests.post(url)

    # Confirm the response worked
    response.status_code
    
    #place data in Beautiful soup object
    soup = BeautifulSoup(response.text)

    return soup, url

In [1]:

# Craigslist example listing page

from IPython.display import HTML
HTML('<iframe src=http://washingtondc.craigslist.org/apa/ width=1000 height=500></iframe>')

Out[1]:

In [17]:

# Get the property attributes from the beautiful soup object for each listing, including # of bedrooms, bathrooms, and
# square feet; housing type, pets allowed, laundry availability, parking access, smoking permissions, and availability

def get_property_attributes(property):

    #create empty property dict to collect property attributes
    attribute_dict = {}

    try:
        attributes_data = property.find('p',class_='attrgroup').find_all('span')

        for attribute in attributes_data:
            if 'BR' in attribute.text:
                attribute_dict['bedroom'] = attribute.text.split('/')[0].replace('BR','') #only keep the number
            if 'Ba' in attribute.text:
                attribute_dict['bathroom'] = attribute.text.split('/')[1].replace('Ba','')#only keep the number
            if 'ft' in attribute.text:
                attribute_dict['square_footage'] = attribute.text.replace('ft2','')       #only keep the number
            if attribute.text in ['apartment', 'condo', 'cottage/cabin', 'duplex', 'flat', 
                'house', 'in-law', 'loft', 'townhouse','manufactured', 'assisted living', 'land']:
                attribute_dict['housing_type'] = attribute.text
            if 'cat' in attribute.text:
                attribute_dict['cat'] = attribute.text
            if 'dog' in attribute.text:
                attribute_dict['dog'] = attribute.text
            if attribute.text in ['w/d in unit','laundry in bldg','laundry on site','w/d hookups']:
                attribute_dict['laundry'] = attribute.text 
            if attribute.text in ['carport', 'attached garage', 'detached garage',  'off-street parking', 'street parking', 'valet parking']:
                attribute_dict['parking'] = attribute.text
            if 'smoking' in attribute.text:
                attribute_dict['smoking'] = attribute.text

        attribute_dict['availability'] = property.find('span', class_='housing_movein_now property_date')['today_msg']
        attribute_dict['date_available'] = property.find('span', class_='housing_movein_now property_date')['date']

        return attribute_dict
    except: pass

In [18]:

# Get the listed price - listings without pricing data are not relevant and will be discarded in later code

def get_property_price(property):

    price = property.find('h2')
    price = price.contents[3].text
    price = price.split()
    return price[0]

In [19]:

# Get the date and time of the posting (e.g. 2015-01-01 12:00pm)

def get_posting_date_and_time(property):
    
    posting_time = property.find('p', class_='postinginfo').find('time').text
    return posting_time

In [20]:

# Get the user created text description for the listing

def get_property_description(property):

    listing_description = property.find('section', class_='userbody').find('section')
    return listing_description.text

In [21]:

#Get image attributes, such as the number of images and average image size

def get_image_data(property):

    #Create a list of dicts containing image number, and image size
    average_image_size = 0
    image_size_sum = 0
    image_number = 0
    
    try:
        images = property.find('figure').find_all('div')
        for pic in images[-1]:                
            image_number = int(pic['title']) #find out number of images in listing - only keep the last (max) number
            image_size = pic['href'].split('_')[-1].split('.')[0].split('x')
            image_size_sum += int(image_size[0]) * int(image_size[1]) #get sum of image size (Width * Height)
    except: pass

    try:    
        average_image_size = image_size_sum / image_number
    except: pass

    return image_number, average_image_size

In [22]:

# Get location data, including latitude, longitutde, country, city, state, & googlemaps metric: location_data_accuracy

def get_property_location(property):

    #get longitude, latitude, and location accuracy metric
    location = property.find('div', class_='viewposting')
     
    try:
        location_data_accuracy = property.find('div', class_='viewposting')['data-accuracy']
    except: pass
    
    try:
        latitude = property.find('div', class_='viewposting')['data-latitude']
    except: pass
    
    try:
        longitude = property.find('div', class_='viewposting')['data-longitude']
    except: pass

    #get country, state, city    
    try:
        #get yahoo maps link - easier to extract data than google maps
        map = property.find('p', class_='mapaddress').find_all('a')[1]['href']
        country = map.split('country=')[1]
        state = map.split('csz=')[1].split('&')[0].split('+')[1]
        city  = map.split('csz=')[1].split('&')[0].split('+')[0]
    except: pass
    
    #get address
    try:
        address = property.find('div', class_='mapaddress').text
    except: pass
    
    #place all location data into dict
    location_dict = {}
    
    try:
        location_dict['location_data_accuracy'] = location_data_accuracy
    except: pass
    
    try:
        location_dict['latitude'] = latitude
    except: pass
    
    try:
        location_dict['longitude'] = longitude
    except: pass
    
    try:
        location_dict['country'] = country
    except: pass
    
    try:
        location_dict['state'] = state
    except: pass
    
    try:
        location_dict['city'] = city
    except: pass
        
    latitude = ""
    longitude = ""
    country = ""
    state = ""
    city = ""
    location_data_accuracy = ''

    return location_dict

In [24]:

#Main class - loop though all listings in listings_ids and extract features

#initialize lists to collect attribute and location data
property_attributes_data = []
property_location_data = []

markets = define_markets_to_search() #Define which markets to search (e.g. Washington DC)

for market in markets:    
    pages_of_listings = count_number_of_listings(market) #Count number of listings in specified market
    listing_ids = get_listing_url_list(market, pages_of_listings) #Get a list of all the individual listing to search

    for id in listing_ids:

        time.sleep(0.5) # delay script each time it get a new listing
        try:
            #get the listing HTML and the listing URL
            property, url = get_craigslist_listing(id, market) 
        except: pass

        try:
            #create initial dict with property attributes
            property_attributes = get_property_attributes(property)
            property_attributes['url'] = url
    
            #add price, description, and image data to dict
            property_attributes['price'] = get_property_price(property)
            property_attributes['description'] = get_property_description(property).encode('utf-8')
            property_attributes['time_of_posting'] = get_posting_date_and_time(property)

            image_number, average_image_size = get_image_data(property)
            property_attributes['image_number'] = image_number
            property_attributes['average_image_size'] = average_image_size

            #add property attributes to property_attributes_data list
            property_attributes = pd.Series(property_attributes, name=id)
            property_attributes_data.append(property_attributes)
        except: pass

        try:
            #add location data to location data list
            location = get_property_location(property)
            location = pd.Series(location, name=id)    
            property_location_data.append(location)
        except: pass

    #put lists into DataFrames
    property_attributes_dataframe = pd.DataFrame(property_attributes_data)
    property_location_dataframe = pd.DataFrame(property_location_data)

0 http://washingtondc.craigslist.org/search/nva/apa
100 http://washingtondc.craigslist.org/search/nva/apa?s=100&
200 http://washingtondc.craigslist.org/search/nva/apa?s=200&
300 http://washingtondc.craigslist.org/search/nva/apa?s=300&
400 http://washingtondc.craigslist.org/search/nva/apa?s=400&
500 http://washingtondc.craigslist.org/search/nva/apa?s=500&
600 http://washingtondc.craigslist.org/search/nva/apa?s=600&
700 http://washingtondc.craigslist.org/search/nva/apa?s=700&
800 http://washingtondc.craigslist.org/search/nva/apa?s=800&
900 http://washingtondc.craigslist.org/search/nva/apa?s=900&
1000 http://washingtondc.craigslist.org/search/nva/apa?s=1000&
1100 http://washingtondc.craigslist.org/search/nva/apa?s=1100&
1200 http://washingtondc.craigslist.org/search/nva/apa?s=1200&
1300 http://washingtondc.craigslist.org/search/nva/apa?s=1300&
1400 http://washingtondc.craigslist.org/search/nva/apa?s=1400&
1500 http://washingtondc.craigslist.org/search/nva/apa?s=1500&
1600 http://washingtondc.craigslist.org/search/nva/apa?s=1600&
1700 http://washingtondc.craigslist.org/search/nva/apa?s=1700&
1800 http://washingtondc.craigslist.org/search/nva/apa?s=1800&
1900 http://washingtondc.craigslist.org/search/nva/apa?s=1900&
2000 http://washingtondc.craigslist.org/search/nva/apa?s=2000&
2100 http://washingtondc.craigslist.org/search/nva/apa?s=2100&
2200 http://washingtondc.craigslist.org/search/nva/apa?s=2200&
2300 http://washingtondc.craigslist.org/search/nva/apa?s=2300&
2400 http://washingtondc.craigslist.org/search/nva/apa?s=2400&
0 http://washingtondc.craigslist.org/search/mld/apa
100 http://washingtondc.craigslist.org/search/mld/apa?s=100&
200 http://washingtondc.craigslist.org/search/mld/apa?s=200&
300 http://washingtondc.craigslist.org/search/mld/apa?s=300&
400 http://washingtondc.craigslist.org/search/mld/apa?s=400&
500 http://washingtondc.craigslist.org/search/mld/apa?s=500&
600 http://washingtondc.craigslist.org/search/mld/apa?s=600&
700 http://washingtondc.craigslist.org/search/mld/apa?s=700&
800 http://washingtondc.craigslist.org/search/mld/apa?s=800&
900 http://washingtondc.craigslist.org/search/mld/apa?s=900&
1000 http://washingtondc.craigslist.org/search/mld/apa?s=1000&
1100 http://washingtondc.craigslist.org/search/mld/apa?s=1100&
1200 http://washingtondc.craigslist.org/search/mld/apa?s=1200&
1300 http://washingtondc.craigslist.org/search/mld/apa?s=1300&
1400 http://washingtondc.craigslist.org/search/mld/apa?s=1400&
1500 http://washingtondc.craigslist.org/search/mld/apa?s=1500&
1600 http://washingtondc.craigslist.org/search/mld/apa?s=1600&
1700 http://washingtondc.craigslist.org/search/mld/apa?s=1700&
1800 http://washingtondc.craigslist.org/search/mld/apa?s=1800&
1900 http://washingtondc.craigslist.org/search/mld/apa?s=1900&
2000 http://washingtondc.craigslist.org/search/mld/apa?s=2000&
2100 http://washingtondc.craigslist.org/search/mld/apa?s=2100&
2200 http://washingtondc.craigslist.org/search/mld/apa?s=2200&
2300 http://washingtondc.craigslist.org/search/mld/apa?s=2300&
2400 http://washingtondc.craigslist.org/search/mld/apa?s=2400&
0 http://washingtondc.craigslist.org/search/doc/apa
100 http://washingtondc.craigslist.org/search/doc/apa?s=100&
200 http://washingtondc.craigslist.org/search/doc/apa?s=200&
300 http://washingtondc.craigslist.org/search/doc/apa?s=300&
400 http://washingtondc.craigslist.org/search/doc/apa?s=400&
500 http://washingtondc.craigslist.org/search/doc/apa?s=500&
600 http://washingtondc.craigslist.org/search/doc/apa?s=600&
700 http://washingtondc.craigslist.org/search/doc/apa?s=700&
800 http://washingtondc.craigslist.org/search/doc/apa?s=800&
900 http://washingtondc.craigslist.org/search/doc/apa?s=900&
1000 http://washingtondc.craigslist.org/search/doc/apa?s=1000&
1100 http://washingtondc.craigslist.org/search/doc/apa?s=1100&
1200 http://washingtondc.craigslist.org/search/doc/apa?s=1200&
1300 http://washingtondc.craigslist.org/search/doc/apa?s=1300&
1400 http://washingtondc.craigslist.org/search/doc/apa?s=1400&
1500 http://washingtondc.craigslist.org/search/doc/apa?s=1500&
1600 http://washingtondc.craigslist.org/search/doc/apa?s=1600&
1700 http://washingtondc.craigslist.org/search/doc/apa?s=1700&
1800 http://washingtondc.craigslist.org/search/doc/apa?s=1800&
1900 http://washingtondc.craigslist.org/search/doc/apa?s=1900&
2000 http://washingtondc.craigslist.org/search/doc/apa?s=2000&
2100 http://washingtondc.craigslist.org/search/doc/apa?s=2100&
2200 http://washingtondc.craigslist.org/search/doc/apa?s=2200&
2300 http://washingtondc.craigslist.org/search/doc/apa?s=2300&
2400 http://washingtondc.craigslist.org/search/doc/apa?s=2400&

In [25]:

# Merge the property attributes and location dataframes

dat = pd.merge(property_location_dataframe, property_attributes_dataframe, left_index=True, right_index=True)

In [30]:

# Check results before converting to a csv
len(dat)
dat[0:2]

Out[30]:

	city	country	latitude	location_data_accuracy	longitude	state	availability	average_image_size	bathroom	bedroom	...	dog	housing_type	image_number	laundry	parking	price	smoking	square_footage	time_of_posting	url
4959351766	NaN	NaN	NaN	NaN	NaN	NaN	available now	270000	1	2	...	NaN	condo	18	NaN	NaN	$1310	NaN	NaN	2015-04-01 2:44pm	http://washingtondc.craigslist.org/mld/apa/495...
4959370650	Alexandria	US	38.806000	22	-77.052900	DC	available now	0	NaN	0	...	dogs are OK - wooof	house	0	NaN	attached garage	$860	NaN	NaN	2015-04-01 2:55pm	http://washingtondc.craigslist.org/mld/apa/495...

2 rows × 23 columns

In [34]:

# Create a csv with a utf-8 encoding

dat.to_csv(r'C:\Users\alsherman\Desktop\GitHub\DataScience_GeneralAssembly\Data\Craigslist_Data_May_3_.csv', encoding='utf-8')