#!/usr/bin/env python
# coding: utf-8

# The purpose of this five-part Ipython Notebook series is to gather, clean, visaulize, and model data for my final project in General Assembly's data science course. 
# 
# Using listings on craigslist, the below code is a set of functions which programatically specifies markets of interest and extracts listing data. All defined functions are used in the main class near the bottom of the notebook - which runs all functions to collect the data. 
# 
# The project data consists of listing data - text descriptions, geolocation, image attributes, and listiing metadata - for rentals in the North Virginia, Maryland, and Washington DC.

# In[12]:


import requests # Helps construct the request to send to the API
import json # JSON helper functions
from bs4 import BeautifulSoup #Data Scraping Library
import pandas as pd
import time


# In[13]:


# Specify markets to search for housing ads. 
# Each market has 2 levels - e.g [washingtondc, nva] for listings in North VA would create the start of the 
# following URL:  http://washingtondc.craigslist.org/search/nva/

def define_markets_to_search():
    
    #To get listings for new locations, add the new location in the markets list
    markets = [['washingtondc','nva'],['washingtondc','mld'],['washingtondc','doc']]
    return markets


# In[14]:


# Determines how many pages of listings exist in each market - listings are split into pages with 100 listings per page
# A market (e.g. North Virginia) with 2500 listings would return a pages_of_listings value of 25

def count_number_of_listings(market):

        #create url for each specified market
        url = 'http://' + market[0] + '.craigslist.org/search/apa' #apa == apartment

        # Make the request
        response = requests.post(url)

        #place data in a Beautiful soup object
        soup = BeautifulSoup(response.text)

        #Determine how many pages (listings) exist to define number of times to run code
        pages_of_listings = int(soup.find_all('span', class_='totalcount')[0].text)
        return pages_of_listings


# In[15]:


# Get a list of all listing ids - a market with 100 listings would return 100 distinct ids

def get_listing_url_list(market, pages_of_listings):
    listing_ids = []  #create list of all the listing ids - these are the unique ids, which make each URL distinct
        
    for page in xrange(0, pages_of_listings, 100):

        #create url for each specified market
        if page == 0: 
            url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa'
        else: 
            #all pages of listings after the first append a page number to the end of the URL 
            url = 'http://' + market[0] + '.craigslist.org/search/' + market[1] + '/apa?s=' + str(page) + '&'

        # delay script each time it gets a new set of listing urls (100 for each page) to avoid burdening the server
        time.sleep(0.5) #seconds

        # Make the request
        response = requests.post(url)

        #place data in Beautiful soup object
        soup = BeautifulSoup(response.text)

        #create links to each listing page
        data_pid = soup.find_all('p', class_='row')
        for listing_id in data_pid:
            listing_ids.append(listing_id['data-pid'])

        print page, url #prints the page number of URL for each page of listings used

    return listing_ids[0:5] #currently limited to five listings for test, remove slicing at end to increase results


# In[16]:


# Create a URL and get data from craigslist for a single listing id. 
# This code is rerun in a for loop to create a distinct url and get the data for each listing of interest

def get_craigslist_listing(id, market):
    url = 'http://' + market[0] + '.craigslist.org/' + market[1] + '/apa/' + id + '.html'
    
    # Make the request
    response = requests.post(url)

    # Confirm the response worked
    response.status_code
    
    #place data in Beautiful soup object
    soup = BeautifulSoup(response.text)

    return soup, url


# In[1]:


# Craigslist example listing page

from IPython.display import HTML
HTML('<iframe src=http://washingtondc.craigslist.org/apa/ width=1000 height=500></iframe>')


# In[17]:


# Get the property attributes from the beautiful soup object for each listing, including # of bedrooms, bathrooms, and
# square feet; housing type, pets allowed, laundry availability, parking access, smoking permissions, and availability

def get_property_attributes(property):

    #create empty property dict to collect property attributes
    attribute_dict = {}

    try:
        attributes_data = property.find('p',class_='attrgroup').find_all('span')

        for attribute in attributes_data:
            if 'BR' in attribute.text:
                attribute_dict['bedroom'] = attribute.text.split('/')[0].replace('BR','') #only keep the number
            if 'Ba' in attribute.text:
                attribute_dict['bathroom'] = attribute.text.split('/')[1].replace('Ba','')#only keep the number
            if 'ft' in attribute.text:
                attribute_dict['square_footage'] = attribute.text.replace('ft2','')       #only keep the number
            if attribute.text in ['apartment', 'condo', 'cottage/cabin', 'duplex', 'flat', 
                'house', 'in-law', 'loft', 'townhouse','manufactured', 'assisted living', 'land']:
                attribute_dict['housing_type'] = attribute.text
            if 'cat' in attribute.text:
                attribute_dict['cat'] = attribute.text
            if 'dog' in attribute.text:
                attribute_dict['dog'] = attribute.text
            if attribute.text in ['w/d in unit','laundry in bldg','laundry on site','w/d hookups']:
                attribute_dict['laundry'] = attribute.text 
            if attribute.text in ['carport', 'attached garage', 'detached garage',  'off-street parking', 'street parking', 'valet parking']:
                attribute_dict['parking'] = attribute.text
            if 'smoking' in attribute.text:
                attribute_dict['smoking'] = attribute.text

        attribute_dict['availability'] = property.find('span', class_='housing_movein_now property_date')['today_msg']
        attribute_dict['date_available'] = property.find('span', class_='housing_movein_now property_date')['date']

        return attribute_dict
    except: pass


# In[18]:


# Get the listed price - listings without pricing data are not relevant and will be discarded in later code

def get_property_price(property):

    price = property.find('h2')
    price = price.contents[3].text
    price = price.split()
    return price[0]


# In[19]:


# Get the date and time of the posting (e.g. 2015-01-01 12:00pm)

def get_posting_date_and_time(property):
    
    posting_time = property.find('p', class_='postinginfo').find('time').text
    return posting_time


# In[20]:


# Get the user created text description for the listing

def get_property_description(property):

    listing_description = property.find('section', class_='userbody').find('section')
    return listing_description.text


# In[21]:


#Get image attributes, such as the number of images and average image size

def get_image_data(property):

    #Create a list of dicts containing image number, and image size
    average_image_size = 0
    image_size_sum = 0
    image_number = 0
    
    try:
        images = property.find('figure').find_all('div')
        for pic in images[-1]:                
            image_number = int(pic['title']) #find out number of images in listing - only keep the last (max) number
            image_size = pic['href'].split('_')[-1].split('.')[0].split('x')
            image_size_sum += int(image_size[0]) * int(image_size[1]) #get sum of image size (Width * Height)
    except: pass

    try:    
        average_image_size = image_size_sum / image_number
    except: pass

    return image_number, average_image_size


# In[22]:


# Get location data, including latitude, longitutde, country, city, state, & googlemaps metric: location_data_accuracy

def get_property_location(property):

    #get longitude, latitude, and location accuracy metric
    location = property.find('div', class_='viewposting')
     
    try:
        location_data_accuracy = property.find('div', class_='viewposting')['data-accuracy']
    except: pass
    
    try:
        latitude = property.find('div', class_='viewposting')['data-latitude']
    except: pass
    
    try:
        longitude = property.find('div', class_='viewposting')['data-longitude']
    except: pass

    #get country, state, city    
    try:
        #get yahoo maps link - easier to extract data than google maps
        map = property.find('p', class_='mapaddress').find_all('a')[1]['href']
        country = map.split('country=')[1]
        state = map.split('csz=')[1].split('&')[0].split('+')[1]
        city  = map.split('csz=')[1].split('&')[0].split('+')[0]
    except: pass
    
    #get address
    try:
        address = property.find('div', class_='mapaddress').text
    except: pass
    
    #place all location data into dict
    location_dict = {}
    
    try:
        location_dict['location_data_accuracy'] = location_data_accuracy
    except: pass
    
    try:
        location_dict['latitude'] = latitude
    except: pass
    
    try:
        location_dict['longitude'] = longitude
    except: pass
    
    try:
        location_dict['country'] = country
    except: pass
    
    try:
        location_dict['state'] = state
    except: pass
    
    try:
        location_dict['city'] = city
    except: pass
        
    latitude = ""
    longitude = ""
    country = ""
    state = ""
    city = ""
    location_data_accuracy = ''

    return location_dict


# In[24]:


#Main class - loop though all listings in listings_ids and extract features

#initialize lists to collect attribute and location data
property_attributes_data = []
property_location_data = []

markets = define_markets_to_search() #Define which markets to search (e.g. Washington DC)

for market in markets:    
    pages_of_listings = count_number_of_listings(market) #Count number of listings in specified market
    listing_ids = get_listing_url_list(market, pages_of_listings) #Get a list of all the individual listing to search

    for id in listing_ids:

        time.sleep(0.5) # delay script each time it get a new listing
        try:
            #get the listing HTML and the listing URL
            property, url = get_craigslist_listing(id, market) 
        except: pass

        try:
            #create initial dict with property attributes
            property_attributes = get_property_attributes(property)
            property_attributes['url'] = url
    
            #add price, description, and image data to dict
            property_attributes['price'] = get_property_price(property)
            property_attributes['description'] = get_property_description(property).encode('utf-8')
            property_attributes['time_of_posting'] = get_posting_date_and_time(property)

            image_number, average_image_size = get_image_data(property)
            property_attributes['image_number'] = image_number
            property_attributes['average_image_size'] = average_image_size

            #add property attributes to property_attributes_data list
            property_attributes = pd.Series(property_attributes, name=id)
            property_attributes_data.append(property_attributes)
        except: pass

        try:
            #add location data to location data list
            location = get_property_location(property)
            location = pd.Series(location, name=id)    
            property_location_data.append(location)
        except: pass

    #put lists into DataFrames
    property_attributes_dataframe = pd.DataFrame(property_attributes_data)
    property_location_dataframe = pd.DataFrame(property_location_data)


# In[25]:


# Merge the property attributes and location dataframes

dat = pd.merge(property_location_dataframe, property_attributes_dataframe, left_index=True, right_index=True)


# In[30]:


# Check results before converting to a csv
len(dat)
dat[0:2]


# In[34]:


# Create a csv with a utf-8 encoding

dat.to_csv(r'C:\Users\alsherman\Desktop\GitHub\DataScience_GeneralAssembly\Data\Craigslist_Data_May_3_.csv', encoding='utf-8')