#!/usr/bin/env python # coding: utf-8 # # 2016 Phillies Games Broadcast on National Television # # I like watching the Phillies. I do not have cable. Some Phillies games are broadcast on national television. This is how I made a list of those games. # # ## [Pandas](http://pandas.pydata.org/) # # Pandas is a data analysis tool for the [Python](https://www.python.org/) programming language. It can do a tremendous amount of really powerful data analysis and visualization. It's a gun in this CSV knife fight. # In[1]: import pandas as pd # A downloadable [CSV schedule](http://philadelphia.phillies.mlb.com/schedule/downloadable.jsp#csv-format) is available from [mlb.com](http://mlb.com). Here is a [direct link](http://mlb.mlb.com/ticketing-client/csv/EventTicketPromotionPrice.tiksrv?team_id=108&display_in=singlegame&ticket_category=Tickets&site_section=Default&sub_category=Default&leave_empty_games=true&event_type=T&event_type=Y) to the Phillies schedule. # # The CSV schedule will be used to instantiate a Pandas [DataFrame](http://pandas.pydata.org/pandas-docs/version/0.13.1/generated/pandas.DataFrame.html) object. # In[2]: schedule = pd.DataFrame.from_csv("phillies-2016.csv") # ## What does the schedule metadata look like? # In[3]: schedule.info() # 190 games and 16 columns of data for each game. # # ## What does the schedule data itself look like? # In[4]: schedule.head() # ## Cleaning up the schedule # # The `DESCRIPTION` column contains the broadcast information. Less interesting columns can be removed. # In[6]: schedule.drop(["REMINDER OFF", "REMINDER ON", "START TIME ET", "END DATE", "END DATE ET", "END TIME", "END TIME ET", "REMINDER TIME", "REMINDER TIME ET", "SHOWTIMEAS FREE", "SHOWTIMEAS BUSY", "REMINDER DATE"], axis=1, inplace=True) schedule.head() # ## What are all of the stations that games are broadcast on this season? # # The `DESCRIPTION` column is nice because it mentions the stations that games are broadcast on. Sometimes a game is broadcast on two channels at once. There is also radio broadcast information that I'm not interested in right now. # In[11]: schedule.DESCRIPTION.head(50) # ### Parse television station broadcast channels from `DESCRIPTION` # # Thankfully, the `DESCRIPTION` column data is parseable. Getting a list of television broadcast stations for each game is not _too_ difficult. # In[73]: description = schedule.DESCRIPTION[6] print description # Grab the rough station string with a regular expression. # In[123]: import re TV_STATION_RE = re.compile(r"""Local\s+TV:\s+ # TV token (?P.*) # Group everything following it lazily as stations """, re.X) # Use that to pull them out and do some text wrangling. # In[ ]: def tv_stations_from_description(description): """Return a list of television stations embedded in the given description.""" tv_stations = [] result = re.search(TV_STATION_RE, str(description)) if result: media_delimiter = "-----" tv_station_str = result.group("stations").split(media_delimiter)[0] tv_stations = tv_station_str.split("- ") tv_stations = [s.strip() for s in tv_stations] return tv_stations # Test it out on all of the descriptions. # In[126]: tv_stations = set() for d in schedule.DESCRIPTION: tv_stations |= set(tv_stations_from_description(d)) tv_stations # Applying this function to the DataFrame yields a [`Series`](http://pandas.pydata.org/pandas-docs/stable/dsintro.html#series) of all television stations on which the Phillies are broadcast this season. # In[127]: stations_series = schedule.DESCRIPTION.apply(lambda d: tv_stations_from_description(d)) stations_series # Double check the `set` of stations from that `Series`. # In[129]: set([station for stations in stations_series.values for station in stations]) # The 190 Phillies games are broadcast on 6 television channels. Unfortunately only 1 of those 6 stations are available without a cable television subscription. This means that I can only watch games on NBC. # # ## The Phillies national television broadcast schedule # # Filtering the `DESCRIPTION` column to national television broadcast stations yields only the games which I can watch over the air with my [HD antenna](http://amzn.to/1r5eZmQ). # In[117]: national_broadcast_schedule = schedule[schedule.DESCRIPTION.str.contains("NBC 10") == True] national_broadcast_schedule # In[118]: national_broadcast_schedule.describe() # This means that I have the possibility to watch 10 out of 190 Phillies games this season which is roughly 5%.