#!/usr/bin/env python # coding: utf-8 # In[13]: import re import pandas as pd import json from tidylib import tidy_document import html2text # In[14]: fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv' t1 = pd.read_csv(fn, header=0) # In[15]: def cleanup(txt): doc, errors = tidy_document(str(txt)) doc = html2text.html2text(doc) return doc.replace('*','').replace('_','').lower().strip() def add_clean_text(row): row['scrape'] = cleanup(row.AdditionalDescription) return row # In[16]: t1_fix = t1.apply(add_clean_text,1) # In[17]: # Work with these columns for now cols = ['RequestID', 'ConfirmationNumber', 'AgencyCode', 'AgencyName', 'AgencyDivision', 'SectionID', 'SectionName', 'scrape'] fixed = t1_fix[cols] # top 10: breakdown of ads by agencies fixed['AgencyName'].value_counts()[:10] # #Parsing the Mayor's Office of Contract of Services Ads # # 67 ads to parse # # # **Plan of Attack** # 1. Experiment to figure out parsing pipeline # 2. write tests to verify parsing results # 3. Use the tests to build confidence of the process, to sign off on data. # # *** # # The data is in a very consistent format which can be tackled by tokenizing and extracting # with regular expressions. # # : # : # : # # ## PUBLIC NOTICE ADVERTS # There are about 3 or 4 of these date-time of meeting anouncements. Using the presence of **"public notice is hearby given"** to extract the inevetable date and time. The full text of # the advert should be further processed to extract Named Entities as well as Street Addresses # # > * Sample Message # > > public notice is hereby given that the # > > franchise and concession review committee will hold a # > > public meeting on **wednesday, october 8, 2014 at 2:30 p.m.**, # > > at 22 reade street, spector hall, borough of manhattan. # > > # > > note: individuals requesting sign language interpreters # > > should contact the mayor's office of contracts services, # > > public hearings unit, 253 broadway, 9th floor, # > > new york, ny 10007 (212) 788-7490, no later # > > than seven (7) business days prior to the public meeting. tdd # > > users should call verizon relay service. # > * Extracted Result # > > data: wednesday october 8 2014 at 2 30 p.m. # # ## Notice of intent Adverts # This type of message makes up the majority of adverts to parse in this set. There can be multiple ads in an anouncement where each ad is collection of key -> value pairs, starting with the **agency: agency name** key pair which we will leverage to tokenize and parse the data. # # Once the presence of **"notice of intent to"** is detected, the parsing/extracing process is: # 1. Split the whole text into a preamble + list-of-ads # 2. For each ad in the list-of-ads, iterate over each key-value pair. # # > * Sample Message # > > notice of intent to extend contract(s) not included in fy 2015 annual # > > contracting plan and schedule # > > # > > notice is hereby given that the mayor will be entering into the following extension(s) of > > (a) contract(s) not included in the fy 2015 annual contracting plan and schedule that is # > > published pursuant to new york city charter § 312(a): # > > # > > agency: department of information technology & telecommunications # > > # > > vendor: accenture llp # > > # > > description of services: design, development and deployment of application enhancements # > > and extensions to the existing apt system along with the appropriate documentation # > > required. # > > # > > award method of original contract: intergovernmental # > > # > > fms contract type: consultant # > > # > > end date of original contract: 1/31/2015 # > > # > > method of renewal/extension the agency intends to utilize: extension # > > # > > new start date of the proposed renewed/extended contract: 2/1/15 # > > # > > new end date of the proposed renewed/extended contract: 7/31/15 # > > # > > modifications sought to the nature of services performed under the contract: none # > > # > > reason(s) the agency intends to renew/extend the contract: continuation of services # > > # > > personnel in substantially similar titles within agency: apt project manager – 1; apt # > > technical lead – 1; apt developer - 2 headcount of personnel in substantially similar # > > titles within agency: 4 # > * Extracted Result # > > * agency # > > * * department of information technology & telecommunications # > > * award method of original contract # > > * * intergovernmental # > > * description of services # > > * * design, development and deployment of application # > > * end date of original contract # > > * * 1/31/2015 # > > * fms contract type # > > * * consultant # > > * headcount of personnel in substantially similar titles within agency # > > * * 4 # > > * method of renewal/extension the agency intends to utilize # > > * * extension # > > * new end date of the proposed renewed/extended contract # > > * * 7/31/15 # > > * new start date of the proposed renewed/extended contract # > > * * 2/1/15' # > > * personnel in substantially similar titles within agency # > > * * apt project manager – 1; apt technical lead – 1; apt developer - 2 # > > * reason(s) the agency intends to renew/extend the contract # > > * * continuation ' # > > * vendor # > > * * accenture llp, # > > * preamble # > > * * notice of intent to extend contract(s) not included in fy 2015 annual contracting plan and schedule\n\nnotice is hereby given that the mayor will be entering into the following extension(s) of (a) contract(s) not included in the fy 2015 annual contracting plan and schedule that is published pursuant to new york city charter § 312(a) # In[6]: # select Office of Contract Services Ads mocs = fixed['AgencyName'] == "Mayor's Office of Contract Services" mocs_ads = fixed[mocs][['RequestID', 'scrape']] # In[7]: # build a regex for public meeting notice rex_time = 'on\s+(?P[^\s,]+)[\s,]\s+(?P[^\s,]+)\s+(?P\d+)[,\s]+(?P\d+)\s+(at)?\s*(?P\d+):(?P\d+)\s+(?P\w+\.?\w+\.?)' rex_time = re.compile(rex_time, re.IGNORECASE|re.DOTALL|re.MULTILINE) rex_mm_dd_year = '(?P\d{1,2})/(?P\d{1,2})/(?P\d{2,4})' rex_month_d_y = '(?P\w+)\s+(?P\d+),?\s+(?P\d+)' # capture : rex_generic = '(?P^[^:]+):(?P[^:]+)(?=^[^:]|$)' rex_generic = re.compile(rex_generic, re.IGNORECASE|re.MULTILINE|re.DOTALL) # In[18]: import pprint def parse_regs(txt): orig = txt txt = re.sub(' +', ' ', txt) txt = re.sub('\n{3,}', '\n\n', txt) verbose = False if 'public notice is hereby given that' in txt: return 'public notice', txt, json.dumps(rex_time.findall(txt)) elif 'notice of intent to' in txt: out = [] # a notice can have multiple ads tokens = re.split('\n(?=agency:)', txt) preamble = tokens[0] for token in tokens[1:]: fragment = re.split('\n(?=[^:]+:)', token) if verbose: print('fragment...') print(fragment) print ('---\n\n') entry = {} for tok in fragment: if verbose: print('\tTokens:') print('\t\t{}'.format(tok)) match = rex_generic.match(tok) if match is None: continue # get the captured key: value pair key = match.group(1).strip() value = match.group(2).strip() entry[key] = value if verbose: print( 'Parsed Entry:') print(entry) print ('---\n\n') if entry.keys(): out.append(entry) return 'intent to solicit', preamble, json.dumps(out) else: print('NOT PARSED!!! --\n{}---\n\n'.format(txt)) return 'unknown', 'unknown', json.dumps(None) def parse(row): msg_type, preamble, row['result'] = parse_regs(row.scrape) preamble = preamble.strip() data = json.loads(row['result']) # add these to the result pass_through = ['RequestID', 'ConfirmationNumber', 'AgencyCode', 'AgencyName', 'AgencyDivision', 'SectionID', 'SectionName'] dic = {} for k in pass_through: dic[k] = row.get(k) if msg_type == 'public notice': if len(data): dic['notice'] = preamble dic['data'] = ' '.join(data[0]) row['result'] = json.dumps(dic) else: dic['notice'] = preamble dic['data'] = '' row['result'] = json.dumps(dic) elif msg_type == 'intent to solicit': dic['preamble'] = preamble dic['data'] = data row['result'] = json.dumps(dic) return row t1_parsed = mocs_ads.apply(parse,1) mayors_ocs = [] for ad in t1_parsed['result'].values: mayors_ocs.append(json.loads(ad)) pprint.pprint(mayors_ocs) # In[ ]: