#!/usr/bin/env python # coding: utf-8 # In[1]: import re import pandas as pd # In[2]: fn = 'procPublicationRequest Oct-Dec 2014 (Updated) - Sheet1-2.csv' t1 = pd.read_csv(fn, header=0) # In[3]: from tidylib import tidy_document import html2text # In[4]: def cleanup(txt): doc, errors = tidy_document(str(txt)) doc = html2text.html2text(doc) return doc.replace('*','').replace('_','').lower().strip() def add_clean_text(row): row['scrape'] = cleanup(row.AdditionalDescription) return row # In[5]: t1_fix = t1.apply(add_clean_text,1) # In[6]: # I only care about these columns for now cols = ['RequestID', 'ConfirmationNumber', 'AgencyCode', 'AgencyName', 'AgencyDivision', 'SectionID', 'SectionName', 'scrape'] fixed = t1_fix[cols] # top 10: breakdown of ads by agencies fixed['AgencyName'].value_counts()[:10] # **Focus on parsing the Mayor's Office of Contract of Services ads.** # In[7]: # select Office of Contract Services Ads mocs = fixed['AgencyName'] == "Mayor's Office of Contract Services" mocs_ads = fixed[mocs][['RequestID', 'scrape']] # **Example entries** # # __public meeting notice__ # # public notice is hereby given that the # franchise and concession review committee will hold a # public meeting on wednesday, october 8, 2014 at 2:30 p.m., # at 22 reade street, spector hall, borough of manhattan. # # # Fields to extract | description | Parsing Status # ------------------| -----------------| -------------- # datetime | meeting datetime | working # # __Notice of intent to extend contract__ # # vendor: accenture llp # description of services: design, development and deployment of application # enhancements and extensions to the existing apt system along with the # appropriate documentation required. # # award method of original contract: intergovernmental # fms contract type: consultant # end date of original contract: 1/31/2015 # method of renewal/extenction the agency intends to utilize: extension # new start date of the proposed renewed/extended contract: 2/1/15 # new end date of the proposed renewed/extended contract: 7/31/15 # modifications sought to the nature of services performed under the contract: none # reason(s) the agency intends to renew/extend the contract: continuation of services # personnel in substantially similar titles within agency: apt project manager – # 1; apt technical lead – 1; apt developer - 2 # headcount of personnel in substantially similar titles within agency: 4 # # # Fields to extract |desc | Parsing Status # ----------------------------------------------------------------------------|-----|--------------- # vendor | - | needs test # description of services | - | needs test # award method of origian contract | - | needs test # fms contract type | - | needs test # end date of original contract | - | needs test # method of renewalextension | - | needs test # new start date of proposed renewed/extended contract | - | needs test # new end date of proposed renewed/extended contract | - | needs test # modifications sought to the nature of services performed under the contract | - | needs test # reason(s) the agency intends to renew/extend the contract | - | needs test # personnel in substantially similar titles within agency | - | debugging & needs test # headcount of personnel in substantially similar titles within agency | - | needs test # # # In[8]: # build a regex for public meeting notice rex_time = 'on\s+(?P[^\s,]+)[\s,]\s+(?P[^\s,]+)\s+(?P\d+)[,\s]+(?P\d+)\s+(at)?\s*(?P\d+):(?P\d+)\s+(?P\w+\.?\w+\.?)' rex_time = re.compile(rex_time, re.IGNORECASE|re.DOTALL|re.MULTILINE) # build regexes to parse Notice of intent to extend contract ads extend_contract_keys = [ 'vendor', 'description of services', 'award method of origian contract', 'fms contract type', 'end date of original contract', 'method of renewalextension', 'new start date of proposed renewed/extended contract', 'new end date of proposed renewed/extended contract', 'modifications sought to the nature of services performed under the contract', 'reason(s) the agency intends to renew/extend the contract', 'personnel in substantially similar titles within agency', 'headcount of personnel in substantially similar titles within agency' ] rex_mm_dd_year = '(?P\d{1,2})/(?P\d{1,2})/(?P\d{2,4})' rex_extend_contract = { 'vendor' : 'vendor:\s+(?P[^\n]+$)\n', 'description of services' : 'description\sof\sservices:\s+(?P.+)\n+award', 'award method of origian contract': 'award\smethod\sof\soriginal\scontract:\s+(?P[^\n]+)', 'fms contract type' : 'fms\scontract\stype:\s+(?P[^\n]+$)\n', 'end date of original contract': 'end\sdate\sof\soriginal\scontract:\s' + rex_mm_dd_year, 'method of renewalextension': 'method\sof\srenewal/extension[^:]+:\s(?P[^\n]+$)\n', 'new start date of proposed renewed/extended contract': 'new\sstart\sdate\sof\sthe\sproposed\srenewed.extended\scontract:\s' + rex_mm_dd_year, 'new end date of proposed renewed/extended contract': 'new\send\sdate\sof\sthe\sproposed\srenewed.extended\scontract:\s' + rex_mm_dd_year, 'modifications sought to the nature of services performed under the contract': 'modifications\ssought\sto\sthe\snature\sof\sservices\sperformed\sunder\sthe\scontract:\s+(?P.+)\n+reason', 'reason(s) the agency intends to renew/extend the contract': 'reason\(s\)\sthe\sagency\sintends\sto\srenew/extend\sthe\scontract:\s(?P.+)\n+personnel', # todo - debug this regex. not working on all the dataset # 'personnel in substantially similar titles within agency': '^personnel\sin\ssubstantially\ssimilar\stitles\swithin\sagency:\s+(?P(none|.+))\n(headcount)?', 'headcount of personnel in substantially similar titles within agency': 'headcount\sof\spersonnel\sin\ssubstantially\ssimilar\stitles\swithin\sagency:\s+(?P\d+)\n' } for k,v in rex_extend_contract.items(): rex_extend_contract[k] = re.compile(v, re.IGNORECASE|re.MULTILINE|re.DOTALL) # In[9]: mocs = fixed['AgencyName'] == "Mayor's Office of Contract Services" mocs_ads = fixed[mocs][['RequestID', 'scrape']] # In[10]: # Iterate over the data and see if we can scrape # unparsed_count = 0 unparsed = [] moc_ads = mocs_ads['scrape'].values for ad in moc_ads: # remove multiple spaces and newlines txt = re.sub(' +', ' ', ad) txt = re.sub('\n{3,}', '\n', txt) match = rex_time.findall(txt) if match: print('Extracted: {}'.format(match)) else: # dispatch parsing to appropriate collection of regex if 'notice of intent to extend contract(s)' in txt or \ 'notice of intent to issue new solicitation' in txt: out = {} for k in extend_contract_keys: match = rex_extend_contract[k].findall(txt) if len(match) and isinstance(match[0], str): match = [match[0].strip()] if len(match): out[k] = match[0] if len(out.keys()): # print('Parsed...{0}'.format(txt)) xxx = 'personnel in substantially similar titles within agency' print ('****{} == [{}]'.format(xxx, out[xxx])) else: print(txt) unparsed_count += 1 unparsed.append(txt) print('--------\n') print ('{} out of {} entries not parsed\nThey are...'.format(unparsed_count, len(moc_ads))) for t in unparsed: print ('{}\n---\n\n'.format(t)) # In[ ]: # In[ ]: