import requests import lxml.html as lh gdelt_base_url = 'http://data.gdeltproject.org/events/' # get the list of all the links on the gdelt file page page = requests.get(gdelt_base_url+'index.html') doc = lh.fromstring(page.content) link_list = doc.xpath("//*/ul/li/a/@href") # separate out those links that begin with four digits file_list = [x for x in link_list if str.isdigit(x[0:4])] infilecounter = 0 outfilecounter = 0 import os.path import urllib import zipfile import glob import operator local_path = '/Users/me/Desktop/GDELT_Data/' fips_country_code = 'UK' for compressed_file in file_list[infilecounter:]: print compressed_file, # if we dont have the compressed file stored locally, go get it. Keep trying if necessary. while not os.path.isfile(local_path+compressed_file): print 'downloading,', urllib.urlretrieve(url=gdelt_base_url+compressed_file, filename=local_path+compressed_file) # extract the contents of the compressed file to a temporary directory print 'extracting,', z = zipfile.ZipFile(file=local_path+compressed_file, mode='r') z.extractall(path=local_path+'tmp/') # parse each of the csv files in the working directory, print 'parsing,', for infile_name in glob.glob(local_path+'tmp/*'): outfile_name = local_path+'country/'+fips_country_code+'%04i.tsv'%outfilecounter # open the infile and outfile with open(infile_name, mode='r') as infile, open(outfile_name, mode='w') as outfile: for line in infile: # extract lines with our interest country code if fips_country_code in operator.itemgetter(51, 37, 44)(line.split('\t')): outfile.write(line) outfilecounter +=1 # delete the temporary file os.remove(infile_name) infilecounter +=1 print 'done' import glob import pandas as pd # Get the GDELT field names from a helper file colnames = pd.read_excel('CSV.header.fieldids.xlsx', sheetname='Sheet1', index_col='Column ID', parse_cols=1)['Field Name'] # Build DataFrames from each of the intermediary files files = glob.glob(local_path+'country/'+fips_country_code+'*') DFlist = [] for active_file in files: print active_file DFlist.append(pd.read_csv(active_file, sep='\t', header=None, dtype=str, names=colnames, index_col=['GLOBALEVENTID'])) # Merge the file-based dataframes and save a pickle DF = pd.concat(DFlist) DF.to_pickle(local_path+'backup'+fips_country_code+'.pickle') # once everythin is safely stored away, remove the temporary files for active_file in files: os.remove(active_file)