!wget http://msb.embopress.org/content/11/10/831/DC1/embed/inline-supplementary-material-1.xlsx
--2016-02-18 01:39:09-- http://msb.embopress.org/content/11/10/831/DC1/embed/inline-supplementary-material-1.xlsx Resolving msb.embopress.org (msb.embopress.org)... 104.232.27.115 Connecting to msb.embopress.org (msb.embopress.org)|104.232.27.115|:80... connected. HTTP request sent, awaiting response... 301 Moved Permanently Location: http://msb.embopress.org/content/msb/11/10/831/DC1/embed/inline-supplementary-material-1.xlsx [following] --2016-02-18 01:39:09-- http://msb.embopress.org/content/msb/11/10/831/DC1/embed/inline-supplementary-material-1.xlsx Connecting to msb.embopress.org (msb.embopress.org)|104.232.27.115|:80... connected. HTTP request sent, awaiting response... 200 OK Length: 16509 (16K) [application/vnd.openxmlformats-officedocument.spreadsheetml.sheet] Saving to: 'inline-supplementary-material-1.xlsx' 100%[======================================>] 16,509 --.-K/s in 0.02s 2016-02-18 01:39:10 (704 KB/s) - 'inline-supplementary-material-1.xlsx' saved [16509/16509]
import codecs
import openpyxl
workbook = openpyxl.load_workbook("inline-supplementary-material-1.xlsx", read_only=True)
sheet = workbook.active
/usr/local/lib/python2.7/dist-packages/openpyxl/workbook/names/named_range.py:125: UserWarning: Discarded range with reserved name warnings.warn("Discarded range with reserved name")
Parse the spreadsheet
data = []
for i, row in enumerate(sheet.rows):
if i == 0: # skip header
continue
if row[2].value is None or len(row[2].value) <= 1:
ref_str = ""
else:
hyperlink_tokens = openpyxl.formula.Tokenizer(row[2].value)
hyperlink_tokens.parse()
ref = hyperlink_tokens.items[3].value.strip('"')
ref_url = hyperlink_tokens.items[1].value.strip('"').replace(" ", "")
ref_str = "[%s](%s)" % (ref, ref_url)
try:
pmid = int(row[3].value)
except:
pmid_str = ""
else:
pmid_str = "[%d](http://www.ncbi.nlm.nih.gov/pubmed/%d)" % (pmid, pmid)
data.append((row[0].value.strip(), row[1].value.strip(), ref_str, pmid_str))
col_lengths = tuple(max(len(i[col]) for i in data) for col in range(4))
Print out the markdown version
row_format = u"| %%-%ds | %%-%ds | %%-%ds | %%-%ds |\n" % col_lengths
with codecs.open("model_key.md", "w", encoding="utf-8") as outfile:
# print header
outfile.write(row_format % (sheet["A1"].value, sheet["B1"].value, sheet["C1"].value, sheet["D1"].value))
# print hrule
outfile.write("|%s|%s|%s|%s|\n" % tuple("-" * (i + 2) for i in col_lengths))
for d in data:
outfile.write(row_format % d)