#!/usr/bin/env python # coding: utf-8 # # Load and Process models # # This script will load the M models in the collection using cobrapy, and convert them to a normalized format. They will also be exported to the "mat" format used by the COBRA toolbox. # # This requires [cobrapy](https://opencobra.github.io/cobrapy) version 0.4.0b1 or later. # In[1]: import os import warnings import re from itertools import chain import sympy import scipy import scipy.io import cobra from read_excel import read_excel # ## Read in Models # In[2]: def open_exchanges(model, amount=10): for reaction in model.reactions: if len(reaction.metabolites) == 1: # Ensure we are not creating any new sinks if reaction.metabolites.values()[0] > 0: reaction.upper_bound = max(reaction.upper_bound, amount) else: reaction.lower_bound = min(reaction.lower_bound, -amount) def add_exchanges(model, extracellular_suffix="[e]", uptake_amount=10): for metabolite in model.metabolites: if str(metabolite).endswith(extracellular_suffix): if len(metabolite.reactions) == 0: print "no reactions for " + metabolite.id continue if min(len(i.metabolites) for i in metabolite.reactions) > 1: EX_reaction = cobra.Reaction("EX_" + metabolite.id) EX_reaction.add_metabolites({metabolite: 1}) m.add_reaction(EX_reaction) EX_reaction.upper_bound = uptake_amount EX_reaction.lower_bound = -uptake_amount # ### SBML models # # These models will be read in using [libSBML](http://sbml.org/Software/libSBML) through cobrapy. Some models will need their exchanges opened. # In[3]: legacy_SBML = {"T_Maritima", "iNJ661m", "iSR432", "iTH366"} open_boundaries = {"iRsp1095", "iWV1314", "iFF708", "iZM363"} models = cobra.DictList() for i in sorted(os.listdir("sbml")): if not i.endswith(".xml"): continue model_id = i[:-4] filepath = os.path.join("sbml", i) with warnings.catch_warnings(): warnings.simplefilter("ignore") m = cobra.io.read_legacy_sbml(filepath) if model_id in legacy_SBML \ else cobra.io.read_sbml_model(filepath) m.id = m.description = model_id.replace(".", "_") if m.id in open_boundaries: open_exchanges(m) models.append(m) # ### Models available in COBRA Toolbox "mat" format # In[4]: for i in sorted(os.listdir("mat")): if not i.endswith(".mat"): continue m = cobra.io.load_matlab_model(os.path.join("mat", i)) m.id = i[:-4] if m.id in open_boundaries: open_exchanges(m) models.append(m) # ### Some models are only available as Microsoft Excel files # In[5]: m = read_excel("xls/iJS747.xls", verbose=False, rxn_sheet_header=7) models.append(m) # In[6]: m = read_excel("xls/iRM588.xls", verbose=False, rxn_sheet_header=5) models.append(m) # In[7]: m = read_excel("xls/iSO783.xls", verbose=False, rxn_sheet_header=2) models.append(m) # In[8]: m = read_excel("xls/iCR744.xls", rxn_sheet_header=4, verbose=False) models.append(m) # In[9]: m = read_excel("xls/iNV213.xls", rxn_str_key="Reaction Formula", verbose=False) # remove boundary metabolites for met in list(m.metabolites): if met.id.endswith("[b]"): met.remove_from_model() models.append(m) # In[10]: m = read_excel("xls/iTL885.xls", verbose=False, rxn_id_key="Rxn name", rxn_gpr_key="Gene-reaction association", met_sheet_name="ignore") models.append(m) # In[11]: m = read_excel("xls/iWZ663.xls", verbose=False, rxn_id_key="auto", rxn_name_key="Reaction name", rxn_gpr_key="Local gene") models.append(m) # In[12]: m = read_excel("xls/iOR363.xls", verbose=False) models.append(m) # In[13]: m = read_excel("xls/iMA945.xls", verbose=False) models.append(m) # In[14]: m = read_excel("xls/iPP668.xls", verbose=False) add_exchanges(m) models.append(m) # In[15]: m = read_excel("xls/iVM679.xls", verbose=False, met_sheet_name="ignore", rxn_id_key="Name", rxn_name_key="Description", rxn_str_key="Reaction") open_exchanges(m) models.append(m) # In[16]: m = read_excel("xls/iTY425.xls", rxn_sheet_header=1, rxn_sheet_name="S8", rxn_id_key="Number", rxn_str_key="Reaction", verbose=False) add_exchanges(m, "xt") # Protein production reaction does not prdocue "PROTEIN" metabolite m.reactions.R511.add_metabolites({m.metabolites.PROTEIN: 1}) m.id = m.id + "_fixed" models.append(m) # In[17]: m = read_excel("xls/iSS724.xls", rxn_str_key="Reactions", rxn_sheet_header=1, met_sheet_header=1, rxn_id_key="Name", verbose=False) add_exchanges(m, "xt") models.append(m) # In[18]: m = read_excel("xls/iCS400.xls", rxn_sheet_name="Complete Rxn List", rxn_sheet_header=2, rxn_str_key="Reaction", rxn_id_key="Name", verbose=False) add_exchanges(m, "xt") models.append(m) # In[19]: m = read_excel("xls/iLL672.xls", rxn_id_key="auto", met_sheet_name="Appendix 3 iLL672 metabolites",\ rxn_str_key="REACTION", rxn_gpr_key="skip", verbose=False, rxn_sheet_name='Appendix 3 iLL672 reactions') m.reactions[-1].objective_coefficient = 1 m.metabolites.BM.remove_from_model() add_exchanges(m, "xt") models.append(m) # In[20]: plus_re = re.compile("(?<=\S)\+") # substitute H+ with H, etc. m = read_excel("xls/iMH551.xls", rxn_sheet_name="GPR Annotation", rxn_sheet_header=4, rxn_id_key="auto", rxn_str_key="REACTION", rxn_gpr_key="skip", rxn_name_key="ENZYME", rxn_skip_rows=[625, 782, 787], verbose=False, rxn_sheet_converters={"REACTION": lambda x: plus_re.sub("", x)}) for met in m.metabolites: if met.id.endswith("(extracellular)"): met.id = met.id[:-15] + "_e" m.repair() add_exchanges(m, "_e") models.append(m) # In[21]: m = read_excel("xls/iCS291.xls", rxn_sheet_name="Sheet1", rxn_str_key="Reaction", rxn_sheet_header=5, rxn_id_key="Name", verbose=False) add_exchanges(m, "xt") # BIOMASS is just all model metabolites in the Demands list m.add_reaction(cobra.Reaction("BIOMASS")) # taken from Table 1 in publication biomass_mets = {} for i in {"ALA", "ARG", "ASN", "ASP", "CYS", "GLU", "GLN", "GLY", "HIS", "ILE", "LEU", "LYS", "MET", "PHE", "PRO", "SER", "THR", "TRP", "TYR", "VAL", "PTRC", "SPMD", "ATP", "GTP", "CTP", "UTP", "DATP", "DGTP", "DCTP", "DTTP", "PS", "PE", "PG", "PEPTIDO", "LPS", "OPP", "UDPP", "NAD", "NADP", "FAD", "COA", "ACP", "PTH", "THIAMIN", "MTHF", "MK", "DMK" }: biomass_mets[m.metabolites.get_by_id(i)] = -1 dm = cobra.Reaction("DM_" + i) m.add_reaction(dm) dm.add_metabolites({m.metabolites.get_by_id(i): -1}) m.reactions.BIOMASS.add_metabolites(biomass_mets) m.change_objective("BIOMASS") add_exchanges(m, "xt") models.append(m) # In[22]: m = read_excel("xls/iYO844.xls", rxn_sheet_name="Reaction and locus", verbose=False, rxn_gpr_key="Locus name", rxn_str_key=u'Equation (note [c] and [e] at the beginning refer to the compartment \n' 'the reaction takes place in, cytosolic and extracellular respectively)') add_exchanges(m) # create the biomass reaction from supplementary data table # http://www.jbc.org/content/suppl/2007/06/29/M703759200.DC1/Biomass_composition.doc r = cobra.Reaction("biomass") r.objective_coefficient = 1. m.add_reaction(r) r.reaction = ("408.3 gly[c] + 266.9 ala-L[c] + 306.7 val-L[c] + 346.4 leu-L[c] + 269.9 ile-L[c] + " "216.2 ser-L[c] + 186.3 thr-L[c] + 175.9 phe-L[c] + 110.8 tyr-L[c] + 54.3 trp-L[c] + " "56.7 cys-L[c] + 113.3 met-L[c] + 323.1 lys-L[c] + 193.0 arg-L[c] + 81.7 his-L[c] + " "148.0 asp-L[c] + 260.4 glu-L[c] + 148.0 asp-L[c] + 260.3 gln-L[c] + 160.6 pro-L[c] + " "62.7 gtp[c] + 38.9 ctp[c] + 41.5 utp[c] + 23.0 datp[c] + 17.4 dgtp[c] + 17.4 dctp[c] + " "22.9 dttp[c] + 0.085750 m12dg_BS[c] + 0.110292 d12dg_BS[c] + 0.065833 t12dg_BS[c] + " "0.004642 cdlp_BS[c] + 0.175859 pgly_BS[c] + 0.022057 lysylpgly_BS[c] + 0.559509 psetha_BS[c] + " "0.006837 lipo1-24_BS[c] + 0.006123 lipo2-24_BS[c] + 0.018162 lipo3-24_BS[c] + " "0.014676 lipo4-24_BS[c] + 101.82 peptido_BS[c] + 3.62 gtca1-45_BS[c] + 2.35 gtca2-45_BS[c] + " "1.82 gtca3-45_BS[c] + 3.11 tcam_BS[c] + 706.3 k[c] + 101.7 mg2[c] + 3.4 fe3[c] + 3.2 ca2[c] + " "0.9 ppi[c] + 0.3 mql7[c] + 0.4 10fthf[c] + 16.2 nad[c] + 4.7 amp[c] + 2.6 adp[c] + 1.0 cmp[c] + " "0.9 nadp[c] + 0.5 ctp[c] + 0.5 gmp[c] + 0.4 gtp[c] + 0.3 cdp[c] + 0.2 nadph[c] + 0.2 gdp[c] + " "105053.5 atp[c] + 105000 h2o[c] --> 104985.6 pi[c] + 104997.4 adp[c] + 105000 h[c]") # units are in mg for this reaction, so scale to grams r *= 0.001 models.append(m) # In[23]: models.sort() # ## Determine Objective Reactions # # Some of these models do not specify an objective (or biomass) reaction. These will be automatically detected if possible, or set from a manually curated list. # In[24]: # regular expression to detect "biomass" biomass_re = re.compile("biomass", re.IGNORECASE) # manually identified objective reactions curated_objectives = {"VvuMBEL943": "R806", "iAI549": "BIO_CBDB1_DM_855", "mus_musculus": "BIO028", "iRsp1095": "RXN1391", "iLC915": "r1133", "PpaMBEL1254": "R01288", "AbyMBEL891": "R761", "iAbaylyiV4": "GROWTH_DASH_RXN", "iOG654": "RM00001", "iOR363": "OF14e_Retli", "iRM588": "agg_GS13m", "iJS747": "agg_GS13m_2", "iTL885": "SS1240", "iMH551": "R0227"} # In[25]: for m in models: if len(m.reactions.query(lambda x: x > 0, "objective_coefficient")): continue if m.id in curated_objectives: m.change_objective(curated_objectives[m.id]) continue # look for reactions with "biomass" in the id or name possible_objectives = m.reactions.query(biomass_re) if len(possible_objectives) == 0: possible_objectives = m.reactions.query(biomass_re, "name") # In some cases, a biomass "metabolite" is produced, whose production # should be the objective function. possible_biomass_metabolites = m.metabolites.query(biomass_re) if len(possible_biomass_metabolites) == 0: possible_biomass_metabolites = m.metabolites.query(biomass_re, "name") if len(possible_biomass_metabolites) > 0: biomass_met = possible_biomass_metabolites[0] r = cobra.Reaction("added_biomass_sink") r.objective_coefficient = 1 r.add_metabolites({biomass_met: -1}) m.add_reaction(r) print ("autodetected biomass metabolite '%s' for model '%s'" % (biomass_met.id, m.id)) elif len(possible_objectives) > 0: print("autodetected objective reaction '%s' for model '%s'" % (possible_objectives[0].id, m.id)) m.change_objective(possible_objectives[0]) else: print("no objective found for " + m.id) # Ensure the biomass objective flux is unconstrained for m in models: for reaction in m.reactions.query(lambda x: x > 0, "objective_coefficient"): reaction.lower_bound = min(reaction.lower_bound, 0) reaction.upper_bound = max(reaction.upper_bound, 1000) # ## Fixes of various encoding bugs # ### General # GSMN_TB does not use the convention of extracellular metabolites with exchanges. Although the model still solves with this formulation, this is still normalized here. This process does not change the mathematical structure of the model. # In[26]: h_c = models.GSMN_TB.metabolites.H_c for r in models.GSMN_TB.reactions: if len(r.metabolites) == 2 and h_c in r.metabolites: met = [i for i in r.metabolites if i is not h_c][0] EX_met = cobra.Metabolite(met.id[:-1] + "e") r.add_metabolites({EX_met: -r.metabolites[met]}) if "EX_" + EX_met.id not in models.GSMN_TB.reactions: exchange = cobra.Reaction("EX_" + EX_met.id) exchange.add_metabolites({EX_met: -1}) exchange.lower_bound = -1000000.0 exchange.upper_bound = 1000000.0 models.GSMN_TB.add_reaction(exchange) # ### Reaction and Metabolites # ### id's # In[27]: # reaction id's with spaces in them models.iJS747.reactions.get_by_id("HDH [deleted 01/16/2007 12:02:30 PM]").id = "HDH_del" models.iJS747.reactions.get_by_id("HIBD [deleted 03/21/2007 01:06:12 PM]").id = "HIBD_del" models.iAC560.reactions.get_by_id("GLUDx [m]").id = "GLUDx[m]" for r in models.iOR363.reactions: if " " in r.id: r.id = r.id.split()[0] models.textbook.reactions.query("Biomass")[0].id = "Biomass_Ecoli_core" # Use the convention underscore + compartment i.e. _c instead of [c] (c) etc. # In[28]: SQBKT_re = re.compile("\[([a-z])\]$") def fix_brackets(id_str, compiled_re): result = compiled_re.findall(id_str) if len(result) > 0: return compiled_re.sub("_" + result[0], id_str) else: return id_str for r in models.iRS1597.reactions: r.id = fix_brackets(r.id, re.compile("_LSQBKT_([a-z])_RSQBKT_$")) for m_id in ["iJS747", "iRM588", "iSO783", "iCR744", "iNV213", "iWZ663", "iOR363", "iMA945", "iPP668", "iTL885", "iVM679", "iYO844", "iZM363"]: for met in models.get_by_id(m_id).metabolites: met.id = fix_brackets(met.id, SQBKT_re) for met in models.S_coilicolor_fixed.metabolites: if met.id.endswith("_None_"): met.id = met.id[:-6] # Some models only have intra and extracellular metabolites, but don't use _c and _e. for m_id in ["iCS291", "iCS400", "iTY425_fixed", "iSS724"]: for metabolite in models.get_by_id(m_id).metabolites: if metabolite.id.endswith("xt"): metabolite.id = metabolite.id[:-2] + "_e" elif len(metabolite.id) < 2 or metabolite.id[-2] != "_": metabolite.id = metabolite.id + "_c" # Exchange reactions should have the id of the metabolite after with the same convention for m_id in ["iAF1260", "iJO1366", "iAF692", "iJN746", "iRC1080", "textbook", "iNV213", "iIT341", "iJN678", "iJR904", "iND750", "iNJ661", "iPS189_fixed", "iSB619", "iZM363", "iMH551"]: for r in models.get_by_id(m_id).reactions: if len(r.metabolites) != 1: continue if r.id.startswith("EX_"): r.id = "EX_" + list(r.metabolites.keys())[0].id if r.id.startswith("DM_"): r.id = "DM_" + list(r.metabolites.keys())[0].id for m in models: m.repair() # ### Metabolite Formulas # In[29]: for model in models: for metabolite in model.metabolites: if metabolite.formula is None: metabolite.formula = "" continue if str(metabolite.formula).lower() == "none": metabolite.formula = "" continue # some characters should not be in a formula if "(" in metabolite.formula or \ ")" in metabolite.formula or \ "." in metabolite.formula: metabolite.formula = "" # ### Metabolite Compartments # In[30]: compartments = { 'c': 'Cytoplasm', 'e': 'Extracellular', 'p': 'Periplasm', 'm': 'Mitochondria', 'g': 'Golgi', 'n': "Nucleus", 'r': "Endoplasmic reticulum", 'x': "Peroxisome", 'v': "Vacuole", "h": "Chloroplast", "x": "Glyoxysome", "s": "Eyespot", "default": "No Compartment"} for model in models: for metabolite in model.metabolites: if metabolite.compartment is None or len(metabolite.compartment.strip()) == 0 or metabolite.compartment == "[": if len(metabolite.id) > 2 and metabolite.id[-2] == "_" and metabolite.id[-1].isalpha(): metabolite.compartment = metabolite.id[-1] else: metabolite.compartment = "default" if metabolite.compartment not in model.compartments: model.compartments[metabolite.compartment] = compartments.get(metabolite.compartment, metabolite.compartment) # ### Metabolite and Reaction Names # Names which start with numbers don't need to be escaped with underscores. # In[31]: for model in models: for x in chain(model.metabolites, model.reactions): if x.name is not None and x.name.startswith("_"): x.name = x.name.lstrip("_") if x.name is not None: x.name = x.name.strip() if x.name is None: x.name = x.id # ### MISC fixes # In[32]: models.iMM1415.reactions.EX_lnlc_dup_e.remove_from_model() models.iMM1415.reactions.EX_retpalm_e.remove_from_model(remove_orphans=True) # these reaction names are reaction strings for r in models.iCac802.reactions: r.name = "" # ## Fix Genes and GPR's # A lot of genes have characters which won't work in their names # In[33]: # nonbreaking spaces models.iCB925.reactions.FDXNRy.gene_reaction_rule = '( Cbei_0661 or Cbei_2182 )' for r in models.iCB925.reactions: if "\xa0" in r.gene_reaction_rule: r.gene_reaction_rule = r.gene_reaction_rule.replace("\xc2", " ").replace("\xa0", " ") for g in list(models.iCB925.genes): if len(g.reactions) == 0: models.iCB925.genes.remove(g) # Some GPR's are not valid boolean expressions. # In[34]: multiple_ors = re.compile("(\s*or\s+){2,}") multiple_ands = re.compile("(\s*and\s+){2,}") for model_id in ["iRS1563", "iRS1597", "iMM1415"]: model = models.get_by_id(model_id) for reaction in model.reactions: gpr = reaction.gene_reaction_rule gpr = multiple_ors.sub(" or ", gpr) gpr = multiple_ands.sub(" and ", gpr) if "[" in gpr: gpr = gpr.replace("[", "(").replace("]", ")") if gpr.endswith(" or"): gpr = gpr[:-3] if gpr.count("(") != gpr.count(")"): gpr = "" # mismatched parenthesis somewhere reaction.gene_reaction_rule = gpr for gene in list(model.genes): if gene.id.startswith("[") or gene.id.endswith("]"): if len(gene.reactions) == 0: model.genes.remove(gene.id) # Some models are missing spaces between the ands/ors in some of their GPR's for m_id in ["iJN678", "iTL885"]: for r in models.get_by_id(m_id).reactions: r.gene_reaction_rule = r.gene_reaction_rule.replace("and", " and ").replace("or", " or ") models.iCac802.reactions.R0095.gene_reaction_rule = \ models.iCac802.reactions.R0095.gene_reaction_rule.replace(" AND ", " and ") # make sbml3 output deterministic by sorting genes for m in models: m.genes.sort() # ## Ensure all ID's are SBML compliant # In[35]: for m in models: cobra.manipulation.escape_ID(m) # ## Export Models # ### SBML 3 # Export the models to the use the fbc version 2 (draft RC6) extension to SBML level 3 version 1. # In[36]: for model in models: cobra.io.write_sbml_model(model, "sbml3/%s.xml" % model.id) # ### mat # Save all the models into a single mat file. In addition to the usual fields in the "mat" struct, we will also include S_num and S_denom, which are the numerator and denominator of the stoichiometric coefficients encoded as rational numbers. # In[37]: def convert_to_rational(value): return sympy.Rational("%.15g" % value) def construct_S_num_denom(model): """convert model to two S matrices they encode the numerator and denominator of stoichiometric coefficients encoded as rational numbers """ # intialize to 0 dimensions = (len(model.metabolites), len(model.reactions)) S_num = scipy.sparse.lil_matrix(dimensions) S_denom = scipy.sparse.lil_matrix(dimensions) # populate with stoichiometry for i, r in enumerate(model.reactions): for met, value in r._metabolites.iteritems(): rational_value = convert_to_rational(value) num, denom = (rational_value.p, rational_value.q) S_num[model.metabolites.index(met), i] = num S_denom[model.metabolites.index(met), i] = denom return S_num, S_denom # In[38]: all_model_dict = {} for model in models: model_dict = cobra.io.mat.create_mat_dict(model) model_dict["S_num"], model_dict["S_denom"] = construct_S_num_denom(model) all_model_dict[model.id] = model_dict scipy.io.savemat("all_models.mat", all_model_dict, oned_as="column")