This notebook walks through a use case of getting a property (in this case, a material's structure in CIF format) for tens of thousands of materials in the MP database for offline analysis. Two helpful ideas presented here are to (1) chunk your API requests and (2) save the IDs for materials you have so that incremental updates to your offline collection can be done efficiently.
It is assumed that you have an MP API key set as your "MAPI_KEY" environment variable. If not, be sure to paste in your API key as a string argument to the MPRester()
call in the first cell below.
import json
from itertools import izip_longest
# An optional utility to display a progress bar
# for long-running loops. `pip install tqdm`.
from tqdm import tqdm
from pymatgen.ext.matproj import MPRester
mpr = MPRester()
entries = mpr.query({"elements": "O", "nelements": {"$gte": 2}}, ["material_id"])
oxide_mp_ids = [e['material_id'] for e in entries]
len(oxide_mp_ids)
38371
# A utility function to "chunk" our queries
def grouper(iterable, n, fillvalue=None):
"Collect data into fixed-length chunks or blocks"
# grouper('ABCDEFG', 3, 'x') --> ABC DEF Gxx
args = [iter(iterable)] * n
return izip_longest(fillvalue=fillvalue, *args)
data = []
mpid_groups = [g for g in grouper(oxide_mp_ids, 1000)]
for group in tqdm(mpid_groups):
# The last group may have fewer than 1000 actual ids,
# so filter the `None`s out.
mpid_list = filter(None, group)
entries = mpr.query({"material_id": {"$in": mpid_list}}, ["material_id", "cif"])
data.extend(entries)
100%|██████████| 39/39 [00:20<00:00, 1.97it/s]
import os
if not os.path.exists('mp_oxide_cifs'):
os.mkdir('mp_oxide_cifs')
for d in tqdm(data):
with open("mp_oxide_cifs/{}.cif".format(d["material_id"]), 'w') as f:
f.write(d["cif"])
100%|██████████| 38371/38371 [00:03<00:00, 12502.72it/s]
# Save IDs for saved structures, so you can
# efficiently update later.
with open('oxide_mp_ids.json', 'w') as f:
json.dump(oxide_mp_ids, f)
# Do fun things with pymatgen
from pymatgen.core import Structure
Structure.from_file('mp_oxide_cifs/mp-190.cif')
Structure Summary Lattice abc : 3.7931997599999998 3.7931997599999998 3.7931997599999998 angles : 90.0 90.0 90.0 volume : 54.577940461944955 A : 3.7931997599999998 0.0 2.3226649723052543e-16 B : -2.3226649723052543e-16 3.7931997599999998 2.3226649723052543e-16 C : 0.0 0.0 3.7931997599999998 PeriodicSite: Re (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000] PeriodicSite: O (1.8966, 0.0000, 0.0000) [0.5000, 0.0000, 0.0000] PeriodicSite: O (0.0000, 0.0000, 1.8966) [0.0000, 0.0000, 0.5000] PeriodicSite: O (-0.0000, 1.8966, 0.0000) [0.0000, 0.5000, 0.0000]
entries = mpr.query({"elements": "O", "nelements": {"$gte": 2}}, ["material_id"])
oxide_mp_ids = [e['material_id'] for e in entries]
# Figure out what you don't have
new_mp_ids = []
with open('oxide_mp_ids.json', 'r') as f:
old_mp_ids = json.load(f)
new_mp_ids = list(set(oxide_mp_ids) - set(old_mp_ids))
new_data = mpr.query({"material_id": {"$in": new_mp_ids}}, ["material_id", "cif"])
with open('oxide_mp_ids.json', 'w') as f:
json.dump(oxide_mp_ids, f)