import gzip
import os
import re
import pandas as pd
import numpy as np
from anndata import AnnData
from scipy.io import mmread
from scipy.sparse import coo_matrix
from utils import download_binary_file
# Gene program lists obtained by cross-referencing the heatmap here
# https://github.com/thomasmaxwellnorman/Perturbseq_GI/blob/master/GI_optimal_umap.ipynb
# with Figure 2b in Norman 2019
G1_CYCLE = [
"CDKN1C+CDKN1B",
"CDKN1B+ctrl",
"CDKN1B+CDKN1A",
"CDKN1C+ctrl",
"ctrl+CDKN1A",
"CDKN1C+CDKN1A",
"CDKN1A+ctrl",
]
ERYTHROID = [
"BPGM+SAMD1",
"ATL1+ctrl",
"UBASH3B+ZBTB25",
"PTPN12+PTPN9",
"PTPN12+UBASH3A",
"CBL+CNN1",
"UBASH3B+CNN1",
"CBL+UBASH3B",
"UBASH3B+PTPN9",
"PTPN1+ctrl",
"CBL+PTPN9",
"CNN1+UBASH3A",
"CBL+PTPN12",
"PTPN12+ZBTB25",
"UBASH3B+PTPN12",
"SAMD1+PTPN12",
"SAMD1+UBASH3B",
"UBASH3B+UBASH3A",
]
PIONEER_FACTORS = [
"ZBTB10+SNAI1",
"FOXL2+MEIS1",
"POU3F2+CBFA2T3",
"DUSP9+SNAI1",
"FOXA3+FOXA1",
"FOXA3+ctrl",
"LYL1+IER5L",
"FOXA1+FOXF1",
"FOXF1+HOXB9",
"FOXA1+HOXB9",
"FOXA3+HOXB9",
"FOXA3+FOXA1",
"FOXA3+FOXL2",
"POU3F2+FOXL2",
"FOXF1+FOXL2",
"FOXA1+FOXL2",
"HOXA13+ctrl",
"ctrl+HOXC13",
"HOXC13+ctrl",
"MIDN+ctrl",
"TP73+ctrl",
]
GRANULOCYTE_APOPTOSIS = [
"SPI1+ctrl",
"ctrl+SPI1",
"ctrl+CEBPB",
"CEBPB+ctrl",
"JUN+CEBPA",
"CEBPB+CEBPA",
"FOSB+CEBPE",
"ZC3HAV1+CEBPA",
"KLF1+CEBPA",
"ctrl+CEBPA",
"CEBPA+ctrl",
"CEBPE+CEBPA",
"CEBPE+SPI1",
"CEBPE+ctrl",
"ctrl+CEBPE",
"CEBPE+RUNX1T1",
"CEBPE+CEBPB",
"FOSB+CEBPB",
"ETS2+CEBPE",
]
MEGAKARYOCYTE = [
"ctrl+ETS2",
"MAPK1+ctrl",
"ctrl+MAPK1",
"ETS2+MAPK1",
"CEBPB+MAPK1",
"MAPK1+TGFBR2",
]
PRO_GROWTH = [
"CEBPE+KLF1",
"KLF1+MAP2K6",
"AHR+KLF1",
"ctrl+KLF1",
"KLF1+ctrl",
"KLF1+BAK1",
"KLF1+TGFBR2",
]
def download_norman_2019(output_path: str) -> None:
"""
Download Norman et al. 2019 data and metadata files from the hosting URLs.
Args:
----
output_path: Output path to store the downloaded and unzipped
directories.
Returns
-------
None. File directories are downloaded to output_path.
"""
file_urls = (
"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl"
"/GSE133344_filtered_matrix.mtx.gz",
"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl"
"/GSE133344_filtered_genes.tsv.gz",
"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl"
"/GSE133344_filtered_barcodes.tsv.gz",
"https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl"
"/GSE133344_filtered_cell_identities.csv.gz",
)
for url in file_urls:
output_filename = os.path.join(output_path, url.split("/")[-1])
download_binary_file(url, output_filename)
def read_norman_2019(file_directory: str) -> coo_matrix:
"""
Read the expression data for Norman et al. 2019 in the given directory.
Args:
----
file_directory: Directory containing Norman et al. 2019 data.
Returns
-------
A sparse matrix containing single-cell gene expression count, with rows
representing genes and columns representing cells.
"""
with gzip.open(
os.path.join(file_directory, "GSE133344_filtered_matrix.mtx.gz"), "rb"
) as f:
matrix = mmread(f)
return matrix
download_path = "./norman2019/"
download_norman_2019(download_path)
matrix = read_norman_2019(download_path)
# List of cell barcodes. The barcodes in this list are stored in the same order
# as cells are in the count matrix.
cell_barcodes = pd.read_csv(
os.path.join(download_path, "GSE133344_filtered_barcodes.tsv.gz"),
sep="\t",
header=None,
names=["cell_barcode"],
)
# IDs/names of the gene features.
gene_list = pd.read_csv(
os.path.join(download_path, "GSE133344_filtered_genes.tsv.gz"),
sep="\t",
header=None,
names=["gene_id", "gene_name"],
)
# Dataframe where each row corresponds to a cell, and each column corresponds
# to a gene feature.
matrix = pd.DataFrame(
matrix.transpose().todense(),
columns=gene_list["gene_id"],
index=cell_barcodes["cell_barcode"],
dtype="int32",
)
# Dataframe mapping cell barcodes to metadata about that cell (e.g. which CRISPR
# guides were applied to that cell). Unfortunately, this list has a different
# ordering from the count matrix, so we have to be careful combining the metadata
# and count data.
cell_identities = pd.read_csv(
os.path.join(download_path, "GSE133344_filtered_cell_identities.csv.gz")
).set_index("cell_barcode")
# This merge call reorders our metadata dataframe to match the ordering in the
# count matrix. Some cells in `cell_barcodes` do not have metadata associated with
# them, and their metadata values will be filled in as NaN.
aligned_metadata = pd.merge(
cell_barcodes,
cell_identities,
left_on="cell_barcode",
right_index=True,
how="left",
).set_index("cell_barcode")
adata = AnnData(matrix)
adata.obs = aligned_metadata
# Filter out any cells that don't have metadata values.
rows_without_nans = [
index for index, row in adata.obs.iterrows() if not row.isnull().any()
]
adata = adata[rows_without_nans, :]
# Remove these as suggested by the authors. See lines referring to
# NegCtrl1_NegCtrl0 in GI_generate_populations.ipynb in the Norman 2019 paper's
# Github repo https://github.com/thomasmaxwellnorman/Perturbseq_GI/
adata = adata[adata.obs["guide_identity"] != "NegCtrl1_NegCtrl0__NegCtrl1_NegCtrl0"]
# We create a new metadata column with cleaner representations of CRISPR guide
# identities. The original format is <Guide1>_<Guide2>__<Guide1>_<Guide2>_<number>
adata.obs["guide_merged"] = adata.obs["guide_identity"]
control_regex = re.compile(r"NegCtrl(.*)_NegCtrl(.*)+NegCtrl(.*)_NegCtrl(.*)")
for i in adata.obs["guide_merged"].unique():
if control_regex.match(i):
# For any cells that only had control guides, we don't care about the
# specific IDs of the guides. Here we relabel them just as "ctrl".
adata.obs["guide_merged"].replace(i, "ctrl", inplace=True)
else:
# Otherwise, we reformat the guide label to be <Guide1>+<Guide2>. If Guide1
# or Guide2 was a control, we replace it with "ctrl".
split = i.split("__")[0]
split = split.split("_")
for j, string in enumerate(split):
if "NegCtrl" in split[j]:
split[j] = "ctrl"
adata.obs["guide_merged"].replace(i, f"{split[0]}+{split[1]}", inplace=True)
guides_to_programs = {}
guides_to_programs.update(dict.fromkeys(G1_CYCLE, "G1 cell cycle arrest"))
guides_to_programs.update(dict.fromkeys(ERYTHROID, "Erythroid"))
guides_to_programs.update(dict.fromkeys(PIONEER_FACTORS, "Pioneer factors"))
guides_to_programs.update(
dict.fromkeys(GRANULOCYTE_APOPTOSIS, "Granulocyte/apoptosis")
)
guides_to_programs.update(dict.fromkeys(PRO_GROWTH, "Pro-growth"))
guides_to_programs.update(dict.fromkeys(MEGAKARYOCYTE, "Megakaryocyte"))
guides_to_programs.update(dict.fromkeys(["ctrl"], "Ctrl"))
adata.obs["gene_program"] = [guides_to_programs[x] if x in guides_to_programs else "N/A" for x in adata.obs["guide_merged"]]
adata.obs["good_coverage"] = adata.obs["good_coverage"].astype(bool)
Downloaded data from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344_filtered_matrix.mtx.gz at ./GSE133344_filtered_matrix.mtx.gz Downloaded data from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344_filtered_genes.tsv.gz at ./GSE133344_filtered_genes.tsv.gz Downloaded data from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344_filtered_barcodes.tsv.gz at ./GSE133344_filtered_barcodes.tsv.gz Downloaded data from https://ftp.ncbi.nlm.nih.gov/geo/series/GSE133nnn/GSE133344/suppl/GSE133344_filtered_cell_identities.csv.gz at ./GSE133344_filtered_cell_identities.csv.gz
Trying to set attribute `.obs` of view, copying.
adata.write('Norman_2019_raw.h5ad')