Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
jkobject committed Apr 15, 2024
2 parents 1528186 + 428c3e8 commit f78e547
Showing 1 changed file with 0 additions and 206 deletions.
206 changes: 0 additions & 206 deletions scdataloader/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,209 +692,3 @@ def additional_postprocess(adata):
# to query N next time points we just get the N elements below and check they are in the group
# to query the N nearest neighbors we just get the N elements above and N below and check they are in the group
return adata


"""
sexr = {
"Male": "PATO:0000384",
"Female": "PATO:0000383",
}
tissuer = {
"Kidney": "UBERON:0002113",
"Lung": "UBERON:0002048",
"Heart": "UBERON:0000948",
"Liver": "UBERON:0002107",
"Brain": "UBERON:0000955",
"BAT": "UBERON:0001348",
"Jejunum": "UBERON:0002115",
"Colon": "UBERON:0001155",
"Ileum": "UBERON:0002116",
"Stomach": "UBERON:0000945",
"gWAT": "UBERON:0001347",
"Duodenum": "UBERON:0002114",
"iWAT": "UBERON:0001347",
"Muscle": "UBERON:0001630",
}
ager = {
"03_months": "MmusDv:0000063",
"16_months": "MmusDv:0000087",
"06_months": "MmusDv:0000077",
"23_months": "MmusDv:0000127",
"12_months": "MmusDv:0000083",
"21_months": "MmusDv:0000125",
}
celltyper = {
"Proximal tubule cells": "epithelial cell of proximal tubule",
"Vascular endothelial cells": "endothelial cell of vascular tree",
"Intestinal epithelial cells": "intestinal epithelial cell",
"Hepatocytes": "hepatocyte",
"Fibroblasts": "fibroblast",
"Lymphoid cells_T cells": "T cell",
"Myeloid cells": "myeloid cell",
"Brown adipocytes": "brown fat cell",
"Lymphoid cells_B cells": "B cell",
"Adipocytes": "fat cell",
"Type II alveolar epithelial cells": "type II pneumocyte",
"Colonic epithelial cells": "colon epithelial cell",
"Mural cells": "mural cell",
"Cerebellum granule neurons": "cerebellar neuron",
"Goblet cells": "goblet cell",
"Vascular endothelial cells_General capillary cells": "endothelial cell of vascular tree",
"Ventricular cardiomyocytes": "regular ventricular cardiac myocyte",
"Type II myonuclei": "type II muscle cell",
"Thick ascending limb of LOH cells": "vasa recta ascending limb cell",
"Gastric mucous cells": "mucous cell of stomach",
"Distal convoluted tubule cells": "kidney distal convoluted tubule epithelial cell",
"Adipoce stem and progenitor cells": "hepatic oval stem cell",
"Chief cells": "chief cell of parathyroid gland",
"Paneth cells": "paneth cell",
"Myeloid cells_Alveolar macrophages": "alveolar macrophage",
"Lymphoid cells_Plasma cells": "plasma cell",
"Secretory cells": "secretory cell",
"Lymphoid cells_Resting B cells": "B cell",
"Cortical projection neurons 1": "corticothalamic-projecting glutamatergic cortical neuron",
"Endocardial endothelial cells": "endocardial cell",
"Type I alveolar epithelial cells": "type I pneumocyte",
"Interbrain and midbrain neurons 1": "midbrain dopaminergic neuron",
"Interbrain and midbrain neurons 2": "midbrain dopaminergic neuron",
"Myeloid cells_Monocytes": "monocyte",
"Myeloid cells_Dendritic cells": "myeloid dendritic cell",
"Oligodendrocytes": "oligodendrocyte",
"Lymphatic endothelial cells": "endothelial cell of lymphatic vessel",
"Enteroendocrine cells": "enteroendocrine cell",
"Vascular endothelial cells_Aerocytes": "endothelial cell of vascular tree",
"Gastric epithelial cells": "epithelial cell of stomach",
"Fibro–adipogenic progenitors": "fibro/adipogenic progenitor cell",
"Parietal cells": "parietal cell",
"Astrocytes": "astrocyte",
"Connecting tubule cells": "kidney connecting tubule beta-intercalated cell",
"Hepatic stellate cells": "hepatic stellate cell",
"Striatal neurons 1": "striatum neuron",
"Mesothelial cells": "mesothelial cell",
"Lymphoid cells_Cycling B cells": "germinal center B cell",
"Type B intercalated cells": "renal beta-intercalated cell",
"Type A intercalated cells": "renal alpha-intercalated cell",
"Myeloid cells_Neutrophils": "neutrophil",
"Principal cells": "renal principal cell",
"Cortical projection neurons 2": "corticothalamic-projecting glutamatergic cortical neuron",
"Muc2-producing goblet cells": "intestine goblet cell",
"OB neurons 1": "olfactory bulb interneuron",
"Atrial cardiomyocytes": "regular atrial cardiac myocyte",
"Lymphoid cells": "leukocyte",
"Skeletal muscle cells": "cell of skeletal muscle",
"Neural cells": "neural cell",
"Cerebellum interneurons": "cerebellar neuron",
"Interneurons 1": "interneuron",
"Descending thin limb of LOH cells": "vasa recta descending limb cell",
"Tuft cells": "intestinal tuft cell",
"Oligodendrocyte progenitor cells": "oligodendrocyte precursor cell",
"Enteric glia": "enteroglial cell",
"Endothelial cells": "endothelial cell",
"Dentate gyrus neurons": "dentate gyrus neuron",
"Myeloid cells_Interstitial macrophages": "tissue-resident macrophage",
"Ciliated cells": "ciliated cell",
"Microglia": "microglial cell",
"Interneurons 2": "interneuron",
"Ncam1 positive cells": "parafollicular cell",
"Rdh16 positive cells": "unknown",
"Circulating hepatoblasts": "hepatoblast",
"Enteric neurons": "enteric neuron",
"Ascending thin limb of LOH cells": "vasa recta ascending limb cell",
"Mfge8 positive cells": "unknown",
"Cholangiocytes": "cholangiocyte",
"Podocytes": "podocyte",
"Muscle satellite cells": "skeletal muscle satellite cell",
"Purkinje neurons": "Purkinje cell",
"Juxtaglomerular cells": "juxtaglomerular complex cell",
"Ngf positive cells": "neurogliaform cell",
"Bergmann glia": "Bergmann glial cell",
"Megf11 positive cells": "unknown",
"Myotendinous junction myonuclei": "unknown",
"Vascular leptomeningeal cells": "vascular leptomeningeal cell",
"Urothelial cells": "urothelial cell",
"Tenocytes": "tendon cell",
"Myelinating Schwann cells": "myelinating Schwann cell",
"Epididymal cells": "epididymis glandular cell",
"Muc6-producing goblet cells": "lung goblet cell",
"Type I myonuclei": "type I muscle cell",
"OB neurons 2": "olfactory bulb interneuron",
"Sis positive cells": "unknown",
"Lgr5 positive cells": "unknown",
"Macula densa cells": "macula densa epithelial cell",
"Choroid plexus epithelial cells": "choroid plexus epithelial cell",
"Cortical projection neurons 3": "corticothalamic-projecting glutamatergic cortical neuron",
"Interstitial cells of Cajal": "interstitial cell of Cajal",
"Cacna1b positive cells": "unknown",
"Hindbrain neurons 2": "neuron",
"Myeloid cells_Basophils": "basophil",
"Ependymal cells": "ependymal cell",
"Muc5ac-producing goblet cells": "lung goblet cell",
"Myeloid cells_Mast cells": "mast cell",
"Pulmonary neuroendocrine cells": "lung neuroendocrine cell",
"Basal cells": "basal cell",
"OB neurons 3": "olfactory bulb interneuron",
"Non-myelinating Schwann cells": "non-myelinating Schwann cell",
"Asic2 positive cells": "unknown",
"Striatal neurons 2": "striatum neuron",
"Erythroblasts": "erythroblast",
"Hindbrain neurons 1": "neuron",
"Neuromuscular junction myonuclei": "unknown",
"Habenula neurons": "unknown",
"Pituitary cells": "pituitary gland cell",
"Unipolar brush cells": "unipolar brush cell",
"Pde4c positive cells": "unknown",
"Pancreatic acinar cells": "pancreatic acinar cell",
"Inferior olivary nucleus neurons": "bushy cell",
"Colec10 positive cells": "unknown",
"Fcgbp positive cells": "unknown",
"Fut9 positive cells": "unknown",
"Mirg positive cells": "unknown",
"Alox15 positive cells": "unknown",
"Osteoblasts": "osteoblast",
}
genesdf = utils.load_genes("NCBITaxon:10090")
{k: v if v =="unknown" else bt.CellType.filter(name=v).one().ontology_id for k, v in celltyper.items()}
adata.obs["organism_ontology_term_id"] = "NCBITaxon:10090"
adata.obs["tissue_ontology_term_id"] = adata.obs["Organ_name"].replace(tissuer)
adata.obs["cell_type_ontology_term_id"] = adata.obs["Main_cell_type"].replace(
celltyper
)
adata.obs["disease_ontology_term_id"] = "PATO:0000461"
adata.obs["assay_ontology_term_id"] = "unknown"
adata.obs["self_reported_ethnicity_ontology_term_id"] = "unknown"
adata.obs["development_stage_ontology_term_id"] = adata.obs["Age_group"].replace(
ager
)
adata.obs["sex_ontology_term_id"] = adata.obs["Gender"].replace(sexr)
for i in range(num_blocks):
start_index = i * block_size
end_index = min((i + 1) * block_size, len(adata))
block = adata[start_index:end_index].to_memory()
# process block here
block = block[(block.obs["Gene_count"] > 400)]
intersect_genes = set(block.var.index).intersection(set(genesdf.index))
print(f"Removed {len(block.var.index) - len(intersect_genes)} genes.")
block = block[:, list(intersect_genes)]
# marking unseen genes
unseen = set(genesdf.index) - set(block.var.index)
# adding them to adata
emptyda = ad.AnnData(
csr_matrix((block.shape[0], len(unseen)), dtype=np.float32),
var=pd.DataFrame(index=list(unseen)),
obs=pd.DataFrame(index=block.obs.index),
)
block = ad.concat([block, emptyda], axis=1, join="outer", merge="only")
# do a validation function
block.uns["unseen_genes"] = list(unseen)
block = block[:, block.var.sort_index().index]
block.var[genesdf.columns] = genesdf.loc[block.var.index]
for name in ["stable_id", "created_at", "updated_at"]:
if name in block.var.columns:
block.var = block.var.drop(columns=name)
block.write_h5ad('zhang2024_adata_'+str(i)+".h5ad")
"""

0 comments on commit f78e547

Please sign in to comment.