Skip to content

Commit

Permalink
finish cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
jkobject committed Dec 30, 2023
1 parent d9f0ca9 commit b585cf6
Show file tree
Hide file tree
Showing 5 changed files with 103 additions and 47 deletions.
4 changes: 4 additions & 0 deletions docs/dataset.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# Documentation for `Dataset`

::: scdataloader.data.Dataset
handler: python
9 changes: 0 additions & 9 deletions docs/preprocess.sync-conflict-20231230-045206-2FEYXUZ.md

This file was deleted.

10 changes: 0 additions & 10 deletions docs/preprocess.sync-conflict-20231230-045226-2FEYXUZ.md

This file was deleted.

13 changes: 8 additions & 5 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ theme:
site_url: https://www.jkobject.com/scdataloader/
nav:
- Home: index.md
- download and preprocess: notebooks/1_download_and_preprocess.ipynb
- use the dataloader: notebooks/2_create_dataloader.ipynb
- Dataset: Dataset.md
- preprocess: preprocess.md
- utils: utils.md
- Example notebooks:
- download and preprocess: notebooks/1_download_and_preprocess.ipynb
- use the dataloader: notebooks/2_create_dataloader.ipynb
- documentation:
- dataset: dataset.md
- preprocess: preprocess.md
- utils: utils.md
plugins:
- search
- mkdocstrings:
Expand All @@ -23,6 +25,7 @@ plugins:
summary: true
merge_init_into_class: true
show_signature: false
do_heading: true
default_handler: python
- git-revision-date-localized
- git-authors
Expand Down
114 changes: 91 additions & 23 deletions scdataloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,15 @@ def getBiomartTable(
"""generate a genelist dataframe from ensembl's biomart
Args:
ensemble_server (str, optional): [description]. Defaults to ENSEMBL_SERVER_V.
useCache (bool, optional): [description]. Defaults to False.
cache_folder (str, optional): [description]. Defaults to CACHE_PATH.
ensemble_server (str, optional): the biomart server. Defaults to "http://jul2023.archive.ensembl.org/biomart".
useCache (bool, optional): whether to use the cache or not. Defaults to False.
cache_folder (str, optional): the cache folder. Defaults to "/tmp/biomart/".
Raises:
ValueError: should be a dataframe (when the result from the server is something else)
Returns:
[type]: [description]
pd.DataFrame: the dataframe
"""
attr = (
[
Expand Down Expand Up @@ -65,29 +65,26 @@ def getBiomartTable(

def validate(adata, lb, organism):
"""
validate _summary_
validate checks if the adata object is valid for lamindb
Args:
adata (_type_): _description_
lb (_type_): _description_
organism (_type_): _description_
adata (anndata): the anndata object
lb (lamindb): the lamindb instance
organism (str): the organism
Raises:
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: _description_
ValueError: if the adata object is not valid
ValueError: if the anndata contains invalid ethnicity ontology term id according to the lb instance
ValueError: if the anndata contains invalid organism ontology term id according to the lb instance
ValueError: if the anndata contains invalid sex ontology term id according to the lb instance
ValueError: if the anndata contains invalid disease ontology term id according to the lb instance
ValueError: if the anndata contains invalid cell_type ontology term id according to the lb instance
ValueError: if the anndata contains invalid development_stage ontology term id according to the lb instance
ValueError: if the anndata contains invalid tissue ontology term id according to the lb instance
ValueError: if the anndata contains invalid assay ontology term id according to the lb instance
Returns:
_type_: _description_
bool: True if the adata object is valid
"""
organism = lb.Organism.filter(ontology_id=organism).one().name
lb.settings.organism = organism
Expand Down Expand Up @@ -167,6 +164,16 @@ def get_all_ancestors(val, df):


def get_ancestry_mapping(all_elem, onto_df):
"""
This function generates a mapping of all elements to their ancestors in the ontology dataframe.
Args:
all_elem (list): A list of all elements.
onto_df (DataFrame): The ontology dataframe.
Returns:
dict: A dictionary mapping each element to its ancestors.
"""
ancestors = {}
full_ancestors = set()
for val in all_elem:
Expand All @@ -193,6 +200,21 @@ def get_ancestry_mapping(all_elem, onto_df):
def load_dataset_local(
lb, remote_dataset, download_folder, name, description, use_cache=True, only=None
):
"""
This function loads a remote lamindb dataset to local.
Args:
lb (lamindb): The lamindb instance.
remote_dataset (lamindb.Dataset): The remote Dataset.
download_folder (str): The path to the download folder.
name (str): The name of the dataset.
description (str): The description of the dataset.
use_cache (bool, optional): Whether to use cache. Defaults to True.
only (list, optional): A list of indices to specify which files to download. Defaults to None.
Returns:
lamindb.Dataset: The local dataset.
"""
saved_files = []
default_storage = ln.Storage.filter(root=ln.settings.storage.as_posix()).one()
files = (
Expand Down Expand Up @@ -250,8 +272,22 @@ def populate_my_ontology(
erase everything with lb.$ontology.filter().delete()
add whatever value you need afterward like it is done here with lb.$ontology(name="ddd", ontology_id="ddddd").save()
# df["assay_ontology_term_id"].unique()
add whatever value you need afterward like it is done here with:
`lb.$ontology(name="ddd", ontology_id="ddddd").save()`
`df["assay_ontology_term_id"].unique()`
Args:
lb (lamindb): lamindb instance.
organisms (list, optional): List of organisms. Defaults to ["NCBITaxon:10090", "NCBITaxon:9606"].
sex (list, optional): List of sexes. Defaults to ["PATO:0000384", "PATO:0000383"].
celltypes (list, optional): List of cell types. Defaults to [].
ethnicities (list, optional): List of ethnicities. Defaults to [].
assays (list, optional): List of assays. Defaults to [].
tissues (list, optional): List of tissues. Defaults to [].
diseases (list, optional): List of diseases. Defaults to [].
dev_stages (list, optional): List of developmental stages. Defaults to [].
"""

names = bt.CellType().df().index if not celltypes else celltypes
Expand Down Expand Up @@ -322,6 +358,17 @@ def populate_my_ontology(


def is_outlier(adata, metric: str, nmads: int):
"""
is_outlier detects outliers in adata.obs[metric]
Args:
adata (annData): the anndata object
metric (str): the metric column to use
nmads (int): the number of median absolute deviations to use as a threshold
Returns:
pd.Series: a boolean series indicating whether a cell is an outlier or not
"""
M = adata.obs[metric]
outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
np.median(M) + nmads * median_abs_deviation(M) < M
Expand All @@ -330,11 +377,32 @@ def is_outlier(adata, metric: str, nmads: int):


def length_normalize(adata, gene_lengths):
"""
length_normalize normalizes the counts by the gene length
Args:
adata (anndata): the anndata object
gene_lengths (list): the gene lengths
Returns:
anndata: the anndata object
"""
adata.X = csr_matrix((adata.X.T / gene_lengths).T)
return adata


def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
"""
pd_load_cached downloads a file from a url and loads it as a pandas dataframe
Args:
url (str): the url to download the file from
loc (str, optional): the location to save the file to. Defaults to "/tmp/".
cache (bool, optional): whether to use the cached file or not. Defaults to True.
Returns:
pd.DataFrame: the dataframe
"""
# Check if the file exists, if not, download it
loc += url.split("/")[-1]
if not os.path.isfile(loc) or not cache:
Expand Down

0 comments on commit b585cf6

Please sign in to comment.