From b585cf604089cf1e3ea6a1ba23d62ac7b7222311 Mon Sep 17 00:00:00 2001 From: jkobject Date: Sat, 30 Dec 2023 14:31:46 +0100 Subject: [PATCH] finish cleanup --- docs/dataset.md | 4 + ...s.sync-conflict-20231230-045206-2FEYXUZ.md | 9 -- ...s.sync-conflict-20231230-045226-2FEYXUZ.md | 10 -- mkdocs.yml | 13 +- scdataloader/utils.py | 114 ++++++++++++++---- 5 files changed, 103 insertions(+), 47 deletions(-) create mode 100644 docs/dataset.md delete mode 100644 docs/preprocess.sync-conflict-20231230-045206-2FEYXUZ.md delete mode 100644 docs/preprocess.sync-conflict-20231230-045226-2FEYXUZ.md diff --git a/docs/dataset.md b/docs/dataset.md new file mode 100644 index 0000000..ae37cb4 --- /dev/null +++ b/docs/dataset.md @@ -0,0 +1,4 @@ +# Documentation for `Dataset` + +::: scdataloader.data.Dataset + handler: python \ No newline at end of file diff --git a/docs/preprocess.sync-conflict-20231230-045206-2FEYXUZ.md b/docs/preprocess.sync-conflict-20231230-045206-2FEYXUZ.md deleted file mode 100644 index b3bb5a2..0000000 --- a/docs/preprocess.sync-conflict-20231230-045206-2FEYXUZ.md +++ /dev/null @@ -1,9 +0,0 @@ -# Documentation for `Preprocessor` - -::: scdataloader.preprocess.Preprocessor - handler: python - options: - show_root_heading: true - show_source: true - summary: true - merge_init_into_class: true \ No newline at end of file diff --git a/docs/preprocess.sync-conflict-20231230-045226-2FEYXUZ.md b/docs/preprocess.sync-conflict-20231230-045226-2FEYXUZ.md deleted file mode 100644 index 6e2000a..0000000 --- a/docs/preprocess.sync-conflict-20231230-045226-2FEYXUZ.md +++ /dev/null @@ -1,10 +0,0 @@ -# Documentation for `Preprocessor` - -::: scdataloader.preprocess.Preprocessor - handler: python - options: - show_root_heading: true - show_source: true - summary: true - merge_init_into_class: true - show_signature: false \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index a4c42db..1bcca38 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -6,11 +6,13 @@ theme: site_url: https://www.jkobject.com/scdataloader/ nav: - Home: index.md - - download and preprocess: notebooks/1_download_and_preprocess.ipynb - - use the dataloader: notebooks/2_create_dataloader.ipynb - - Dataset: Dataset.md - - preprocess: preprocess.md - - utils: utils.md + - Example notebooks: + - download and preprocess: notebooks/1_download_and_preprocess.ipynb + - use the dataloader: notebooks/2_create_dataloader.ipynb + - documentation: + - dataset: dataset.md + - preprocess: preprocess.md + - utils: utils.md plugins: - search - mkdocstrings: @@ -23,6 +25,7 @@ plugins: summary: true merge_init_into_class: true show_signature: false + do_heading: true default_handler: python - git-revision-date-localized - git-authors diff --git a/scdataloader/utils.py b/scdataloader/utils.py index 5e34a7f..e353d15 100644 --- a/scdataloader/utils.py +++ b/scdataloader/utils.py @@ -18,15 +18,15 @@ def getBiomartTable( """generate a genelist dataframe from ensembl's biomart Args: - ensemble_server (str, optional): [description]. Defaults to ENSEMBL_SERVER_V. - useCache (bool, optional): [description]. Defaults to False. - cache_folder (str, optional): [description]. Defaults to CACHE_PATH. + ensemble_server (str, optional): the biomart server. Defaults to "http://jul2023.archive.ensembl.org/biomart". + useCache (bool, optional): whether to use the cache or not. Defaults to False. + cache_folder (str, optional): the cache folder. Defaults to "/tmp/biomart/". Raises: ValueError: should be a dataframe (when the result from the server is something else) Returns: - [type]: [description] + pd.DataFrame: the dataframe """ attr = ( [ @@ -65,29 +65,26 @@ def getBiomartTable( def validate(adata, lb, organism): """ - validate _summary_ + validate checks if the adata object is valid for lamindb Args: - adata (_type_): _description_ - lb (_type_): _description_ - organism (_type_): _description_ + adata (anndata): the anndata object + lb (lamindb): the lamindb instance + organism (str): the organism Raises: - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ - ValueError: _description_ + ValueError: if the adata object is not valid + ValueError: if the anndata contains invalid ethnicity ontology term id according to the lb instance + ValueError: if the anndata contains invalid organism ontology term id according to the lb instance + ValueError: if the anndata contains invalid sex ontology term id according to the lb instance + ValueError: if the anndata contains invalid disease ontology term id according to the lb instance + ValueError: if the anndata contains invalid cell_type ontology term id according to the lb instance + ValueError: if the anndata contains invalid development_stage ontology term id according to the lb instance + ValueError: if the anndata contains invalid tissue ontology term id according to the lb instance + ValueError: if the anndata contains invalid assay ontology term id according to the lb instance Returns: - _type_: _description_ + bool: True if the adata object is valid """ organism = lb.Organism.filter(ontology_id=organism).one().name lb.settings.organism = organism @@ -167,6 +164,16 @@ def get_all_ancestors(val, df): def get_ancestry_mapping(all_elem, onto_df): + """ + This function generates a mapping of all elements to their ancestors in the ontology dataframe. + + Args: + all_elem (list): A list of all elements. + onto_df (DataFrame): The ontology dataframe. + + Returns: + dict: A dictionary mapping each element to its ancestors. + """ ancestors = {} full_ancestors = set() for val in all_elem: @@ -193,6 +200,21 @@ def get_ancestry_mapping(all_elem, onto_df): def load_dataset_local( lb, remote_dataset, download_folder, name, description, use_cache=True, only=None ): + """ + This function loads a remote lamindb dataset to local. + + Args: + lb (lamindb): The lamindb instance. + remote_dataset (lamindb.Dataset): The remote Dataset. + download_folder (str): The path to the download folder. + name (str): The name of the dataset. + description (str): The description of the dataset. + use_cache (bool, optional): Whether to use cache. Defaults to True. + only (list, optional): A list of indices to specify which files to download. Defaults to None. + + Returns: + lamindb.Dataset: The local dataset. + """ saved_files = [] default_storage = ln.Storage.filter(root=ln.settings.storage.as_posix()).one() files = ( @@ -250,8 +272,22 @@ def populate_my_ontology( erase everything with lb.$ontology.filter().delete() - add whatever value you need afterward like it is done here with lb.$ontology(name="ddd", ontology_id="ddddd").save() - # df["assay_ontology_term_id"].unique() + add whatever value you need afterward like it is done here with: + + `lb.$ontology(name="ddd", ontology_id="ddddd").save()` + + `df["assay_ontology_term_id"].unique()` + + Args: + lb (lamindb): lamindb instance. + organisms (list, optional): List of organisms. Defaults to ["NCBITaxon:10090", "NCBITaxon:9606"]. + sex (list, optional): List of sexes. Defaults to ["PATO:0000384", "PATO:0000383"]. + celltypes (list, optional): List of cell types. Defaults to []. + ethnicities (list, optional): List of ethnicities. Defaults to []. + assays (list, optional): List of assays. Defaults to []. + tissues (list, optional): List of tissues. Defaults to []. + diseases (list, optional): List of diseases. Defaults to []. + dev_stages (list, optional): List of developmental stages. Defaults to []. """ names = bt.CellType().df().index if not celltypes else celltypes @@ -322,6 +358,17 @@ def populate_my_ontology( def is_outlier(adata, metric: str, nmads: int): + """ + is_outlier detects outliers in adata.obs[metric] + + Args: + adata (annData): the anndata object + metric (str): the metric column to use + nmads (int): the number of median absolute deviations to use as a threshold + + Returns: + pd.Series: a boolean series indicating whether a cell is an outlier or not + """ M = adata.obs[metric] outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | ( np.median(M) + nmads * median_abs_deviation(M) < M @@ -330,11 +377,32 @@ def is_outlier(adata, metric: str, nmads: int): def length_normalize(adata, gene_lengths): + """ + length_normalize normalizes the counts by the gene length + + Args: + adata (anndata): the anndata object + gene_lengths (list): the gene lengths + + Returns: + anndata: the anndata object + """ adata.X = csr_matrix((adata.X.T / gene_lengths).T) return adata def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs): + """ + pd_load_cached downloads a file from a url and loads it as a pandas dataframe + + Args: + url (str): the url to download the file from + loc (str, optional): the location to save the file to. Defaults to "/tmp/". + cache (bool, optional): whether to use the cached file or not. Defaults to True. + + Returns: + pd.DataFrame: the dataframe + """ # Check if the file exists, if not, download it loc += url.split("/")[-1] if not os.path.isfile(loc) or not cache: