From b585cf604089cf1e3ea6a1ba23d62ac7b7222311 Mon Sep 17 00:00:00 2001
From: jkobject <jkobject@gmail.com>
Date: Sat, 30 Dec 2023 14:31:46 +0100
Subject: [PATCH] finish cleanup

---
 docs/dataset.md                               |   4 +
 ...s.sync-conflict-20231230-045206-2FEYXUZ.md |   9 --
 ...s.sync-conflict-20231230-045226-2FEYXUZ.md |  10 --
 mkdocs.yml                                    |  13 +-
 scdataloader/utils.py                         | 114 ++++++++++++++----
 5 files changed, 103 insertions(+), 47 deletions(-)
 create mode 100644 docs/dataset.md
 delete mode 100644 docs/preprocess.sync-conflict-20231230-045206-2FEYXUZ.md
 delete mode 100644 docs/preprocess.sync-conflict-20231230-045226-2FEYXUZ.md

diff --git a/docs/dataset.md b/docs/dataset.md
new file mode 100644
index 0000000..ae37cb4
--- /dev/null
+++ b/docs/dataset.md
@@ -0,0 +1,4 @@
+# Documentation for `Dataset`
+
+::: scdataloader.data.Dataset
+    handler: python
\ No newline at end of file
diff --git a/docs/preprocess.sync-conflict-20231230-045206-2FEYXUZ.md b/docs/preprocess.sync-conflict-20231230-045206-2FEYXUZ.md
deleted file mode 100644
index b3bb5a2..0000000
--- a/docs/preprocess.sync-conflict-20231230-045206-2FEYXUZ.md
+++ /dev/null
@@ -1,9 +0,0 @@
-# Documentation for `Preprocessor`
-
-::: scdataloader.preprocess.Preprocessor
-    handler: python
-    options:
-      show_root_heading: true
-      show_source: true
-      summary: true
-      merge_init_into_class: true
\ No newline at end of file
diff --git a/docs/preprocess.sync-conflict-20231230-045226-2FEYXUZ.md b/docs/preprocess.sync-conflict-20231230-045226-2FEYXUZ.md
deleted file mode 100644
index 6e2000a..0000000
--- a/docs/preprocess.sync-conflict-20231230-045226-2FEYXUZ.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# Documentation for `Preprocessor`
-
-::: scdataloader.preprocess.Preprocessor
-    handler: python
-    options:
-      show_root_heading: true
-      show_source: true
-      summary: true
-      merge_init_into_class: true
-      show_signature: false
\ No newline at end of file
diff --git a/mkdocs.yml b/mkdocs.yml
index a4c42db..1bcca38 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -6,11 +6,13 @@ theme:
 site_url: https://www.jkobject.com/scdataloader/
 nav:
   - Home: index.md
-  - download and preprocess: notebooks/1_download_and_preprocess.ipynb
-  - use the dataloader: notebooks/2_create_dataloader.ipynb
-  - Dataset: Dataset.md
-  - preprocess: preprocess.md
-  - utils: utils.md
+  - Example notebooks:
+    - download and preprocess: notebooks/1_download_and_preprocess.ipynb
+    - use the dataloader: notebooks/2_create_dataloader.ipynb
+  - documentation:
+    - dataset: dataset.md
+    - preprocess: preprocess.md
+    - utils: utils.md
 plugins: 
   - search
   - mkdocstrings:
@@ -23,6 +25,7 @@ plugins:
             summary: true
             merge_init_into_class: true
             show_signature: false
+            do_heading: true
       default_handler: python
   - git-revision-date-localized
   - git-authors
diff --git a/scdataloader/utils.py b/scdataloader/utils.py
index 5e34a7f..e353d15 100644
--- a/scdataloader/utils.py
+++ b/scdataloader/utils.py
@@ -18,15 +18,15 @@ def getBiomartTable(
     """generate a genelist dataframe from ensembl's biomart
 
     Args:
-        ensemble_server (str, optional): [description]. Defaults to ENSEMBL_SERVER_V.
-        useCache (bool, optional): [description]. Defaults to False.
-        cache_folder (str, optional): [description]. Defaults to CACHE_PATH.
+        ensemble_server (str, optional): the biomart server. Defaults to "http://jul2023.archive.ensembl.org/biomart".
+        useCache (bool, optional): whether to use the cache or not. Defaults to False.
+        cache_folder (str, optional): the cache folder. Defaults to "/tmp/biomart/".
 
     Raises:
         ValueError: should be a dataframe (when the result from the server is something else)
 
     Returns:
-        [type]: [description]
+        pd.DataFrame: the dataframe
     """
     attr = (
         [
@@ -65,29 +65,26 @@ def getBiomartTable(
 
 def validate(adata, lb, organism):
     """
-    validate _summary_
+    validate checks if the adata object is valid for lamindb
 
     Args:
-        adata (_type_): _description_
-        lb (_type_): _description_
-        organism (_type_): _description_
+        adata (anndata): the anndata object
+        lb (lamindb): the lamindb instance
+        organism (str): the organism
 
     Raises:
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
-        ValueError: _description_
+        ValueError: if the adata object is not valid
+        ValueError: if the anndata contains invalid ethnicity ontology term id according to the lb instance
+        ValueError: if the anndata contains invalid organism ontology term id according to the lb instance
+        ValueError: if the anndata contains invalid sex ontology term id according to the lb instance
+        ValueError: if the anndata contains invalid disease ontology term id according to the lb instance
+        ValueError: if the anndata contains invalid cell_type ontology term id according to the lb instance
+        ValueError: if the anndata contains invalid development_stage ontology term id according to the lb instance
+        ValueError: if the anndata contains invalid tissue ontology term id according to the lb instance
+        ValueError: if the anndata contains invalid assay ontology term id according to the lb instance
 
     Returns:
-        _type_: _description_
+        bool: True if the adata object is valid
     """
     organism = lb.Organism.filter(ontology_id=organism).one().name
     lb.settings.organism = organism
@@ -167,6 +164,16 @@ def get_all_ancestors(val, df):
 
 
 def get_ancestry_mapping(all_elem, onto_df):
+    """
+    This function generates a mapping of all elements to their ancestors in the ontology dataframe.
+
+    Args:
+        all_elem (list): A list of all elements.
+        onto_df (DataFrame): The ontology dataframe.
+
+    Returns:
+        dict: A dictionary mapping each element to its ancestors.
+    """
     ancestors = {}
     full_ancestors = set()
     for val in all_elem:
@@ -193,6 +200,21 @@ def get_ancestry_mapping(all_elem, onto_df):
 def load_dataset_local(
     lb, remote_dataset, download_folder, name, description, use_cache=True, only=None
 ):
+    """
+    This function loads a remote lamindb dataset to local.
+
+    Args:
+        lb (lamindb): The lamindb instance.
+        remote_dataset (lamindb.Dataset): The remote Dataset.
+        download_folder (str): The path to the download folder.
+        name (str): The name of the dataset.
+        description (str): The description of the dataset.
+        use_cache (bool, optional): Whether to use cache. Defaults to True.
+        only (list, optional): A list of indices to specify which files to download. Defaults to None.
+
+    Returns:
+        lamindb.Dataset: The local dataset.
+    """
     saved_files = []
     default_storage = ln.Storage.filter(root=ln.settings.storage.as_posix()).one()
     files = (
@@ -250,8 +272,22 @@ def populate_my_ontology(
 
     erase everything with lb.$ontology.filter().delete()
 
-    add whatever value you need afterward like it is done here with lb.$ontology(name="ddd", ontology_id="ddddd").save()
-    # df["assay_ontology_term_id"].unique()
+    add whatever value you need afterward like it is done here with:
+
+    `lb.$ontology(name="ddd", ontology_id="ddddd").save()`
+
+    `df["assay_ontology_term_id"].unique()`
+
+    Args:
+        lb (lamindb): lamindb instance.
+        organisms (list, optional): List of organisms. Defaults to ["NCBITaxon:10090", "NCBITaxon:9606"].
+        sex (list, optional): List of sexes. Defaults to ["PATO:0000384", "PATO:0000383"].
+        celltypes (list, optional): List of cell types. Defaults to [].
+        ethnicities (list, optional): List of ethnicities. Defaults to [].
+        assays (list, optional): List of assays. Defaults to [].
+        tissues (list, optional): List of tissues. Defaults to [].
+        diseases (list, optional): List of diseases. Defaults to [].
+        dev_stages (list, optional): List of developmental stages. Defaults to [].
     """
 
     names = bt.CellType().df().index if not celltypes else celltypes
@@ -322,6 +358,17 @@ def populate_my_ontology(
 
 
 def is_outlier(adata, metric: str, nmads: int):
+    """
+    is_outlier detects outliers in adata.obs[metric]
+
+    Args:
+        adata (annData): the anndata object
+        metric (str): the metric column to use
+        nmads (int): the number of median absolute deviations to use as a threshold
+
+    Returns:
+        pd.Series: a boolean series indicating whether a cell is an outlier or not
+    """
     M = adata.obs[metric]
     outlier = (M < np.median(M) - nmads * median_abs_deviation(M)) | (
         np.median(M) + nmads * median_abs_deviation(M) < M
@@ -330,11 +377,32 @@ def is_outlier(adata, metric: str, nmads: int):
 
 
 def length_normalize(adata, gene_lengths):
+    """
+    length_normalize normalizes the counts by the gene length
+
+    Args:
+        adata (anndata): the anndata object
+        gene_lengths (list): the gene lengths
+
+    Returns:
+        anndata: the anndata object
+    """
     adata.X = csr_matrix((adata.X.T / gene_lengths).T)
     return adata
 
 
 def pd_load_cached(url, loc="/tmp/", cache=True, **kwargs):
+    """
+    pd_load_cached downloads a file from a url and loads it as a pandas dataframe
+
+    Args:
+        url (str): the url to download the file from
+        loc (str, optional): the location to save the file to. Defaults to "/tmp/".
+        cache (bool, optional): whether to use the cached file or not. Defaults to True.
+
+    Returns:
+        pd.DataFrame: the dataframe
+    """
     # Check if the file exists, if not, download it
     loc += url.split("/")[-1]
     if not os.path.isfile(loc) or not cache: