From 05b69ab04dbd93bccbac307b7b4ff4f06c9ea2d2 Mon Sep 17 00:00:00 2001
From: Cornelius Roemer <cornelius.roemer@gmail.com>
Date: Mon, 3 Apr 2023 20:30:22 +0200
Subject: [PATCH] Create and upload partitioned datasets by year-month, clade,
 continent

---
 Snakefile                              |  2 +
 workflow/snakemake_rules/partition.smk | 52 ++++++++++++++++++++++++++
 workflow/snakemake_rules/upload.smk    | 26 +++++++++++++
 3 files changed, 80 insertions(+)
 create mode 100644 workflow/snakemake_rules/partition.smk

diff --git a/Snakefile b/Snakefile
index 7ca9f6f0..e2b5c90c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -55,6 +55,8 @@ include: "workflow/snakemake_rules/curate.smk"
 
 include: "workflow/snakemake_rules/nextclade.smk"
 
+include: "workflow/snakemake_rules/partition.smk"
+
 if send_notifications and config.get("s3_src"):
     include: "workflow/snakemake_rules/slack_notifications.smk"
 
diff --git a/workflow/snakemake_rules/partition.smk b/workflow/snakemake_rules/partition.smk
new file mode 100644
index 00000000..d8f0f530
--- /dev/null
+++ b/workflow/snakemake_rules/partition.smk
@@ -0,0 +1,52 @@
+"""
+Creates partitioned datasets:
+- by year_month
+- by clade
+- by continent
+"""
+
+
+rule metadata_by_year_month:
+    input:
+        "data/{database}/metadata.tsv",
+    output:
+        "data/{database}/metadata_year-month_{year}-{month}.tsv",
+    shell:
+        """
+        tsv-filter -H --istr-in-fld "date:{wildcards.year}-{wildcards.month}" {input} > {output}
+        """
+
+
+rule metadata_by_clade:
+    input:
+        "data/{database}/metadata.tsv",
+    output:
+        "data/{database}/metadata_clade_{clade}.tsv",
+    shell:
+        """
+        tsv-filter -H --istr-in-fld "Nextstrain_clade:{wildcards.clade}" {input} > {output}
+        """
+
+
+rule metadata_by_continent:
+    input:
+        "data/{database}/metadata.tsv",
+    output:
+        "data/{database}/metadata_region_{continent}.tsv",
+    shell:
+        """
+        tsv-filter -H --istr-eq "region:{wildcards.continent}" {input} > {output}
+        """
+
+rule sequences_by_metadata:
+    input:
+        sequences="data/{database}/sequences.fasta",
+        metadata="data/{database}/metadata_{partition}.tsv",
+    output:
+        sequences="data/{database}/sequences_{partition}.fasta",
+        strains=temp("data/{database}/strains_{partition}.txt"),
+    shell:
+        """
+        tsv-select -H -f strain {input.metadata} > {output.strains}
+        seqkit grep -f {output.strains} {input.sequences} > {output.sequences}
+        """
diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk
index 5be9ee9c..20d42371 100644
--- a/workflow/snakemake_rules/upload.smk
+++ b/workflow/snakemake_rules/upload.smk
@@ -12,6 +12,8 @@ These output files are empty flag files to force Snakemake to run the upload rul
 Note: we are doing parallel uploads of zstd compressed files to slowly make the transition to this format.
 """
 
+import datetime
+
 def compute_files_to_upload():
     """
     Compute files to upload
@@ -33,6 +35,30 @@ def compute_files_to_upload():
                         "aligned.fasta.zst":           f"data/{database}/aligned.fasta",
                         "nextclade_21L.tsv.zst":       f"data/{database}/nextclade_21L.tsv",
                     }
+    
+    now = datetime.datetime.now()
+    months_since_2020_01 = {f"{year}-{month:02d}" for year in range(2020, now.year+1) for month in range(1, 12+1) if year < now.year or month <= now.month}
+    regions={"europe", "north-america", "south-america", "asia", "africa", "oceania"}
+
+    max_per_year = {"19": "B", "20":"K", "21":"M", "22":"F","23":"A"}
+    clades = set()
+    for year, max_letter in max_per_year.items():
+            for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
+                if letter > max_letter:
+                    break
+                clades.add(f"{year}{letter}")
+    
+    for clade in clades:
+        files_to_upload[f"metadata_{clade}.tsv.zst"] = f"data/{database}/metadata_clade_{clade}.tsv"
+        files_to_upload[f"sequences_{clade}.fasta.zst"] = f"data/{database}/sequences_clade_{clade}.fasta"
+
+    for region in regions:
+        files_to_upload[f"metadata_{region}.tsv.zst"] = f"data/{database}/metadata_region_{region}.tsv"
+        files_to_upload[f"sequences_{region}.fasta.zst"] = f"data/{database}/sequences_region_{region}.fasta"
+    
+    for year_month in months_since_2020_01:
+        files_to_upload[f"metadata_{year_month}.tsv.zst"] = f"data/{database}/metadata_year-month_{year_month}.tsv"
+        files_to_upload[f"sequences_{year_month}.fasta.zst"] = f"data/{database}/sequences_year-month_{year_month}.fasta"
 
     if database=="genbank":
         files_to_upload["biosample.tsv.gz"] =           f"data/{database}/biosample.tsv"