From 05b69ab04dbd93bccbac307b7b4ff4f06c9ea2d2 Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 3 Apr 2023 20:30:22 +0200 Subject: [PATCH] Create and upload partitioned datasets by year-month, clade, continent --- Snakefile | 2 + workflow/snakemake_rules/partition.smk | 52 ++++++++++++++++++++++++++ workflow/snakemake_rules/upload.smk | 26 +++++++++++++ 3 files changed, 80 insertions(+) create mode 100644 workflow/snakemake_rules/partition.smk diff --git a/Snakefile b/Snakefile index 7ca9f6f0..e2b5c90c 100644 --- a/Snakefile +++ b/Snakefile @@ -55,6 +55,8 @@ include: "workflow/snakemake_rules/curate.smk" include: "workflow/snakemake_rules/nextclade.smk" +include: "workflow/snakemake_rules/partition.smk" + if send_notifications and config.get("s3_src"): include: "workflow/snakemake_rules/slack_notifications.smk" diff --git a/workflow/snakemake_rules/partition.smk b/workflow/snakemake_rules/partition.smk new file mode 100644 index 00000000..d8f0f530 --- /dev/null +++ b/workflow/snakemake_rules/partition.smk @@ -0,0 +1,52 @@ +""" +Creates partitioned datasets: +- by year_month +- by clade +- by continent +""" + + +rule metadata_by_year_month: + input: + "data/{database}/metadata.tsv", + output: + "data/{database}/metadata_year-month_{year}-{month}.tsv", + shell: + """ + tsv-filter -H --istr-in-fld "date:{wildcards.year}-{wildcards.month}" {input} > {output} + """ + + +rule metadata_by_clade: + input: + "data/{database}/metadata.tsv", + output: + "data/{database}/metadata_clade_{clade}.tsv", + shell: + """ + tsv-filter -H --istr-in-fld "Nextstrain_clade:{wildcards.clade}" {input} > {output} + """ + + +rule metadata_by_continent: + input: + "data/{database}/metadata.tsv", + output: + "data/{database}/metadata_region_{continent}.tsv", + shell: + """ + tsv-filter -H --istr-eq "region:{wildcards.continent}" {input} > {output} + """ + +rule sequences_by_metadata: + input: + sequences="data/{database}/sequences.fasta", + metadata="data/{database}/metadata_{partition}.tsv", + output: + sequences="data/{database}/sequences_{partition}.fasta", + strains=temp("data/{database}/strains_{partition}.txt"), + shell: + """ + tsv-select -H -f strain {input.metadata} > {output.strains} + seqkit grep -f {output.strains} {input.sequences} > {output.sequences} + """ diff --git a/workflow/snakemake_rules/upload.smk b/workflow/snakemake_rules/upload.smk index 5be9ee9c..20d42371 100644 --- a/workflow/snakemake_rules/upload.smk +++ b/workflow/snakemake_rules/upload.smk @@ -12,6 +12,8 @@ These output files are empty flag files to force Snakemake to run the upload rul Note: we are doing parallel uploads of zstd compressed files to slowly make the transition to this format. """ +import datetime + def compute_files_to_upload(): """ Compute files to upload @@ -33,6 +35,30 @@ def compute_files_to_upload(): "aligned.fasta.zst": f"data/{database}/aligned.fasta", "nextclade_21L.tsv.zst": f"data/{database}/nextclade_21L.tsv", } + + now = datetime.datetime.now() + months_since_2020_01 = {f"{year}-{month:02d}" for year in range(2020, now.year+1) for month in range(1, 12+1) if year < now.year or month <= now.month} + regions={"europe", "north-america", "south-america", "asia", "africa", "oceania"} + + max_per_year = {"19": "B", "20":"K", "21":"M", "22":"F","23":"A"} + clades = set() + for year, max_letter in max_per_year.items(): + for letter in "ABCDEFGHIJKLMNOPQRSTUVWXYZ": + if letter > max_letter: + break + clades.add(f"{year}{letter}") + + for clade in clades: + files_to_upload[f"metadata_{clade}.tsv.zst"] = f"data/{database}/metadata_clade_{clade}.tsv" + files_to_upload[f"sequences_{clade}.fasta.zst"] = f"data/{database}/sequences_clade_{clade}.fasta" + + for region in regions: + files_to_upload[f"metadata_{region}.tsv.zst"] = f"data/{database}/metadata_region_{region}.tsv" + files_to_upload[f"sequences_{region}.fasta.zst"] = f"data/{database}/sequences_region_{region}.fasta" + + for year_month in months_since_2020_01: + files_to_upload[f"metadata_{year_month}.tsv.zst"] = f"data/{database}/metadata_year-month_{year_month}.tsv" + files_to_upload[f"sequences_{year_month}.fasta.zst"] = f"data/{database}/sequences_year-month_{year_month}.fasta" if database=="genbank": files_to_upload["biosample.tsv.gz"] = f"data/{database}/biosample.tsv"