Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add script and process for generating low-coverage bed file #13

Merged
merged 9 commits into from
Sep 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*~
.nextflow*
work
test_input
Expand Down
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@ flowchart TD
fastp -- trimmed_reads --> tbprofiler
tbprofiler -- vcf --> snpit
tbprofiler -- bam --> qualimap_bamqc
tbprofiler -- bam --> mpileup
mpileup -- depths --> plot_coverage
mpileup -- depths --> generate_low_coverage_bed
```

## Usage
Expand Down Expand Up @@ -40,8 +43,10 @@ The following files will be produced for each sample:
.
└── sample-01
├── sample-01_TIMESTAMP_provenance.yml
├── sample-01_coverage_plot.png
├── sample-01_fastp.csv
├── sample-01_fastp.json
├── sample-01_low_coverage_regions.bed
├── sample-01_qualimap_alignment_qc.csv
├── sample-01_snpit.tsv
├── sample-01_tbprofiler.bam
Expand All @@ -50,6 +55,7 @@ The following files will be produced for each sample:
├── sample-01_tbprofiler_full_report.json
├── sample-01_tbprofiler_lineage.csv
├── sample-01_tbprofiler_resistance.csv
├── sample-01_tbprofiler_resistance_mutations.csv
├── sample-01_tbprofiler_summary.csv
├── sample-01_tbprofiler_targets.vcf
└── sample-01_tbprofiler_whole_genome.vcf
Expand Down
55,146 changes: 55,146 additions & 0 deletions assets/NC_000962.3.fa

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions assets/resistance_genes.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
Chromosome 5240 7267 gyrB 0 +
Chromosome 7302 9818 gyrA 0 +
Chromosome 490783 491793 fgd1 0 +
Chromosome 759807 763325 rpoB 0 +
Chromosome 763370 767320 rpoC 0 +
Chromosome 778990 779487 mmpR5 0 +
Chromosome 781560 781934 rpsL 0 +
Chromosome 800809 801462 rplC 0 +
Chromosome 1416181 1417347 embR 0 -
Chromosome 1471846 1473382 rrs 0 +
Chromosome 1473658 1476795 rrl 0 +
Chromosome 1673440 1674183 fabG1 0 +
Chromosome 1674202 1675011 inhA 0 +
Chromosome 1833542 1834987 rpsA 0 +
Chromosome 1917940 1918746 tlyA 0 +
Chromosome 2153889 2156111 katG 0 -
Chromosome 2288681 2289241 pncA 0 -
Chromosome 2518115 2519365 kasA 0 +
Chromosome 2714124 2715332 eis 0 -
Chromosome 2726193 2726780 ahpC 0 +
Chromosome 2746135 2747598 folC 0 -
Chromosome 2986839 2987615 ribD 0 +
Chromosome 3067193 3067945 thyX 0 -
Chromosome 3073680 3074471 thyA 0 -
Chromosome 3086820 3087935 ald 0 +
Chromosome 3640543 3641538 fbiA 0 +
Chromosome 3840194 3841420 alr 0 -
Chromosome 3986844 3987299 ddn 0 +
Chromosome 4043862 4044281 panD 0 -
Chromosome 4239863 4243147 embC 0 +
Chromosome 4243233 4246517 embA 0 +
Chromosome 4246514 4249810 embB 0 +
Chromosome 4326004 4327473 ethA 0 -
Chromosome 4327549 4328199 ethR 0 +
Chromosome 4407528 4408202 gid 0 -
73,527 changes: 73,527 additions & 0 deletions assets/tbdb_genome.fa

Large diffs are not rendered by default.

35 changes: 35 additions & 0 deletions bin/generate_low_coverage_bed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3

import argparse
import csv
import sys

def main(args):
writer = csv.DictWriter(sys.stdout, fieldnames=['chrom', 'start', 'end'], delimiter='\t')
current_bed_record = None
with open(args.input, 'r') as f:
header = f.readline()
for line in f:
line_split = line.strip().split('\t')
chrom = line_split[0]
pos = int(line_split[1]) - 1
depth = int(line_split[3])
if depth < args.threshold:
if current_bed_record is None:
current_bed_record = {
'chrom': chrom,
'start': pos,
}
else:
if current_bed_record is not None:
current_bed_record['end'] = pos
writer.writerow(current_bed_record)
current_bed_record = None


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Summarize depth of coverage')
parser.add_argument('-i', '--input', help='Input file', required=True)
parser.add_argument('-t', '--threshold', type=int, default=10, help='Threshold for depth of coverage')
args = parser.parse_args()
main(args)
165 changes: 165 additions & 0 deletions bin/plot_coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
#!/usr/bin/env python3

import argparse
import csv

from pathlib import Path
from typing import Optional

import matplotlib
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
import pandas as pd
import seaborn as sns

from matplotlib.patches import Rectangle

def parse_bed(bed_path: Path) -> dict:
"""
Parse BED file

:param bed: BED file
:type bed: Path
:return: Dictionary of genes by name
:rtype: dict
"""
genes_by_name = {}
with open(bed_path, 'r') as f:
for line in f:
line_split = line.strip().split('\t')
chrom = line_split[0]
start = int(line_split[1])
end = int(line_split[2])
name = line_split[3]
strand = line_split[5]
genes_by_name[name] = {
'name': name,
'chrom': chrom,
'start': start,
'end': end,
'strand': strand,
}

return genes_by_name


def plot_coverage(depths: pd.DataFrame, resistance_genes: dict={}, sample_name: str="Sample", threshold: int=10, rolling_window: int=100, y_limit: int=500, log_scale: bool=False) -> matplotlib.figure.Figure:
"""
Plot coverage

:param depths: DataFrame of depths (columns: chrom, pos, depth)
:type depths: pandas.DataFrame
:param resistance_genes: Dictionary of resistance genes by name
:type resistance_genes: dict
:param sample_name: Sample name
:type sample_name: str
:param threshold: Threshold for depth of coverage
:type threshold: int
:param rolling_window: Rolling window for coverage
:type rolling_window: int
:param log_scale: Log scale y-axis (default: False)
:type log_scale: bool
:return: matplotlib figure
:rtype: matplotlib.figure.Figure
"""
percent_coverage_above_threshold = (
sum(1 if x > threshold else 0 for x in depths.depth)
/ depths.shape[0]
* 100
)

if resistance_genes:
coverage_plot, (ax_depth, ax_genes) = plt.subplots(2, gridspec_kw={'height_ratios': [10, 1]}, figsize=(64, 8), sharex=True)
depth_plot = sns.lineplot(data=depths, x="pos", y="depth", linewidth=0.5, ax=ax_depth)
threshold_line = ax_depth.axhline(y=threshold, color="red", linestyle="--", linewidth=0.5)
ax_depth.set_ylabel("Depth of coverage")
ax_depth.set_title(f"Percent bases with coverage above {threshold}X: {percent_coverage_above_threshold: .1f}% | Rolling window: {rolling_window} nt")
coverage_plot.suptitle(f"Ref: {depths.iloc[0].chrom} | Sample: {sample_name}")
if log_scale:
ax_depth.set_yscale('log')
ax_depth.set_ylim(1, y_limit)
ax_depth.set_ylabel("Depth of coverage (log scale)")

prev_gene_end = 0
prev_gene_name_vertical_padding = 0
for gene_name, gene in resistance_genes.items():
print(gene)
if gene['strand'] == '+':
gene_color = 'blue'
gene_vertical_padding = 3
else:
gene_color = 'red'
gene_vertical_padding = 2
gene_rectangle = Rectangle((gene['start'], gene_vertical_padding), 1000, 1, color=gene_color, alpha=0.5)
rx, ry = gene_rectangle.get_xy()
cx = rx + gene_rectangle.get_width() / 2.0
cy = ry + gene_rectangle.get_height() / 2.0
ax_genes.add_patch(gene_rectangle)
if gene['start'] - prev_gene_end < 10000:
if gene['strand'] == '+':
gene_name_vertical_padding = prev_gene_name_vertical_padding + 1
else:
gene_name_vertical_padding = prev_gene_name_vertical_padding - 1
else:
if gene['strand'] == '+':
gene_name_vertical_padding = 1
else:
gene_name_vertical_padding = -1
ax_genes.annotate(gene_name, (cx, cy + gene_name_vertical_padding), color='black', fontsize=6, ha='center', va='center')
prev_gene_end = gene['start'] + 1000
prev_gene_name_vertical_padding = gene_name_vertical_padding

ax_genes.set_ylim(top=6, bottom=0)
ax_genes.yaxis.set_visible(False)
ax_genes.xaxis.set_visible(False)
else:
coverage_plot = plt.figure(figsize=(64, 8))
line_plot = sns.lineplot(data=depths, x="pos", y="depth", linewidth=0.5)
plt.axhline(y=threshold, color="red", linestyle="--", linewidth=0.5)
plt.ylabel("Depth of coverage")
if log_scale:
plt.yscale("log")
plt.ylim(1, y_limit)
plt.ylabel("Depth of coverage (log scale)")
else:
plt.ylim(0, y_limit)
plt.ylabel("Depth of coverage")
plt.title(
f"Percent bases with coverage above {threshold}X: {percent_coverage_above_threshold: .1f}% | Rolling window: {rolling_window} nt"
)
plt.suptitle(f"Ref: {depths.iloc[0].chrom} | Sample: {sample_name}")

# line_plot.set_xticks(range(0, depths.shape[0], 100000), minor=True)

plt.close()

return coverage_plot


def main(args):

all_depths = pd.read_csv(args.input, sep="\t")
rolling_window_depths = all_depths.assign(
depth=lambda x: x.depth.rolling(args.rolling_window, center=True).mean()
)

resistance_genes = {}
if args.genes is not None:
resistance_genes = parse_bed(args.genes)

coverage_plot = plot_coverage(rolling_window_depths, resistance_genes, args.sample_id, args.threshold, args.rolling_window, args.y_limit, args.log_scale)
coverage_plot.savefig(f"{args.sample_id}_coverage_plot.svg")
coverage_plot.savefig(f"{args.sample_id}_coverage_plot.png")


if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Plot coverage')
parser.add_argument('-i', '--input', help='Input file', required=True)
parser.add_argument('-g', '--genes', type=Path, help='Resistance genes (bed file)')
parser.add_argument('-s', '--sample-id', default="Sample", help='Sample ID')
parser.add_argument('-t', '--threshold', type=int, default=10, help='Threshold for depth of coverage')
parser.add_argument('-r', '--rolling-window', type=int, default=100, help='Rolling window for coverage')
parser.add_argument('--log-scale', action='store_true', help='Log scale y-axis')
parser.add_argument('--y-limit', type=int, default=500, help='Maximum y-axis value in plot (default: 500)')
args = parser.parse_args()
main(args)
7 changes: 7 additions & 0 deletions environments/seaborn.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
name: seaborn
channels:
- conda-forge
- bioconda
- defaults
dependencies:
- seaborn=0.12.2
42 changes: 28 additions & 14 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,20 @@ import java.time.LocalDateTime

nextflow.enable.dsl = 2

include { fastp } from './modules/tbprofiler.nf'
include { tbprofiler } from './modules/tbprofiler.nf'
include { rename_ref_in_alignment } from './modules/tbprofiler.nf'
include { fastp } from './modules/tbprofiler.nf'
include { tbprofiler } from './modules/tbprofiler.nf'
include { snpit } from './modules/tbprofiler.nf'
include { rename_ref_in_alignment } from './modules/tbprofiler.nf'
include { rename_ref_in_variants as rename_ref_in_targets_variants } from './modules/tbprofiler.nf'
include { rename_ref_in_variants as rename_ref_in_whole_genome_variants } from './modules/tbprofiler.nf'
include { qualimap_bamqc } from './modules/tbprofiler.nf'
include { pipeline_provenance } from './modules/provenance.nf'
include { collect_provenance } from './modules/provenance.nf'
include { qualimap_bamqc } from './modules/tbprofiler.nf'
include { mpileup } from './modules/tbprofiler.nf'
include { plot_coverage } from './modules/tbprofiler.nf'
include { generate_low_coverage_bed } from './modules/tbprofiler.nf'
include { pipeline_provenance } from './modules/provenance.nf'
include { collect_provenance } from './modules/provenance.nf'


include { snpit } from './modules/tbprofiler.nf'

workflow {

Expand All @@ -29,25 +33,35 @@ workflow {
ch_fastq = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] }
}

ch_resistance_genes_bed = Channel.fromPath("${baseDir}/assets/resistance_genes.bed")

main:
fastp(ch_fastq)

tbprofiler(fastp.out.reads)

if (params.rename_ref) {
ch_ref = Channel.fromPath("${baseDir}/assets/NC_000962.3.fa")
rename_ref_in_alignment(tbprofiler.out.alignment)
rename_ref_in_targets_variants(tbprofiler.out.targets_vcf)
rename_ref_in_whole_genome_variants(tbprofiler.out.whole_genome_vcf)
qualimap_bamqc(rename_ref_in_alignment.out)
ch_alignment = rename_ref_in_alignment.out
ch_whole_genome_variants = rename_ref_in_whole_genome_variants.out
} else {
qualimap_bamqc(tbprofiler.out.alignment)
ch_ref = Channel.fromPath("${baseDir}/assets/tbdb_genome.fa")
ch_alignment = tbprofiler.out.alignment
ch_whole_genome_variants = tbprofiler.out.whole_genome_vcf
}

if (params.rename_ref) {
snpit(rename_ref_in_whole_genome_variants.out)
} else {
snpit(tbprofiler.out.whole_genome_vcf)
}
snpit(ch_whole_genome_variants)

qualimap_bamqc(ch_alignment)

ch_depths = mpileup(ch_alignment.combine(ch_ref))

plot_coverage(ch_depths.combine(ch_resistance_genes_bed))

generate_low_coverage_bed(ch_depths)

ch_provenance = fastp.out.provenance
ch_provenance = ch_provenance.join(tbprofiler.out.provenance).map{ it -> [it[0], [it[1], it[2]]] }
Expand Down
Loading