-
Notifications
You must be signed in to change notification settings - Fork 0
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add reading alevin metadata (and refactor) #26
Changes from 13 commits
5fd0a58
79380eb
eafbf1b
06d8519
d129150
4f6feca
37acf26
661ab33
f043128
be0349b
0e287ec
615b7ef
f98eaaa
290a226
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -3,7 +3,7 @@ | |||||||||
#' @param quant_dir Path to directory where output files are located. | ||||||||||
#' @param intron_mode Logical indicating if the files included alignment to intronic regions. | ||||||||||
#' Default is FALSE. | ||||||||||
#' @param usa_mode Logical indicating if Alevin-fry was used, if the USA mode was invoked. | ||||||||||
#' @param usa_mode Logical indicating if Alevin-fry was used, if USA mode was invoked. | ||||||||||
#' Default is FALSE. | ||||||||||
#' @param which_counts Which type of counts should be included, | ||||||||||
#' only counts aligned to spliced cDNA ("spliced") or all spliced and unspliced cDNA ("unspliced"). | ||||||||||
|
@@ -13,6 +13,9 @@ | |||||||||
#' @return SingleCellExperiment of unfiltered gene x cell counts matrix. | ||||||||||
#' @export | ||||||||||
#' | ||||||||||
#' @import SingleCellExperiment | ||||||||||
#' @import SummarizedExperiment | ||||||||||
#' | ||||||||||
#' @examples | ||||||||||
#' \dontrun{ | ||||||||||
#' | ||||||||||
|
@@ -53,53 +56,41 @@ read_alevin <- function(quant_dir, | |||||||||
stop("Can only read counts using either usa mode or intron mode.") | ||||||||||
} | ||||||||||
|
||||||||||
if(usa_mode) { | ||||||||||
# read in counts using read_usa mode | ||||||||||
counts <- read_usa_mode(quant_dir, which_counts) | ||||||||||
|
||||||||||
} else { | ||||||||||
# use tximport for all non-usa mode | ||||||||||
alevin_files <- c("quants_mat_cols.txt", "quants_mat_rows.txt", "quants_mat.gz") | ||||||||||
|
||||||||||
# check that all files exist in quant directory | ||||||||||
if(!dir.exists(file.path(quant_dir, "alevin"))){ | ||||||||||
stop("Missing alevin directory with output files") | ||||||||||
} | ||||||||||
|
||||||||||
missing <- !file.exists(file.path(quant_dir, "alevin", alevin_files)) | ||||||||||
if(any(missing)) { | ||||||||||
missing_files <- paste(alevin_files[missing], collapse = ", ") | ||||||||||
stop(paste0("Missing Alevin output file(s): ", missing_files)) | ||||||||||
} | ||||||||||
|
||||||||||
if(!file.exists(file.path(quant_dir, "cmd_info.json"))){ | ||||||||||
stop("Missing cmd_info.json in Alevin output directory") | ||||||||||
} | ||||||||||
# check that the expected quant directory exists | ||||||||||
if(!dir.exists(file.path(quant_dir, "alevin"))){ | ||||||||||
stop("Missing alevin directory with output files") | ||||||||||
} | ||||||||||
|
||||||||||
txi <- suppressMessages(tximport::tximport(file.path(quant_dir, "alevin", "quants_mat.gz"), type = "alevin")) | ||||||||||
counts <- txi$counts | ||||||||||
# read metadata | ||||||||||
meta <- read_alevin_metadata(quant_dir) | ||||||||||
|
||||||||||
# collapse intron counts for intron_mode = TRUE | ||||||||||
if (intron_mode) { | ||||||||||
counts <- collapse_intron_counts(counts, which_counts) | ||||||||||
# Read the count data | ||||||||||
if(usa_mode) { | ||||||||||
if(meta$usa_mode != TRUE){ | ||||||||||
stop("Output files not in USA mode") | ||||||||||
} | ||||||||||
counts <- read_alevin_mtx(quant_dir) | ||||||||||
} else { | ||||||||||
counts <- read_tximport(quant_dir) | ||||||||||
} | ||||||||||
sce <- SingleCellExperiment(list(counts = counts)) | ||||||||||
if (intron_mode | usa_mode) { | ||||||||||
counts <- collapse_intron_counts(counts, which_counts) | ||||||||||
meta$transcript_type <- which_counts | ||||||||||
} | ||||||||||
|
||||||||||
# make the SCE object | ||||||||||
sce <- SingleCellExperiment(assays = list(counts = counts), | ||||||||||
metadata = meta) | ||||||||||
return(sce) | ||||||||||
} | ||||||||||
|
||||||||||
#' Read in counts data processed with Alevin-fry in USA mode | ||||||||||
#' Read in counts data processed with Alevin-fry in with mtx output. | ||||||||||
#' | ||||||||||
#' @param quant_dir Path to directory where output files are located. | ||||||||||
#' @param which_counts Which type of counts should be included, | ||||||||||
#' only counts aligned to spliced cDNA ("spliced") or all spliced and unspliced cDNA ("unspliced"). | ||||||||||
#' Default is "spliced". | ||||||||||
#' @param quant_dir Path to alevin output directory. | ||||||||||
#' | ||||||||||
#' @return unfiltered gene x cell counts matrix | ||||||||||
#' @return unfiltered and uncollapsed gene x cell counts matrix | ||||||||||
#' | ||||||||||
read_usa_mode <- function(quant_dir, | ||||||||||
which_counts = c("spliced", "unspliced")){ | ||||||||||
which_counts <- match.arg(which_counts) | ||||||||||
read_alevin_mtx <- function(quant_dir){ | ||||||||||
|
||||||||||
# check that all files exist in quant_dir | ||||||||||
alevin_files <- c("quants_mat_cols.txt", "quants_mat_rows.txt", "quants_mat.mtx") | ||||||||||
|
@@ -115,29 +106,101 @@ read_usa_mode <- function(quant_dir, | |||||||||
stop(paste0("Missing Alevin output file(s): ", missing_files)) | ||||||||||
} | ||||||||||
|
||||||||||
# read in .mtx files | ||||||||||
counts <- Matrix::readMM(file = file.path(quant_dir, "alevin", "quants_mat.mtx"))%>% | ||||||||||
Matrix::t() %>% | ||||||||||
as("dgCMatrix") | ||||||||||
cols <- readLines(file.path(quant_dir, "alevin", "quants_mat_cols.txt")) | ||||||||||
rows <- readLines(file.path(quant_dir, "alevin", "quants_mat_rows.txt")) | ||||||||||
dimnames(counts) <- list(cols, rows) | ||||||||||
return(counts) | ||||||||||
} | ||||||||||
|
||||||||||
#' Read in counts data processed with Alevin or alevin-fry in tximport-compatible formats | ||||||||||
#' | ||||||||||
#' @param quant_dir Path to alevin output directory. | ||||||||||
#' | ||||||||||
#' @return unfiltered & uncollapsed gene x cell counts matrix | ||||||||||
#' | ||||||||||
read_tximport <- function(quant_dir){ | ||||||||||
|
||||||||||
# check that all files exist in quant_dir | ||||||||||
# use tximport for all non-usa mode | ||||||||||
alevin_files <- c("quants_mat_cols.txt", "quants_mat_rows.txt", "quants_mat.gz") | ||||||||||
|
||||||||||
missing <- !file.exists(file.path(quant_dir, "alevin", alevin_files)) | ||||||||||
if(any(missing)) { | ||||||||||
missing_files <- paste(alevin_files[missing], collapse = ", ") | ||||||||||
stop(paste0("Missing Alevin output file(s): ", missing_files)) | ||||||||||
} | ||||||||||
|
||||||||||
txi <- suppressMessages(tximport::tximport( | ||||||||||
file.path(quant_dir, "alevin", "quants_mat.gz"), | ||||||||||
type = "alevin" | ||||||||||
)) | ||||||||||
counts <- txi$counts | ||||||||||
} | ||||||||||
|
||||||||||
#' Read alevin metadata from json files | ||||||||||
#' | ||||||||||
#' @param quant_dir Path alevin output directory. | ||||||||||
#' | ||||||||||
#' @return A list containing alevin run metadata, | ||||||||||
#' with NULL values for missing elements. | ||||||||||
#' | ||||||||||
#' @noRd | ||||||||||
read_alevin_metadata <- function(quant_dir){ | ||||||||||
cmd_info_path <- file.path(quant_dir, "cmd_info.json") | ||||||||||
permit_json_path <- file.path(quant_dir, "generate_permit_list.json") | ||||||||||
collate_json_path <- file.path(quant_dir, "collate.json") | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we even need this file? We don't need any of the metadata from it so unless we just want to check it exists for completeness of the run I think we can remove this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, I guess we never use it... |
||||||||||
quant_json_path <- file.path(quant_dir, "quant.json") | ||||||||||
if(!file.exists(quant_json_path)){ | ||||||||||
# file for alevin-fry < 0.4.1 | ||||||||||
quant_json_path <- file.path(quant_dir, "meta_info.json") | ||||||||||
if(!file.exists(quant_json_path)){ | ||||||||||
stop("Missing quant.json (or meta_info.json) in Alevin output directory") | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
# check that USA mode is true in JSON file | ||||||||||
quant_json <- jsonlite::fromJSON(quant_json_path) | ||||||||||
if(quant_json$usa_mode != "TRUE"){ | ||||||||||
stop("Output files not in USA mode") | ||||||||||
# get cmd_info, which should always be present | ||||||||||
if (file.exists(cmd_info_path)){ | ||||||||||
cmd_info <- jsonlite::read_json(cmd_info_path) | ||||||||||
} else { | ||||||||||
stop("cmd_info.json is missing") | ||||||||||
} | ||||||||||
|
||||||||||
# read in .mtx files | ||||||||||
mtx <- Matrix::readMM(file = file.path(quant_dir, "alevin", "quants_mat.mtx"))%>% | ||||||||||
Matrix::t() %>% | ||||||||||
as("dgCMatrix") | ||||||||||
cols <- readLines(file.path(quant_dir, "alevin", "quants_mat_cols.txt")) | ||||||||||
rows <- readLines(file.path(quant_dir, "alevin", "quants_mat_rows.txt")) | ||||||||||
dimnames(mtx) <- list(cols, rows) | ||||||||||
# Read other info files if they exist. Otherwise, create dummy values | ||||||||||
if (file.exists(permit_json_path)){ | ||||||||||
permit_info <- jsonlite::read_json(permit_json_path) | ||||||||||
} else { | ||||||||||
permit_info <- list() | ||||||||||
} | ||||||||||
if (file.exists(quant_json_path)){ | ||||||||||
collate_info <- jsonlite::read_json(collate_json_path) | ||||||||||
} else { | ||||||||||
collate_info <- list() | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Based on my comment above, we could remove this. |
||||||||||
} | ||||||||||
if (file.exists(quant_json_path)){ | ||||||||||
quant_info <- jsonlite::read_json(quant_json_path) | ||||||||||
} else { | ||||||||||
quant_info <- list() | ||||||||||
} | ||||||||||
|
||||||||||
counts <- collapse_intron_counts(mtx, which_counts) | ||||||||||
return(counts) | ||||||||||
# Create a metadata list | ||||||||||
meta <- list(salmon_version = cmd_info[['salmon_version']], | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
This technically has a colon at the end of only the most recent salmon versions. To account for the variation we can use the $ instead. I submitted an issue on Salmon's repo about this bug COMBINE-lab/salmon#691. |
||||||||||
reference_index = cmd_info[['index']]) | ||||||||||
|
||||||||||
# if we have permit_info data, we used alevin-fry, otherwise alevin | ||||||||||
if (length(permit_info) == 0){ | ||||||||||
meta$mapping_tool <- "alevin" | ||||||||||
} else { | ||||||||||
meta$mapping_tool <- "alevin-fry" | ||||||||||
} | ||||||||||
|
||||||||||
# Add other metadata | ||||||||||
# assume all alevin-fry tool versions are the same | ||||||||||
meta$alevinfry_version <- permit_info[['version_str']] | ||||||||||
meta$af_permit_type <- permit_info[['permit-list-type']] | ||||||||||
meta$af_resolution <- quant_info[['resolution_strategy']] | ||||||||||
meta$usa_mode <- quant_info[['usa_mode']] | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
If you like my earlier suggestion of including the transcript type in this function. |
||||||||||
|
||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Do we want to add a line to grab the tx2gene map information if we are using alevin? That will also help grab some of the version information since the names of those files have the ensembl version in them. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the idea of this, but for I think we probably want to explore other ways of passing along this information along the workflow... not sure the best way as of now. |
||||||||||
return(meta) | ||||||||||
} | ||||||||||
|
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
This file was deleted.
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -15,17 +15,23 @@ test_that("reading salmon alevin data works", { | |||||||||
# check that column names are barcodes | ||||||||||
col_barcode <- str_detect(colnames(sce), "^[ACGT]+$") | ||||||||||
expect_true(all(col_barcode)) | ||||||||||
expect_equal(sce@metadata$mapping_tool, "alevin") | ||||||||||
expect_null(sce@metadata$alevin_fry_version) | ||||||||||
}) | ||||||||||
|
||||||||||
test_that("reading alevin-fry USA mode works", { | ||||||||||
sce <- read_alevin(usa_dir, | ||||||||||
usa_mode = TRUE, | ||||||||||
which_counts = "spliced") | ||||||||||
which_counts = "spliced") | ||||||||||
expect_s4_class(sce, "SingleCellExperiment") | ||||||||||
expect_equal(dim(sce), sce_af_size) | ||||||||||
# check that column names are barcodes | ||||||||||
col_barcode <- str_detect(colnames(sce), "^[ACGT]+$") | ||||||||||
expect_true(all(col_barcode)) | ||||||||||
# check metadata | ||||||||||
expect_equal(sce@metadata$mapping_tool, "alevin-fry") | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Probably a good idea to add checks that the salmon version and reference index aren't empty to ensure the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yup, good idea... made me realize the line after is wrong though: should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh, actually, I know why I wasn't testing the reference index: it would fail with some versions where that data wasn't reported properly... so null values there are possible. (this was also why I wasn't testing for salmon version probably. But that is there (if sometimes wrong) in all cases. |
||||||||||
expect_equal(sce@metadata$transcript_type, "spliced") | ||||||||||
|
||||||||||
# no remaining unspliced | ||||||||||
unmerged_genes <- str_subset(rownames(sce), "-[IUA]$") | ||||||||||
expect_length(unmerged_genes, 0) | ||||||||||
|
@@ -41,6 +47,10 @@ test_that("reading alevin-fry intron mode works", { | |||||||||
# check that column names are barcodes | ||||||||||
col_barcode <- stringr::str_detect(colnames(sce), "^[ACGT]+$") | ||||||||||
expect_true(all(col_barcode)) | ||||||||||
# check metadata | ||||||||||
expect_equal(sce@metadata$mapping_tool, "alevin-fry") | ||||||||||
expect_equal(sce@metadata$transcript_type, "unspliced") | ||||||||||
|
||||||||||
# no remaining unspliced | ||||||||||
unmerged_genes <- str_subset(rownames(sce), "-[IUA]$") | ||||||||||
expect_length(unmerged_genes, 0) | ||||||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we want to add transcript type during this part why don't we include it as an argument here and then append it later?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought about doing this (it was there in a pervious version) but I think my logic was that this function is (by its name) about reading metadata, so anything not read in could be added separately.