Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add reading alevin metadata (and refactor) #26

Merged
merged 14 commits into from
Jul 30, 2021
171 changes: 117 additions & 54 deletions R/read_alevin.R
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#' @param quant_dir Path to directory where output files are located.
#' @param intron_mode Logical indicating if the files included alignment to intronic regions.
#' Default is FALSE.
#' @param usa_mode Logical indicating if Alevin-fry was used, if the USA mode was invoked.
#' @param usa_mode Logical indicating if Alevin-fry was used, if USA mode was invoked.
#' Default is FALSE.
#' @param which_counts Which type of counts should be included,
#' only counts aligned to spliced cDNA ("spliced") or all spliced and unspliced cDNA ("unspliced").
Expand All @@ -13,6 +13,9 @@
#' @return SingleCellExperiment of unfiltered gene x cell counts matrix.
#' @export
#'
#' @import SingleCellExperiment
#' @import SummarizedExperiment
#'
#' @examples
#' \dontrun{
#'
Expand Down Expand Up @@ -53,53 +56,41 @@ read_alevin <- function(quant_dir,
stop("Can only read counts using either usa mode or intron mode.")
}

if(usa_mode) {
# read in counts using read_usa mode
counts <- read_usa_mode(quant_dir, which_counts)

} else {
# use tximport for all non-usa mode
alevin_files <- c("quants_mat_cols.txt", "quants_mat_rows.txt", "quants_mat.gz")

# check that all files exist in quant directory
if(!dir.exists(file.path(quant_dir, "alevin"))){
stop("Missing alevin directory with output files")
}

missing <- !file.exists(file.path(quant_dir, "alevin", alevin_files))
if(any(missing)) {
missing_files <- paste(alevin_files[missing], collapse = ", ")
stop(paste0("Missing Alevin output file(s): ", missing_files))
}

if(!file.exists(file.path(quant_dir, "cmd_info.json"))){
stop("Missing cmd_info.json in Alevin output directory")
}
# check that the expected quant directory exists
if(!dir.exists(file.path(quant_dir, "alevin"))){
stop("Missing alevin directory with output files")
}

txi <- suppressMessages(tximport::tximport(file.path(quant_dir, "alevin", "quants_mat.gz"), type = "alevin"))
counts <- txi$counts
# read metadata
meta <- read_alevin_metadata(quant_dir)

# collapse intron counts for intron_mode = TRUE
if (intron_mode) {
counts <- collapse_intron_counts(counts, which_counts)
# Read the count data
if(usa_mode) {
if(meta$usa_mode != TRUE){
stop("Output files not in USA mode")
}
counts <- read_alevin_mtx(quant_dir)
} else {
counts <- read_tximport(quant_dir)
}
sce <- SingleCellExperiment(list(counts = counts))
if (intron_mode | usa_mode) {
counts <- collapse_intron_counts(counts, which_counts)
meta$transcript_type <- which_counts
}

# make the SCE object
sce <- SingleCellExperiment(assays = list(counts = counts),
metadata = meta)
return(sce)
}

#' Read in counts data processed with Alevin-fry in USA mode
#' Read in counts data processed with Alevin-fry in with mtx output.
#'
#' @param quant_dir Path to directory where output files are located.
#' @param which_counts Which type of counts should be included,
#' only counts aligned to spliced cDNA ("spliced") or all spliced and unspliced cDNA ("unspliced").
#' Default is "spliced".
#' @param quant_dir Path to alevin output directory.
#'
#' @return unfiltered gene x cell counts matrix
#' @return unfiltered and uncollapsed gene x cell counts matrix
#'
read_usa_mode <- function(quant_dir,
which_counts = c("spliced", "unspliced")){
which_counts <- match.arg(which_counts)
read_alevin_mtx <- function(quant_dir){

# check that all files exist in quant_dir
alevin_files <- c("quants_mat_cols.txt", "quants_mat_rows.txt", "quants_mat.mtx")
Expand All @@ -115,29 +106,101 @@ read_usa_mode <- function(quant_dir,
stop(paste0("Missing Alevin output file(s): ", missing_files))
}

# read in .mtx files
counts <- Matrix::readMM(file = file.path(quant_dir, "alevin", "quants_mat.mtx"))%>%
Matrix::t() %>%
as("dgCMatrix")
cols <- readLines(file.path(quant_dir, "alevin", "quants_mat_cols.txt"))
rows <- readLines(file.path(quant_dir, "alevin", "quants_mat_rows.txt"))
dimnames(counts) <- list(cols, rows)
return(counts)
}

#' Read in counts data processed with Alevin or alevin-fry in tximport-compatible formats
#'
#' @param quant_dir Path to alevin output directory.
#'
#' @return unfiltered & uncollapsed gene x cell counts matrix
#'
read_tximport <- function(quant_dir){

# check that all files exist in quant_dir
# use tximport for all non-usa mode
alevin_files <- c("quants_mat_cols.txt", "quants_mat_rows.txt", "quants_mat.gz")

missing <- !file.exists(file.path(quant_dir, "alevin", alevin_files))
if(any(missing)) {
missing_files <- paste(alevin_files[missing], collapse = ", ")
stop(paste0("Missing Alevin output file(s): ", missing_files))
}

txi <- suppressMessages(tximport::tximport(
file.path(quant_dir, "alevin", "quants_mat.gz"),
type = "alevin"
))
counts <- txi$counts
}

#' Read alevin metadata from json files
#'
#' @param quant_dir Path alevin output directory.
#'
#' @return A list containing alevin run metadata,
#' with NULL values for missing elements.
#'
#' @noRd
read_alevin_metadata <- function(quant_dir){
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
read_alevin_metadata <- function(quant_dir){
read_alevin_metadata <- function(quant_dir, which_counts = c("spliced", "unspliced"){

If we want to add transcript type during this part why don't we include it as an argument here and then append it later?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought about doing this (it was there in a pervious version) but I think my logic was that this function is (by its name) about reading metadata, so anything not read in could be added separately.

cmd_info_path <- file.path(quant_dir, "cmd_info.json")
permit_json_path <- file.path(quant_dir, "generate_permit_list.json")
collate_json_path <- file.path(quant_dir, "collate.json")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we even need this file? We don't need any of the metadata from it so unless we just want to check it exists for completeness of the run I think we can remove this.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I guess we never use it...

quant_json_path <- file.path(quant_dir, "quant.json")
if(!file.exists(quant_json_path)){
# file for alevin-fry < 0.4.1
quant_json_path <- file.path(quant_dir, "meta_info.json")
if(!file.exists(quant_json_path)){
stop("Missing quant.json (or meta_info.json) in Alevin output directory")
}
}

# check that USA mode is true in JSON file
quant_json <- jsonlite::fromJSON(quant_json_path)
if(quant_json$usa_mode != "TRUE"){
stop("Output files not in USA mode")
# get cmd_info, which should always be present
if (file.exists(cmd_info_path)){
cmd_info <- jsonlite::read_json(cmd_info_path)
} else {
stop("cmd_info.json is missing")
}

# read in .mtx files
mtx <- Matrix::readMM(file = file.path(quant_dir, "alevin", "quants_mat.mtx"))%>%
Matrix::t() %>%
as("dgCMatrix")
cols <- readLines(file.path(quant_dir, "alevin", "quants_mat_cols.txt"))
rows <- readLines(file.path(quant_dir, "alevin", "quants_mat_rows.txt"))
dimnames(mtx) <- list(cols, rows)
# Read other info files if they exist. Otherwise, create dummy values
if (file.exists(permit_json_path)){
permit_info <- jsonlite::read_json(permit_json_path)
} else {
permit_info <- list()
}
if (file.exists(quant_json_path)){
collate_info <- jsonlite::read_json(collate_json_path)
} else {
collate_info <- list()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if (file.exists(quant_json_path)){
collate_info <- jsonlite::read_json(collate_json_path)
} else {
collate_info <- list()

Based on my comment above, we could remove this.

}
if (file.exists(quant_json_path)){
quant_info <- jsonlite::read_json(quant_json_path)
} else {
quant_info <- list()
}

counts <- collapse_intron_counts(mtx, which_counts)
return(counts)
# Create a metadata list
meta <- list(salmon_version = cmd_info[['salmon_version']],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
meta <- list(salmon_version = cmd_info[['salmon_version']],
meta <- list(salmon_version = cmd_info$salmon_version,

This technically has a colon at the end of only the most recent salmon versions. To account for the variation we can use the $ instead. I submitted an issue on Salmon's repo about this bug COMBINE-lab/salmon#691.

reference_index = cmd_info[['index']])

# if we have permit_info data, we used alevin-fry, otherwise alevin
if (length(permit_info) == 0){
meta$mapping_tool <- "alevin"
} else {
meta$mapping_tool <- "alevin-fry"
}

# Add other metadata
# assume all alevin-fry tool versions are the same
meta$alevinfry_version <- permit_info[['version_str']]
meta$af_permit_type <- permit_info[['permit-list-type']]
meta$af_resolution <- quant_info[['resolution_strategy']]
meta$usa_mode <- quant_info[['usa_mode']]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
meta$usa_mode <- quant_info[['usa_mode']]
meta$usa_mode <- quant_info[['usa_mode']]
meta$transcript_type <- which_counts

If you like my earlier suggestion of including the transcript type in this function.


Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
meta$alevin_tx2gene <- cmd_info[['tgMap']]

Do we want to add a line to grab the tx2gene map information if we are using alevin? That will also help grab some of the version information since the names of those files have the ensembl version in them.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I like the idea of this, but for --rad mode this might not actually be there (the argument is no longer required, as the file is unused). There should be an equivalent in the quant.json file, but there isn't a specific field, so we would have to parse it out of the command line call stored in "cmd". I don't love that.

I think we probably want to explore other ways of passing along this information along the workflow... not sure the best way as of now.

return(meta)
}

2 changes: 1 addition & 1 deletion man/read_alevin.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/read_alevin_mtx.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions man/read_tximport.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

21 changes: 0 additions & 21 deletions man/read_usa_mode.Rd

This file was deleted.

12 changes: 11 additions & 1 deletion tests/testthat/test-read_alevin.R
Original file line number Diff line number Diff line change
Expand Up @@ -15,17 +15,23 @@ test_that("reading salmon alevin data works", {
# check that column names are barcodes
col_barcode <- str_detect(colnames(sce), "^[ACGT]+$")
expect_true(all(col_barcode))
expect_equal(sce@metadata$mapping_tool, "alevin")
expect_null(sce@metadata$alevin_fry_version)
})

test_that("reading alevin-fry USA mode works", {
sce <- read_alevin(usa_dir,
usa_mode = TRUE,
which_counts = "spliced")
which_counts = "spliced")
expect_s4_class(sce, "SingleCellExperiment")
expect_equal(dim(sce), sce_af_size)
# check that column names are barcodes
col_barcode <- str_detect(colnames(sce), "^[ACGT]+$")
expect_true(all(col_barcode))
# check metadata
expect_equal(sce@metadata$mapping_tool, "alevin-fry")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
expect_equal(sce@metadata$mapping_tool, "alevin-fry")
expect_false(is.na(sce@metadata$salmon_version))
expect_false(is.na(sce@metadata$reference_index))
expect_equal(sce@metadata$mapping_tool, "alevin-fry")

Probably a good idea to add checks that the salmon version and reference index aren't empty to ensure the cmd_info.json was read in properly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yup, good idea... made me realize the line after is wrong though: should be is.null!

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, actually, I know why I wasn't testing the reference index: it would fail with some versions where that data wasn't reported properly... so null values there are possible. (this was also why I wasn't testing for salmon version probably. But that is there (if sometimes wrong) in all cases.

expect_equal(sce@metadata$transcript_type, "spliced")

# no remaining unspliced
unmerged_genes <- str_subset(rownames(sce), "-[IUA]$")
expect_length(unmerged_genes, 0)
Expand All @@ -41,6 +47,10 @@ test_that("reading alevin-fry intron mode works", {
# check that column names are barcodes
col_barcode <- stringr::str_detect(colnames(sce), "^[ACGT]+$")
expect_true(all(col_barcode))
# check metadata
expect_equal(sce@metadata$mapping_tool, "alevin-fry")
expect_equal(sce@metadata$transcript_type, "unspliced")

# no remaining unspliced
unmerged_genes <- str_subset(rownames(sce), "-[IUA]$")
expect_length(unmerged_genes, 0)
Expand Down