vignettes/Benchmark_msqrob2.Rmd
Benchmark_msqrob2.RmdWe benchmark the msqrob2 msqrobHurdle method across
three datasets. The hurdle model produces inference using
rlm from protein intensity estimates and uses
glm to obtain estimates in those cases where no
intensity-based estimates are found.
library(QFeatures)
library(msqrob2)
my_medianPolish <- function(x, verbose = FALSE, ...) {
medpol <- stats::medpolish(x, na.rm = TRUE, trace.iter = verbose, maxiter = 10)
return(medpol$overall + medpol$col)
}
msqrob2_col_map <- c(
"name" = "protein_id",
"contrast" = "contrast",
"logFC" = "log_fc",
"t" = "t_statistic",
"pval" = "p_value",
"FDR" = "p_value_adjusted"
)
ionstar_ground_truth <- list(
id_column = "protein_id",
positive = list(label = "ECOLI", pattern = "ECOLI"),
negative = list(label = "HUMAN", pattern = "HUMAN")
)
ionstar_contrasts_L <- makeContrast(
c("dilution.e-dilution.d=0",
"dilution.d-dilution.c=0",
"dilution.c-dilution.b=0",
"dilution.b=0"),
parameterNames = c("dilution.e",
"dilution.d",
"dilution.c",
"dilution.b"))
ionstar_contrast_map <- c(
"hurdle_dilution.e - dilution.d" = "dilution_(9/7.5)_1.2",
"hurdle_dilution.d - dilution.c" = "dilution_(7.5/6)_1.25",
"hurdle_dilution.c - dilution.b" = "dilution_(6/4.5)_1.3(3)",
"hurdle_dilution.b" = "dilution_(4.5/3)_1.5")
ionstar_abd_contrast_map <- c(
"avgAbd.e.d" = "dilution_(9/7.5)_1.2",
"avgAbd.d.c" = "dilution_(7.5/6)_1.25",
"avgAbd.c.b" = "dilution_(6/4.5)_1.3(3)",
"avgAbd.b.a" = "dilution_(4.5/3)_1.5")
#' Run msqrob2 hurdle and extract merged results
#'
#' @param lfqdataPeptideNorm normalized peptide-level LFQData
#' @param lfqdataProtein protein-level LFQData (for average abundances)
#' @param formula model formula
#' @param L contrast matrix from makeContrast
#' @param contrast_map named vector mapping hurdle contrast names to standard names
#' @param abd_contrast_map named vector mapping abundance contrast names to standard names
#' @param factor_col name of the factor column in data
run_msqrob2_hurdle <- function(lfqdataPeptideNorm, lfqdataProtein,
formula, L, contrast_map, abd_contrast_map,
factor_col) {
# Create QFeatures and aggregate
se <- prolfqua::LFQDataToSummarizedExperiment(lfqdata = lfqdataPeptideNorm)
pe <- QFeatures::QFeatures(list(peptide = se), colData = colData(se))
pe <- QFeatures::aggregateFeatures(
pe, i = "peptide", fcol = "protein_Id",
name = "protein", fun = my_medianPolish)
# Fit hurdle model
prlm <- msqrobHurdle(pe, i = "protein", formula = formula, overwrite = TRUE)
prlm <- hypothesisTestHurdle(prlm, i = "protein", L, overwrite = TRUE)
# Extract hurdle results
xx <- rowData(prlm[["protein"]])
hurdle <- xx[grepl("hurdle_", names(xx))]
res <- list()
for (i in names(hurdle)) {
hurdle[[i]]$contrast <- i
res[[i]] <- prolfqua::matrix_to_tibble(hurdle[[i]], preserve_row_names = "name")
}
hurdle <- dplyr::bind_rows(res)
# Merge intensity and count models
logFC <- hurdle |> dplyr::select("name", "contrast", starts_with("logFC"))
logFC <- dplyr::filter(logFC, !is.na(logFCt))
logFC$modelName <- "msqrobHurdleIntensity"
names(logFC) <- c("name", "contrast", "logFC", "se", "df", "t", "pval", "modelName")
logOR <- hurdle |> dplyr::select("name", "contrast", starts_with("logOR"))
logOR$modelName <- "msqrobHurdleCount"
names(logOR) <- c("name", "contrast", "logFC", "se", "df", "t", "pval", "modelName")
ddd <- dplyr::anti_join(logOR, logFC, by = c("name", "contrast"))
all <- dplyr::bind_rows(ddd, logFC) |> dplyr::arrange(contrast, name)
all <- prolfqua::adjust_p_values(all, column = "pval", group_by_col = "contrast")
# Remap contrast names
all <- all |> dplyr::mutate(
contrast = dplyr::case_when(
contrast %in% names(contrast_map) ~ contrast_map[contrast],
TRUE ~ contrast))
stopifnot(sum(all$contrast == "something wrong") == 0)
# Compute average abundances per contrast
st <- lfqdataProtein$get_Stats()
protAbundanceIngroup <- st$stats()
protAbundanceIngroup <- protAbundanceIngroup |>
tidyr::pivot_wider(
id_cols = protein_Id,
names_from = !!rlang::sym(factor_col),
names_prefix = "abd.",
values_from = meanAbundance)
# Compute average abundances for each contrast pair
abd_cols <- grep("^abd\\.", names(protAbundanceIngroup), value = TRUE)
abd_values <- sort(abd_cols)
for (nm in names(abd_contrast_map)) {
# Extract the two group suffixes from the abd_contrast_map name (e.g., "avgAbd.e.d" -> "e", "d")
parts <- strsplit(sub("^avgAbd\\.", "", nm), "\\.")[[1]]
col1 <- paste0("abd.", parts[1])
col2 <- paste0("abd.", parts[2])
protAbundanceIngroup <- protAbundanceIngroup |>
dplyr::mutate(!!nm := mean(c(.data[[col1]], .data[[col2]]), na.rm = TRUE))
}
protAbundanceIngroup <- protAbundanceIngroup |>
dplyr::select(-starts_with("abd")) |>
tidyr::pivot_longer(starts_with("avgAbd"), names_to = "contrast", values_to = "avgAbd")
protAbundanceIngroup <- protAbundanceIngroup |>
dplyr::mutate(contrast = dplyr::case_when(
contrast %in% names(abd_contrast_map) ~ abd_contrast_map[contrast],
TRUE ~ contrast))
bb <- dplyr::inner_join(all, protAbundanceIngroup,
by = c("name" = "protein_Id", "contrast" = "contrast"))
bb
}
datadir <- file.path(find.package("prolfquadata"), "quantdata")
inputMQfile <- file.path(datadir, "MAXQuant_IonStar2018_PXD003881.zip")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
mqdata <- list()
mqdata$data <- prolfquapp::tidyMQ_Peptides(inputMQfile)
mqdata$config <- prolfqua::create_config_MQ_peptide()
annotation <- readxl::read_xlsx(inputAnnotation)
data <- dplyr::inner_join(mqdata$data, annotation, by = "raw.file")
mqdata$config$table$factors[["dilution."]] <- "sample"
mqdata$config$table$factors[["run_Id"]] <- "run_ID"
mqdata$config$table$factorDepth <- 1
mqdata$data <- prolfqua::setup_analysis(data, mqdata$config)
lfqdata_mq <- prolfqua::LFQData$new(mqdata$data, mqdata$config)
lfqdata_mq$data <- lfqdata_mq$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id))
lfqdata_mq$filter_proteins_by_peptide_count()
lfqdata_mq$remove_small_intensities()
lfqdata_mq$hierarchy_counts()
tr <- lfqdata_mq$get_Transformer()
subset_h <- lfqdata_mq$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNorm_mq <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
lfqAgg_mq <- lfqdataNorm_mq$get_Aggregator()
lfqAgg_mq$medpolish()
lfqProt_mq <- lfqAgg_mq$lfq_agg
bb_mq <- run_msqrob2_hurdle(
lfqdataPeptideNorm = lfqdataNorm_mq,
lfqdataProtein = lfqProt_mq,
formula = ~dilution.,
L = ionstar_contrasts_L,
contrast_map = ionstar_contrast_map,
abd_contrast_map = ionstar_abd_contrast_map,
factor_col = "dilution.")
ttd <- ionstar_bench_preprocess(bb_mq, idcol = "name")
benchmark_mq <- make_benchmark(ttd$data,
contrast = "contrast",
toscale = c("pval"),
fcestimate = "logFC",
benchmark = list(
list(score = "logFC", desc = TRUE),
list(score = "t", desc = TRUE),
list(score = "scaled.pval", desc = TRUE)),
model_description = "msqrob2 hurdle on IonStar/MQ peptides",
model_name = "msqrob2_MQ",
FDRvsFDP = list(list(score = "FDR", desc = FALSE)),
hierarchy = c("name"), summarizeNA = "t")
prolfqua::table_facade(benchmark_mq$smc$summary,
caption = "Nr of proteins with 0, 1, 2, 3 missing contrasts.")
benchmark_mq$plot_ROC(xlim = 0.2)
benchmark_mq$plot_FDRvsFDP()
write_contrast_results(
bb_mq,
path = "../inst/Benchresults/msqrob2_IonStar_MQ",
metadata = list(
dataset = "IonStar_MaxQuant",
method = "msqrob2",
method_description = "msqrob2 hurdle on MQ peptides + median polish",
input_file = "MAXQuant_IonStar2018_PXD003881.zip/peptides.txt",
software_version = paste0("msqrob2 ", packageVersion("msqrob2")),
date = as.character(Sys.Date()),
ground_truth = ionstar_ground_truth),
col_map = msqrob2_col_map)
inputFragfile <- file.path(datadir, "MSFragger_IonStar2018_PXD003881.zip")
annotation_fp <- readxl::read_xlsx(inputAnnotation)
msstats <- read.csv(
unz(inputFragfile, "IonstarWithMSFragger/MSstats.csv"),
header = TRUE, sep = ",", stringsAsFactors = FALSE)
msstats <- tibble::tibble(msstats)
msstats$Run <- tolower(msstats$Run)
merged <- dplyr::inner_join(annotation_fp, msstats, by = c("raw.file" = "Run"))
summedPep <- merged |>
dplyr::group_by(raw.file, run_ID, sample, ProteinName, PeptideSequence) |>
dplyr::summarize(nprecursor = dplyr::n(),
pepIntensity = sum(Intensity, na.rm = TRUE), .groups = "drop")
atable <- prolfqua::AnalysisConfiguration$new()
atable$fileName <- "raw.file"
atable$hierarchy[["protein_Id"]] <- c("ProteinName")
atable$hierarchy[["peptide_Id"]] <- c("PeptideSequence")
atable$hierarchyDepth <- 1
atable$set_response("pepIntensity")
atable$factors[["dilution."]] <- "sample"
atable$factors[["run"]] <- "run_ID"
atable$factorDepth <- 1
config_fp <- prolfqua::AnalysisConfiguration$new(atable)
adata_fp <- prolfqua::setup_analysis(summedPep, config_fp)
lfqdata_fp <- prolfqua::LFQData$new(adata_fp, config_fp)
lfqdata_fp$hierarchy_counts()
lfqdata_fp$filter_proteins_by_peptide_count()
lfqdata_fp$remove_small_intensities()
knitr::kable(lfqdata_fp$hierarchy_counts(), caption = "number of proteins and peptides.")
pl <- lfqdata_fp$get_Plotter()
pl$intensity_distribution_density()
subset_h <- lfqdata_fp$get_copy()$get_Transformer()$log2()$lfq
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
tr <- lfqdata_fp$get_Transformer()
lfqdataNorm_fp <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
agg_fp <- lfqdataNorm_fp$get_Aggregator()
agg_fp$medpolish()
lfqProt_fp <- agg_fp$lfq_agg
lfqProt_fp$hierarchy_counts()
bb_fp <- run_msqrob2_hurdle(
lfqdataPeptideNorm = lfqdataNorm_fp,
lfqdataProtein = lfqProt_fp,
formula = ~dilution.,
L = ionstar_contrasts_L,
contrast_map = ionstar_contrast_map,
abd_contrast_map = ionstar_abd_contrast_map,
factor_col = "dilution.")
ttd <- ionstar_bench_preprocess(bb_fp, idcol = "name")
benchmark_fp <- make_benchmark(ttd$data,
contrast = "contrast",
toscale = c("pval"),
fcestimate = "logFC",
benchmark = list(
list(score = "logFC", desc = TRUE),
list(score = "t", desc = TRUE),
list(score = "scaled.pval", desc = TRUE)),
model_description = "msqrob2 hurdle on IonStar/FragPipe",
model_name = "msqrob2_FragPipe",
FDRvsFDP = list(list(score = "FDR", desc = FALSE)),
hierarchy = c("name"), summarizeNA = "t")
prolfqua::table_facade(benchmark_fp$smc$summary,
caption = "Nr of proteins with 0, 1, 2, 3 missing contrasts.")
benchmark_fp$plot_ROC(xlim = 0.2)
benchmark_fp$plot_FDRvsFDP()
write_contrast_results(
bb_fp,
path = "../inst/Benchresults/msqrob2_IonStar_FragPipe",
metadata = list(
dataset = "IonStar_FragPipe",
method = "msqrob2",
method_description = "msqrob2 hurdle on FragPipe MSstats.csv peptides + median polish",
input_file = "MSFragger_IonStar2018_PXD003881.zip/IonstarWithMSFragger/MSstats.csv",
software_version = paste0("msqrob2 ", packageVersion("msqrob2")),
date = as.character(Sys.Date()),
ground_truth = ionstar_ground_truth),
col_map = msqrob2_col_map)
evalCPTAC <- requireNamespace("msdata", quietly = TRUE) && evalAll
tmp <- msdata::quant(full.names = TRUE)
xx <- read.csv(tmp, sep = "\t")
peptides <- prolfquapp::tidyMQ_Peptides(xx)
annotation_cptac <- data.frame(raw.file = unique(peptides$raw.file))
annotation_cptac <- annotation_cptac |>
dplyr::mutate(group = gsub("^6", "", raw.file)) |>
dplyr::mutate(group = gsub("_[0-9]$", "", group))
peptides <- dplyr::inner_join(annotation_cptac, peptides)
config_cptac <- prolfqua::create_config_MQ_peptide()
config_cptac$table$factors["group."] <- "group"
peptides <- prolfqua::setup_analysis(peptides, config_cptac)
lfqpeptides <- prolfqua::LFQData$new(peptides, config_cptac)
lfqpeptides$filter_proteins_by_peptide_count()
lfqpeptides$remove_small_intensities()
lfqpeptides$hierarchy_counts()
tr <- lfqpeptides$get_Transformer()
tr <- tr$intensity_matrix(vsn::justvsn)
lfqtransformed <- tr$lfq
agg <- lfqtransformed$get_Aggregator()
agg$medpolish()
lfqProt_cptac <- agg$lfq_agg
# CPTAC has a single contrast: group.b vs group.a
L_cptac <- makeContrast(c("group.b=0"), parameterNames = c("group.b"))
cptac_contrast_map <- c("hurdle_group.b" = "b_vs_a")
cptac_abd_contrast_map <- c("avgAbd.b.a" = "b_vs_a")
se <- prolfqua::LFQDataToSummarizedExperiment(lfqtransformed)
pe <- QFeatures::QFeatures(list(peptide = se), colData = colData(se))
pe <- QFeatures::aggregateFeatures(
pe, i = "peptide", fcol = "protein_Id",
name = "protein", fun = my_medianPolish)
prlm <- msqrobHurdle(pe, i = "protein", formula = ~group., overwrite = TRUE)
prlm <- hypothesisTestHurdle(prlm, i = "protein", L_cptac, overwrite = TRUE)
xx <- rowData(prlm[["protein"]])
hurdle <- xx[grepl("hurdle_", names(xx))]
res <- list()
for (i in names(hurdle)) {
hurdle[[i]]$contrast <- i
res[[i]] <- prolfqua::matrix_to_tibble(hurdle[[i]], preserve_row_names = "name")
}
hurdle <- dplyr::bind_rows(res)
logFC <- hurdle |> dplyr::select("name", "contrast", starts_with("logFC"))
logFC <- dplyr::filter(logFC, !is.na(logFCt))
logFC$modelName <- "msqrobHurdleIntensity"
names(logFC) <- c("name", "contrast", "logFC", "se", "df", "t", "pval", "modelName")
logOR <- hurdle |> dplyr::select("name", "contrast", starts_with("logOR"))
logOR$modelName <- "msqrobHurdleCount"
names(logOR) <- c("name", "contrast", "logFC", "se", "df", "t", "pval", "modelName")
ddd <- dplyr::anti_join(logOR, logFC, by = c("name", "contrast"))
all <- dplyr::bind_rows(ddd, logFC) |> dplyr::arrange(contrast, name)
all <- prolfqua::adjust_p_values(all, column = "pval", group_by_col = "contrast")
all$contrast <- "b_vs_a"
st <- lfqProt_cptac$get_Stats()
protAbundanceIngroup <- st$stats()
protAbundanceIngroup <- protAbundanceIngroup |>
tidyr::pivot_wider(id_cols = protein_Id,
names_from = group., names_prefix = "abd.",
values_from = meanAbundance)
protAbundanceIngroup <- protAbundanceIngroup |>
dplyr::mutate(avgAbd.b.a = mean(c(abd.b, abd.a), na.rm = TRUE))
protAbundanceIngroup <- protAbundanceIngroup |>
dplyr::select(-starts_with("abd")) |>
tidyr::pivot_longer(starts_with("avgAbd"), names_to = "contrast", values_to = "avgAbd")
protAbundanceIngroup$contrast <- "b_vs_a"
bb_cptac <- dplyr::inner_join(all, protAbundanceIngroup,
by = c("name" = "protein_Id", "contrast" = "contrast"))
ttd <- prolfquabenchmark::cptac_bench_preprocess(bb_cptac, idcol = "name")
benchmark_cptac <- make_benchmark(ttd$data,
contrast = "contrast",
toscale = c("pval"),
fcestimate = "logFC",
benchmark = list(
list(score = "logFC", desc = TRUE),
list(score = "t", desc = TRUE),
list(score = "scaled.pval", desc = TRUE)),
model_description = "msqrob2 hurdle on CPTAC",
model_name = "msqrob2_CPTAC",
FDRvsFDP = list(list(score = "pval.adjusted", desc = FALSE)),
hierarchy = c("name"), summarizeNA = "t")
prolfqua::table_facade(benchmark_cptac$smc$summary,
caption = "Nr of proteins with 0, 1, 2, 3 missing contrasts.")
benchmark_cptac$plot_ROC(xlim = 0.2)
benchmark_cptac$plot_FDRvsFDP()
write_contrast_results(
bb_cptac,
path = "../inst/Benchresults/msqrob2_CPTAC",
metadata = list(
dataset = "CPTAC",
method = "msqrob2",
method_description = "msqrob2 hurdle on CPTAC MQ peptides + VSN + median polish",
input_file = "msdata::quant()",
software_version = paste0("msqrob2 ", packageVersion("msqrob2")),
date = as.character(Sys.Date()),
ground_truth = list(
id_column = "protein_id",
positive = list(label = "UPS", pattern = "_UPS"),
negative = list(label = "YEAST", pattern = "_YEAST")
)),
col_map = msqrob2_col_map)