Setup

We benchmark the msqrob2 msqrobHurdle method across three datasets. The hurdle model produces inference using rlm from protein intensity estimates and uses glm to obtain estimates in those cases where no intensity-based estimates are found.

library(QFeatures)
library(msqrob2)

my_medianPolish <- function(x, verbose = FALSE, ...) {
  medpol <- stats::medpolish(x, na.rm = TRUE, trace.iter = verbose, maxiter = 10)
  return(medpol$overall + medpol$col)
}

msqrob2_col_map <- c(
  "name" = "protein_id",
  "contrast" = "contrast",
  "logFC" = "log_fc",
  "t" = "t_statistic",
  "pval" = "p_value",
  "FDR" = "p_value_adjusted"
)

ionstar_ground_truth <- list(
  id_column = "protein_id",
  positive = list(label = "ECOLI", pattern = "ECOLI"),
  negative = list(label = "HUMAN", pattern = "HUMAN")
)

ionstar_contrasts_L <- makeContrast(
  c("dilution.e-dilution.d=0",
    "dilution.d-dilution.c=0",
    "dilution.c-dilution.b=0",
    "dilution.b=0"),
  parameterNames = c("dilution.e",
                     "dilution.d",
                     "dilution.c",
                     "dilution.b"))

ionstar_contrast_map <- c(
  "hurdle_dilution.e - dilution.d" = "dilution_(9/7.5)_1.2",
  "hurdle_dilution.d - dilution.c" = "dilution_(7.5/6)_1.25",
  "hurdle_dilution.c - dilution.b" = "dilution_(6/4.5)_1.3(3)",
  "hurdle_dilution.b" = "dilution_(4.5/3)_1.5")

ionstar_abd_contrast_map <- c(
  "avgAbd.e.d" = "dilution_(9/7.5)_1.2",
  "avgAbd.d.c" = "dilution_(7.5/6)_1.25",
  "avgAbd.c.b" = "dilution_(6/4.5)_1.3(3)",
  "avgAbd.b.a" = "dilution_(4.5/3)_1.5")
#' Run msqrob2 hurdle and extract merged results
#'
#' @param lfqdataPeptideNorm normalized peptide-level LFQData
#' @param lfqdataProtein protein-level LFQData (for average abundances)
#' @param formula model formula
#' @param L contrast matrix from makeContrast
#' @param contrast_map named vector mapping hurdle contrast names to standard names
#' @param abd_contrast_map named vector mapping abundance contrast names to standard names
#' @param factor_col name of the factor column in data
run_msqrob2_hurdle <- function(lfqdataPeptideNorm, lfqdataProtein,
                               formula, L, contrast_map, abd_contrast_map,
                               factor_col) {
  # Create QFeatures and aggregate
  se <- prolfqua::LFQDataToSummarizedExperiment(lfqdata = lfqdataPeptideNorm)
  pe <- QFeatures::QFeatures(list(peptide = se), colData = colData(se))
  pe <- QFeatures::aggregateFeatures(
    pe, i = "peptide", fcol = "protein_Id",
    name = "protein", fun = my_medianPolish)

  # Fit hurdle model
  prlm <- msqrobHurdle(pe, i = "protein", formula = formula, overwrite = TRUE)
  prlm <- hypothesisTestHurdle(prlm, i = "protein", L, overwrite = TRUE)

  # Extract hurdle results
  xx <- rowData(prlm[["protein"]])
  hurdle <- xx[grepl("hurdle_", names(xx))]
  res <- list()
  for (i in names(hurdle)) {
    hurdle[[i]]$contrast <- i
    res[[i]] <- prolfqua::matrix_to_tibble(hurdle[[i]], preserve_row_names = "name")
  }
  hurdle <- dplyr::bind_rows(res)

  # Merge intensity and count models
  logFC <- hurdle |> dplyr::select("name", "contrast", starts_with("logFC"))
  logFC <- dplyr::filter(logFC, !is.na(logFCt))
  logFC$modelName <- "msqrobHurdleIntensity"
  names(logFC) <- c("name", "contrast", "logFC", "se", "df", "t", "pval", "modelName")

  logOR <- hurdle |> dplyr::select("name", "contrast", starts_with("logOR"))
  logOR$modelName <- "msqrobHurdleCount"
  names(logOR) <- c("name", "contrast", "logFC", "se", "df", "t", "pval", "modelName")

  ddd <- dplyr::anti_join(logOR, logFC, by = c("name", "contrast"))
  all <- dplyr::bind_rows(ddd, logFC) |> dplyr::arrange(contrast, name)
  all <- prolfqua::adjust_p_values(all, column = "pval", group_by_col = "contrast")

  # Remap contrast names
  all <- all |> dplyr::mutate(
    contrast = dplyr::case_when(
      contrast %in% names(contrast_map) ~ contrast_map[contrast],
      TRUE ~ contrast))
  stopifnot(sum(all$contrast == "something wrong") == 0)

  # Compute average abundances per contrast
  st <- lfqdataProtein$get_Stats()
  protAbundanceIngroup <- st$stats()
  protAbundanceIngroup <- protAbundanceIngroup |>
    tidyr::pivot_wider(
      id_cols = protein_Id,
      names_from = !!rlang::sym(factor_col),
      names_prefix = "abd.",
      values_from = meanAbundance)

  # Compute average abundances for each contrast pair
  abd_cols <- grep("^abd\\.", names(protAbundanceIngroup), value = TRUE)
  abd_values <- sort(abd_cols)

  for (nm in names(abd_contrast_map)) {
    # Extract the two group suffixes from the abd_contrast_map name (e.g., "avgAbd.e.d" -> "e", "d")
    parts <- strsplit(sub("^avgAbd\\.", "", nm), "\\.")[[1]]
    col1 <- paste0("abd.", parts[1])
    col2 <- paste0("abd.", parts[2])
    protAbundanceIngroup <- protAbundanceIngroup |>
      dplyr::mutate(!!nm := mean(c(.data[[col1]], .data[[col2]]), na.rm = TRUE))
  }

  protAbundanceIngroup <- protAbundanceIngroup |>
    dplyr::select(-starts_with("abd")) |>
    tidyr::pivot_longer(starts_with("avgAbd"), names_to = "contrast", values_to = "avgAbd")

  protAbundanceIngroup <- protAbundanceIngroup |>
    dplyr::mutate(contrast = dplyr::case_when(
      contrast %in% names(abd_contrast_map) ~ abd_contrast_map[contrast],
      TRUE ~ contrast))

  bb <- dplyr::inner_join(all, protAbundanceIngroup,
                          by = c("name" = "protein_Id", "contrast" = "contrast"))
  bb
}

Dataset 1: IonStar / MaxQuant peptides

datadir <- file.path(find.package("prolfquadata"), "quantdata")
inputMQfile <- file.path(datadir, "MAXQuant_IonStar2018_PXD003881.zip")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
mqdata <- list()
mqdata$data <- prolfquapp::tidyMQ_Peptides(inputMQfile)
mqdata$config <- prolfqua::create_config_MQ_peptide()

annotation <- readxl::read_xlsx(inputAnnotation)
data <- dplyr::inner_join(mqdata$data, annotation, by = "raw.file")

mqdata$config$table$factors[["dilution."]] <- "sample"
mqdata$config$table$factors[["run_Id"]] <- "run_ID"
mqdata$config$table$factorDepth <- 1
mqdata$data <- prolfqua::setup_analysis(data, mqdata$config)

lfqdata_mq <- prolfqua::LFQData$new(mqdata$data, mqdata$config)
lfqdata_mq$data <- lfqdata_mq$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id))
lfqdata_mq$filter_proteins_by_peptide_count()
lfqdata_mq$remove_small_intensities()
lfqdata_mq$hierarchy_counts()
tr <- lfqdata_mq$get_Transformer()
subset_h <- lfqdata_mq$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNorm_mq <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
lfqAgg_mq <- lfqdataNorm_mq$get_Aggregator()
lfqAgg_mq$medpolish()
lfqProt_mq <- lfqAgg_mq$lfq_agg
bb_mq <- run_msqrob2_hurdle(
  lfqdataPeptideNorm = lfqdataNorm_mq,
  lfqdataProtein = lfqProt_mq,
  formula = ~dilution.,
  L = ionstar_contrasts_L,
  contrast_map = ionstar_contrast_map,
  abd_contrast_map = ionstar_abd_contrast_map,
  factor_col = "dilution.")
ttd <- ionstar_bench_preprocess(bb_mq, idcol = "name")
benchmark_mq <- make_benchmark(ttd$data,
  contrast = "contrast",
  toscale = c("pval"),
  fcestimate = "logFC",
  benchmark = list(
    list(score = "logFC", desc = TRUE),
    list(score = "t", desc = TRUE),
    list(score = "scaled.pval", desc = TRUE)),
  model_description = "msqrob2 hurdle on IonStar/MQ peptides",
  model_name = "msqrob2_MQ",
  FDRvsFDP = list(list(score = "FDR", desc = FALSE)),
  hierarchy = c("name"), summarizeNA = "t")

prolfqua::table_facade(benchmark_mq$smc$summary,
  caption = "Nr of proteins with 0, 1, 2, 3 missing contrasts.")
benchmark_mq$plot_ROC(xlim = 0.2)
benchmark_mq$plot_FDRvsFDP()
write_contrast_results(
  bb_mq,
  path = "../inst/Benchresults/msqrob2_IonStar_MQ",
  metadata = list(
    dataset = "IonStar_MaxQuant",
    method = "msqrob2",
    method_description = "msqrob2 hurdle on MQ peptides + median polish",
    input_file = "MAXQuant_IonStar2018_PXD003881.zip/peptides.txt",
    software_version = paste0("msqrob2 ", packageVersion("msqrob2")),
    date = as.character(Sys.Date()),
    ground_truth = ionstar_ground_truth),
  col_map = msqrob2_col_map)

Dataset 2: IonStar / FragPipe MSstats.csv

inputFragfile <- file.path(datadir, "MSFragger_IonStar2018_PXD003881.zip")
annotation_fp <- readxl::read_xlsx(inputAnnotation)

msstats <- read.csv(
  unz(inputFragfile, "IonstarWithMSFragger/MSstats.csv"),
  header = TRUE, sep = ",", stringsAsFactors = FALSE)
msstats <- tibble::tibble(msstats)
msstats$Run <- tolower(msstats$Run)

merged <- dplyr::inner_join(annotation_fp, msstats, by = c("raw.file" = "Run"))
summedPep <- merged |>
  dplyr::group_by(raw.file, run_ID, sample, ProteinName, PeptideSequence) |>
  dplyr::summarize(nprecursor = dplyr::n(),
                   pepIntensity = sum(Intensity, na.rm = TRUE), .groups = "drop")
atable <- prolfqua::AnalysisConfiguration$new()
atable$fileName <- "raw.file"
atable$hierarchy[["protein_Id"]] <- c("ProteinName")
atable$hierarchy[["peptide_Id"]] <- c("PeptideSequence")
atable$hierarchyDepth <- 1
atable$set_response("pepIntensity")
atable$factors[["dilution."]] <- "sample"
atable$factors[["run"]] <- "run_ID"
atable$factorDepth <- 1

config_fp <- prolfqua::AnalysisConfiguration$new(atable)
adata_fp <- prolfqua::setup_analysis(summedPep, config_fp)
lfqdata_fp <- prolfqua::LFQData$new(adata_fp, config_fp)
lfqdata_fp$hierarchy_counts()
lfqdata_fp$filter_proteins_by_peptide_count()
lfqdata_fp$remove_small_intensities()

knitr::kable(lfqdata_fp$hierarchy_counts(), caption = "number of proteins and peptides.")
pl <- lfqdata_fp$get_Plotter()
pl$intensity_distribution_density()
subset_h <- lfqdata_fp$get_copy()$get_Transformer()$log2()$lfq
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
tr <- lfqdata_fp$get_Transformer()
lfqdataNorm_fp <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
agg_fp <- lfqdataNorm_fp$get_Aggregator()
agg_fp$medpolish()
lfqProt_fp <- agg_fp$lfq_agg
lfqProt_fp$hierarchy_counts()
bb_fp <- run_msqrob2_hurdle(
  lfqdataPeptideNorm = lfqdataNorm_fp,
  lfqdataProtein = lfqProt_fp,
  formula = ~dilution.,
  L = ionstar_contrasts_L,
  contrast_map = ionstar_contrast_map,
  abd_contrast_map = ionstar_abd_contrast_map,
  factor_col = "dilution.")
ttd <- ionstar_bench_preprocess(bb_fp, idcol = "name")
benchmark_fp <- make_benchmark(ttd$data,
  contrast = "contrast",
  toscale = c("pval"),
  fcestimate = "logFC",
  benchmark = list(
    list(score = "logFC", desc = TRUE),
    list(score = "t", desc = TRUE),
    list(score = "scaled.pval", desc = TRUE)),
  model_description = "msqrob2 hurdle on IonStar/FragPipe",
  model_name = "msqrob2_FragPipe",
  FDRvsFDP = list(list(score = "FDR", desc = FALSE)),
  hierarchy = c("name"), summarizeNA = "t")

prolfqua::table_facade(benchmark_fp$smc$summary,
  caption = "Nr of proteins with 0, 1, 2, 3 missing contrasts.")
benchmark_fp$plot_ROC(xlim = 0.2)
benchmark_fp$plot_FDRvsFDP()
write_contrast_results(
  bb_fp,
  path = "../inst/Benchresults/msqrob2_IonStar_FragPipe",
  metadata = list(
    dataset = "IonStar_FragPipe",
    method = "msqrob2",
    method_description = "msqrob2 hurdle on FragPipe MSstats.csv peptides + median polish",
    input_file = "MSFragger_IonStar2018_PXD003881.zip/IonstarWithMSFragger/MSstats.csv",
    software_version = paste0("msqrob2 ", packageVersion("msqrob2")),
    date = as.character(Sys.Date()),
    ground_truth = ionstar_ground_truth),
  col_map = msqrob2_col_map)

Dataset 3: CPTAC

evalCPTAC <- requireNamespace("msdata", quietly = TRUE) && evalAll
tmp <- msdata::quant(full.names = TRUE)
xx <- read.csv(tmp, sep = "\t")
peptides <- prolfquapp::tidyMQ_Peptides(xx)

annotation_cptac <- data.frame(raw.file = unique(peptides$raw.file))
annotation_cptac <- annotation_cptac |>
  dplyr::mutate(group = gsub("^6", "", raw.file)) |>
  dplyr::mutate(group = gsub("_[0-9]$", "", group))

peptides <- dplyr::inner_join(annotation_cptac, peptides)

config_cptac <- prolfqua::create_config_MQ_peptide()
config_cptac$table$factors["group."] <- "group"

peptides <- prolfqua::setup_analysis(peptides, config_cptac)
lfqpeptides <- prolfqua::LFQData$new(peptides, config_cptac)
lfqpeptides$filter_proteins_by_peptide_count()
lfqpeptides$remove_small_intensities()
lfqpeptides$hierarchy_counts()
tr <- lfqpeptides$get_Transformer()
tr <- tr$intensity_matrix(vsn::justvsn)
lfqtransformed <- tr$lfq
agg <- lfqtransformed$get_Aggregator()
agg$medpolish()
lfqProt_cptac <- agg$lfq_agg
# CPTAC has a single contrast: group.b vs group.a
L_cptac <- makeContrast(c("group.b=0"), parameterNames = c("group.b"))

cptac_contrast_map <- c("hurdle_group.b" = "b_vs_a")
cptac_abd_contrast_map <- c("avgAbd.b.a" = "b_vs_a")

se <- prolfqua::LFQDataToSummarizedExperiment(lfqtransformed)
pe <- QFeatures::QFeatures(list(peptide = se), colData = colData(se))
pe <- QFeatures::aggregateFeatures(
  pe, i = "peptide", fcol = "protein_Id",
  name = "protein", fun = my_medianPolish)

prlm <- msqrobHurdle(pe, i = "protein", formula = ~group., overwrite = TRUE)
prlm <- hypothesisTestHurdle(prlm, i = "protein", L_cptac, overwrite = TRUE)

xx <- rowData(prlm[["protein"]])
hurdle <- xx[grepl("hurdle_", names(xx))]
res <- list()
for (i in names(hurdle)) {
  hurdle[[i]]$contrast <- i
  res[[i]] <- prolfqua::matrix_to_tibble(hurdle[[i]], preserve_row_names = "name")
}
hurdle <- dplyr::bind_rows(res)

logFC <- hurdle |> dplyr::select("name", "contrast", starts_with("logFC"))
logFC <- dplyr::filter(logFC, !is.na(logFCt))
logFC$modelName <- "msqrobHurdleIntensity"
names(logFC) <- c("name", "contrast", "logFC", "se", "df", "t", "pval", "modelName")

logOR <- hurdle |> dplyr::select("name", "contrast", starts_with("logOR"))
logOR$modelName <- "msqrobHurdleCount"
names(logOR) <- c("name", "contrast", "logFC", "se", "df", "t", "pval", "modelName")
ddd <- dplyr::anti_join(logOR, logFC, by = c("name", "contrast"))
all <- dplyr::bind_rows(ddd, logFC) |> dplyr::arrange(contrast, name)
all <- prolfqua::adjust_p_values(all, column = "pval", group_by_col = "contrast")
all$contrast <- "b_vs_a"

st <- lfqProt_cptac$get_Stats()
protAbundanceIngroup <- st$stats()
protAbundanceIngroup <- protAbundanceIngroup |>
  tidyr::pivot_wider(id_cols = protein_Id,
                     names_from = group., names_prefix = "abd.",
                     values_from = meanAbundance)
protAbundanceIngroup <- protAbundanceIngroup |>
  dplyr::mutate(avgAbd.b.a = mean(c(abd.b, abd.a), na.rm = TRUE))
protAbundanceIngroup <- protAbundanceIngroup |>
  dplyr::select(-starts_with("abd")) |>
  tidyr::pivot_longer(starts_with("avgAbd"), names_to = "contrast", values_to = "avgAbd")
protAbundanceIngroup$contrast <- "b_vs_a"

bb_cptac <- dplyr::inner_join(all, protAbundanceIngroup,
  by = c("name" = "protein_Id", "contrast" = "contrast"))
ttd <- prolfquabenchmark::cptac_bench_preprocess(bb_cptac, idcol = "name")
benchmark_cptac <- make_benchmark(ttd$data,
  contrast = "contrast",
  toscale = c("pval"),
  fcestimate = "logFC",
  benchmark = list(
    list(score = "logFC", desc = TRUE),
    list(score = "t", desc = TRUE),
    list(score = "scaled.pval", desc = TRUE)),
  model_description = "msqrob2 hurdle on CPTAC",
  model_name = "msqrob2_CPTAC",
  FDRvsFDP = list(list(score = "pval.adjusted", desc = FALSE)),
  hierarchy = c("name"), summarizeNA = "t")

prolfqua::table_facade(benchmark_cptac$smc$summary,
  caption = "Nr of proteins with 0, 1, 2, 3 missing contrasts.")
benchmark_cptac$plot_ROC(xlim = 0.2)
benchmark_cptac$plot_FDRvsFDP()
write_contrast_results(
  bb_cptac,
  path = "../inst/Benchresults/msqrob2_CPTAC",
  metadata = list(
    dataset = "CPTAC",
    method = "msqrob2",
    method_description = "msqrob2 hurdle on CPTAC MQ peptides + VSN + median polish",
    input_file = "msdata::quant()",
    software_version = paste0("msqrob2 ", packageVersion("msqrob2")),
    date = as.character(Sys.Date()),
    ground_truth = list(
      id_column = "protein_id",
      positive = list(label = "UPS", pattern = "_UPS"),
      negative = list(label = "YEAST", pattern = "_YEAST")
    )),
  col_map = msqrob2_col_map)