Benchmarking the msqrob2 package using the Ionstar Dataset starting from peptides.txt

library(tidyverse)

## Error in get(paste0(generic, ".", class), envir = get_method_env()) : 
##   object 'type_sum.accel' not found

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(limma)
library(QFeatures)

## Loading required package: MultiAssayExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## 
## Attaching package: 'MatrixGenerics'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## 
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following object is masked from 'package:limma':
## 
##     plotMA
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following object is masked from 'package:utils':
## 
##     findMatches
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Loading required package: IRanges

## Warning: package 'IRanges' was built under R version 4.4.2

## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## Loading required package: GenomeInfoDb

## Warning: package 'GenomeInfoDb' was built under R version 4.4.2

## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## 
## Attaching package: 'QFeatures'
## 
## The following object is masked from 'package:MultiAssayExperiment':
## 
##     longFormat
## 
## The following object is masked from 'package:base':
## 
##     sweep

library(msqrob2)

## Warning: package 'msqrob2' was built under R version 4.4.2

library(plotly)

## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:IRanges':
## 
##     slice
## 
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout

library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:Biobase':
## 
##     combine
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
evalAll <- require(proDA)

## Loading required package: proDA

SAVE = TRUE

Here we benchmark the msqrob2 package using the peptide.txt intensities and median polish

We will be examining the msqrobHurdle method, which does produce inference using rlm from protein intensity estimates and uses glm to obtain estimates in those cases where no estimates are found.

datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputMQfile <-  file.path(datadir,
                          "MAXQuant_IonStar2018_PXD003881.zip")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
mqdata <- list()
mqdata$data <- prolfquapp::tidyMQ_Peptides(inputMQfile)
length(unique(mqdata$data$proteins))

## [1] 5295

mqdata$config <- prolfqua::create_config_MQ_peptide()

annotation <- readxl::read_xlsx(inputAnnotation)

data <- dplyr::inner_join(
  mqdata$data,
  annotation,
  by = "raw.file"
)

mqdata$config$table$factors[["dilution."]] = "sample"
mqdata$config$table$factors[["run_Id"]] = "run_ID"
mqdata$config$table$factorDepth <- 1
mqdata$data <- prolfqua::setup_analysis(data, mqdata$config)


lfqdata <- prolfqua::LFQData$new(mqdata$data, mqdata$config)

Filter the data for small intensities (maxquant reports missing values as 0) and for two peptides per protein.

lfqdata$data <- lfqdata$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id)) 
lfqdata$filter_proteins_by_peptide_count()
lfqdata$remove_small_intensities()
lfqdata$hierarchy_counts()

## # A tibble: 1 × 3
##   isotope protein_Id peptide_Id
##   <chr>        <int>      <int>
## 1 light         4178      29879

tr <- lfqdata$get_Transformer()
subset_h <- lfqdata$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNormalized <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq

lfqAggMedpol <- lfqdataNormalized$get_Aggregator()
lfqAggMedpol$medpolish()
lfqtrans <- lfqAggMedpol$lfq_agg
st <- lfqtrans$get_Stats()
protAbundanceIngroup <- st$stats()
protAbundanceIngroup <- protAbundanceIngroup |>
  tidyr::pivot_wider(
    id_cols = protein_Id,
    names_from = dilution.,
    names_prefix = "abd.",
    values_from = meanAbundance)

To use proDA, we need to create an SummarizedExperiment. We use the to_wide function of prolfqua to get the data in in the SummarizedExperiment compatible format.

Defining Contrasts and computing group comparisons

As usual, two steps are required, first fit the models, then compute the contrasts.

prlm <- msqrobHurdle(pe,
                     i = "protein",
                     formula = ~dilution.,
                     overwrite = TRUE)

Since msqrob does not report average abundances, we are computing them for each contrast.

L <- makeContrast(c("dilution.e-dilution.d=0",
                    "dilution.d-dilution.c=0",
                    "dilution.c-dilution.b=0",
                    "dilution.b=0"),
                  parameterNames = c("dilution.e",
                                     "dilution.d",
                                     "dilution.c",
                                     "dilution.b"))
prlm <- hypothesisTestHurdle(prlm, i = "protein", L, overwrite = TRUE)
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.e.d =  mean( c(abd.e,abd.d), na.rm = TRUE)  )
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.d.c =  mean( c(abd.d,abd.c), na.rm = TRUE)  )
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.c.b =  mean( c(abd.c,abd.b), na.rm = TRUE)  )
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.b.a =  mean( c(abd.b,abd.a), na.rm = TRUE)  )


xx <- rowData(prlm[["protein"]])
hurdle <- xx[grepl("hurdle_",names(xx))]

res <- list()
for (i in names(hurdle)) { 
  hurdle[[i]]$contrast <- i
  res[[i]] <- prolfqua::matrix_to_tibble(hurdle[[i]], preserve_row_names = "name")
}

hurdle <- dplyr::bind_rows(res)

Now we need to merge the results of both models msqrobHurdleIntensity and msqrobHurdleCount. To find out which models were not estimated by msqrobHurdleIntensity we check for NA’s and use the anti_join to select those from the msqrobHurdleCount models.

logFC <- hurdle |> dplyr::select("name","contrast", starts_with("logFC"))
logFC <- filter(logFC ,!is.na(logFCt))
logFC$modelName <- "msqrobHurdleIntensity"
names(logFC) <- c("name","contrast","logFC","se","df","t","pval","modelName")

logOR <- hurdle |> dplyr::select("name","contrast", starts_with("logOR"))
logOR$modelName <- "msqrobHurdleCount"
names(logOR) <- c("name","contrast","logFC","se","df","t","pval","modelName")


ddd <- dplyr::anti_join(logOR , logFC, by = c("name", "contrast"))
all <- dplyr::bind_rows(ddd , logFC) |> dplyr::arrange(contrast, name)
all <- prolfqua::adjust_p_values(all, column = "pval", group_by_col = "contrast")
all$contrast |> unique()

## [1] "hurdle_dilution.b"              "hurdle_dilution.c - dilution.b"
## [3] "hurdle_dilution.d - dilution.c" "hurdle_dilution.e - dilution.d"

protAbundanceIngroup <- protAbundanceIngroup |> 
  dplyr::select(-starts_with("abd")) |> 
  tidyr::pivot_longer(starts_with("avgAbd"), names_to = "contrast" ,values_to = "avgAbd")

protAbundanceIngroup <- protAbundanceIngroup |> 
  mutate(contrast = 
           case_when(contrast == "avgAbd.e.d" ~ "dilution_(9/7.5)_1.2",
                     contrast == "avgAbd.d.c" ~ "dilution_(7.5/6)_1.25",
                     contrast == "avgAbd.c.b" ~ "dilution_(6/4.5)_1.3(3)",
                     contrast == "avgAbd.b.a" ~ "dilution_(4.5/3)_1.5",
                     TRUE ~ "something wrong"))
all <- all |> mutate(contrast = case_when(
  contrast == "hurdle_dilution.e - dilution.d" ~ "dilution_(9/7.5)_1.2",
  contrast == "hurdle_dilution.d - dilution.c" ~ "dilution_(7.5/6)_1.25",
  contrast == "hurdle_dilution.c - dilution.b" ~ "dilution_(6/4.5)_1.3(3)",
  contrast == "hurdle_dilution.b" ~ "dilution_(4.5/3)_1.5",
  TRUE ~ "something wrong"))
stopifnot(sum(all$contrast == "something wrong") == 0 )
stopifnot(sum(all$contrast == "something wrong") == 0 )


bb <- dplyr::inner_join(all, protAbundanceIngroup, by = c("name" = "protein_Id", "contrast" = "contrast"))

Benchmarking

Here we use proflqua benchmark functions to generate some summaries.

ttd <- prolfqua::ionstar_bench_preprocess( bb , idcol = "name" )
benchmark_msqrob <- prolfqua::make_benchmark(ttd$data,
                                             contrast = "contrast",
                                             toscale = c("pval"),
                                             fcestimate = "logFC",
                                             benchmark = list(
                                               list(score = "logFC", desc = TRUE),
                                               list(score = "t", desc = TRUE),
                                               list(score = "scaled.pval", desc = TRUE)
                                             ),  
                                             model_description = "msqrob_QFeature",
                                             model_name = "msqrob_QFeature",
                                             FDRvsFDP = list(list(score = "FDR", desc = FALSE))
                                             , hierarchy = c("name"), summarizeNA = "t"
)

sumarry <- benchmark_msqrob$smc$summary
prolfqua::table_facade(sumarry, caption = "nr of proteins with 0, 1, 2, 3 missing contrasts.")

nr of proteins with 0, 1, 2, 3 missing contrasts.
nr_missing	name
0	4178

res <- benchmark_msqrob$pAUC_summaries()
knitr::kable(res$ftable$content,caption = res$ftable$caption)

AUC, and pAUC at 0.1 and 0.2 FPR for (NC) msqrob_QFeature
contrast	what	AUC	pAUC_10	pAUC_20
all	logFC	92.63229	66.24404	78.59403
all	scaled.pval	93.01754	73.14452	79.60171
all	t	92.98309	72.89377	79.43704
dilution_(4.5/3)_1.5	logFC	93.86049	79.46350	85.68809
dilution_(4.5/3)_1.5	scaled.pval	93.72252	79.48732	83.93065
dilution_(4.5/3)_1.5	t	93.67864	79.29457	83.75974
dilution_(6/4.5)_1.3(3)	logFC	91.35905	64.93423	76.98337
dilution_(6/4.5)_1.3(3)	scaled.pval	91.60129	72.43374	77.82537
dilution_(6/4.5)_1.3(3)	t	91.57134	72.21100	77.69018
dilution_(7.5/6)_1.25	logFC	92.34012	59.73226	75.30775
dilution_(7.5/6)_1.25	scaled.pval	92.93888	69.21834	77.32784
dilution_(7.5/6)_1.25	t	92.90068	68.93694	77.16134
dilution_(9/7.5)_1.2	logFC	92.84969	60.08247	76.25803
dilution_(9/7.5)_1.2	scaled.pval	93.74453	71.32533	79.51444
dilution_(9/7.5)_1.2	t	93.72019	71.08318	79.36250

res$barp

ROC curves

#res$ftable
benchmark_msqrob$plot_ROC(xlim = 0.2)

plot ROC curves

benchmark_msqrob$plot_FDRvsFDP()

plot FDR vs FDP

benchmark_msqrob$plot_precision_recall()

Witold E. Wolski

2025-01-09

Defining Contrasts and computing group comparisons

Benchmarking