## Error in get(paste0(generic, ".", class), envir = get_method_env()) : 
##   object 'type_sum.accel' not found
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
##  dplyr     1.1.4      readr     2.1.5
##  forcats   1.0.0      stringr   1.5.1
##  ggplot2   3.5.1      tibble    3.2.1
##  lubridate 1.9.4      tidyr     1.3.1
##  purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
##  dplyr::filter() masks stats::filter()
##  dplyr::lag()    masks stats::lag()
##  Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: MultiAssayExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## 
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## 
## Attaching package: 'MatrixGenerics'
## 
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## 
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## 
## The following object is masked from 'package:limma':
## 
##     plotMA
## 
## The following objects are masked from 'package:lubridate':
## 
##     intersect, setdiff, union
## 
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## 
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## 
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Loading required package: S4Vectors
## 
## Attaching package: 'S4Vectors'
## 
## The following objects are masked from 'package:lubridate':
## 
##     second, second<-
## 
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The following object is masked from 'package:utils':
## 
##     findMatches
## 
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## 
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 4.4.2
## 
## Attaching package: 'IRanges'
## 
## The following object is masked from 'package:lubridate':
## 
##     %within%
## 
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## 
## The following object is masked from 'package:purrr':
## 
##     reduce
## 
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.4.2
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## 
## Attaching package: 'Biobase'
## 
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## 
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
## 
## 
## Attaching package: 'QFeatures'
## 
## The following object is masked from 'package:MultiAssayExperiment':
## 
##     longFormat
## 
## The following object is masked from 'package:base':
## 
##     sweep
## Warning: package 'msqrob2' was built under R version 4.4.2
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:IRanges':
## 
##     slice
## 
## The following object is masked from 'package:S4Vectors':
## 
##     rename
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:Biobase':
## 
##     combine
## 
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
evalAll <- require(proDA)
## Loading required package: proDA
SAVE = TRUE

Here we benchmark the msqrob2 package using the peptide.txt intensities and median polish

We will be examining the msqrobHurdle method, which does produce inference using rlm from protein intensity estimates and uses glm to obtain estimates in those cases where no estimates are found.

datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputMQfile <-  file.path(datadir,
                          "MAXQuant_IonStar2018_PXD003881.zip")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
mqdata <- list()
mqdata$data <- prolfquapp::tidyMQ_Peptides(inputMQfile)
length(unique(mqdata$data$proteins))
## [1] 5295
mqdata$config <- prolfqua::create_config_MQ_peptide()

annotation <- readxl::read_xlsx(inputAnnotation)

data <- dplyr::inner_join(
  mqdata$data,
  annotation,
  by = "raw.file"
)

mqdata$config$table$factors[["dilution."]] = "sample"
mqdata$config$table$factors[["run_Id"]] = "run_ID"
mqdata$config$table$factorDepth <- 1
mqdata$data <- prolfqua::setup_analysis(data, mqdata$config)


lfqdata <- prolfqua::LFQData$new(mqdata$data, mqdata$config)

Filter the data for small intensities (maxquant reports missing values as 0) and for two peptides per protein.

lfqdata$data <- lfqdata$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id)) 
lfqdata$filter_proteins_by_peptide_count()
lfqdata$remove_small_intensities()
lfqdata$hierarchy_counts()
## # A tibble: 1 × 3
##   isotope protein_Id peptide_Id
##   <chr>        <int>      <int>
## 1 light         4178      29879
tr <- lfqdata$get_Transformer()
subset_h <- lfqdata$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNormalized <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
lfqAggMedpol <- lfqdataNormalized$get_Aggregator()
lfqAggMedpol$medpolish()
lfqtrans <- lfqAggMedpol$lfq_agg
st <- lfqtrans$get_Stats()
protAbundanceIngroup <- st$stats()
protAbundanceIngroup <- protAbundanceIngroup |>
  tidyr::pivot_wider(
    id_cols = protein_Id,
    names_from = dilution.,
    names_prefix = "abd.",
    values_from = meanAbundance)

To use proDA, we need to create an SummarizedExperiment. We use the to_wide function of prolfqua to get the data in in the SummarizedExperiment compatible format.

Defining Contrasts and computing group comparisons

As usual, two steps are required, first fit the models, then compute the contrasts.

prlm <- msqrobHurdle(pe,
                     i = "protein",
                     formula = ~dilution.,
                     overwrite = TRUE)

Since msqrob does not report average abundances, we are computing them for each contrast.

L <- makeContrast(c("dilution.e-dilution.d=0",
                    "dilution.d-dilution.c=0",
                    "dilution.c-dilution.b=0",
                    "dilution.b=0"),
                  parameterNames = c("dilution.e",
                                     "dilution.d",
                                     "dilution.c",
                                     "dilution.b"))
prlm <- hypothesisTestHurdle(prlm, i = "protein", L, overwrite = TRUE)
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.e.d =  mean( c(abd.e,abd.d), na.rm = TRUE)  )
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.d.c =  mean( c(abd.d,abd.c), na.rm = TRUE)  )
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.c.b =  mean( c(abd.c,abd.b), na.rm = TRUE)  )
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.b.a =  mean( c(abd.b,abd.a), na.rm = TRUE)  )


xx <- rowData(prlm[["protein"]])
hurdle <- xx[grepl("hurdle_",names(xx))]

res <- list()
for (i in names(hurdle)) { 
  hurdle[[i]]$contrast <- i
  res[[i]] <- prolfqua::matrix_to_tibble(hurdle[[i]], preserve_row_names = "name")
}

hurdle <- dplyr::bind_rows(res)

Now we need to merge the results of both models msqrobHurdleIntensity and msqrobHurdleCount. To find out which models were not estimated by msqrobHurdleIntensity we check for NA’s and use the anti_join to select those from the msqrobHurdleCount models.

logFC <- hurdle |> dplyr::select("name","contrast", starts_with("logFC"))
logFC <- filter(logFC ,!is.na(logFCt))
logFC$modelName <- "msqrobHurdleIntensity"
names(logFC) <- c("name","contrast","logFC","se","df","t","pval","modelName")

logOR <- hurdle |> dplyr::select("name","contrast", starts_with("logOR"))
logOR$modelName <- "msqrobHurdleCount"
names(logOR) <- c("name","contrast","logFC","se","df","t","pval","modelName")


ddd <- dplyr::anti_join(logOR , logFC, by = c("name", "contrast"))
all <- dplyr::bind_rows(ddd , logFC) |> dplyr::arrange(contrast, name)
all <- prolfqua::adjust_p_values(all, column = "pval", group_by_col = "contrast")
all$contrast |> unique()
## [1] "hurdle_dilution.b"              "hurdle_dilution.c - dilution.b"
## [3] "hurdle_dilution.d - dilution.c" "hurdle_dilution.e - dilution.d"
protAbundanceIngroup <- protAbundanceIngroup |> 
  dplyr::select(-starts_with("abd")) |> 
  tidyr::pivot_longer(starts_with("avgAbd"), names_to = "contrast" ,values_to = "avgAbd")

protAbundanceIngroup <- protAbundanceIngroup |> 
  mutate(contrast = 
           case_when(contrast == "avgAbd.e.d" ~ "dilution_(9/7.5)_1.2",
                     contrast == "avgAbd.d.c" ~ "dilution_(7.5/6)_1.25",
                     contrast == "avgAbd.c.b" ~ "dilution_(6/4.5)_1.3(3)",
                     contrast == "avgAbd.b.a" ~ "dilution_(4.5/3)_1.5",
                     TRUE ~ "something wrong"))
all <- all |> mutate(contrast = case_when(
  contrast == "hurdle_dilution.e - dilution.d" ~ "dilution_(9/7.5)_1.2",
  contrast == "hurdle_dilution.d - dilution.c" ~ "dilution_(7.5/6)_1.25",
  contrast == "hurdle_dilution.c - dilution.b" ~ "dilution_(6/4.5)_1.3(3)",
  contrast == "hurdle_dilution.b" ~ "dilution_(4.5/3)_1.5",
  TRUE ~ "something wrong"))
stopifnot(sum(all$contrast == "something wrong") == 0 )
stopifnot(sum(all$contrast == "something wrong") == 0 )


bb <- dplyr::inner_join(all, protAbundanceIngroup, by = c("name" = "protein_Id", "contrast" = "contrast"))

Benchmarking

Here we use proflqua benchmark functions to generate some summaries.

ttd <- prolfqua::ionstar_bench_preprocess( bb , idcol = "name" )
benchmark_msqrob <- prolfqua::make_benchmark(ttd$data,
                                             contrast = "contrast",
                                             toscale = c("pval"),
                                             fcestimate = "logFC",
                                             benchmark = list(
                                               list(score = "logFC", desc = TRUE),
                                               list(score = "t", desc = TRUE),
                                               list(score = "scaled.pval", desc = TRUE)
                                             ),  
                                             model_description = "msqrob_QFeature",
                                             model_name = "msqrob_QFeature",
                                             FDRvsFDP = list(list(score = "FDR", desc = FALSE))
                                             , hierarchy = c("name"), summarizeNA = "t"
)

sumarry <- benchmark_msqrob$smc$summary
prolfqua::table_facade(sumarry, caption = "nr of proteins with 0, 1, 2, 3 missing contrasts.")
nr of proteins with 0, 1, 2, 3 missing contrasts.
nr_missing name
0 4178
res <- benchmark_msqrob$pAUC_summaries()
knitr::kable(res$ftable$content,caption = res$ftable$caption)
AUC, and pAUC at 0.1 and 0.2 FPR for (NC) msqrob_QFeature
contrast what AUC pAUC_10 pAUC_20
all logFC 92.63229 66.24404 78.59403
all scaled.pval 93.01754 73.14452 79.60171
all t 92.98309 72.89377 79.43704
dilution_(4.5/3)_1.5 logFC 93.86049 79.46350 85.68809
dilution_(4.5/3)_1.5 scaled.pval 93.72252 79.48732 83.93065
dilution_(4.5/3)_1.5 t 93.67864 79.29457 83.75974
dilution_(6/4.5)_1.3(3) logFC 91.35905 64.93423 76.98337
dilution_(6/4.5)_1.3(3) scaled.pval 91.60129 72.43374 77.82537
dilution_(6/4.5)_1.3(3) t 91.57134 72.21100 77.69018
dilution_(7.5/6)_1.25 logFC 92.34012 59.73226 75.30775
dilution_(7.5/6)_1.25 scaled.pval 92.93888 69.21834 77.32784
dilution_(7.5/6)_1.25 t 92.90068 68.93694 77.16134
dilution_(9/7.5)_1.2 logFC 92.84969 60.08247 76.25803
dilution_(9/7.5)_1.2 scaled.pval 93.74453 71.32533 79.51444
dilution_(9/7.5)_1.2 t 93.72019 71.08318 79.36250
res$barp
ROC curves

ROC curves

#res$ftable
benchmark_msqrob$plot_ROC(xlim = 0.2)
plot ROC curves

plot ROC curves

benchmark_msqrob$plot_FDRvsFDP()
plot FDR vs FDP

plot FDR vs FDP

benchmark_msqrob$plot_precision_recall()