vignettes/BenchmarkMSqRob2.Rmd
BenchmarkMSqRob2.Rmd
## Error in get(paste0(generic, ".", class), envir = get_method_env()) :
## object 'type_sum.accel' not found
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: MultiAssayExperiment
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
##
## Attaching package: 'matrixStats'
##
## The following object is masked from 'package:dplyr':
##
## count
##
##
## Attaching package: 'MatrixGenerics'
##
## The following objects are masked from 'package:matrixStats':
##
## colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
## colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
## colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
## colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
## colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
## colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
## colWeightedMeans, colWeightedMedians, colWeightedSds,
## colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
## rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
## rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
## rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
## rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
## rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
## rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
## rowWeightedSds, rowWeightedVars
##
## Loading required package: GenomicRanges
## Loading required package: stats4
## Loading required package: BiocGenerics
##
## Attaching package: 'BiocGenerics'
##
## The following object is masked from 'package:limma':
##
## plotMA
##
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
##
## The following objects are masked from 'package:dplyr':
##
## combine, intersect, setdiff, union
##
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
##
## The following objects are masked from 'package:base':
##
## anyDuplicated, aperm, append, as.data.frame, basename, cbind,
## colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
## get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
## match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
## Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
## table, tapply, union, unique, unsplit, which.max, which.min
##
## Loading required package: S4Vectors
##
## Attaching package: 'S4Vectors'
##
## The following objects are masked from 'package:lubridate':
##
## second, second<-
##
## The following objects are masked from 'package:dplyr':
##
## first, rename
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## The following object is masked from 'package:utils':
##
## findMatches
##
## The following objects are masked from 'package:base':
##
## expand.grid, I, unname
##
## Loading required package: IRanges
## Warning: package 'IRanges' was built under R version 4.4.2
##
## Attaching package: 'IRanges'
##
## The following object is masked from 'package:lubridate':
##
## %within%
##
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
##
## The following object is masked from 'package:purrr':
##
## reduce
##
## Loading required package: GenomeInfoDb
## Warning: package 'GenomeInfoDb' was built under R version 4.4.2
## Loading required package: Biobase
## Welcome to Bioconductor
##
## Vignettes contain introductory material; view with
## 'browseVignettes()'. To cite Bioconductor, see
## 'citation("Biobase")', and for packages 'citation("pkgname")'.
##
##
## Attaching package: 'Biobase'
##
## The following object is masked from 'package:MatrixGenerics':
##
## rowMedians
##
## The following objects are masked from 'package:matrixStats':
##
## anyMissing, rowMedians
##
##
## Attaching package: 'QFeatures'
##
## The following object is masked from 'package:MultiAssayExperiment':
##
## longFormat
##
## The following object is masked from 'package:base':
##
## sweep
## Warning: package 'msqrob2' was built under R version 4.4.2
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:IRanges':
##
## slice
##
## The following object is masked from 'package:S4Vectors':
##
## rename
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:Biobase':
##
## combine
##
## The following object is masked from 'package:BiocGenerics':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
evalAll <- require(proDA)
## Loading required package: proDA
SAVE = TRUE
Here we benchmark the msqrob2 package using the
peptide.txt
intensities and median polish
We will be examining the msqrobHurdle
method, which does
produce inference using rlm
from protein intensity
estimates and uses glm
to obtain estimates in those cases
where no estimates are found.
datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputMQfile <- file.path(datadir,
"MAXQuant_IonStar2018_PXD003881.zip")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
mqdata <- list()
mqdata$data <- prolfquapp::tidyMQ_Peptides(inputMQfile)
length(unique(mqdata$data$proteins))
## [1] 5295
mqdata$config <- prolfqua::create_config_MQ_peptide()
annotation <- readxl::read_xlsx(inputAnnotation)
data <- dplyr::inner_join(
mqdata$data,
annotation,
by = "raw.file"
)
mqdata$config$table$factors[["dilution."]] = "sample"
mqdata$config$table$factors[["run_Id"]] = "run_ID"
mqdata$config$table$factorDepth <- 1
mqdata$data <- prolfqua::setup_analysis(data, mqdata$config)
lfqdata <- prolfqua::LFQData$new(mqdata$data, mqdata$config)
Filter the data for small intensities (maxquant reports missing values as 0) and for two peptides per protein.
lfqdata$data <- lfqdata$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id))
lfqdata$filter_proteins_by_peptide_count()
lfqdata$remove_small_intensities()
lfqdata$hierarchy_counts()
## # A tibble: 1 × 3
## isotope protein_Id peptide_Id
## <chr> <int> <int>
## 1 light 4178 29879
tr <- lfqdata$get_Transformer()
subset_h <- lfqdata$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNormalized <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
lfqAggMedpol <- lfqdataNormalized$get_Aggregator()
lfqAggMedpol$medpolish()
lfqtrans <- lfqAggMedpol$lfq_agg
st <- lfqtrans$get_Stats()
protAbundanceIngroup <- st$stats()
protAbundanceIngroup <- protAbundanceIngroup |>
tidyr::pivot_wider(
id_cols = protein_Id,
names_from = dilution.,
names_prefix = "abd.",
values_from = meanAbundance)
To use proDA
, we need to create an
SummarizedExperiment
. We use the to_wide
function of prolfqua
to get the data in in the
SummarizedExperiment
compatible format.
As usual, two steps are required, first fit the models, then compute the contrasts.
prlm <- msqrobHurdle(pe,
i = "protein",
formula = ~dilution.,
overwrite = TRUE)
Since msqrob does not report average abundances, we are computing them for each contrast.
L <- makeContrast(c("dilution.e-dilution.d=0",
"dilution.d-dilution.c=0",
"dilution.c-dilution.b=0",
"dilution.b=0"),
parameterNames = c("dilution.e",
"dilution.d",
"dilution.c",
"dilution.b"))
prlm <- hypothesisTestHurdle(prlm, i = "protein", L, overwrite = TRUE)
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.e.d = mean( c(abd.e,abd.d), na.rm = TRUE) )
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.d.c = mean( c(abd.d,abd.c), na.rm = TRUE) )
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.c.b = mean( c(abd.c,abd.b), na.rm = TRUE) )
protAbundanceIngroup <- protAbundanceIngroup |> dplyr::mutate( avgAbd.b.a = mean( c(abd.b,abd.a), na.rm = TRUE) )
xx <- rowData(prlm[["protein"]])
hurdle <- xx[grepl("hurdle_",names(xx))]
res <- list()
for (i in names(hurdle)) {
hurdle[[i]]$contrast <- i
res[[i]] <- prolfqua::matrix_to_tibble(hurdle[[i]], preserve_row_names = "name")
}
hurdle <- dplyr::bind_rows(res)
Now we need to merge the results of both models msqrobHurdleIntensity
and msqrobHurdleCount. To find out which models were not estimated by
msqrobHurdleIntensity
we check for NA’s and use the
anti_join
to select those from the
msqrobHurdleCount
models.
logFC <- hurdle |> dplyr::select("name","contrast", starts_with("logFC"))
logFC <- filter(logFC ,!is.na(logFCt))
logFC$modelName <- "msqrobHurdleIntensity"
names(logFC) <- c("name","contrast","logFC","se","df","t","pval","modelName")
logOR <- hurdle |> dplyr::select("name","contrast", starts_with("logOR"))
logOR$modelName <- "msqrobHurdleCount"
names(logOR) <- c("name","contrast","logFC","se","df","t","pval","modelName")
ddd <- dplyr::anti_join(logOR , logFC, by = c("name", "contrast"))
all <- dplyr::bind_rows(ddd , logFC) |> dplyr::arrange(contrast, name)
all <- prolfqua::adjust_p_values(all, column = "pval", group_by_col = "contrast")
all$contrast |> unique()
## [1] "hurdle_dilution.b" "hurdle_dilution.c - dilution.b"
## [3] "hurdle_dilution.d - dilution.c" "hurdle_dilution.e - dilution.d"
protAbundanceIngroup <- protAbundanceIngroup |>
dplyr::select(-starts_with("abd")) |>
tidyr::pivot_longer(starts_with("avgAbd"), names_to = "contrast" ,values_to = "avgAbd")
protAbundanceIngroup <- protAbundanceIngroup |>
mutate(contrast =
case_when(contrast == "avgAbd.e.d" ~ "dilution_(9/7.5)_1.2",
contrast == "avgAbd.d.c" ~ "dilution_(7.5/6)_1.25",
contrast == "avgAbd.c.b" ~ "dilution_(6/4.5)_1.3(3)",
contrast == "avgAbd.b.a" ~ "dilution_(4.5/3)_1.5",
TRUE ~ "something wrong"))
all <- all |> mutate(contrast = case_when(
contrast == "hurdle_dilution.e - dilution.d" ~ "dilution_(9/7.5)_1.2",
contrast == "hurdle_dilution.d - dilution.c" ~ "dilution_(7.5/6)_1.25",
contrast == "hurdle_dilution.c - dilution.b" ~ "dilution_(6/4.5)_1.3(3)",
contrast == "hurdle_dilution.b" ~ "dilution_(4.5/3)_1.5",
TRUE ~ "something wrong"))
stopifnot(sum(all$contrast == "something wrong") == 0 )
stopifnot(sum(all$contrast == "something wrong") == 0 )
bb <- dplyr::inner_join(all, protAbundanceIngroup, by = c("name" = "protein_Id", "contrast" = "contrast"))
Here we use proflqua
benchmark functions to generate
some summaries.
ttd <- prolfqua::ionstar_bench_preprocess( bb , idcol = "name" )
benchmark_msqrob <- prolfqua::make_benchmark(ttd$data,
contrast = "contrast",
toscale = c("pval"),
fcestimate = "logFC",
benchmark = list(
list(score = "logFC", desc = TRUE),
list(score = "t", desc = TRUE),
list(score = "scaled.pval", desc = TRUE)
),
model_description = "msqrob_QFeature",
model_name = "msqrob_QFeature",
FDRvsFDP = list(list(score = "FDR", desc = FALSE))
, hierarchy = c("name"), summarizeNA = "t"
)
sumarry <- benchmark_msqrob$smc$summary
prolfqua::table_facade(sumarry, caption = "nr of proteins with 0, 1, 2, 3 missing contrasts.")
nr_missing | name |
---|---|
0 | 4178 |
res <- benchmark_msqrob$pAUC_summaries()
knitr::kable(res$ftable$content,caption = res$ftable$caption)
contrast | what | AUC | pAUC_10 | pAUC_20 |
---|---|---|---|---|
all | logFC | 92.63229 | 66.24404 | 78.59403 |
all | scaled.pval | 93.01754 | 73.14452 | 79.60171 |
all | t | 92.98309 | 72.89377 | 79.43704 |
dilution_(4.5/3)_1.5 | logFC | 93.86049 | 79.46350 | 85.68809 |
dilution_(4.5/3)_1.5 | scaled.pval | 93.72252 | 79.48732 | 83.93065 |
dilution_(4.5/3)_1.5 | t | 93.67864 | 79.29457 | 83.75974 |
dilution_(6/4.5)_1.3(3) | logFC | 91.35905 | 64.93423 | 76.98337 |
dilution_(6/4.5)_1.3(3) | scaled.pval | 91.60129 | 72.43374 | 77.82537 |
dilution_(6/4.5)_1.3(3) | t | 91.57134 | 72.21100 | 77.69018 |
dilution_(7.5/6)_1.25 | logFC | 92.34012 | 59.73226 | 75.30775 |
dilution_(7.5/6)_1.25 | scaled.pval | 92.93888 | 69.21834 | 77.32784 |
dilution_(7.5/6)_1.25 | t | 92.90068 | 68.93694 | 77.16134 |
dilution_(9/7.5)_1.2 | logFC | 92.84969 | 60.08247 | 76.25803 |
dilution_(9/7.5)_1.2 | scaled.pval | 93.74453 | 71.32533 | 79.51444 |
dilution_(9/7.5)_1.2 | t | 93.72019 | 71.08318 | 79.36250 |
res$barp
#res$ftable
benchmark_msqrob$plot_ROC(xlim = 0.2)
benchmark_msqrob$plot_FDRvsFDP()
benchmark_msqrob$plot_precision_recall()