Benchmarking the proDA package using the Ionstar Dataset starting from peptides

knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
evalAll <- require(proDA)

## Loading required package: proDA

SAVE = TRUE

proDA benchmark based on peptide.txt intensities and median polish

datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputMQfile <-  file.path(datadir,
                          "MAXQuant_IonStar2018_PXD003881.zip")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
mqdata <- list()
mqdata$data <- prolfquapp::tidyMQ_Peptides(inputMQfile)

## Error in get(paste0(generic, ".", class), envir = get_method_env()) : 
##   object 'type_sum.accel' not found

length(unique(mqdata$data$proteins))

## [1] 5295

mqdata$config <- prolfqua::create_config_MQ_peptide()

annotation <- readxl::read_xlsx(inputAnnotation)
res <- dplyr::inner_join(
  mqdata$data,
  annotation,
  by = "raw.file"
)

mqdata$config$table$factors[["dilution."]] = "sample"
mqdata$config$table$factors[["run_Id"]] = "run_ID"
mqdata$config$table$factorDepth <- 1
mqdata$data <- prolfqua::setup_analysis(res, mqdata$config)


lfqdata <- prolfqua::LFQData$new(mqdata$data, mqdata$config)

Filter the data for small intensities (maxquant reports missing values as 0) and for two peptides per protein.

lfqdata$data <- lfqdata$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id)) 
lfqdata$filter_proteins_by_peptide_count()
lfqdata$remove_small_intensities()
lfqdata$hierarchy_counts()

## # A tibble: 1 × 3
##   isotope protein_Id peptide_Id
##   <chr>        <int>      <int>
## 1 light         4178      29879

tr <- lfqdata$get_Transformer()
subset_h <- lfqdata$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNormalized <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq

lfqAggMedpol <- lfqdataNormalized$get_Aggregator()
lfqAggMedpol$medpolish()
lfqtrans <- lfqAggMedpol$lfq_agg

To use proDA, we need to create an SummarizedExperiment. We use the to_wide function of prolfqua to get the data in in the SummarizedExperiment compatible format.

se <- prolfqua::LFQDataToSummarizedExperiment(lfqtrans)

Defining Contrasts and computing group comparisons

As usual, two steps are required, first fit the models, then comptue the contrasts.

fit <- proDA::proDA(se, design = ~ dilution. - 1,data_is_log_transformed = TRUE)
contr <- list()
contr[["dilution_(9/7.5)_1.2"]] <- data.frame(
  contrast = "dilution_(9/7.5)_1.2",
  proDA::test_diff(fit, contrast = "dilution.e - dilution.d"))
contr[["dilution_(7.5/6)_1.25"]] <- data.frame(
  contrast = "dilution_(7.5/6)_1.25",
  proDA::test_diff(fit, contrast = "dilution.d - dilution.c"))
contr[["dilution_(6/4.5)_1.3(3)"]] <- data.frame(
  contrast = "dilution_(6/4.5)_1.3(3)", 
  proDA::test_diff(fit, contrast = "dilution.c - dilution.b"))
contr[["dilution_(4.5/3)_1.5"]] <- data.frame(
  contrast = "dilution_(4.5/3)_1.5", 
  proDA::test_diff(fit, contrast = "dilution.b - dilution.a" ))

bb <- dplyr::bind_rows(contr)

Benchmarking

bb$name |> unique() |> length()

## [1] 4178

ttd <- prolfqua::ionstar_bench_preprocess( bb , idcol = "name" )

benchmark_proDA <- prolfqua::make_benchmark(ttd$data,
                                            contrast = "contrast",
                                            toscale = c("pval"),
                                            fcestimate = "diff",
                                            benchmark = list(
                                              list(score = "diff", desc = TRUE),
                                              list(score = "t_statistic", desc = TRUE),
                                              list(score = "scaled.pval", desc = TRUE)
                                            ),  
                                            model_description = "proDA_medpolishInt",
                                            model_name = "proDA_medpolishInt",
                                            FDRvsFDP = list(list(score = "adj_pval", desc = FALSE))
                                            , hierarchy = c("name"), summarizeNA = "t_statistic"
)

sum(benchmark_proDA$smc$summary$name)

## [1] 4178

sumarry <- benchmark_proDA$smc$summary
prolfqua::table_facade(sumarry, caption = "nr of proteins with 0, 1, 2, 3 missing contrasts.")

nr of proteins with 0, 1, 2, 3 missing contrasts.
nr_missing	name
0	4178

res <- benchmark_proDA$pAUC_summaries()
knitr::kable(res$ftable$content,caption = res$ftable$caption)

AUC, and pAUC at 0.1 and 0.2 FPR for (NC) proDA_medpolishInt
contrast	what	AUC	pAUC_10	pAUC_20
all	diff	92.87096	66.41758	78.70558
all	scaled.pval	92.98494	72.56085	79.12681
all	t_statistic	92.98494	72.56085	79.12681
dilution_(4.5/3)_1.5	diff	94.61194	81.30629	86.53539
dilution_(4.5/3)_1.5	scaled.pval	94.19345	81.52859	85.09507
dilution_(4.5/3)_1.5	t_statistic	94.19345	81.52859	85.09507
dilution_(6/4.5)_1.3(3)	diff	92.05863	66.72145	78.40251
dilution_(6/4.5)_1.3(3)	scaled.pval	92.10508	72.99698	78.92297
dilution_(6/4.5)_1.3(3)	t_statistic	92.10508	72.99698	78.92297
dilution_(7.5/6)_1.25	diff	91.74305	58.60092	74.26792
dilution_(7.5/6)_1.25	scaled.pval	92.07200	67.19075	75.01046
dilution_(7.5/6)_1.25	t_statistic	92.07200	67.19075	75.01046
dilution_(9/7.5)_1.2	diff	92.94016	57.98739	75.34832
dilution_(9/7.5)_1.2	scaled.pval	93.50843	68.18403	77.57506
dilution_(9/7.5)_1.2	t_statistic	93.50843	68.18403	77.57506

res$barp

ROC curves

#res$ftable
benchmark_proDA$plot_ROC(xlim = 0.2)

plot ROC curves

benchmark_proDA$plot_FDRvsFDP()

plot FDR vs FDP

benchmark_proDA$plot_precision_recall()

Witold E Wolski

2025-01-09

proDA benchmark based on peptide.txt intensities and median polish

Defining Contrasts and computing group comparisons

Benchmarking