Benchmarking the proDA package using the Ionstar Dataset MQ LFQ intensities

proDA benchmark based on maxquant lfq intensities

# Read in MaxQuant files
datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputMQfile <-  file.path(datadir,
                          "MAXQuant_IonStar2018_PXD003881.zip")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")



mqdata <- prolfquapp::tidyMQ_ProteinGroups(inputMQfile)

## Error in get(paste0(generic, ".", class), envir = get_method_env()) : 
##   object 'type_sum.accel' not found

mqdata <- mqdata |> dplyr::filter(!grepl("^REV__|^CON__", proteinID))

annotation <- readxl::read_xlsx(inputAnnotation)
res <- dplyr::inner_join(
  mqdata,
  annotation,
  by = "raw.file"
)
res <- dplyr::filter(res, nr.peptides > 1)

config <- prolfqua::AnalysisTableAnnotation$new()
config$factors[["dilution."]] = "sample"
config$fileName = "raw.file"
config$workIntensity = "mq.protein.lfq.intensity"
config$hierarchy[["proteinID"]] = "proteinID"

config <- prolfqua::AnalysisConfiguration$new(config)



lfqdata <- prolfqua::setup_analysis(res, config)
lfqPROT <- prolfqua::LFQData$new(lfqdata, config)
lfqPROT$remove_small_intensities()

sr <- lfqPROT$get_Summariser()
sr$hierarchy_counts()

## # A tibble: 1 × 2
##   isotopeLabel proteinID
##   <chr>            <int>
## 1 light             4235

Calibrate the data using the human protein subset.

lfqhum <- lfqPROT$get_subset(dplyr::filter(lfqPROT$data, grepl("HUMAN", proteinID)))

tr <- lfqPROT$get_Transformer()
log2data <- tr$log2()$lfq
p1 <- log2data$get_Plotter()$intensity_distribution_density()
tr$robscale_subset(lfqhum$get_Transformer()$log2()$lfq)

lfqtrans <- tr$lfq
pl <- lfqtrans$get_Plotter()

p2 <- pl$intensity_distribution_density()
gridExtra::grid.arrange(p1, p2)

Distribution of intensites before (left panel) and after (right panel) robscaling.

To use proDA, we need to create an SummarizedExperiment, which is easily to be done using the to_wide function of prolfqua.

library(SummarizedExperiment)
wide <- lfqtrans$to_wide(as.matrix = TRUE)
library(SummarizedExperiment)
ann <- data.frame(wide$annotation)
rownames(ann) <- wide$annotation$sampleName

se <- SummarizedExperiment(SimpleList(LFQ=wide$data), colData = ann)

Defining Contrasts and computing group comparisons

As usual, two steps are required, first fit the models, then comptue the contrasts.

fit <- proDA::proDA(se, design = ~ dilution. - 1,data_is_log_transformed = TRUE)
contr <- list()
contr[["dilution_(9/7.5)_1.2"]] <- data.frame(
  contrast = "dilution_(9/7.5)_1.2",
  proDA::test_diff(fit, contrast = "dilution.e - dilution.d"))
contr[["dilution_(7.5/6)_1.25"]] <- data.frame(
  contrast = "dilution_(7.5/6)_1.25",
  proDA::test_diff(fit, contrast = "dilution.d - dilution.c"))
contr[["dilution_(6/4.5)_1.3(3)"]] <- data.frame(
  contrast = "dilution_(6/4.5)_1.3(3)", 
  proDA::test_diff(fit, contrast = "dilution.c - dilution.b"))
contr[["dilution_(4.5/3)_1.5"]] <- data.frame(
  contrast = "dilution_(4.5/3)_1.5", 
  proDA::test_diff(fit, contrast = "dilution.b - dilution.a" ))

bb <- dplyr::bind_rows(contr)

Benchmarking

bb$name |> unique() |> length()

## [1] 4235

ttd <- prolfqua::ionstar_bench_preprocess( bb , idcol = "name" )
benchmark_proDA <- prolfqua::make_benchmark(ttd$data,
                                            contrast = "contrast",
                                            avgInt = "avg_abundance",
                                            toscale = c("pval"),
                                            fcestimate = "diff",
                                            benchmark = list(
                                              list(score = "diff", desc = TRUE),
                                              list(score = "t_statistic", desc = TRUE),
                                              list(score = "scaled.pval", desc = TRUE)
                                            ),  
                                            model_description = "proDA_lfqInt",
                                            model_name = "proDA_lfqInt",
                                            FDRvsFDP = list(list(score = "adj_pval", desc = FALSE))
                                            , hierarchy = c("name"), summarizeNA = "t_statistic"
)

sum(benchmark_proDA$smc$summary$name)

## [1] 4235

sumarry <- benchmark_proDA$smc$summary
prolfqua::table_facade(sumarry, caption = "nr of proteins with 0, 1, 2, 3 missing contrasts.")

nr of proteins with 0, 1, 2, 3 missing contrasts.
nr_missing	name
0	4229
4	6

res <- benchmark_proDA$pAUC_summaries()
knitr::kable(res$ftable$content,caption = res$ftable$caption)

AUC, and pAUC at 0.1 and 0.2 FPR for (NC) proDA_lfqInt
contrast	what	AUC	pAUC_10	pAUC_20
all	diff	87.09803	49.73144	65.37793
all	scaled.pval	88.72228	65.75249	70.89996
all	t_statistic	88.72228	65.75249	70.89996
dilution_(4.5/3)_1.5	diff	86.65613	42.39515	58.71656
dilution_(4.5/3)_1.5	scaled.pval	87.89895	63.23340	67.08660
dilution_(4.5/3)_1.5	t_statistic	87.89895	63.23340	67.08660
dilution_(6/4.5)_1.3(3)	diff	84.53858	51.30062	64.70244
dilution_(6/4.5)_1.3(3)	scaled.pval	87.54415	64.41084	69.19538
dilution_(6/4.5)_1.3(3)	t_statistic	87.54415	64.41084	69.19538
dilution_(7.5/6)_1.25	diff	87.03299	48.36020	64.93879
dilution_(7.5/6)_1.25	scaled.pval	88.66216	64.50954	70.58000
dilution_(7.5/6)_1.25	t_statistic	88.66216	64.50954	70.58000
dilution_(9/7.5)_1.2	diff	89.94016	58.26534	73.48897
dilution_(9/7.5)_1.2	scaled.pval	90.79159	70.18498	76.43741
dilution_(9/7.5)_1.2	t_statistic	90.79159	70.18498	76.43741

res$barp

ROC curves

#res$ftable
benchmark_proDA$plot_ROC(xlim = 0.2)

plot ROC curves

benchmark_proDA$plot_FDRvsFDP()

plot FDR vs FDP

benchmark_proDA$plot_precision_recall()

Witold E Wolski

2025-01-09

proDA benchmark based on maxquant lfq intensities

Defining Contrasts and computing group comparisons

Benchmarking