Benchmarking SaintExpress

SAINT express provides only binaries for WINDOWS and LINUX. Therefore If you are on MacOS this vingette will not render.

if( Sys.info()["sysname"] == "Darwin") {
    evalAll = FALSE
}

Importing MQ data into SAINTExpress

First read data and annotation.

library(prolfqua)
datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputMQfile <-  file.path(datadir,
                          "MAXQuant_IonStar2018_PXD003881.zip")
data <- prolfquapp::tidyMQ_Peptides(inputMQfile)

inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
annotation <- readxl::read_xlsx(inputAnnotation)

Create proLFQua configruation for MQ peptide.txt file and annotate data.

config <- prolfqua::create_config_MQ_peptide()
res <- dplyr::inner_join(
  data,
  annotation,
  by = "raw.file"
)

config$table$factors[["dilution."]] = "sample"
config$table$factors[["run_Id"]] = "run_ID"
config$table$factorDepth <- 1
data <- prolfqua::setup_analysis(res, config)

Setup LFQData class, filter, transform and aggregate peptide intensities: - remove proteins with only one peptide identified (filter_proteins_by_peptide_count). - remove small (zero) intensities (remove_small_intensities) - log2 transform and scale intensities (get_Transformer) - aggregate peptides to proteins (get_Aggregator)

lfqdata <- prolfqua::LFQData$new(data,config)
lfqdata$data <- lfqdata$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id)) 
lfqdata$filter_proteins_by_peptide_count()
lfqdata$hierarchy_counts()
lfqdata$remove_small_intensities()

tr <- lfqdata$get_Transformer()
subset_h <- lfqdata$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNormalized <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq

lfqAggMedpol <- lfqdataNormalized$get_Aggregator()
lfqAggMedpol$medpolish()
lfqtrans <- lfqAggMedpol$lfq_agg

Prepare protein data for SaintExpress analysis and run SaintExpress

SeintExpress requires specifying a control condition. As control C we will use the samples with the lowest E-Coli spike in concentration (dilution a). Treatments T are samples with higher concentrations (dilution ‘b’, ‘c’, ‘d’, ‘e’).

The code snipped below specifies control and treatment samples using case_when.
adds protein lengths (readPeptideFasta,addProteinLengths).
Creates SaintExpress compatible inputs (protein_2localSaint).
run saintExpress (runSaint)

exampleDat <- lfqtrans$data |> dplyr::mutate(CorT = case_when(dilution. == "a" ~ "C", TRUE ~ "T"))
exampleDat$protein_Id <- gsub("~.*","", exampleDat$protein_Id)

# sample protein lengths


Ecolifasta <- system.file("fastaDBs/uniprot-proteome_UP000000625_reviewed_yes.fasta.gz",package = "prolfquadata")
Humanfasta <- system.file("fastaDBs/uniprot-proteome_UP000005640_reviewed_yes.fasta.gz",package = "prolfquadata")

Ecolifasta <- prozor::readPeptideFasta(Ecolifasta)
Humanfasta <- prozor::readPeptideFasta(Humanfasta)
fasta <- c(Ecolifasta, Humanfasta)
exampleDat <- prolfqua::addProteinLengths(exampleDat, fasta)



res <- protein_2localSaint(exampleDat,quantcolumn = "medpolish",
                           proteinID = "protein_Id",
                           proteinLength = "protein.length",
                           IP_name = "raw.file",
                           baitCol = "dilution.",
                           CorTCol = "CorT"
)
stopifnot(names(res) == c( "inter", "prey",  "bait"))
resSaint <- runSaint(res,filedir = tempdir())

names(resSaint)
ctr <- prolfqua::ContrastsSaintExpress$new(resSaint$list)

pl <- ctr$get_Plotter()
pl$score_plot()$SaintScore
pl$histogram()

Benchmarking

We benchmark SaintExpress using the Ionstar dataset. We are using here the lowest ECOLI spike in concentration as control.

bb <- ctr$get_contrasts()
bb <- mutate(bb , PEP = 1-SaintScore)
ttd <- prolfqua::ionstar_bench_preprocess( bb , idcol = "Prey" )
benchmark_SaintExpres <- prolfqua::make_benchmark(ttd$data,
                                   contrast = "Bait",
                                fcestimate = "log2FC",
                                toscale = NULL,
                                benchmark = list(
                                  list(score = "log2FC", desc = TRUE),    
                                  list(score = "SaintScore", desc = TRUE),
                                  list(score = "BFDR", desc = FALSE)
                                ),  
                                model_description = "SaintExpress_medpolishInt",
                                model_name = "SaintExpress_medpolishInt",
                                FDRvsFDP = list(list(score = "BFDR", desc = FALSE))
, hierarchy = c("Prey"), summarizeNA = "SaintScore"
)
colnames(ttd$data)
sum(benchmark_SaintExpres$smc$summary$Prey)
sumarry <- benchmark_SaintExpres$smc$summary
prolfqua::table_facade(sumarry, caption = "nr of proteins with 0, 1, 2, 3 missing contrasts.")

res <- benchmark_SaintExpres$pAUC_summaries()
knitr::kable(res$ftable$content,caption = res$ftable$caption)
res$barp

#res$ftable
benchmark_SaintExpres$plot_ROC(xlim = 0.2)

benchmark_SaintExpres$plot_FDRvsFDP()

benchmark_SaintExpres$plot_precision_recall()

Conclusions

There are several problems with the results produced by SaintExpress:

First, the fold changes reported, have no relation with the true fold change (see volcano plots)
Secondly, the score with the highest performance (largest pAUC) is the $\log_2(FC)$ . However, since the SaintScore also model the observed variances we would expect to perform better.
Third, the performance of SaintExpress is significantly worse when compared with other algorithms, e.g. limma or PRORA implemented in prolfqua or proDA (see other benchmark vignettes).

Saint Express BFDR.

SaintExpress infers several statistics. - the SaintScore - Saint Probability, vaguely - probability that the protein is a true interactor. - the BFDR Bayesian FDR.

The following article [http://varianceexplained.org/r/bayesian_fdr_baseball/] describes how the BFDR can be derived from the posterior error probability (PEP). The BFDR equals the cumulative mean of the PEP.

SaintExpress does not report the PEP. But if the SaintScore is the probability that a protein is an interactor then, $1 - SaintScore$ can be interpreted as posterior error probability (that it is not an interactor) and hence we can compute the BFDR:

computeFDR <- function(mdata ){
    mdata <- dplyr::mutate(mdata, PEP = 1 - SaintScore)
    mdata <- mdata |> arrange(PEP)
    mdata <- mdata |> mutate( myFDR = cummean(PEP))
    return(mdata)
}

There are two options, either to determine the BFDR for all Baits (first code snipped) or for each Bait (second one with the for loop).

reslist <- resSaint$list

reslist <- computeFDR(reslist)
ggplot(reslist, aes(x = BFDR, y = myFDR)) + geom_point() + geom_abline(slope = 1,color = "red")

ad <- list()
for(i in unique(reslist$Bait)){
    print(i)
    set <- filter(reslist, Bait == i)
    ad[[i]] <- computeFDR(set)
}
ad <- dplyr::bind_rows(ad)

ggplot(ad, aes(x = BFDR, y = myFDR)) + geom_point() + facet_wrap(~Bait) + ggplot2::geom_abline(slope = 1,colour = 2)

The SE BFDR is quite similar to the FDR we estimated from the SaintScore. It also seems that SaintExpress computes the BFDR for all Baits not for each.

Witold E. Wolski

2025-01-09

Importing MQ data into SAINTExpress

Prepare protein data for SaintExpress analysis and run SaintExpress

Benchmarking

Conclusions

Saint Express BFDR.