vignettes/Benchmark_SaintExpress.Rmd
Benchmark_SaintExpress.Rmd
SAINT express provides only binaries for WINDOWS and LINUX. Therefore If you are on MacOS this vingette will not render.
if( Sys.info()["sysname"] == "Darwin") {
evalAll = FALSE
}
First read data and annotation.
library(prolfqua)
datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputMQfile <- file.path(datadir,
"MAXQuant_IonStar2018_PXD003881.zip")
data <- prolfquapp::tidyMQ_Peptides(inputMQfile)
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
annotation <- readxl::read_xlsx(inputAnnotation)
Create proLFQua
configruation for MQ
peptide.txt
file and annotate data.
config <- prolfqua::create_config_MQ_peptide()
res <- dplyr::inner_join(
data,
annotation,
by = "raw.file"
)
config$table$factors[["dilution."]] = "sample"
config$table$factors[["run_Id"]] = "run_ID"
config$table$factorDepth <- 1
data <- prolfqua::setup_analysis(res, config)
Setup LFQData class, filter, transform and aggregate peptide
intensities: - remove proteins with only one peptide identified
(filter_proteins_by_peptide_count
). - remove small (zero)
intensities (remove_small_intensities
) - log2 transform and
scale intensities (get_Transformer
) - aggregate peptides to
proteins (get_Aggregator
)
lfqdata <- prolfqua::LFQData$new(data,config)
lfqdata$data <- lfqdata$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id))
lfqdata$filter_proteins_by_peptide_count()
lfqdata$hierarchy_counts()
lfqdata$remove_small_intensities()
tr <- lfqdata$get_Transformer()
subset_h <- lfqdata$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNormalized <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
lfqAggMedpol <- lfqdataNormalized$get_Aggregator()
lfqAggMedpol$medpolish()
lfqtrans <- lfqAggMedpol$lfq_agg
SeintExpress requires specifying a control condition. As control
C
we will use the samples with the lowest E-Coli spike in
concentration (dilution a
). Treatments T
are
samples with higher concentrations (dilution ‘b’, ‘c’, ‘d’, ‘e’).
case_when
.readPeptideFasta
,addProteinLengths
).protein_2localSaint
).runSaint
)
exampleDat <- lfqtrans$data |> dplyr::mutate(CorT = case_when(dilution. == "a" ~ "C", TRUE ~ "T"))
exampleDat$protein_Id <- gsub("~.*","", exampleDat$protein_Id)
# sample protein lengths
Ecolifasta <- system.file("fastaDBs/uniprot-proteome_UP000000625_reviewed_yes.fasta.gz",package = "prolfquadata")
Humanfasta <- system.file("fastaDBs/uniprot-proteome_UP000005640_reviewed_yes.fasta.gz",package = "prolfquadata")
Ecolifasta <- prozor::readPeptideFasta(Ecolifasta)
Humanfasta <- prozor::readPeptideFasta(Humanfasta)
fasta <- c(Ecolifasta, Humanfasta)
exampleDat <- prolfqua::addProteinLengths(exampleDat, fasta)
res <- protein_2localSaint(exampleDat,quantcolumn = "medpolish",
proteinID = "protein_Id",
proteinLength = "protein.length",
IP_name = "raw.file",
baitCol = "dilution.",
CorTCol = "CorT"
)
stopifnot(names(res) == c( "inter", "prey", "bait"))
resSaint <- runSaint(res,filedir = tempdir())
names(resSaint)
ctr <- prolfqua::ContrastsSaintExpress$new(resSaint$list)
pl <- ctr$get_Plotter()
pl$score_plot()$SaintScore
pl$histogram()
We benchmark SaintExpress using the Ionstar dataset. We are using here the lowest ECOLI spike in concentration as control.
bb <- ctr$get_contrasts()
bb <- mutate(bb , PEP = 1-SaintScore)
ttd <- prolfqua::ionstar_bench_preprocess( bb , idcol = "Prey" )
benchmark_SaintExpres <- prolfqua::make_benchmark(ttd$data,
contrast = "Bait",
fcestimate = "log2FC",
toscale = NULL,
benchmark = list(
list(score = "log2FC", desc = TRUE),
list(score = "SaintScore", desc = TRUE),
list(score = "BFDR", desc = FALSE)
),
model_description = "SaintExpress_medpolishInt",
model_name = "SaintExpress_medpolishInt",
FDRvsFDP = list(list(score = "BFDR", desc = FALSE))
, hierarchy = c("Prey"), summarizeNA = "SaintScore"
)
colnames(ttd$data)
sum(benchmark_SaintExpres$smc$summary$Prey)
sumarry <- benchmark_SaintExpres$smc$summary
prolfqua::table_facade(sumarry, caption = "nr of proteins with 0, 1, 2, 3 missing contrasts.")
res <- benchmark_SaintExpres$pAUC_summaries()
knitr::kable(res$ftable$content,caption = res$ftable$caption)
res$barp
#res$ftable
benchmark_SaintExpres$plot_ROC(xlim = 0.2)
benchmark_SaintExpres$plot_FDRvsFDP()
benchmark_SaintExpres$plot_precision_recall()
There are several problems with the results produced by SaintExpress:
SaintExpress infers several statistics. - the SaintScore - Saint Probability, vaguely - probability that the protein is a true interactor. - the BFDR Bayesian FDR.
The following article [http://varianceexplained.org/r/bayesian_fdr_baseball/] describes how the BFDR can be derived from the posterior error probability (PEP). The BFDR equals the cumulative mean of the PEP.
SaintExpress does not report the PEP. But if the SaintScore is the probability that a protein is an interactor then, can be interpreted as posterior error probability (that it is not an interactor) and hence we can compute the BFDR:
computeFDR <- function(mdata ){
mdata <- dplyr::mutate(mdata, PEP = 1 - SaintScore)
mdata <- mdata |> arrange(PEP)
mdata <- mdata |> mutate( myFDR = cummean(PEP))
return(mdata)
}
There are two options, either to determine the BFDR for all Baits
(first code snipped) or for each Bait (second one with the
for
loop).
reslist <- resSaint$list
reslist <- computeFDR(reslist)
ggplot(reslist, aes(x = BFDR, y = myFDR)) + geom_point() + geom_abline(slope = 1,color = "red")
ad <- list()
for(i in unique(reslist$Bait)){
print(i)
set <- filter(reslist, Bait == i)
ad[[i]] <- computeFDR(set)
}
ad <- dplyr::bind_rows(ad)
ggplot(ad, aes(x = BFDR, y = myFDR)) + geom_point() + facet_wrap(~Bait) + ggplot2::geom_abline(slope = 1,colour = 2)
The SE BFDR is quite similar to the FDR we estimated from the SaintScore. It also seems that SaintExpress computes the BFDR for all Baits not for each.