vignettes/Benchmark_prolfqua.Rmd
Benchmark_prolfqua.RmdWe benchmark the prolfqua ContrastsLMMissingFacade (LM +
missing-value merge + variance moderation) and
ContrastsLimmaFacade (Limma empirical Bayes) methods across
four datasets.
ionstar_ground_truth <- list(
id_column = "protein_id",
positive = list(label = "ECOLI", pattern = "ECOLI"),
negative = list(label = "HUMAN", pattern = "HUMAN")
)
ionstar_contrasts <- c(
"dilution_(9/7.5)_1.2" = "dilution.e - dilution.d",
"dilution_(7.5/6)_1.25" = "dilution.d - dilution.c",
"dilution_(6/4.5)_1.3(3)" = "dilution.c - dilution.b",
"dilution_(4.5/3)_1.5" = "dilution.b - dilution.a"
)
datadir <- file.path(find.package("prolfquadata"), "quantdata")
inputMQfile <- file.path(datadir, "MAXQuant_IonStar2018_PXD003881.zip")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
#' Run prolfqua benchmark with both Facade methods
#'
#' @param lfqdata_protein protein-level LFQData (log2-transformed, normalized)
#' @param formula_str model formula as character (e.g. "~ dilution.")
#' @param contrasts named character vector of contrasts
#' @param path_prefix directory prefix for output (e.g. "../inst/Benchresults/prolfqua_IonStar_MQ")
#' @param dataset_name dataset identifier for metadata
#' @param input_file input file description for metadata
#' @param ground_truth ground truth specification list
run_prolfqua_benchmark <- function(lfqdata_protein, formula_str, contrasts,
path_prefix, dataset_name, input_file,
ground_truth) {
sw_version <- paste0("prolfqua ", packageVersion("prolfqua"))
results <- list()
# LM + missing + moderation
fa_lm <- prolfqua::ContrastsLMMissingFacade$new(
lfqdata_protein, formula_str, contrasts)
results$lm <- fa_lm$get_contrasts()
write_contrast_results(
results$lm,
path = paste0(path_prefix, "_lm"),
metadata = list(
dataset = dataset_name,
method = "prolfqua_lm",
method_description = "prolfqua LM + missing merge + moderation",
input_file = input_file,
software_version = sw_version,
date = as.character(Sys.Date()),
ground_truth = ground_truth))
# Limma empirical Bayes
fa_limma <- prolfqua::ContrastsLimmaFacade$new(
lfqdata_protein, formula_str, contrasts)
results$limma <- fa_limma$get_contrasts()
write_contrast_results(
results$limma,
path = paste0(path_prefix, "_limma"),
metadata = list(
dataset = dataset_name,
method = "prolfqua_limma",
method_description = "prolfqua Limma empirical Bayes",
input_file = input_file,
software_version = sw_version,
date = as.character(Sys.Date()),
ground_truth = ground_truth))
results
}
mqdata <- list()
mqdata$data <- prolfquapp::tidyMQ_Peptides(inputMQfile)
mqdata$config <- prolfqua::create_config_MQ_peptide()
annotation <- readxl::read_xlsx(inputAnnotation)
res <- dplyr::inner_join(mqdata$data, annotation, by = "raw.file")
mqdata$config$table$factors[["dilution."]] <- "sample"
mqdata$config$table$factors[["run_Id"]] <- "run_ID"
mqdata$config$table$factorDepth <- 1
mqdata$data <- prolfqua::setup_analysis(res, mqdata$config)
lfqdata_mq <- prolfqua::LFQData$new(mqdata$data, mqdata$config)
lfqdata_mq$data <- lfqdata_mq$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id))
lfqdata_mq$filter_proteins_by_peptide_count()
lfqdata_mq$remove_small_intensities()
tr <- lfqdata_mq$get_Transformer()
subset_h <- lfqdata_mq$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNorm_mq <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
agg_mq <- lfqdataNorm_mq$get_Aggregator()
agg_mq$medpolish()
lfqProt_mq <- agg_mq$lfq_agg
results_mq <- run_prolfqua_benchmark(
lfqdata_protein = lfqProt_mq,
formula_str = "~ dilution.",
contrasts = ionstar_contrasts,
path_prefix = "../inst/Benchresults/prolfqua_IonStar_MQ",
dataset_name = "IonStar_MQ",
input_file = "MAXQuant_IonStar2018_PXD003881.zip/peptides.txt",
ground_truth = ionstar_ground_truth)
inputFragfile <- file.path(datadir, "MSFragger_IonStar2018_PXD003881.zip")
annotation <- readxl::read_xlsx(inputAnnotation)
msstats <- read.csv(
unz(inputFragfile, "IonstarWithMSFragger/MSstats.csv"),
header = TRUE, sep = ",", stringsAsFactors = FALSE)
msstats <- tibble::tibble(msstats)
msstats$Run <- tolower(msstats$Run)
merged <- dplyr::inner_join(annotation, msstats, by = c("raw.file" = "Run"))
summedPep <- merged |>
dplyr::group_by(raw.file, run_ID, sample, ProteinName, PeptideSequence) |>
dplyr::summarize(nprecursor = dplyr::n(),
pepIntensity = sum(Intensity, na.rm = TRUE),
.groups = "drop")
atable <- prolfqua::AnalysisConfiguration$new()
atable$fileName <- "raw.file"
atable$hierarchy[["protein_Id"]] <- c("ProteinName")
atable$hierarchy[["peptide_Id"]] <- c("PeptideSequence")
atable$hierarchyDepth <- 1
atable$set_response("pepIntensity")
atable$factors[["dilution."]] <- "sample"
atable$factors[["run"]] <- "run_ID"
atable$factorDepth <- 1
config_fp <- prolfqua::AnalysisConfiguration$new(atable)
adata_fp <- prolfqua::setup_analysis(summedPep, config_fp)
lfqdata_fp <- prolfqua::LFQData$new(adata_fp, config_fp)
lfqdata_fp$filter_proteins_by_peptide_count()
lfqdata_fp$remove_small_intensities()
subset_h <- lfqdata_fp$get_copy()$get_Transformer()$log2()$lfq
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
tr <- lfqdata_fp$get_Transformer()
lfqdataNorm_fp <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
agg_fp <- lfqdataNorm_fp$get_Aggregator()
agg_fp$medpolish()
lfqProt_fp <- agg_fp$lfq_agg
results_fp <- run_prolfqua_benchmark(
lfqdata_protein = lfqProt_fp,
formula_str = "~ dilution.",
contrasts = ionstar_contrasts,
path_prefix = "../inst/Benchresults/prolfqua_IonStar_FragPipe_MSstatsCSV",
dataset_name = "IonStar_FragPipe_MSstatsCSV",
input_file = "MSFragger_IonStar2018_PXD003881.zip/IonstarWithMSFragger/MSstats.csv",
ground_truth = ionstar_ground_truth)This dataset is already at protein level, so no peptide aggregation is needed.
annotation <- readxl::read_xlsx(inputAnnotation)
protein <- tibble::as_tibble(read.csv(
unz(inputFragfile, "IonstarWithMSFragger/combined_protein.tsv"),
header = TRUE, sep = "\t", stringsAsFactors = FALSE))
protein <- prolfquapp::tidy_FragPipe_combined_protein_deprec(protein)
protein <- protein |> dplyr::filter(unique.stripped.peptides > 1)
merged <- dplyr::inner_join(annotation, protein)
atable3 <- prolfqua::AnalysisConfiguration$new()
atable3$fileName <- "raw.file"
atable3$hierarchy[["protein_Id"]] <- c("protein")
atable3$hierarchyDepth <- 1
atable3$set_response("razor.intensity")
atable3$factors[["dilution."]] <- "sample"
atable3$factors[["run"]] <- "run_ID"
atable3$factorDepth <- 1
config3 <- prolfqua::AnalysisConfiguration$new(atable3)
adata3 <- prolfqua::setup_analysis(merged, config3)
lfqdata3 <- prolfqua::LFQData$new(adata3, config3)
lfqdata3$remove_small_intensities()
subset_h <- lfqdata3$get_copy()$get_Transformer()$log2()$lfq
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
tr <- lfqdata3$get_Transformer()
lfqNorm3 <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
results_cp <- run_prolfqua_benchmark(
lfqdata_protein = lfqNorm3,
formula_str = "~ dilution.",
contrasts = ionstar_contrasts,
path_prefix = "../inst/Benchresults/prolfqua_IonStar_FragPipe_combined_protein",
dataset_name = "IonStar_FragPipe_combined_protein",
input_file = "MSFragger_IonStar2018_PXD003881.zip/IonstarWithMSFragger/combined_protein.tsv",
ground_truth = ionstar_ground_truth)
evalCPTAC <- requireNamespace("msdata", quietly = TRUE)
tmp <- msdata::quant(full.names = TRUE)
xx <- read.csv(tmp, sep = "\t")
peptides <- prolfquapp::tidyMQ_Peptides(xx)
annotation_cptac <- data.frame(raw.file = unique(peptides$raw.file))
annotation_cptac <- annotation_cptac |>
dplyr::mutate(group = gsub("^6", "", raw.file)) |>
dplyr::mutate(group = gsub("_[0-9]$", "", group))
peptides <- dplyr::inner_join(annotation_cptac, peptides)
config_cptac <- prolfqua::create_config_MQ_peptide()
config_cptac$table$factors["group."] <- "group"
peptides <- prolfqua::setup_analysis(peptides, config_cptac)
lfqpeptides <- prolfqua::LFQData$new(peptides, config_cptac)
lfqpeptides$filter_proteins_by_peptide_count()
lfqpeptides$remove_small_intensities()
tr <- lfqpeptides$get_Transformer()
tr <- tr$intensity_matrix(vsn::justvsn)
lfqtransformed <- tr$lfq
agg <- lfqtransformed$get_Aggregator()
agg$medpolish()
lfqProt_cptac <- agg$lfq_agg
cptac_contrasts <- c("b_vs_a" = "group.b - group.a")
cptac_ground_truth <- list(
id_column = "protein_id",
positive = list(label = "UPS", pattern = "_UPS"),
negative = list(label = "YEAST", pattern = "_YEAST")
)
results_cptac <- run_prolfqua_benchmark(
lfqdata_protein = lfqProt_cptac,
formula_str = "~ group.",
contrasts = cptac_contrasts,
path_prefix = "../inst/Benchresults/prolfqua_CPTAC",
dataset_name = "CPTAC",
input_file = "msdata::quant()/peptides.txt",
ground_truth = cptac_ground_truth)