vignettes/Benchmark_MSStats.Rmd
Benchmark_MSStats.Rmd
datadir <- file.path(find.package("prolfquadata"), "quantdata")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
# Contrast matrix shared by both datasets
comparison1 <- matrix(c(0, 0, 0, -1, 1), nrow = 1)
comparison2 <- matrix(c(0, 0, -1, 1, 0), nrow = 1)
comparison3 <- matrix(c(0, -1, 1, 0, 0), nrow = 1)
comparison4 <- matrix(c(-1, 1, 0, 0, 0), nrow = 1)
comparison <- rbind(comparison1, comparison2, comparison3, comparison4)
row.names(comparison) <- c("dilution_(9/7.5)_1.2", "dilution_(7.5/6)_1.25",
"dilution_(6/4.5)_1.3(3)", "dilution_(4.5/3)_1.5")
colnames(comparison) <- c("a", "b", "c", "d", "e")
inputMQfile <- file.path(datadir, "MAXQuant_IonStar2018_PXD003881.zip")
proteinGroups <- read.table(unz(inputMQfile, "proteinGroups.txt"), sep = "\t", header = TRUE)
infile <- read.table(unz(inputMQfile, "evidence.txt"), sep = "\t", header = TRUE)
annotation <- readxl::read_xlsx(inputAnnotation)
msstas_annotation <- annotation |>
dplyr::select(raw.file, Condition = sample, BioReplicate = run_ID)
msstas_annotation$IsotopeLabelType <- "L"
msstas_annotation <- msstas_annotation |>
tidyr::unite("Experiment", Condition, BioReplicate, sep = "_", remove = FALSE)
rf <- unique(infile$Raw.file)
rf <- data.frame(Raw.file = rf, raw.file = tolower(rf))
msstas_annotation <- dplyr::inner_join(rf, msstas_annotation)
msstas_annotation$raw.file <- NULL
raw <- MSstats::MaxQtoMSstatsFormat(
evidence = infile,
annotation = msstas_annotation,
proteinGroups = proteinGroups,
removeProtein_with1Peptide = TRUE,
use_log_file = FALSE,
verbose = FALSE)Remove reverse sequences and contaminants.
invisible(utils::capture.output(
QuantData_mq <- MSstats::dataProcess(raw3, use_log_file = FALSE, verbose = FALSE)
))
invisible(utils::capture.output(
testResult_mq <- MSstats::groupComparison(
contrast.matrix = comparison,
data = QuantData_mq,
verbose = FALSE,
use_log_file = FALSE)
))
bb_mq <- testResult_mq$ComparisonResult
msstats_col_map <- c(
"Protein" = "protein_id",
"Label" = "contrast",
"log2FC" = "log_fc",
"Tvalue" = "t_statistic",
"pvalue" = "p_value",
"adj.pvalue" = "p_value_adjusted"
)
write_contrast_results(
bb_mq,
path = "../inst/Benchresults/MSstats_MaxQuant",
metadata = list(
dataset = "IonStar_MaxQuant",
method = "MSstats",
method_description = "MSstats on MaxQuant evidence.txt",
input_file = "MAXQuant_IonStar2018_PXD003881.zip/evidence.txt",
software_version = paste0("MSstats ", packageVersion("MSstats")),
date = as.character(Sys.Date()),
ground_truth = list(
id_column = "protein_id",
positive = list(label = "ECOLI", pattern = "ECOLI"),
negative = list(label = "HUMAN", pattern = "HUMAN")
)
),
col_map = msstats_col_map
)
inputFragfile <- file.path(datadir, "MSFragger_IonStar2018_PXD003881.zip")
msstats_fp <- read.csv(
unz(inputFragfile, "IonstarWithMSFragger/MSstats.csv"),
header = TRUE, sep = ",", stringsAsFactors = FALSE)
msstats_fp$BioReplicate <- NULL
msstats_fp$Run <- tolower(msstats_fp$Run)
msstats_fp$Condition <- NULL
annotation_fp <- readxl::read_xlsx(inputAnnotation)
annotation_fp$BioReplicate <- annotation_fp$run_ID
annotation_fp$run_ID <- NULL
annotation_fp$date <- NULL
annotation_fp$Condition <- annotation_fp$sample
annotation_fp$sample <- NULL
msstats_fp <- dplyr::inner_join(msstats_fp, annotation_fp, by = c(Run = "raw.file"))
invisible(utils::capture.output(
QuantData_fp <- MSstats::dataProcess(msstats_fp, use_log_file = FALSE, verbose = FALSE)
))
invisible(utils::capture.output(
testResult_fp <- MSstats::groupComparison(
contrast.matrix = comparison,
data = QuantData_fp,
verbose = FALSE,
use_log_file = FALSE)
))
bb_fp <- testResult_fp$ComparisonResult
write_contrast_results(
bb_fp,
path = "../inst/Benchresults/MSstats_FragPipe",
metadata = list(
dataset = "IonStar_FragPipe",
method = "MSstats",
method_description = "MSstats on FragPipe MSstats.csv",
input_file = "MSFragger_IonStar2018_PXD003881.zip/IonstarWithMSFragger/MSstats.csv",
software_version = paste0("MSstats ", packageVersion("MSstats")),
date = as.character(Sys.Date()),
ground_truth = list(
id_column = "protein_id",
positive = list(label = "ECOLI", pattern = "ECOLI"),
negative = list(label = "HUMAN", pattern = "HUMAN")
)
),
col_map = msstats_col_map
)