vignettes/Benchmark_Model_IonStar_With2Factors.Rmd
Benchmark_Model_IonStar_With2Factors.Rmd
TODO(wew,jg): Can you please add a one-liner of the purpose of the vignette?
a <- c(a = 3, b = 4.5, c = 6, d = 7.5, e = 9)
F1 <- list(L1 = 3, L2 = 4.5)
F2 <- list(L1 = 0, L2 = 3)
a <- F1$L1 + F2$L1 # 3
b <- F1$L2 + F2$L1 # 4.5
c <- F1$L1 + F2$L2 # 6
d <- F1$L2 + F2$L2 # 7.5
#(F1L1 - F1L2) gv F2L1 = log2(3/4.5) = -0.585
#(F1L1 - F1L2) gv F2L2 = log2(6/7.5) = -0.321
c(a, b, c, d)
## [1] 3.0 4.5 6.0 7.5
First, we load the data and do the configuration.
datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")
annotation <- readxl::read_xlsx(inputAnnotation)
## Error in get(paste0(generic, ".", class), envir = get_method_env()) :
## object 'type_sum.accel' not found
annotation <- annotation |> dplyr::filter(sample != "e")
annotation <- annotation |>
dplyr::mutate(F1 = dplyr::case_when(sample %in% c("a","c") ~ "L1", TRUE ~ "L2"),
F2 = dplyr::case_when(sample %in% c("a","b") ~ "L1", TRUE ~ "L2")) |>
dplyr::arrange(sample)
datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputMQfile <- file.path(datadir,
"MAXQuant_IonStar2018_PXD003881.zip")
data <- prolfquapp::tidyMQ_Peptides(inputMQfile)
length(unique(data$proteins))
## [1] 5295
Read the sample annotation. The sample annotation must contain the
raw.file
name and the explanatory variables of your
experiment, e.g. treatment, time point, genetic background, or other
factors which you would like to check for confounding.
Then you need to tell prolfqua
which columns in
the data frame contain what information. You do it using the
AnalysisTableAnnotation
class.
The AnalysisTableAnnotation
has the following fields
that need to be populated: - fileName - hierarchy - factors -
workingIntensity , and we will discuss it in more detail below.
The fileName
is the column with the raw file names,
however for labelled TMT experiments, it can be used to hold the name of
the TMT channel.
The hierarchy
field describes the structure of the MS
data e.g, - protein - peptides - modified peptides - precursor In case
of the MaxQuant peptide.txt
file we have data on protein
level.
In addition, we need to describe the factors
of the
analysis, i.e., the column containing the explanatory variables.
config <- prolfqua::create_config_MQ_peptide()
res <- dplyr::inner_join(
data,
annotation,
by = "raw.file"
)
config$table$factors[["F1."]] = "F1"
config$table$factors[["F2."]] = "F2"
config$table$factorDepth <- 2
data <- prolfqua::setup_analysis(res, config)
lfqdata <- prolfqua::LFQData$new(data, config)
Filter the data for small intensities (maxquant reports missing values as 0) and for two peptides per protein.
lfqdata$data <- lfqdata$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id))
lfqdata$filter_proteins_by_peptide_count()
lfqdata$remove_small_intensities()
lfqdata$hierarchy_counts()
## # A tibble: 1 × 3
## isotope protein_Id peptide_Id
## <chr> <int> <int>
## 1 light 4176 29762
tr <- lfqdata$get_Transformer()
subset_h <- lfqdata$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNormalized <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq
lfqAggMedpol <- lfqdataNormalized$get_Aggregator()
lfqAggMedpol$medpolish()
lfqTrans <- lfqAggMedpol$lfq_agg
defines the contrasts
lfqTrans$rename_response("abundance")
formula_2_Factors <- prolfqua::strategy_lm("abundance ~ F1. * F2. ")
# specify model definition
modelName <- "Model"
Contrasts <- c("F1.L1_vs_F1.L2" = "F1.L1 - F1.L2",
"F2.L1_vs_F2.L2" = "F2.L1 - F2.L2",
"F1L1_vs_F1L2_gv_F2L1" = "`F1.L1:F2.L1` - `F1.L2:F2.L1`",
"F1L1_vs_F1L2_gv_F2L2" = "`F1.L1:F2.L2` - `F1.L2:F2.L2`",
"doFCinF2L1_differfromF2L2" = "`F1L1_vs_F1L2_gv_F2L1` - `F1L1_vs_F1L2_gv_F2L2`"
)
The following line fits the model.
mod <- prolfqua::build_model(lfqTrans, formula_2_Factors)
mod$anova_histogram(what = "FDR")
## $plot
##
## $name
## [1] "Anova_p.values_Model.pdf"
Examine proteins with a significant interaction between the two factors treatment and batch.
ANOVA <- mod$get_anova()
ANOVA |>
dplyr::filter(factor == "F1.:F2.") |>
dplyr::arrange(FDR) |>
head(5)
## # A tibble: 5 × 10
## protein_Id isSingular nrcoef factor Df Sum.Sq Mean.Sq F.value p.value
## <chr> <lgl> <int> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 sp|P0A7J3|RL10_… FALSE 4 F1.:F… 1 0.174 0.174 51.2 1.15e-5
## 2 sp|P0A905|SLYB_… FALSE 4 F1.:F… 1 0.122 0.122 51.5 1.12e-5
## 3 sp|P0A953|FABB_… FALSE 4 F1.:F… 1 0.404 0.404 49.5 1.37e-5
## 4 sp|P13029|KATG_… FALSE 4 F1.:F… 1 0.0599 0.0599 50.7 1.21e-5
## 5 sp|P0A763|NDK_E… FALSE 4 F1.:F… 1 0.185 0.185 45.8 2.00e-5
## # ℹ 1 more variable: FDR <dbl>
ANOVA$factor |> unique()
## [1] "F1." "F2." "F1.:F2."
protIntSig <- ANOVA |> dplyr::filter(factor == "F1.") |>
dplyr::filter(FDR < 0.1)
protInt <- lfqTrans$get_copy()
protInt$data <- protInt$data[protInt$data$protein_Id %in%
protIntSig$protein_Id, ]
protInt$hierarchy_counts()
## # A tibble: 1 × 2
## isotope protein_Id
## <chr> <int>
## 1 light 415
## [1] 0.9511936
#ggpubr::ggarrange(plotlist = protInt$get_Plotter()$boxplots()$boxplot)
contr <- prolfqua::ContrastsModerated$new(prolfqua::Contrasts$new(mod, Contrasts))
#contr$get_contrasts_sides()
contrdf <- contr$get_contrasts()
The code snippets graph the volcano and ma plot.
plotter <- contr$get_Plotter()
plotter$volcano()
## $FDR
plotter$ma_plot()
lfqTrans$config$table$factorDepth <- 2
# ContrastsMissing$debug("get_contrasts")
contrSimple <- prolfqua::ContrastsMissing$new(lfqdata = lfqTrans,
Contrasts)
contrdfSimple <- contrSimple$get_contrasts()
# na.omit(contrdfSimple)
pl <- contrSimple$get_Plotter()
pl$histogram_diff()
pl$volcano()
## $p.value
##
## $FDR
dim(contr$get_contrasts())
## [1] 20466 13
dim(contrSimple$get_contrasts())
## [1] 20880 20
mergedContrasts <- prolfqua::merge_contrasts_results(prefer = contr, add = contrSimple)$merged
cM <- mergedContrasts$get_Plotter()
plot <- cM$volcano()
plot$FDR
The prolfqua
package is described in [@Wolski2022.06.07.494524].
## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.0.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.0
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Europe/Zurich
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 xfun_0.49 bslib_0.8.0 ggplot2_3.5.1
## [5] htmlwidgets_1.6.4 ggrepel_0.9.6 vctrs_0.6.5 tools_4.4.1
## [9] generics_0.1.3 tibble_3.2.1 pkgconfig_2.0.3 pheatmap_1.0.12
## [13] data.table_1.16.4 RColorBrewer_1.1-3 desc_1.4.3 readxl_1.4.3
## [17] lifecycle_1.0.4 prolfqua_1.3.6 compiler_4.4.1 farver_2.1.2
## [21] stringr_1.5.1 textshaping_0.4.1 progress_1.2.3 munsell_0.5.1
## [25] htmltools_0.5.8.1 sass_0.4.9 yaml_2.3.10 lazyeval_0.2.2
## [29] plotly_4.10.4 pillar_1.10.0 pkgdown_2.1.1 crayon_1.5.3
## [33] jquerylib_0.1.4 tidyr_1.3.1 MASS_7.3-64 prolfquapp_0.0.8
## [37] cachem_1.1.0 tidyselect_1.2.1 conflicted_1.2.0 digest_0.6.37
## [41] stringi_1.8.4 dplyr_1.1.4 purrr_1.0.2 labeling_0.4.3
## [45] forcats_1.0.0 fastmap_1.2.0 grid_4.4.1 colorspace_2.1-1
## [49] cli_3.6.3 magrittr_2.0.3 utf8_1.2.4 withr_3.0.2
## [53] prettyunits_1.2.0 scales_1.3.0 rmarkdown_2.29 httr_1.4.7
## [57] gridExtra_2.3 cellranger_1.1.0 ragg_1.3.3 hms_1.1.3
## [61] memoise_2.0.1 evaluate_1.0.1 knitr_1.49 viridisLite_0.4.2
## [65] dtplyr_1.3.1 rlang_1.1.4 Rcpp_1.0.13-1 glue_1.8.0
## [69] jsonlite_1.8.9 R6_2.5.1 systemfonts_1.1.0 fs_1.6.5