Modelling with Interactions

TODO(wew,jg): Can you please add a one-liner of the purpose of the vignette?

a <- c(a = 3, b = 4.5, c = 6, d = 7.5, e = 9)

F1 <- list(L1 = 3, L2 = 4.5)
F2 <- list(L1 = 0, L2 = 3) 


a <- F1$L1 + F2$L1 # 3
b <- F1$L2 + F2$L1 # 4.5
c <- F1$L1 + F2$L2 # 6
d <- F1$L2 + F2$L2 # 7.5

#(F1L1 - F1L2) gv F2L1 = log2(3/4.5) = -0.585 
#(F1L1 - F1L2) gv F2L2 = log2(6/7.5) = -0.321 
    
    
c(a, b, c, d)

## [1] 3.0 4.5 6.0 7.5

First, we load the data and do the configuration.

datadir <- file.path(find.package("prolfquadata") , "quantdata")

inputAnnotation <- file.path(datadir, "annotation_Ionstar2018_PXD003881.xlsx")

annotation <- readxl::read_xlsx(inputAnnotation)

## Error in get(paste0(generic, ".", class), envir = get_method_env()) : 
##   object 'type_sum.accel' not found

annotation <- annotation |> dplyr::filter(sample != "e")
annotation <- annotation |> 
  dplyr::mutate(F1 = dplyr::case_when(sample %in% c("a","c") ~ "L1", TRUE ~ "L2"),
    F2 = dplyr::case_when(sample %in% c("a","b") ~ "L1", TRUE ~ "L2")) |>
  dplyr::arrange(sample)

datadir <- file.path(find.package("prolfquadata") , "quantdata")
inputMQfile <-  file.path(datadir,
                          "MAXQuant_IonStar2018_PXD003881.zip")

data <- prolfquapp::tidyMQ_Peptides(inputMQfile)
length(unique(data$proteins))

## [1] 5295

Read the sample annotation. The sample annotation must contain the raw.file name and the explanatory variables of your experiment, e.g. treatment, time point, genetic background, or other factors which you would like to check for confounding.

Then you need to tell prolfqua which columns in the data frame contain what information. You do it using the AnalysisTableAnnotation class.

The AnalysisTableAnnotation has the following fields that need to be populated: - fileName - hierarchy - factors - workingIntensity , and we will discuss it in more detail below.

The fileName is the column with the raw file names, however for labelled TMT experiments, it can be used to hold the name of the TMT channel.

The hierarchy field describes the structure of the MS data e.g, - protein - peptides - modified peptides - precursor In case of the MaxQuant peptide.txt file we have data on protein level.

In addition, we need to describe the factors of the analysis, i.e., the column containing the explanatory variables.

config <- prolfqua::create_config_MQ_peptide()

res <- dplyr::inner_join(
  data,
  annotation,
  by = "raw.file"
)

config$table$factors[["F1."]] = "F1"
config$table$factors[["F2."]] = "F2"
config$table$factorDepth <- 2

data <- prolfqua::setup_analysis(res, config)


lfqdata <- prolfqua::LFQData$new(data, config)

Filter the data for small intensities (maxquant reports missing values as 0) and for two peptides per protein.

lfqdata$data <- lfqdata$data |> dplyr::filter(!grepl("^REV__|^CON__", protein_Id)) 
lfqdata$filter_proteins_by_peptide_count()
lfqdata$remove_small_intensities()
lfqdata$hierarchy_counts()

## # A tibble: 1 × 3
##   isotope protein_Id peptide_Id
##   <chr>        <int>      <int>
## 1 light         4176      29762

tr <- lfqdata$get_Transformer()
subset_h <- lfqdata$get_copy()
subset_h$data <- subset_h$data |> dplyr::filter(grepl("HUMAN", protein_Id))
subset_h <- subset_h$get_Transformer()$log2()$lfq
lfqdataNormalized <- tr$log2()$robscale_subset(lfqsubset = subset_h)$lfq

lfqAggMedpol <- lfqdataNormalized$get_Aggregator()
lfqAggMedpol$medpolish()
lfqTrans <- lfqAggMedpol$lfq_agg

Model Fitting

defines the contrasts

lfqTrans$rename_response("abundance")
formula_2_Factors <- prolfqua::strategy_lm("abundance ~ F1. * F2. ")

# specify model definition
modelName  <- "Model"


Contrasts <- c("F1.L1_vs_F1.L2" = "F1.L1 - F1.L2",
               "F2.L1_vs_F2.L2" = "F2.L1 - F2.L2",
               "F1L1_vs_F1L2_gv_F2L1" = "`F1.L1:F2.L1` - `F1.L2:F2.L1`",
               "F1L1_vs_F1L2_gv_F2L2" = "`F1.L1:F2.L2` - `F1.L2:F2.L2`",
               "doFCinF2L1_differfromF2L2" = "`F1L1_vs_F1L2_gv_F2L1` - `F1L1_vs_F1L2_gv_F2L2`"
               )

The following line fits the model.

mod <- prolfqua::build_model(lfqTrans, formula_2_Factors)

mod$anova_histogram(what = "FDR")

## $plot

p-value distributions for ANOVA analysis.

## 
## $name
## [1] "Anova_p.values_Model.pdf"

ANOVA

Examine proteins with a significant interaction between the two factors treatment and batch.

ANOVA <- mod$get_anova()
ANOVA |>
  dplyr::filter(factor == "F1.:F2.") |>
  dplyr::arrange(FDR) |>
  head(5)

## # A tibble: 5 × 10
##   protein_Id       isSingular nrcoef factor    Df Sum.Sq Mean.Sq F.value p.value
##   <chr>            <lgl>       <int> <chr>  <int>  <dbl>   <dbl>   <dbl>   <dbl>
## 1 sp|P0A7J3|RL10_… FALSE           4 F1.:F…     1 0.174   0.174     51.2 1.15e-5
## 2 sp|P0A905|SLYB_… FALSE           4 F1.:F…     1 0.122   0.122     51.5 1.12e-5
## 3 sp|P0A953|FABB_… FALSE           4 F1.:F…     1 0.404   0.404     49.5 1.37e-5
## 4 sp|P13029|KATG_… FALSE           4 F1.:F…     1 0.0599  0.0599    50.7 1.21e-5
## 5 sp|P0A763|NDK_E… FALSE           4 F1.:F…     1 0.185   0.185     45.8 2.00e-5
## # ℹ 1 more variable: FDR <dbl>

ANOVA$factor |> unique()

## [1] "F1."     "F2."     "F1.:F2."

protIntSig <- ANOVA |> dplyr::filter(factor == "F1.") |>
  dplyr::filter(FDR < 0.1)

protInt <-  lfqTrans$get_copy()
protInt$data <- protInt$data[protInt$data$protein_Id %in%
                               protIntSig$protein_Id, ]
protInt$hierarchy_counts()

## # A tibble: 1 × 2
##   isotope protein_Id
##   <chr>        <int>
## 1 light          415

mean(grepl("ECOLI", protInt$data$protein_Id))

## [1] 0.9511936

#ggpubr::ggarrange(plotlist = protInt$get_Plotter()$boxplots()$boxplot)

Compute contrasts

contr <- prolfqua::ContrastsModerated$new(prolfqua::Contrasts$new(mod, Contrasts))
#contr$get_contrasts_sides()
contrdf <- contr$get_contrasts()

The code snippets graph the volcano and ma plot.

plotter <- contr$get_Plotter()
plotter$volcano()

## $FDR

plotter$ma_plot()

Annalyse contrasts with missing data imputation

lfqTrans$config$table$factorDepth <- 2
# ContrastsMissing$debug("get_contrasts")
contrSimple <- prolfqua::ContrastsMissing$new(lfqdata = lfqTrans,
                                                   Contrasts)
contrdfSimple <- contrSimple$get_contrasts()
# na.omit(contrdfSimple)
pl <- contrSimple$get_Plotter()
pl$histogram_diff()

pl$volcano()

## $p.value

## 
## $FDR

Merge nonimputed and imputed data.

dim(contr$get_contrasts())

## [1] 20466    13

dim(contrSimple$get_contrasts())

## [1] 20880    20

mergedContrasts <- prolfqua::merge_contrasts_results(prefer = contr, add = contrSimple)$merged
cM <- mergedContrasts$get_Plotter()

plot <- cM$volcano()
plot$FDR

The prolfqua package is described in [@Wolski2022.06.07.494524].

Session Info

sessionInfo()

## R version 4.4.1 (2024-06-14)
## Platform: aarch64-apple-darwin20
## Running under: macOS 15.0.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Zurich
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6       xfun_0.49          bslib_0.8.0        ggplot2_3.5.1     
##  [5] htmlwidgets_1.6.4  ggrepel_0.9.6      vctrs_0.6.5        tools_4.4.1       
##  [9] generics_0.1.3     tibble_3.2.1       pkgconfig_2.0.3    pheatmap_1.0.12   
## [13] data.table_1.16.4  RColorBrewer_1.1-3 desc_1.4.3         readxl_1.4.3      
## [17] lifecycle_1.0.4    prolfqua_1.3.6     compiler_4.4.1     farver_2.1.2      
## [21] stringr_1.5.1      textshaping_0.4.1  progress_1.2.3     munsell_0.5.1     
## [25] htmltools_0.5.8.1  sass_0.4.9         yaml_2.3.10        lazyeval_0.2.2    
## [29] plotly_4.10.4      pillar_1.10.0      pkgdown_2.1.1      crayon_1.5.3      
## [33] jquerylib_0.1.4    tidyr_1.3.1        MASS_7.3-64        prolfquapp_0.0.8  
## [37] cachem_1.1.0       tidyselect_1.2.1   conflicted_1.2.0   digest_0.6.37     
## [41] stringi_1.8.4      dplyr_1.1.4        purrr_1.0.2        labeling_0.4.3    
## [45] forcats_1.0.0      fastmap_1.2.0      grid_4.4.1         colorspace_2.1-1  
## [49] cli_3.6.3          magrittr_2.0.3     utf8_1.2.4         withr_3.0.2       
## [53] prettyunits_1.2.0  scales_1.3.0       rmarkdown_2.29     httr_1.4.7        
## [57] gridExtra_2.3      cellranger_1.1.0   ragg_1.3.3         hms_1.1.3         
## [61] memoise_2.0.1      evaluate_1.0.1     knitr_1.49         viridisLite_0.4.2 
## [65] dtplyr_1.3.1       rlang_1.1.4        Rcpp_1.0.13-1      glue_1.8.0        
## [69] jsonlite_1.8.9     R6_2.5.1           systemfonts_1.1.0  fs_1.6.5

Witold E. Wolski