suppressPackageStartupMessages({
library("QFeatures")
library("dplyr")
library("tidyr")
library("ggplot2")
library("msqrob2")
library("stringr")
library("ExploreModelMatrix")
library("MsCoreUtils")
library("matrixStats")
library("patchwork")
library("kableExtra")
library("ComplexHeatmap")
library("purrr")
library("tibble")
library("scater")
library("BiocFileCache")
})13 msqrob2PTM: DIA-NN case study - Paired phospho-enriched and non-enriched samples (three bioreps)
13.1 Background
- Mass spectrometry–based (MS) proteomics allows the identification and quantification of a myriad of posttranslational modifications (PTMs)
- This reveals additional complexity and diversity of the proteome.
- Indeed, PTMs greatly extend the number of different forms of a protein, that is, proteoforms, that can be found.
- More importantly, these PTMs can impact protein functions
- They are linked to a variety of diseases and developmental disorders.
- Aberrant PTM status can cause a number of detrimental effects ranging from the alteration of protein folding to the dysregulation of cell signaling.
- It is thus of great importance to study these PTMs in detail, not only through their correct identification but also by their correct quantification and subsequent statistical analysis.
13.1.1 Vignette
In this vignette we start from quantified precursor level data to infer
- Differential abundance (DA) of precursors carrying post-translation-modifications,
- DA at the protein level
- Differential usage of precursors, i.e. DA of precursors upon correction for DA at protein level
- Differential usage of PTMs, i.e. upon summarising the usages for all precursors that carry the same PTM at a specific residue (position in the protein),
- on enriched and non-enriched paired samples on three bio-repeats
using the msqrob2PTM workflow in the bioconductor package msqrob2.
You can read more about
msqrob2on https://statomics.github.io/msqrob2book/msqrob2PTMin msqrob2PTM: Differential Abundance and Differential Usage Analysis of MS-Based Proteomics Data at the Posttranslational Modification and Peptidoform Level. N. Demeulemeester, M. Gébelin, L. Caldi Gomes, C. Carapito, L. Martens and L. Clement. Molecular & Cellular Proteomics, 2023; 23. https://doi.org/10.1016/j.mcpro.2023.100708
13.1.2 Experiment
Mutations in tRNAs can lead to mis-incorporation of an amino acid into a growing polypeptide chain that differs from what is specified by the mRNA in a process known as mistranslation. To better understand the impact of mistranslating tRNA variants, Berg et al. profiled the proteome and phosphoproteome of yeast expressing three different mistranslating tRNAs. The first is a proline tRNA with a G3:U70 base pair in its acceptor stem that mis-incorporates alanine at proline codons (Pro -> Ala; Hoffman et al. 2017). The other two are serine tRNAs with either a UGG proline or UCU arginine anticodon which mis-incorporate serine at proline (Pro -> Ser) or serine at arginine codons (Arg -> Ser), respectively (Berg et al. 2017, 2019b).
Six replicates of each strain were grown. From each replicate an non-enriched (whole proteome) and a phospho-peptide-enriched sample was prepared and quantified using a data independent acquisition workflow.
The data for the non-enriched and phospho-enriched samples are deposited to PRIDE, dataset identifiers PXD068388 and PXD068392).
We will take a subset of the data to be able to compare the results of a paired analysis to the unpaired analysis in the previous section.
Summary:
- Mutations in tRNAs can lead to mis-incorporation of amino acids during protein translation.
- 4 strains: WT, 3 strains with mutations leading to miss-incorporations (Pro -> Ala, Pro -> Ser and Arg -> Ser).
- 6 biological repeats per strain
- For each biological repeat: non-enriched and phospho-enriched MS run.
- Here we use the enriched and non-enriched MS runs from three random bio-repeats to compare with an unpaired analysis with 6 MS runs where the enriched and unenriched samples were conducted on different biorepeats.
M. Berg, A. Chang, R. Rodriguez-Mias, J. Villén, Mistranslating tRNA variants impact the proteome and phosphoproteome of Saccharomyces cerevisiae, G3 Genes|Genomes|Genetics, Volume 16, Issue 2, February 2026, jkaf284, https://doi.org/10.1093/g3journal/jkaf284
13.2 Load packages
We load the msqrob2 package, along with additional packages for data manipulation and visualisation.
13.3 Data
13.3.1 Precursor table
- We load the output from DIA-NN parquet files.
- Can be file path to local file or url to file that lives on the web.
- Here we use BiocFileCache so that we do not need to download the files again each time that the script is knit.
bfc <- BiocFileCache()
precursorFile <- bfcrpath(bfc,"https://zenodo.org/records/20414816/files/WholeProteome_DIANNreport.parquet?download=1")
precursorFilePTM <- bfcrpath(bfc,"https://zenodo.org/records/20414816/files/Phosphoproteome_DIANNreport.parquet?download=1")We can import the report.parquet file using the read_parquet function from the arrow package.
precursors <- arrow::read_parquet(precursorFile) # function from the arrow package
precursorsPTM <- arrow::read_parquet(precursorFilePTM) # function from the arrow package
#precursors <- data.table::fread(precursorFile) # For older versions the results are stored as tsv files. We subset the precursor files to reduce the memory footprint and only keep the variables that we need in the downstream analysis.
Note that Protein.Sites refers to the exact modified residue position in the full protein sequence, which will be useful when assessing PTM-level quantification.
Note, that we also filter out the strain with the ProSer variant here.
precursors <- precursors |>
filter(!grepl("ProSer",Run)) |>
select(
Run,
Precursor.Id,
Modified.Sequence,
Stripped.Sequence,
Precursor.Charge,
Protein.Group,
Protein.Names,
Protein.Ids,
Protein.Sites, #exact modified residue position in the full protein sequence
Genes,
Precursor.Quantity,
Precursor.Normalised,
Normalisation.Factor,
Ms1.Area,
Ms1.Normalised,
PG.MaxLFQ,
Q.Value,
Lib.Q.Value,
PG.Q.Value,
Lib.PG.Q.Value,
Proteotypic,
Decoy, # Not available in older versions of DIA-NN
RT)
precursorsPTM <- precursorsPTM |>
filter(!grepl("ProSer",Run)) |>
select(
Run,
Precursor.Id,
Modified.Sequence,
Stripped.Sequence,
Precursor.Charge,
Protein.Group,
Protein.Names,
Protein.Ids,
Protein.Sites, #exact modified residue position in the full protein sequence
Genes,
Precursor.Quantity,
Precursor.Normalised,
Normalisation.Factor,
Ms1.Area,
Ms1.Normalised,
PG.MaxLFQ,
Q.Value,
Lib.Q.Value,
PG.Q.Value,
Lib.PG.Q.Value,
Proteotypic,
Decoy, # Not available in older versions of DIA-NN
RT)precursorsPTM |>
select(Protein.Sites, Modified.Sequence, Stripped.Sequence) |>
head(20)# A tibble: 20 × 3
Protein.Sites Modified.Sequence Stripped.Sequence
<chr> <chr> <chr>
1 [P52871:A2,S4] (UniMod:1)AAS(UniMod:21)VPPGGQR AASVPPGGQR
2 [P52871:A2,S4] (UniMod:1)AAS(UniMod:21)VPPGGQR AASVPPGGQR
3 [P52871:A2,S4] (UniMod:1)AAS(UniMod:21)VPPGGQR AASVPPGGQR
4 [P52871:A2,S4] (UniMod:1)AAS(UniMod:21)VPPGGQR AASVPPGGQR
5 [P52871:A2,S4] (UniMod:1)AAS(UniMod:21)VPPGGQR AASVPPGGQR
6 [P52871:A2,S4] (UniMod:1)AAS(UniMod:21)VPPGGQR AASVPPGGQR
7 [P04147:A2,T5] (UniMod:1)ADIT(UniMod:21)DKTAEQLENLNIQD… ADITDKTAEQLENLNI…
8 [P04147:A2,T5] (UniMod:1)ADIT(UniMod:21)DKTAEQLENLNIQD… ADITDKTAEQLENLNI…
9 [P04147:A2,T5] (UniMod:1)ADIT(UniMod:21)DKTAEQLENLNIQD… ADITDKTAEQLENLNI…
10 [P04147:A2,T8] (UniMod:1)ADITDKT(UniMod:21)AEQLENLNIQD… ADITDKTAEQLENLNI…
11 [P04147:A2,T8] (UniMod:1)ADITDKT(UniMod:21)AEQLENLNIQD… ADITDKTAEQLENLNI…
12 [P04147:A2,T8] (UniMod:1)ADITDKT(UniMod:21)AEQLENLNIQD… ADITDKTAEQLENLNI…
13 [P04147:A2,T8] (UniMod:1)ADITDKT(UniMod:21)AEQLENLNIQD… ADITDKTAEQLENLNI…
14 [P04147:A2,T8] (UniMod:1)ADITDKT(UniMod:21)AEQLENLNIQD… ADITDKTAEQLENLNI…
15 [P04147:A2,T8] (UniMod:1)ADITDKT(UniMod:21)AEQLENLNIQD… ADITDKTAEQLENLNI…
16 [P04147:A2,T8] (UniMod:1)ADITDKT(UniMod:21)AEQLENLNIQD… ADITDKTAEQLENLNI…
17 [P04147:A2,T8] (UniMod:1)ADITDKT(UniMod:21)AEQLENLNIQD… ADITDKTAEQLENLNI…
18 [P40096:A2,T8,M32] (UniMod:1)ADQVPVT(UniMod:21)TQLPPIKPEHE… ADQVPVTTQLPPIKPE…
19 [P40096:A2,T8,M32] (UniMod:1)ADQVPVT(UniMod:21)TQLPPIKPEHE… ADQVPVTTQLPPIKPE…
20 [P40096:A2,T8,M32] (UniMod:1)ADQVPVT(UniMod:21)TQLPPIKPEHE… ADQVPVTTQLPPIKPE…
Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10185238 544.0 17409510 929.8 NA 14228709 759.9
Vcells 61097122 466.2 251102720 1915.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10185347 544.0 17409510 929.8 NA 14228709 759.9
Vcells 61097413 466.2 200882176 1532.7 24576 312076398 2381.0
13.3.2 Subset phospho
In DIA-NN phosphorylation is indicated with (UniMod:21) in the sequence stored in variable Modified.Sequence. This variable contains the precursor sequence along with its modifications.
Here, we only keep precursors that are phosphorylated, i.e. with pattern (UniMod:21) in the variable Modified.Sequence.
precursorsPTM <- precursorsPTM |>
filter(grepl(pattern = "\\(UniMod:21\\)", Modified.Sequence)) Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10181105 543.8 17409510 929.8 NA 14228709 759.9
Vcells 59445039 453.6 200882176 1532.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10181104 543.8 17409510 929.8 NA 14228709 759.9
Vcells 59445070 453.6 200882176 1532.7 24576 312076398 2381.0
13.3.3 Quick check for imputation
Quick check on distribution of precursors MS2 intensities.
precursors |>
filter(is.finite(Precursor.Quantity) & Precursor.Quantity > 0) |>
ggplot(aes(x = log2(Precursor.Quantity))) +
geom_density() +
theme_minimal() +
labs(subtitle = "Abundances (non-enriched)") + precursorsPTM |>
filter(is.finite(Precursor.Quantity) & Precursor.Quantity > 0) |>
ggplot(aes(x = log2(Precursor.Quantity))) +
geom_density() +
theme_minimal() +
labs(subtitle = "Abundances (phospho-enriched)")
Seems no imputation has been done.
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10417415 556.4 17409510 929.8 NA 14442245 771.3
Vcells 99947987 762.6 200882176 1532.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10417396 556.4 17409510 929.8 NA 14442245 771.3
Vcells 99947989 762.6 200882176 1532.7 24576 312076398 2381.0
13.3.4 Subset the data to 3 biorepeats
We select at random 3 biorepeats.
set.seed(1245)
sampleNames <- paste0("_0",1:6,"_")
peSamp <- sample(sampleNames,3) |> sort()Subset the data
precursors <- precursors |> filter(grepl(paste(peSamp,collapse="|"), Run))
precursorsPTM <- precursorsPTM |> filter(grepl(paste(peSamp,collapse="|"), Run))#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10417567 556.4 17409510 929.8 NA 14442245 771.3
Vcells 79776973 608.7 200882176 1532.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10417560 556.4 17409510 929.8 NA 14442245 771.3
Vcells 79776995 608.7 200882176 1532.7 24576 312076398 2381.0
13.3.5 Sample annotation tables
The sample annotation table is not available and can be generated from the run labels, as the researchers included information on the design in the filenames.
- Extract unique Run labels
- Split Run label into variables using the “_” separator
- Rename the variable Run to runCol (needed for
readQFeaturesfunction) - Add sampleId
- Sort annotation file according to sampleId
annot <- precursors |>
dplyr::distinct(Run) |> ## 1.
separate(Run,
into=c("run","label2","strain","rep","type","acquisition"),
sep="_",
remove=FALSE) |> ## 2.
rename(runCol = Run) |> ## 3.
mutate(sampleId = paste0(strain,rep)) |> #4.
arrange(sampleId) #5.
annot# A tibble: 9 × 8
runCol run label2 strain rep type acquisition sampleId
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 e14639_MB057_ArgSer_03_P… e146… MB057 ArgSer 03 Prot… DIA ArgSer03
2 e14644_MB057_ArgSer_04_P… e146… MB057 ArgSer 04 Prot… DIA ArgSer04
3 e14653_MB057_ArgSer_06_P… e146… MB057 ArgSer 06 Prot… DIA ArgSer06
4 e14638_MB057_Ctrl_03_Pro… e146… MB057 Ctrl 03 Prot… DIA Ctrl03
5 e14642_MB057_Ctrl_04_Pro… e146… MB057 Ctrl 04 Prot… DIA Ctrl04
6 e14654_MB057_Ctrl_06_Pro… e146… MB057 Ctrl 06 Prot… DIA Ctrl06
7 e14640_MB057_ProAla_03_P… e146… MB057 ProAla 03 Prot… DIA ProAla03
8 e14643_MB057_ProAla_04_P… e146… MB057 ProAla 04 Prot… DIA ProAla04
9 e14655_MB057_ProAla_06_P… e146… MB057 ProAla 06 Prot… DIA ProAla06
We do the same for the phospho data.
annotPTM <- precursorsPTM |>
dplyr::distinct(Run) |> ## 1.
separate(Run,
into=c("label1","label2","strain","rep","type","acquisition"),
sep="_",
remove=FALSE) |> ## 2.
rename(runCol = Run) |> ## 3.
mutate(sampleId = paste0(strain,rep)) |> #4.
arrange(sampleId) #5.
annotPTM# A tibble: 9 × 8
runCol label1 label2 strain rep type acquisition sampleId
<chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 e14498_MB057_ArgSer_03_… e14498 MB057 ArgSer 03 Phos… DIA ArgSer03
2 e14503_MB057_ArgSer_04_… e14503 MB057 ArgSer 04 Phos… DIA ArgSer04
3 e14512_MB057_ArgSer_06_… e14512 MB057 ArgSer 06 Phos… DIA ArgSer06
4 e14497_MB057_Ctrl_03_Ph… e14497 MB057 Ctrl 03 Phos… DIA Ctrl03
5 e14501_MB057_Ctrl_04_Ph… e14501 MB057 Ctrl 04 Phos… DIA Ctrl04
6 e14513_MB057_Ctrl_06_Ph… e14513 MB057 Ctrl 06 Phos… DIA Ctrl06
7 e14499_MB057_ProAla_03_… e14499 MB057 ProAla 03 Phos… DIA ProAla03
8 e14502_MB057_ProAla_04_… e14502 MB057 ProAla 04 Phos… DIA ProAla04
9 e14514_MB057_ProAla_06_… e14514 MB057 ProAla 06 Phos… DIA ProAla06
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10429473 557.0 17409510 929.8 NA 14442245 771.3
Vcells 79804494 608.9 200882176 1532.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10429418 557.0 17409510 929.8 NA 14442245 771.3
Vcells 79804436 608.9 200882176 1532.7 24576 312076398 2381.0
13.3.6 Convert to QFeatures
First, recall that the precursor table is file in long format. Every quantitative column in the precursor table contains information for multiple runs. Therefore, the function split the table based on the run identifier, given by the runCol argument (for DIA-NN, that identifier is contained in Run). So, the QFeatures object after import will contain as many sets as there are runs. Next, the function links the annotation table with the PSM data. To achieve this, the annotation table must contain a runCol column that provides the run identifier in which each sample has been acquired, and this information will be used to match the identifiers in the Run column of the precursor table.
Here, we will use the Precursor.Quantity column as quantification input.
- We first combine the precursor and precursorPTM table in a list of QFeatures objects,
- which we subsequently reduce in one QFeatures object. Note that in QFeatures, the function
c()is defined to merge QFeatures objects, stacking all assays together into a single object.
This allows us to store the data of the phospho-enriched and the non-enriched samples in the same QFeatures object.
(qf <- list(
readQFeatures(assayData = precursors,
colData = annot,
quantCols = "Precursor.Quantity",
runCol = "Run",
fnames = "Precursor.Id"),
readQFeatures(assayData = precursorsPTM,
colData = annotPTM,
quantCols = "Precursor.Quantity",
runCol = "Run",
fnames = "Precursor.Id") ##1.
) |> purrr::reduce(c) ##2.
)Checking arguments.
Loading data as a 'SummarizedExperiment' object.
Splitting data in runs.
Formatting sample annotations (colData).
Formatting data as a 'QFeatures' object.
Setting assay rownames.
Checking arguments.
Loading data as a 'SummarizedExperiment' object.
Splitting data in runs.
Formatting sample annotations (colData).
Formatting data as a 'QFeatures' object.
Setting assay rownames.
An instance of class QFeatures (type: bulk) with 18 sets:
[1] e14638_MB057_Ctrl_03_Proteome_DIA: SummarizedExperiment with 70529 rows and 1 columns
[2] e14639_MB057_ArgSer_03_Proteome_DIA: SummarizedExperiment with 75839 rows and 1 columns
[3] e14640_MB057_ProAla_03_Proteome_DIA: SummarizedExperiment with 73676 rows and 1 columns
...
[16] e14512_MB057_ArgSer_06_Phospho_DIA: SummarizedExperiment with 28688 rows and 1 columns
[17] e14513_MB057_Ctrl_06_Phospho_DIA: SummarizedExperiment with 29944 rows and 1 columns
[18] e14514_MB057_ProAla_06_Phospho_DIA: SummarizedExperiment with 30520 rows and 1 columns
Remove data tables to free space.
rm(precursors, precursorsPTM)#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10465797 559 17409510 929.8 NA 17409510 929.8
Vcells 80864931 617 200882176 1532.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10465793 559 17409510 929.8 NA 17409510 929.8
Vcells 80864958 617 200882176 1532.7 24576 312076398 2381.0
13.4 Data preprocessing
Here we conduct the following processing steps
- Replace zero’s by NA.
- Precursor filtering and assay joining
- Log-transformation
- Normalisation
- Summarisation for non-enriched runs
13.4.1 Encoding missing values
We first replace any zero in the quantitative data with an NA.
qf <- zeroIsNA(qf, names(qf))Note that msqrob2 can handle missing data without having to rely on hard-to-verify imputation assumptions, which is our general recommendation. However, msqrob2 does not prevent users from using imputation, which can be performed with impute() from the QFeatures package.
13.4.2 Precursor Filtering
Filtering removes low-quality and unreliable precursors that would otherwise introduce noise and artefacts in the data.
Remove questionable identifications
We apply standard filtering:
q-value threshold of 0.01 for the identification of precursors (
Q.Value) and protein groups (PG.Q.Value). If the file is processed via matching between runs, it is also useful to filter on Lib.Q.Value and Lib.PG.Q.Value.Remove precursors that could not be mapped, i.e. when
Precursor.Idcolumn is an empty string.Filter decoys, i.e. only keep precursors for which the
Decoycolumn equals(Note, that the
Decoycolumn is not present in the output of older versions of DIA-NN).Keeping only proteotypic peptides, which map uniquely to a specific protein.
qf <- qf |>
filterFeatures(~ Q.Value <= 0.01 & #1.
PG.Q.Value <= 0.01 & #1.
Lib.Q.Value <= 0.01 & #1.
Lib.PG.Q.Value <= 0.01 & #1.
Precursor.Id != "" & #2.
Decoy == 0 & #3. (Note, that this filter is not available with previous versions of DIA-NN, as the report.tsv file did not include a Decoy column. So other strategies are needed if Decoys are in the output file.)
Proteotypic == 1) #4.'Q.Value' found in 18 out of 18 assay(s).'PG.Q.Value' found in 18 out of 18 assay(s).'Lib.Q.Value' found in 18 out of 18 assay(s).'Lib.PG.Q.Value' found in 18 out of 18 assay(s).'Precursor.Id' found in 18 out of 18 assay(s).'Decoy' found in 18 out of 18 assay(s).'Proteotypic' found in 18 out of 18 assay(s).
Note, that it is important that the filtering criteria are not distorting the distribution of the test statistics in the downstream analysis for features that are non-DA. It can be shown that filtering will not induce bias results when the filtering criterion is independent of test statistic. The criteria that we proposed above are all based on the results of the identification step, hence, they are independent of the downstream test statistics that will be used to prioritize DA proteins.
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10471764 559.3 17409510 929.8 NA 17409510 929.8
Vcells 81522079 622.0 200882176 1532.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10471766 559.3 17409510 929.8 NA 17409510 929.8
Vcells 81522116 622.0 200882176 1532.7 24576 312076398 2381.0
Assay joining
- Up to now, the data from different runs were kept in separate assays.
- We can now join the filtered sets into an precursor set using joinAssays().
- Sets are joined by stacking the columns (samples) in a matrix and rows (features) are matched according to a row identifier, here the
Precursor.Id. - We do this for the non-enriched and phospho samples, which we store in assays with names:
precursorsandprecursorsPTM, respectively.
qf <- joinAssays(
x = qf,
i = grep("Proteome",names(qf), value = TRUE),
fcol = "Precursor.Id",
name = "precursors"
)Using 'Precursor.Id' to join assays.
(qf <- joinAssays(
x = qf,
i = grep("Phospho",names(qf), value = TRUE),
fcol = "Precursor.Id",
name = "precursorsPTM"
)
)Using 'Precursor.Id' to join assays.
An instance of class QFeatures (type: bulk) with 20 sets:
[1] e14638_MB057_Ctrl_03_Proteome_DIA: SummarizedExperiment with 68197 rows and 1 columns
[2] e14639_MB057_ArgSer_03_Proteome_DIA: SummarizedExperiment with 73453 rows and 1 columns
[3] e14640_MB057_ProAla_03_Proteome_DIA: SummarizedExperiment with 71329 rows and 1 columns
...
[18] e14514_MB057_ProAla_06_Phospho_DIA: SummarizedExperiment with 29322 rows and 1 columns
[19] precursors: SummarizedExperiment with 86283 rows and 9 columns
[20] precursorsPTM: SummarizedExperiment with 52645 rows and 9 columns
Remove the original assays to free space
qf <- qf[,,-c(1:18)]Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 18 sampleMap rows not in names(experiments)
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10477033 559.6 17409510 929.8 NA 17409510 929.8
Vcells 63731614 486.3 200882176 1532.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10477029 559.6 17409510 929.8 NA 17409510 929.8
Vcells 63731641 486.3 200882176 1532.7 24576 312076398 2381.0
Filtering: Remove highly missing precursors
We keep peptides that were observed at least 3 times out of the \(n= 9\) samples, so that we can estimate the peptide characteristics.
nObs <- 3
n <- ncol(qf[["precursors"]])
(qf <- filterNA(qf, i = c("precursors", "precursorsPTM"), pNA = (n - nObs) / n))An instance of class QFeatures (type: bulk) with 2 sets:
[1] precursors: SummarizedExperiment with 78411 rows and 9 columns
[2] precursorsPTM: SummarizedExperiment with 34209 rows and 9 columns
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10477200 559.6 17409510 929.8 NA 17409510 929.8
Vcells 63053337 481.1 200882176 1532.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10477199 559.6 17409510 929.8 NA 17409510 929.8
Vcells 63053369 481.1 200882176 1532.7 24576 312076398 2381.0
Filter one-hit wonders
We normally remove proteins that can only be found by one peptide, as such proteins may not be trustworthy.
Here, we opt not to do this because we prefer a protein-level summary based on one precursor so that we can estimate usages later on.
We leave the code here for your reference. But commented it out.
We first calculate how many distinct peptides map to each protein group (
PG_ProteinGroups). We use the stripped precursor sequence, i.e. sequence of the base peptide for this purpose.We store this information in the row data of the precursors assay
We filter precursors of one-hit wonder proteins.
## Filter for peptides per protein
# pepsPerProtDf <- qf[["precursors"]] |>
# rowData() |>
# data.frame() |>
# dplyr::select("Stripped.Sequence", "Protein.Group") |>
# group_by(Protein.Group) |>
# mutate(pepsPerProt = Stripped.Sequence |>
# unique() |> length()
# ) #1.
# rowData(qf[["precursors"]])$pepsPerProt <- pepsPerProtDf$pepsPerProt #2.
#qf <- filterFeatures(qf,
# ~ pepsPerProt > 1,
# keep = TRUE) #3.#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10477202 559.6 17409510 929.8 NA 17409510 929.8
Vcells 63053483 481.1 200882176 1532.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10477201 559.6 17409510 929.8 NA 17409510 929.8
Vcells 63053515 481.1 200882176 1532.7 24576 312076398 2381.0
13.4.3 Log-transformation
We perform log2-transformation with logTransform() from the QFeatures package. We use base = 2 and store the result in a new summarized experiment named precursors_log.
qf <- logTransform(qf,
base = 2,
i = "precursors",
name = "precursors_log")
qf <- logTransform(qf,
base = 2,
i = "precursorsPTM",
name = "precursorsPTM_log")13.4.4 Normalisation
We first evaluate the distributions of the non-normalised precursors per sample.
For this, we perform a short data manipulation pipeline:
- We select an assay from the
QFeaturesobject,qf.
- We use
longForm()to convert theQFeaturesobject into a long table, where each row contains the quantitative information about one observation, in which column, row and set it was found. Long tables are particularly useful for manipulating data with thetidyverseecosystem, namely withggplot2for visualisation.longForm()also allows to include annotations, and we here include all variables from the colData for filtering and colouring. longForm()returns aDataFramewhich we convert to adata.frame.- We filter out the missing values.
- We make a plot with ggplot.
- We add aesthetics, we select the log2-intensities (
value) as x, and color according to the strain and group according to the colname. - We add a geom_dens layer to add a density plot.
- We add a title.
- Use a minimal theme.
qf[, , "precursors_log"] |> #1.
longForm(colvars = colnames(colData(qf))) |> #2.
data.frame() |> #3.
filter(!is.na(value)) |> #4.
ggplot() + #5.
aes(x = value,
colour = strain,
group = colname) + #6.
geom_density() + #7.
labs(subtitle = "precursors") + #8.
theme_minimal() #9.harmonizing input:
removing 27 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

qf[, , "precursorsPTM_log"] |> #1.
longForm(colvars = colnames(colData(qf))) |> #2.
data.frame() |> #3.
filter(!is.na(value)) |> #4.
ggplot() + #5.
aes(x = value,
colour = strain,
group = colname) + #6.
geom_density() + #7.
labs(subtitle = "PTM - precursors") + #8.
theme_minimal() #9. harmonizing input:
removing 27 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

The data seems very clean. However, normalisation is still required.
We use the median of ratio normalisation, which addresses both differences in loading as well as difference in compositionality.
- This first calculates a reference sample by using the log2 geometric mean, i.e. the arithmetic mean on a log scale, for every feature.
- Subsequently log2 fold changes are calculated between each feature of a sample and the reference sample.
- Finally, the median of these log2 fold changes are used as the log2-tranformed normalisation factor.
The method is the default method for normalisation in bulk RNA-seq data analysis with DESeq2. In our hands it performs very well in a proteomics context.
qf <- sweep( #4. Subtract log2 norm factor column-by-column (MARGIN = 2)
qf,
MARGIN = 2,
STATS = nfLogMedianOfRatios(qf,"precursors_log"),
i = "precursors_log",
name = "precursors_norm"
)This function aims to calculate norm factors on a log scale,
the input data are assumed to be on the log-scale!
qf <- sweep( #4. Subtract log2 norm factor column-by-column (MARGIN = 2)
qf,
MARGIN = 2,
STATS = nfLogMedianOfRatios(qf,"precursorsPTM_log"),
i = "precursorsPTM_log",
name = "precursorsPTM_norm"
)This function aims to calculate norm factors on a log scale,
the input data are assumed to be on the log-scale!
We explore the effect of the global normalisation in the subsequent plot.
Formally, the function applies the following operation on each sample \(i\) across all precursors \(p\):
\[ y_{ip}^{\text{norm}} = y_{ip} - \log_2(nf_i) \]
with \(y_ip\) the log2-transformed intensities and \(nf_i\) the log2-transformed norm factor.
Upon normalisation, we can see that the distribution of the \(y_{ip}^{\text{norm}}\) nicely overlap (using the same code as above)
qf[, , "precursors_norm"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = value,
colour = strain,
group = colname) +
geom_density() +
labs(subtitle = "Median normalisation precursors") +
theme_minimal()harmonizing input:
removing 45 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

qf[, , "precursorsPTM_norm"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = value,
colour = strain,
group = colname) +
geom_density() +
labs(subtitle = "Median normalisation PTM precursors") +
theme_minimal()harmonizing input:
removing 45 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

We also explore the normalisation for the common precursors.
precursorsCommon <- assay(qf[, , "precursors_log"]) |> na.exclude() |> rownames()harmonizing input:
removing 45 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
qf[precursorsCommon, , "precursors_norm"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = value,
colour = strain,
group = colname) +
geom_density() +
theme_minimal() +
labs(subtitle = "Median normalised log2 precursor intensities (common)")Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 45 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

precursorsCommonPTM <- assay(qf[, , "precursorsPTM_log"]) |> na.exclude() |> rownames()harmonizing input:
removing 45 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
qf[precursorsCommonPTM, , "precursorsPTM_norm"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = value,
colour = strain,
group = colname) +
geom_density() +
theme_minimal() +
labs(subtitle = "Median normalised log2 precursor intensities (common)")Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 45 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

Remove temporarily objects
rm(precursorsCommon, precursorsCommonPTM)#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10406514 555.8 17409510 929.8 NA 17409510 929.8
Vcells 26713823 203.9 160705741 1226.1 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10406504 555.8 17409510 929.8 NA 17409510 929.8
Vcells 26713840 203.9 128564593 980.9 24576 312076398 2381.0
Note, that normalisation could have been avoided by using Precursor.Normalised which is already internally normalised by DIA-NN. However, we do not know the exact procedure for this.
This generally works well for many datasets.
In this vignette we have chosen to conduct the normalisation explicitly ourselves using a method for which we know the underlying rationale.
13.4.5 Summarisation
Here, we summarise the precursor-level data for the non-enriched runs into protein intensities using maxLFQ, which is one of the default methods used in DIA data analysis.
Note, that the maxLFQ implementation can become slow for large datasets. In that case we recommend the median polish method.
maxLFQ first calculates all pairwise log ratio’s between samples only using their shared precursors. Particularly, it uses the median of the log ratio’s between the shared precursors, i.e. \[r_{ij} = median(y_{sj}-y_{si})\]
Hence, it first eliminates ion effects.It then estimates the summaries by solving
\[ \sum_i\sum_j(y^{prot}_j - y^{prot}_i - r_{ij})^2 \]
It is implemented in the maxLFQ function of the iq package.
aggregateFeatures() in the QFeatures package streamlines summarisation. It requires the name of a rowData column to group the precursors into proteins (or protein groups), here Protein.Group. We provide the summarisation approach through the fun argument.
The function will return a QFeatures object with a new set that we called proteins.
Other summarisation methods are available from the MsCoreUtils package, see ?aggregateFeatures for a comprehensive list.
(qf <- aggregateFeatures(
qf, i = "precursors_norm",
name = "proteins",
fcol = "Protein.Group",
fun = function(X) iq::maxLFQ(X)$estimate
))Your quantitative data contain missing values. Please read the relevant
section(s) in the aggregateFeatures manual page regarding the effects
of missing values on data aggregation.
Aggregated: 1/1
An instance of class QFeatures (type: bulk) with 7 sets:
[1] precursors: SummarizedExperiment with 78411 rows and 9 columns
[2] precursorsPTM: SummarizedExperiment with 34209 rows and 9 columns
[3] precursors_log: SummarizedExperiment with 78411 rows and 9 columns
[4] precursorsPTM_log: SummarizedExperiment with 34209 rows and 9 columns
[5] precursors_norm: SummarizedExperiment with 78411 rows and 9 columns
[6] precursorsPTM_norm: SummarizedExperiment with 34209 rows and 9 columns
[7] proteins: SummarizedExperiment with 4615 rows and 9 columns
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10422741 556.7 17409510 929.8 NA 17409510 929.8
Vcells 27103279 206.8 102851675 784.7 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10422743 556.7 17409510 929.8 NA 17409510 929.8
Vcells 27103316 206.8 82281340 627.8 24576 312076398 2381.0
13.5 Data exploration and QC
Data exploration aims to highlight the main sources of variation in the data prior to data modelling and can pinpoint to outlying or off-behaving samples.
We first performance the QC for the Phospho-enriched runs then for the non-enriched runs.
13.5.1 Phospho-enriched runs
Marginal distribution at precursor level
qf[, , "precursorsPTM_norm"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = value,
colour = strain,
group = colname) +
geom_density() +
theme_minimal() +
labs(subtitle = "Normalised log2 PTM precursor intensities")harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

qf[, , "precursorsPTM_norm"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = sampleId,
y = value,
colour = strain,
group = colname) +
xlab("sample") +
geom_boxplot() +
theme_minimal() +
labs(subtitle = "Normalised log2 PTM precursor intensities")harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10451959 558.2 17409510 929.8 NA 17409510 929.8
Vcells 28836769 220.1 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10451961 558.2 17409510 929.8 NA 17409510 929.8
Vcells 28836806 220.1 82281340 627.8 24576 312076398 2381.0
Charge state
qf[, , "precursorsPTM_norm"] |>
longForm(colvars = colnames(colData(qf)), rowvars = "Precursor.Charge") |>
as.data.frame() |>
filter(!is.na(value)) |>
filter(Precursor.Charge<=4) |>
ggplot(aes(x = sampleId)) +
geom_bar(aes(fill = factor(Precursor.Charge, levels = 4:1)),
colour = "black") +
labs(title = "Number of phospho-ions per sample",
x = "Sample",
fill = "Charge state") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10459470 558.6 17409510 929.8 NA 17409510 929.8
Vcells 28942883 220.9 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10459469 558.6 17409510 929.8 NA 17409510 929.8
Vcells 28942915 220.9 82281340 627.8 24576 312076398 2381.0
Identifications per sample
qf[,,"precursorsPTM_norm"] |>
longForm(colvars = colnames(colData(qf)),
rowvars= c("Precursor.Id",
"Protein.Group")) |>
data.frame() |>
filter(!is.na(value)) |>
group_by(strain, sampleId) |>
summarise(Precursors = length(unique(Precursor.Id)),
`Protein Groups` = length(unique(Protein.Group))) |>
pivot_longer(-(1:2),
names_to = "Feature",
values_to = "IDs") |>
ggplot(aes(x = sampleId, y = IDs, fill = strain)) +
geom_col() +
#scale_fill_observable() +
facet_wrap(~Feature,
scales = "free_y") +
labs(title = "Precursor and protein group identificiations per PTM sample",
x = "Sample",
y = "Identifications") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
`summarise()` has regrouped the output.
ℹ Summaries were computed grouped by strain and sampleId.
ℹ Output is grouped by strain.
ℹ Use `summarise(.groups = "drop_last")` to silence this message.
ℹ Use `summarise(.by = c(strain, sampleId))` for per-operation grouping
(`?dplyr::dplyr_by`) instead.

# Data Modeling (Robust Regression){#sec-modelling}#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10515114 561.6 17409510 929.8 NA 17409510 929.8
Vcells 25721599 196.3 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10515086 561.6 17409510 929.8 NA 17409510 929.8
Vcells 25721586 196.3 82281340 627.8 24576 312076398 2381.0
Dimensionality reduction plot
A common approach for data exploration is to perform dimension reduction, such as Multi Dimensional Scaling (MDS). We will first extract the set to explore along the sample annotations (used for plot colouring).
getWithColData(qf, "precursorsPTM_norm") |>
as("SingleCellExperiment") |>
runMDS(exprs_values = 1) |>
plotMDS(colour_by = "strain") 
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537181 562.8 17409510 929.8 NA 17409510 929.8
Vcells 25787167 196.8 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537162 562.8 17409510 929.8 NA 17409510 929.8
Vcells 25787169 196.8 82281340 627.8 24576 312076398 2381.0
13.5.2 Non-enriched runs
Marginal distribution at precursor and protein level
qf[, , "precursors_norm"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = value,
colour = strain,
group = colname) +
geom_density() +
theme_minimal() +
labs(subtitle = "Normalised log2 precursor intensities")harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

qf[, , "precursors_norm"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = sampleId,
y = value,
colour = strain,
group = colname) +
xlab("sample") +
geom_boxplot() +
theme_minimal() +
labs(subtitle = "Normalised log2 precursor intensities")harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

qf[, , "proteins"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = value,
colour = strain,
group = colname) +
geom_density() +
theme_minimal() +
labs(subtitle = "log2 protein intensities (normalised precursors)")harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

qf[, , "proteins"] |>
longForm(colvars = colnames(colData(qf))) |>
data.frame() |>
filter(!is.na(value)) |>
ggplot() +
aes(x = sampleId,
y = value,
colour = strain,
group = colname) +
xlab("sample") +
geom_boxplot() +
theme_minimal() +
labs(subtitle = "log2 protein intensities (normalised precursors)")harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537661 562.8 17409510 929.8 NA 17409510 929.8
Vcells 26342070 201.0 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537654 562.8 17409510 929.8 NA 17409510 929.8
Vcells 26342092 201.0 82281340 627.8 24576 312076398 2381.0
Charge state
qf[, , "precursors_norm"] |>
longForm(colvars = colnames(colData(qf)), rowvars = "Precursor.Charge") |>
as.data.frame() |>
filter(!is.na(value)) |>
filter(Precursor.Charge<=4) |>
ggplot(aes(x = sampleId)) +
geom_bar(aes(fill = factor(Precursor.Charge, levels = 4:1)),
colour = "black") +
labs(title = "Number of precursor ions per sample",
x = "Sample",
fill = "Charge state") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537654 562.8 17409510 929.8 NA 17409510 929.8
Vcells 34865265 266.1 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537656 562.8 17409510 929.8 NA 17409510 929.8
Vcells 34865302 266.1 82281340 627.8 24576 312076398 2381.0
Identifications per sample
qf[,,"precursors_norm"] |>
longForm(colvars = colnames(colData(qf)),
rowvars= c("Precursor.Id",
"Protein.Group")) |>
data.frame() |>
filter(!is.na(value)) |>
group_by(strain, sampleId) |>
summarise(Precursors = length(unique(Precursor.Id)),
`Protein Groups` = length(unique(Protein.Group))) |>
pivot_longer(-(1:2),
names_to = "Feature",
values_to = "IDs") |>
ggplot(aes(x = sampleId, y = IDs, fill = strain)) +
geom_col() +
#scale_fill_observable() +
facet_wrap(~Feature,
scales = "free_y") +
labs(title = "Precursor and protein group identificiations per sample",
x = "Sample",
y = "Identifications") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90))harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
`summarise()` has regrouped the output.
ℹ Summaries were computed grouped by strain and sampleId.
ℹ Output is grouped by strain.
ℹ Use `summarise(.groups = "drop_last")` to silence this message.
ℹ Use `summarise(.by = c(strain, sampleId))` for per-operation grouping
(`?dplyr::dplyr_by`) instead.

#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537790 562.8 17409510 929.8 NA 17409510 929.8
Vcells 25772518 196.7 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537768 562.8 17409510 929.8 NA 17409510 929.8
Vcells 25772515 196.7 82281340 627.8 24576 312076398 2381.0
Dimensionality reduction plot
getWithColData(qf, "proteins") |>
as("SingleCellExperiment") |>
runMDS(exprs_values = 1) |>
plotMDS(colour_by = "strain") 
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537356 562.8 17409510 929.8 NA 17409510 929.8
Vcells 25773716 196.7 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 10537337 562.8 17409510 929.8 NA 17409510 929.8
Vcells 25773718 196.7 82281340 627.8 24576 312076398 2381.0
13.6 Data Modeling at phospho-precursor-level
13.6.1 Model estimation
We first define the model. We only have one source of variability in the experiment that we can model, i.e. the effect of the strain.
We fit the model to each phosphorylated precursor in the
precursorsPTM_normsummarised experiment of the QFeatures objectqfusing the msqrob function.
model <- ~ strain
qf <- msqrob(
qf,
i = "precursorsPTM_norm",
formula = model,
robust = TRUE)We enabled M-estimation (robust = TRUE) for improved robustness against outliers.
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11193384 597.8 17409510 929.8 NA 17409510 929.8
Vcells 27473937 209.7 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11193386 597.8 17409510 929.8 NA 17409510 929.8
Vcells 27473974 209.7 82281340 627.8 24576 312076398 2381.0
13.6.2 Inference
We can now convert the research question “Is the abundance of phospho-precursors different between the strains” into a statistical hypotheses.
In other words, we need to translate this question in a null and alternative hypothesis on a single model parameter or a linear combination of model parameters, which is also referred to with a contrast.
To aid defining contrasts, we will visualise the experimental design using the ExploreModelMatrix package.
vd <- ExploreModelMatrix::VisualizeDesign(
sampleData = colData(qf),
designFormula = model,
textSizeFitted = 4
)
vd$plotlist[[1]]

We have 3 interesting research questions related to the strain.
- Is a precursor DA between ArgSer strain and the WT (control)
- Is a precursor DA between ProAla strain and the WT (control)
- Is a precursor DA between ProAla strain and the ArgSer strain.
Each of these contrasts can be translated in a log2 FC or a difference in log2 FC and can be estimated with one or a linear combination of the model parameters.
Below we define the contrast and construct the corresponding contrast matrix.
We can generate the null-hypothesess for all pairwise comparisons manually. However, here we will use the createPairwiseContrasts function in msqrob2.
(allHypotheses <- createPairwiseContrasts(
model, colData(qf), "strain"
)
)Warning: the 'nobars' function has moved to the reformulas package. Please update your imports, or ask an upstream package maintainter to do so.
This warning is displayed once per session.
[1] "strainCtrl = 0" "strainProAla = 0"
[3] "strainProAla - strainCtrl = 0"
Based on these null hypothesis we now generate the contrast matrix with all three contrasts using the makeContrast function. Note, that experienced users can also define the contrast matrix, directly.
(L <- makeContrast(
allHypotheses,
parameterNames = colnames(vd$designmatrix)
)) strainCtrl strainProAla strainProAla - strainCtrl
(Intercept) 0 0 0
strainCtrl 1 0 -1
strainProAla 0 1 1
We assess the contrast for each precursor.
qf <- hypothesisTest(qf, i = "precursorsPTM_norm", contrast = L, overwrite = TRUE)We extract the results table from the precursorsPTM_norm summarised experiment in the qf object.
inferencesPTM <-
msqrobCollect(qf[["precursorsPTM_norm"]], L)#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11353728 606.4 17409510 929.8 NA 17409510 929.8
Vcells 30131958 229.9 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11353724 606.4 17409510 929.8 NA 17409510 929.8
Vcells 30131985 229.9 82281340 627.8 24576 312076398 2381.0
13.6.3 Report results
We report the results using a results table, volcano plots and heatmaps.
Results table
We use the 5% nominal FDR level to report results
alpha <- 0.05for (j in colnames(L)) {
inference <- inferencesPTM |>
dplyr::filter(adjPval < alpha & contrast == j)
cat("**Median - Contrast:**", j, "= 0 (", nrow(inference),
"significant precursors)\n\n")
cat('<div style="max-height:300px; overflow-y:auto;">')
print(
kable(
inference |>
dplyr::arrange(pval) |>
dplyr::relocate(feature),
row.names = FALSE
)
)
cat('</div>')
cat("\n\n\n---\n\n")
}Median - Contrast: strainCtrl = 0 ( 1 significant precursors)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|---|---|---|---|---|---|---|
| IGDS(UniMod:21)LQGSPQR2 | -2.907071 | 0.1393496 | 6.823797 | -20.86171 | 2e-07 | 0.0045824 | strainCtrl |
Median - Contrast: strainProAla = 0 ( 4 significant precursors)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|---|---|---|---|---|---|---|
| AVQES(UniMod:21)DS(UniMod:21)TTS(UniMod:21)RIIEEHESPIDAEK3 | -3.860267 | 0.1557383 | 5.826762 | -24.78688 | 4.0e-07 | 0.0092686 | strainProAla |
| S(UniMod:21)DSEVNQEAKPEVK3 | 1.921745 | 0.1343868 | 6.342927 | 14.30010 | 4.6e-06 | 0.0467721 | strainProAla |
| NTSYQES(UniMod:21)PGLQERPK3 | -1.113769 | 0.1049216 | 7.826762 | -10.61525 | 6.4e-06 | 0.0467721 | strainProAla |
| (UniMod:1)SDS(UniMod:21)EVNQEAKPEVK2 | 2.002423 | 0.1433679 | 6.038833 | 13.96702 | 8.0e-06 | 0.0467721 | strainProAla |
Median - Contrast: strainProAla - strainCtrl = 0 ( 19 significant precursors)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|---|---|---|---|---|---|---|
| IGDS(UniMod:21)LQGSPQR2 | 3.0414472 | 0.1557899 | 6.823797 | 19.522750 | 3.00e-07 | 0.0054041 | strainProAla - strainCtrl |
| AVQES(UniMod:21)DS(UniMod:21)TTS(UniMod:21)RIIEEHESPIDAEK3 | -3.7465156 | 0.1557383 | 5.826762 | -24.056476 | 5.00e-07 | 0.0054041 | strainProAla - strainCtrl |
| GQVVSEEQRPGT(UniMod:21)PLFTVK3 | 2.2682125 | 0.1336666 | 6.779026 | 16.969186 | 8.00e-07 | 0.0064050 | strainProAla - strainCtrl |
| (UniMod:1)SDS(UniMod:21)EVNQEAKPEVK2 | 2.0997487 | 0.1203906 | 6.038833 | 17.441138 | 2.10e-06 | 0.0123465 | strainProAla - strainCtrl |
| (UniMod:1)SDEEHTFENADAGAS(UniMod:21)ATYPMQC(UniMod:4)SALR4 | -7.2887717 | 0.4604139 | 5.736513 | -15.830911 | 5.90e-06 | 0.0233619 | strainProAla - strainCtrl |
| VDIIANDQGNRT(UniMod:21)TPSFVAFTDTER3 | 1.4613762 | 0.1367441 | 7.826762 | 10.686943 | 6.10e-06 | 0.0233619 | strainProAla - strainCtrl |
| S(UniMod:21)DSEVNQEAKPEVK3 | 1.8128444 | 0.1489254 | 6.342927 | 12.172839 | 1.24e-05 | 0.0392179 | strainProAla - strainCtrl |
| LTNMPFGGNPTANQSGS(UniMod:21)GNSLFGTK3 | -2.4850090 | 0.2265396 | 6.646603 | -10.969426 | 1.68e-05 | 0.0392179 | strainProAla - strainCtrl |
| LIDENATENGLAGS(UniMod:21)PKDEDGIIM(UniMod:35)TNK3 | -3.5131571 | 0.2707854 | 5.765748 | -12.973952 | 1.74e-05 | 0.0392179 | strainProAla - strainCtrl |
| NTSYQES(UniMod:21)PGLQERPK3 | -0.9528266 | 0.1049216 | 7.826762 | -9.081320 | 2.00e-05 | 0.0392179 | strainProAla - strainCtrl |
| PHTPFIS(UniMod:21)KLNTHQDSSYLSPNT(UniMod:21)TST(UniMod:21)TTPSNNNSNSNQAK4 | 3.8675421 | 0.2460948 | 4.826762 | 15.715662 | 2.49e-05 | 0.0392179 | strainProAla - strainCtrl |
| RAT(UniMod:21)YAGFLLADPK3 | 2.5531200 | 0.2564607 | 6.826762 | 9.955209 | 2.60e-05 | 0.0392179 | strainProAla - strainCtrl |
| RGQVVSEEQRPGT(UniMod:21)PLFTVK3 | 1.8197408 | 0.2057852 | 7.695480 | 8.842913 | 2.69e-05 | 0.0392179 | strainProAla - strainCtrl |
| TTPSFVAFT(UniMod:21)DTER2 | 1.2275840 | 0.1377368 | 7.598572 | 8.912537 | 2.75e-05 | 0.0392179 | strainProAla - strainCtrl |
| TT(UniMod:21)PSFVAFTDTER2 | 1.3146921 | 0.1515195 | 7.826293 | 8.676717 | 2.77e-05 | 0.0392179 | strainProAla - strainCtrl |
| S(UniMod:21)DSEVNQEAKPEVK2 | 2.0603552 | 0.2182683 | 7.086695 | 9.439554 | 2.89e-05 | 0.0392179 | strainProAla - strainCtrl |
| NIAPPPT(UniMod:21)T(UniMod:21)SVSAPS(UniMod:21)TPTLSSSSQMANMASPSTDNGDNEEK4 | -5.8974392 | 0.5097645 | 5.826762 | -11.568948 | 3.08e-05 | 0.0392179 | strainProAla - strainCtrl |
| RFS(UniMod:21)QHSS(UniMod:21)MLINNPATPNQK3 | 4.5073908 | 0.2202057 | 4.021668 | 20.469003 | 3.23e-05 | 0.0392179 | strainProAla - strainCtrl |
| ETVES(UniMod:21)ESSQTALSK2 | 0.9869042 | 0.1087065 | 7.253952 | 9.078612 | 3.24e-05 | 0.0392179 | strainProAla - strainCtrl |
Volcanoplots
plotVolcano(inferencesPTM) +
facet_wrap(~contrast) +
labs(title = "Precursor-level Phospho DA")
inferencesPTM |>
filter(adjPval < alpha) |>
mutate(DA = sign(logFC) |> as.factor() |> recode("-1"= "down","1" = "up")) |>
group_by(contrast, DA) |>
ggplot(aes(x = contrast)) +
geom_bar(aes(fill = factor(DA)),
colour = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90))
Heatmaps
We will make a heatmap for each contrast. Note that we cluster the rows ourselves to avoid issues with missing values.
lapply(colnames(L),
function(contrast, se, alpha)
{
sig <- rowData(se)[[contrast]] |>
filter(adjPval < alpha) |>
rownames()
if (length(sig) > 2)
{
quants <- t(scale(t(assay(se[sig,]))))
colnames(quants) <- se$sampleId #specific to this dataset to get short colnames
rowclushlp <- quants
rowclushlp[is.na(rowclushlp)] <- min(quants,na.rm=TRUE) - 2
rowclus <- hclust(dist(rowclushlp))
annotations <- columnAnnotation(
group = se$strain
) #3.
set.seed(1234) ## annotation colours are randomly generated by default
return(
Heatmap(show_row_names = FALSE,
quants,
name = "log2 intensity",
top_annotation = annotations,
column_title = paste0(contrast, " = 0"),
cluster_rows = rowclus
)
)
} else return(ggplot() + theme_minimal() + ggtitle(paste0(contrast, " = 0")))
},
se = getWithColData(qf, "precursorsPTM_norm"),
alpha = alpha)[[1]]

[[2]]

[[3]]

#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11578883 618.4 17409510 929.8 NA 17409510 929.8
Vcells 30591779 233.4 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11578885 618.4 17409510 929.8 NA 17409510 929.8
Vcells 30591816 233.4 82281340 627.8 24576 312076398 2381.0
Detail plots
We can explore the data for a precursor to validate the statistical inference results. For example, let’s explore the precursor and the normalised precursor intensities for the precursor with the most significant log2 fold change.
(target_feature <- inferencesPTM |>
dplyr::slice(which.min(pval)) |>
pull(feature)
)[1] "IGDS(UniMod:21)LQGSPQR2"
inferencesPTM |>
filter(feature == target_feature) logFC se df
strainCtrl.IGDS(UniMod:21)LQGSPQR2 -2.9070713 0.1393496 6.823796
strainProAla.IGDS(UniMod:21)LQGSPQR2 0.1343759 0.1557591 6.823796
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 3.0414472 0.1557899 6.823796
t pval
strainCtrl.IGDS(UniMod:21)LQGSPQR2 -20.861711 1.955531e-07
strainProAla.IGDS(UniMod:21)LQGSPQR2 0.862716 4.175794e-01
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 19.522750 3.054750e-07
adjPval
strainCtrl.IGDS(UniMod:21)LQGSPQR2 0.004582395
strainProAla.IGDS(UniMod:21)LQGSPQR2 0.986318960
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 0.005404076
contrast
strainCtrl.IGDS(UniMod:21)LQGSPQR2 strainCtrl
strainProAla.IGDS(UniMod:21)LQGSPQR2 strainProAla
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 strainProAla - strainCtrl
feature
strainCtrl.IGDS(UniMod:21)LQGSPQR2 IGDS(UniMod:21)LQGSPQR2
strainProAla.IGDS(UniMod:21)LQGSPQR2 IGDS(UniMod:21)LQGSPQR2
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 IGDS(UniMod:21)LQGSPQR2
To obtain the required data, we perform a little data manipulation pipeline:
- We use the QFeatures subsetting functionality to retrieve all data related to ion data used for model fitting.
- We then convert the data with longForm() for plotting.
- Finally, we plot the log2 normalised intensities for each sample for the unnormalised and normalised phospho precursors.
qf[target_feature, , c("precursorsPTM_log","precursorsPTM_norm")] |> #1
longForm(colvars = colnames(colData(qf))) |> #2
data.frame() |>
ggplot() +
aes(x = sampleId,
y = value) +
geom_line(aes(group = rowname), linewidth = 0.1) +
geom_point(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
ggtitle(target_feature) +
theme_minimal() +
theme(axis.text.x = element_blank())Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 45 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

qf[target_feature, , c("precursorsPTM_log","precursorsPTM_norm")] |> #1
longForm(colvars = colnames(colData(qf))) |> #2
data.frame() |>
filter(!is.na(value)) %>%
{
ggplot(.) +
aes(x = strain,
y = value) +
geom_boxplot(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
geom_jitter(aes(shape = rowname)) +
scale_shape_manual(values = seq_len(dplyr::n_distinct(.$rowname))) +
ggtitle(target_feature) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
guides(shape = "none")
}Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 45 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

We now compare the intensities for the top 5 DA phospo-precursors for the first contrast to these of their corresponding protein in the non-enriched assay.
contr = colnames(L)[1]
top5 <- inferencesPTM |>
filter(contrast == contr) |>
arrange(pval) |>
head(n = 5) |>
pull(feature)
for (feat in top5)
{
ptm_data <- qf[,,c("precursorsPTM_norm")] |>
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |>
data.frame() |>
filter(rowname==feat)
feature_protein <- ptm_data |>
pull("Protein.Group") |>
unique()
prot_data <- qf[,,"proteins"]|>
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |>
data.frame() |>
filter(Protein.Group==feature_protein)
ptm_protein <- rbind(ptm_data, prot_data)
ylims <- ptm_protein |>
group_by(assay) |>
summarise(cent = mean(range(value,na.rm=TRUE)), ampl = diff(range(value,na.rm=TRUE))) |>
mutate(lower = cent - max(ampl)/2,
upper = cent + max(ampl)/2) |>
select(-c(cent, ampl))
comparison_plot <- ptm_protein |>
ggplot() +
aes(x = sampleId,
y = value) +
geom_line(aes(group = rowname), linewidth = 0.1) +
geom_point(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
labs(
title = paste0(feat," / ", feature_protein)
) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
ggh4x::facetted_pos_scales(
y = list(
assay == ylims$assay[1] ~ scale_y_continuous(limits = unlist(ylims[1,c("lower","upper")])),
assay == ylims$assay[2] ~ scale_y_continuous(limits = unlist(ylims[2,c("lower","upper")]))
)
)
print(comparison_plot)
}




We prioritised DA phospho-precursors. However, they might be DA due to the parent proteins on which the PTMs occur that can also change in abundance regardless of the modification, which is clearly illustrated for the top DA phospho-precursor!
Any changes in the abundance of a phospho-peptidoform can thus be confounded with changes in protein abundance.
We therefore first assess differential abundance at the protein level for the non-enriched samples and will then propose an differential usage analysis where the precursor-level log2-FC is corrected for the protein-level log2-FC.
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11593882 619.2 17409510 929.8 NA 17409510 929.8
Vcells 30620154 233.7 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11593878 619.2 17409510 929.8 NA 17409510 929.8
Vcells 30620181 233.7 82281340 627.8 24576 312076398 2381.0
13.7 Data Modeling at protein-level (non-enriched samples)
13.7.1 Model estimation
We can use the same model to model the protein-level data of the non-enriched runs. Indeed, they stem from the same biological samples.
qf <- msqrob(
qf,
i = "proteins",
formula = model,
robust = TRUE)We enabled M-estimation (robust = TRUE) for improved robustness against outliers.
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11695336 624.6 17409510 929.8 NA 17409510 929.8
Vcells 30917593 235.9 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11695338 624.6 17409510 929.8 NA 17409510 929.8
Vcells 30917630 235.9 82281340 627.8 24576 312076398 2381.0
13.7.2 Inference
We assess the contrast for each protein.
qf <- hypothesisTest(qf, i = "proteins", contrast = L, overwrite = TRUE)We extract the results table from the proteins summarised experiment in the qf object.
inferences <-
msqrobCollect(qf[["proteins"]], L)#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11709463 625.4 17409510 929.8 NA 17409510 929.8
Vcells 31205896 238.1 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11709459 625.4 17409510 929.8 NA 17409510 929.8
Vcells 31205923 238.1 82281340 627.8 24576 312076398 2381.0
13.7.3 Report results
We report the results using a results table, volcano plots and heatmaps.
Results table
for (j in colnames(L)) {
inference <- inferences |>
dplyr::filter(adjPval < alpha & contrast == j)
cat("**Median - Contrast:**", j, "= 0 (", nrow(inference),
"significant proteins)\n\n")
cat('<div style="max-height:300px; overflow-y:auto;">')
print(
kable(
inference |>
dplyr::arrange(pval) |>
dplyr::relocate(feature),
row.names = FALSE
)
)
cat('</div>')
cat("\n\n\n---\n\n")
}Median - Contrast: strainCtrl = 0 ( 120 significant proteins)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|---|---|---|---|---|---|---|
| Q12068 | -1.2550208 | 0.0460982 | 7.912320 | -27.224954 | 0.0000000 | 0.0000190 | strainCtrl |
| P53912 | -0.7633337 | 0.0426103 | 7.912320 | -17.914315 | 0.0000001 | 0.0002478 | strainCtrl |
| P32642 | -0.7748089 | 0.0554839 | 7.912320 | -13.964565 | 0.0000007 | 0.0009196 | strainCtrl |
| P38113 | 0.5361176 | 0.0402713 | 7.831425 | 13.312661 | 0.0000012 | 0.0009196 | strainCtrl |
| P53834 | -0.4709909 | 0.0359808 | 7.912320 | -13.090065 | 0.0000012 | 0.0009196 | strainCtrl |
| P31539 | -0.4596626 | 0.0352707 | 7.834254 | -13.032425 | 0.0000014 | 0.0009196 | strainCtrl |
| P10591 | -0.6326804 | 0.0442788 | 7.249641 | -14.288555 | 0.0000014 | 0.0009196 | strainCtrl |
| Q03148 | 0.4798082 | 0.0410520 | 7.663536 | 11.687806 | 0.0000037 | 0.0021036 | strainCtrl |
| P34227 | -0.4221975 | 0.0420501 | 7.882979 | -10.040343 | 0.0000092 | 0.0042179 | strainCtrl |
| P32644 | -0.4051631 | 0.0405994 | 7.912320 | -9.979540 | 0.0000093 | 0.0042179 | strainCtrl |
| P47169 | 0.4808276 | 0.0511144 | 7.912320 | 9.406894 | 0.0000144 | 0.0051697 | strainCtrl |
| P47164 | 0.5890143 | 0.0622674 | 7.848219 | 9.459437 | 0.0000146 | 0.0051697 | strainCtrl |
| Q03102 | -0.7380302 | 0.0760724 | 7.619844 | -9.701679 | 0.0000149 | 0.0051697 | strainCtrl |
| P38988 | 0.5035084 | 0.0546816 | 7.898633 | 9.207999 | 0.0000170 | 0.0052389 | strainCtrl |
| P25294 | -0.3476410 | 0.0377920 | 7.883597 | -9.198799 | 0.0000174 | 0.0052389 | strainCtrl |
| P15992 | 0.5676440 | 0.0657404 | 7.784463 | 8.634625 | 0.0000297 | 0.0076233 | strainCtrl |
| P22134 | -0.5084279 | 0.0598458 | 7.912320 | -8.495626 | 0.0000302 | 0.0076233 | strainCtrl |
| P36007 | 0.3994290 | 0.0467162 | 7.751904 | 8.550121 | 0.0000326 | 0.0076233 | strainCtrl |
| P38260 | -0.4728488 | 0.0562286 | 7.880615 | -8.409395 | 0.0000333 | 0.0076233 | strainCtrl |
| P15705 | -0.3219811 | 0.0384805 | 7.912320 | -8.367390 | 0.0000337 | 0.0076233 | strainCtrl |
| Q96VH4 | -0.6128773 | 0.0716386 | 7.473710 | -8.555126 | 0.0000404 | 0.0083838 | strainCtrl |
| Q06150 | 1.1882161 | 0.1255814 | 6.694697 | 9.461721 | 0.0000408 | 0.0083838 | strainCtrl |
| P07264 | 0.5036564 | 0.0513081 | 6.342681 | 9.816310 | 0.0000456 | 0.0086988 | strainCtrl |
| P11972 | 0.6395323 | 0.0798616 | 7.912320 | 8.008008 | 0.0000461 | 0.0086988 | strainCtrl |
| P23337 | 0.6038295 | 0.0755196 | 7.786680 | 7.995660 | 0.0000510 | 0.0089278 | strainCtrl |
| P07258 | 0.3245530 | 0.0405182 | 7.761790 | 8.010053 | 0.0000513 | 0.0089278 | strainCtrl |
| P15108 | -0.4337743 | 0.0567181 | 7.912320 | -7.647892 | 0.0000640 | 0.0107180 | strainCtrl |
| P32643 | 0.5675624 | 0.0758729 | 7.912320 | 7.480431 | 0.0000747 | 0.0120468 | strainCtrl |
| P40897 | 0.8257634 | 0.1108991 | 7.912320 | 7.446076 | 0.0000772 | 0.0120468 | strainCtrl |
| Q04792 | 0.4217111 | 0.0515853 | 6.913936 | 8.175031 | 0.0000849 | 0.0122724 | strainCtrl |
| P38737 | -0.3549085 | 0.0458978 | 7.382064 | -7.732579 | 0.0000857 | 0.0122724 | strainCtrl |
| Q12359 | 0.3775633 | 0.0518620 | 7.912320 | 7.280148 | 0.0000904 | 0.0122724 | strainCtrl |
| P41930 | 0.5071652 | 0.0681844 | 7.678847 | 7.438139 | 0.0000907 | 0.0122724 | strainCtrl |
| P07702 | 0.2641709 | 0.0355694 | 7.627491 | 7.426906 | 0.0000949 | 0.0122724 | strainCtrl |
| P40017 | 0.3241162 | 0.0448450 | 7.912320 | 7.227483 | 0.0000951 | 0.0122724 | strainCtrl |
| P38279 | 1.3271191 | 0.1640118 | 6.791696 | 8.091609 | 0.0000998 | 0.0122724 | strainCtrl |
| P53691 | -0.3176657 | 0.0426231 | 7.509253 | -7.452907 | 0.0001003 | 0.0122724 | strainCtrl |
| P10592 | -0.4019022 | 0.0563163 | 7.912320 | -7.136515 | 0.0001039 | 0.0123695 | strainCtrl |
| P53241 | 0.6445252 | 0.0875748 | 7.530594 | 7.359714 | 0.0001076 | 0.0124431 | strainCtrl |
| P33416 | -0.3051604 | 0.0431141 | 7.912320 | -7.077971 | 0.0001100 | 0.0124431 | strainCtrl |
| P00924 | -0.3025571 | 0.0430671 | 7.912320 | -7.025249 | 0.0001158 | 0.0125763 | strainCtrl |
| P02829 | -0.5585165 | 0.0796896 | 7.912320 | -7.008651 | 0.0001178 | 0.0125763 | strainCtrl |
| P08432 | 0.5558426 | 0.0706083 | 6.779120 | 7.872199 | 0.0001195 | 0.0125763 | strainCtrl |
| P53078 | 0.4272710 | 0.0613209 | 7.912320 | 6.967783 | 0.0001226 | 0.0126104 | strainCtrl |
| P53133 | 0.3739169 | 0.0539455 | 7.883584 | 6.931384 | 0.0001294 | 0.0130069 | strainCtrl |
| P53970 | -1.4406207 | 0.1348759 | 4.912320 | -10.681081 | 0.0001383 | 0.0136061 | strainCtrl |
| P07285 | 0.2761568 | 0.0396862 | 7.645178 | 6.958509 | 0.0001456 | 0.0137455 | strainCtrl |
| P54007 | -1.1091201 | 0.1623732 | 7.851031 | -6.830685 | 0.0001458 | 0.0137455 | strainCtrl |
| P21242 | -0.2646331 | 0.0392475 | 7.912320 | -6.742673 | 0.0001537 | 0.0141961 | strainCtrl |
| Q01217 | 0.3632569 | 0.0457700 | 6.303964 | 7.936576 | 0.0001656 | 0.0149888 | strainCtrl |
| P53303 | -0.3567024 | 0.0516711 | 7.356282 | -6.903327 | 0.0001837 | 0.0162833 | strainCtrl |
| P22768 | 0.3256622 | 0.0497220 | 7.912320 | 6.549661 | 0.0001874 | 0.0162833 | strainCtrl |
| P09368 | -0.8990006 | 0.1378106 | 7.894926 | -6.523451 | 0.0001945 | 0.0162833 | strainCtrl |
| P25491 | -0.3585182 | 0.0503980 | 6.967679 | -7.113740 | 0.0001956 | 0.0162833 | strainCtrl |
| Q12213 | 1.0111936 | 0.1556292 | 7.912320 | 6.497453 | 0.0001979 | 0.0162833 | strainCtrl |
| P39692 | 0.4859742 | 0.0712597 | 7.236287 | 6.819757 | 0.0002142 | 0.0169668 | strainCtrl |
| Q04894 | -0.2962878 | 0.0459462 | 7.849736 | -6.448581 | 0.0002156 | 0.0169668 | strainCtrl |
| P33734 | 0.2668452 | 0.0416446 | 7.912320 | 6.407682 | 0.0002175 | 0.0169668 | strainCtrl |
| Q04728 | 0.2762263 | 0.0429688 | 7.715826 | 6.428538 | 0.0002370 | 0.0181756 | strainCtrl |
| P04076 | 0.3192189 | 0.0508046 | 7.912320 | 6.283265 | 0.0002482 | 0.0187185 | strainCtrl |
| P53915 | 0.3395219 | 0.0542298 | 7.912320 | 6.260798 | 0.0002542 | 0.0188601 | strainCtrl |
| P05374 | 0.3220159 | 0.0521627 | 7.907686 | 6.173297 | 0.0002801 | 0.0204231 | strainCtrl |
| O14467 | -0.3364747 | 0.0539882 | 7.757612 | -6.232370 | 0.0002843 | 0.0204231 | strainCtrl |
| P36152 | -0.2877060 | 0.0452245 | 7.467107 | -6.361730 | 0.0002914 | 0.0206036 | strainCtrl |
| P38840 | -0.2992185 | 0.0474965 | 7.523938 | -6.299808 | 0.0003007 | 0.0209338 | strainCtrl |
| P21954 | 0.2637012 | 0.0433249 | 7.912320 | 6.086595 | 0.0003070 | 0.0210502 | strainCtrl |
| P23180 | 0.2500113 | 0.0412153 | 7.912320 | 6.065982 | 0.0003140 | 0.0210840 | strainCtrl |
| P32775 | 0.4261251 | 0.0693922 | 7.736751 | 6.140822 | 0.0003168 | 0.0210840 | strainCtrl |
| Q08422 | -0.5747735 | 0.0898453 | 7.224849 | -6.397369 | 0.0003228 | 0.0211721 | strainCtrl |
| P18544 | 0.3150823 | 0.0531767 | 7.912320 | 5.925199 | 0.0003669 | 0.0237170 | strainCtrl |
| Q2V2Q1 | -0.7650280 | 0.1260441 | 7.459710 | -6.069527 | 0.0003949 | 0.0250252 | strainCtrl |
| P38790 | 0.4359957 | 0.0614290 | 5.967403 | 7.097550 | 0.0004024 | 0.0250252 | strainCtrl |
| P07172 | 0.2544743 | 0.0435764 | 7.912320 | 5.839731 | 0.0004037 | 0.0250252 | strainCtrl |
| P47112 | -0.3771868 | 0.0633309 | 7.598434 | -5.955813 | 0.0004146 | 0.0252781 | strainCtrl |
| P39522 | 0.2242400 | 0.0380328 | 7.707311 | 5.895964 | 0.0004190 | 0.0252781 | strainCtrl |
| P36141 | -0.3813329 | 0.0651623 | 7.709378 | -5.852051 | 0.0004392 | 0.0258878 | strainCtrl |
| P12866 | 0.3740206 | 0.0615381 | 7.238648 | 6.077871 | 0.0004405 | 0.0258878 | strainCtrl |
| Q04341 | -0.4322525 | 0.0753174 | 7.908442 | -5.739083 | 0.0004532 | 0.0262925 | strainCtrl |
| P53940 | -0.3985612 | 0.0650403 | 7.059729 | -6.127915 | 0.0004621 | 0.0264695 | strainCtrl |
| P03965 | 0.3666642 | 0.0496814 | 5.476991 | 7.380309 | 0.0004810 | 0.0272077 | strainCtrl |
| P32354 | -0.7048556 | 0.0978244 | 5.553064 | -7.205314 | 0.0005109 | 0.0285438 | strainCtrl |
| Q04066 | 0.2692450 | 0.0480575 | 7.872568 | 5.602560 | 0.0005386 | 0.0297032 | strainCtrl |
| P53090 | 0.2581710 | 0.0450790 | 7.547892 | 5.727080 | 0.0005448 | 0.0297032 | strainCtrl |
| Q04432 | -0.4348224 | 0.0764554 | 7.593576 | -5.687267 | 0.0005570 | 0.0297746 | strainCtrl |
| P32327 | 0.2087815 | 0.0375872 | 7.912320 | 5.554596 | 0.0005593 | 0.0297746 | strainCtrl |
| P38084 | 0.2541978 | 0.0462549 | 7.909677 | 5.495586 | 0.0005998 | 0.0315609 | strainCtrl |
| P27614 | -0.2292086 | 0.0419602 | 7.912320 | -5.462530 | 0.0006228 | 0.0322896 | strainCtrl |
| P05626 | 0.2241644 | 0.0411634 | 7.912320 | 5.445724 | 0.0006352 | 0.0322896 | strainCtrl |
| P40989 | -0.7453095 | 0.1369897 | 7.912320 | -5.440624 | 0.0006391 | 0.0322896 | strainCtrl |
| P38631 | 0.2413635 | 0.0443976 | 7.912320 | 5.436413 | 0.0006422 | 0.0322896 | strainCtrl |
| P50278 | -0.2359914 | 0.0431851 | 7.810697 | -5.464650 | 0.0006494 | 0.0322896 | strainCtrl |
| P05150 | 0.2900981 | 0.0535745 | 7.912320 | 5.414858 | 0.0006588 | 0.0324016 | strainCtrl |
| P53044 | -0.2625684 | 0.0489893 | 7.912320 | -5.359710 | 0.0007032 | 0.0342172 | strainCtrl |
| P32795 | -0.1774741 | 0.0333584 | 7.912320 | -5.320218 | 0.0007371 | 0.0351408 | strainCtrl |
| P52489 | 0.2560915 | 0.0482269 | 7.912320 | 5.310140 | 0.0007460 | 0.0351408 | strainCtrl |
| P14843 | 0.2267140 | 0.0415602 | 7.503024 | 5.455070 | 0.0007527 | 0.0351408 | strainCtrl |
| P53885 | -0.2102375 | 0.0393138 | 7.783327 | -5.347681 | 0.0007533 | 0.0351408 | strainCtrl |
| Q06324 | 0.4458845 | 0.0845196 | 7.912320 | 5.275516 | 0.0007776 | 0.0359059 | strainCtrl |
| P05694 | 0.2378218 | 0.0452354 | 7.779541 | 5.257433 | 0.0008393 | 0.0383631 | strainCtrl |
| P32331 | 0.2001792 | 0.0382238 | 7.791226 | 5.237033 | 0.0008558 | 0.0384352 | strainCtrl |
| Q12680 | 0.1700713 | 0.0328092 | 7.912320 | 5.183649 | 0.0008688 | 0.0384352 | strainCtrl |
| Q12100 | 0.3534247 | 0.0669794 | 7.640724 | 5.276620 | 0.0008697 | 0.0384352 | strainCtrl |
| P00815 | 0.3103773 | 0.0600020 | 7.912320 | 5.172779 | 0.0008803 | 0.0384352 | strainCtrl |
| P38219 | -0.2292567 | 0.0441503 | 7.843771 | -5.192643 | 0.0008834 | 0.0384352 | strainCtrl |
| P53101 | 0.3747404 | 0.0727722 | 7.912320 | 5.149503 | 0.0009056 | 0.0386789 | strainCtrl |
| P33297 | -0.2104838 | 0.0404903 | 7.764355 | -5.198370 | 0.0009061 | 0.0386789 | strainCtrl |
| P13663 | 0.2093461 | 0.0407220 | 7.912320 | 5.140854 | 0.0009152 | 0.0387022 | strainCtrl |
| P32496 | -0.2118184 | 0.0414383 | 7.794930 | -5.111657 | 0.0009932 | 0.0416143 | strainCtrl |
| P13090 | -0.4896959 | 0.0981496 | 7.912320 | -4.989279 | 0.0011026 | 0.0457750 | strainCtrl |
| P19097 | 0.1699314 | 0.0345207 | 7.912320 | 4.922588 | 0.0011981 | 0.0483808 | strainCtrl |
| P00635 | -1.0494337 | 0.1569692 | 4.912320 | -6.685601 | 0.0012123 | 0.0483808 | strainCtrl |
| P32379 | -0.2777397 | 0.0565628 | 7.912320 | -4.910285 | 0.0012167 | 0.0483808 | strainCtrl |
| P32781 | 0.7625871 | 0.1528342 | 7.631036 | 4.989637 | 0.0012274 | 0.0483808 | strainCtrl |
| P47988 | 0.4943697 | 0.0951744 | 7.057001 | 5.194355 | 0.0012293 | 0.0483808 | strainCtrl |
| Q12449 | -0.3876477 | 0.0783732 | 7.751548 | -4.946175 | 0.0012356 | 0.0483808 | strainCtrl |
| P06738 | 0.4102688 | 0.0812821 | 7.431425 | 5.047465 | 0.0012403 | 0.0483808 | strainCtrl |
| P25638 | -0.3427329 | 0.0612136 | 6.156628 | -5.598969 | 0.0012675 | 0.0490197 | strainCtrl |
| P00899 | 0.1871999 | 0.0382071 | 7.807960 | 4.899613 | 0.0012813 | 0.0491352 | strainCtrl |
| Q3E842 | 1.2390944 | 0.2384932 | 6.912320 | 5.195512 | 0.0013096 | 0.0497994 | strainCtrl |
| P28778 | -0.4310406 | 0.0885471 | 7.824972 | -4.867925 | 0.0013245 | 0.0499453 | strainCtrl |
Median - Contrast: strainProAla = 0 ( 4 significant proteins)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|---|---|---|---|---|---|---|
| Q12068 | -0.6165566 | 0.0460982 | 7.91232 | -13.374858 | 1.0e-06 | 0.0046827 | strainProAla |
| P32642 | -0.6238562 | 0.0554839 | 7.91232 | -11.243908 | 3.8e-06 | 0.0086874 | strainProAla |
| P53912 | -0.4222356 | 0.0426103 | 7.91232 | -9.909246 | 9.8e-06 | 0.0148261 | strainProAla |
| P00924 | -0.3482582 | 0.0430671 | 7.91232 | -8.086409 | 4.3e-05 | 0.0487303 | strainProAla |
Median - Contrast: strainProAla - strainCtrl = 0 ( 77 significant proteins)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|---|---|---|---|---|---|---|
| P31539 | 0.7060103 | 0.0352707 | 7.834254 | 20.016913 | 0.0000001 | 0.0002362 | strainProAla - strainCtrl |
| P10591 | 0.9412387 | 0.0471600 | 7.249641 | 19.958398 | 0.0000001 | 0.0003012 | strainProAla - strainCtrl |
| P33416 | 0.6037370 | 0.0431141 | 7.912320 | 14.003237 | 0.0000007 | 0.0008958 | strainProAla - strainCtrl |
| Q12068 | 0.6384642 | 0.0460982 | 7.912320 | 13.850096 | 0.0000008 | 0.0008958 | strainProAla - strainCtrl |
| P53834 | 0.4321425 | 0.0359808 | 7.912320 | 12.010367 | 0.0000023 | 0.0020941 | strainProAla - strainCtrl |
| P15705 | 0.4426148 | 0.0384805 | 7.912320 | 11.502323 | 0.0000032 | 0.0020941 | strainProAla - strainCtrl |
| P25294 | 0.4362851 | 0.0377920 | 7.883597 | 11.544377 | 0.0000032 | 0.0020941 | strainProAla - strainCtrl |
| P53691 | 0.4399509 | 0.0410596 | 7.509253 | 10.714931 | 0.0000081 | 0.0045978 | strainProAla - strainCtrl |
| P02829 | 0.7795300 | 0.0796896 | 7.912320 | 9.782081 | 0.0000108 | 0.0046284 | strainProAla - strainCtrl |
| Q06150 | -1.2627310 | 0.1084222 | 6.694697 | -11.646426 | 0.0000109 | 0.0046284 | strainProAla - strainCtrl |
| P38113 | -0.3928407 | 0.0399951 | 7.831425 | -9.822215 | 0.0000113 | 0.0046284 | strainProAla - strainCtrl |
| P22202 | 0.8645164 | 0.0758880 | 6.561383 | 11.392003 | 0.0000145 | 0.0054772 | strainProAla - strainCtrl |
| P38634 | 0.9600999 | 0.1075703 | 7.912320 | 8.925329 | 0.0000211 | 0.0073498 | strainProAla - strainCtrl |
| P09368 | 1.1930265 | 0.1376101 | 7.894926 | 8.669615 | 0.0000264 | 0.0085409 | strainProAla - strainCtrl |
| P40017 | -0.3689066 | 0.0448450 | 7.912320 | -8.226265 | 0.0000381 | 0.0114767 | strainProAla - strainCtrl |
| P38260 | 0.4608851 | 0.0563786 | 7.880615 | 8.174827 | 0.0000408 | 0.0115171 | strainProAla - strainCtrl |
| P07264 | -0.4609168 | 0.0466780 | 6.342681 | -9.874391 | 0.0000440 | 0.0116211 | strainProAla - strainCtrl |
| P53912 | 0.3410981 | 0.0426103 | 7.912320 | 8.005069 | 0.0000463 | 0.0116211 | strainProAla - strainCtrl |
| P53940 | 0.5349932 | 0.0636790 | 7.059729 | 8.401407 | 0.0000635 | 0.0151041 | strainProAla - strainCtrl |
| P32562 | -0.4918672 | 0.0649603 | 7.912320 | -7.571807 | 0.0000686 | 0.0155165 | strainProAla - strainCtrl |
| Q12680 | -0.2400301 | 0.0328092 | 7.912320 | -7.315942 | 0.0000874 | 0.0176714 | strainProAla - strainCtrl |
| Q01217 | -0.3522307 | 0.0397836 | 6.303964 | -8.853674 | 0.0000874 | 0.0176714 | strainProAla - strainCtrl |
| P15108 | 0.4132614 | 0.0567181 | 7.912320 | 7.286229 | 0.0000899 | 0.0176714 | strainProAla - strainCtrl |
| P10592 | 0.3991219 | 0.0563163 | 7.912320 | 7.087146 | 0.0001090 | 0.0205392 | strainProAla - strainCtrl |
| P40054 | -0.2944867 | 0.0418620 | 7.912320 | -7.034703 | 0.0001148 | 0.0207599 | strainProAla - strainCtrl |
| P38737 | 0.3279518 | 0.0447718 | 7.382064 | 7.324965 | 0.0001227 | 0.0208935 | strainProAla - strainCtrl |
| P23337 | -0.5250673 | 0.0747077 | 7.786680 | -7.028291 | 0.0001248 | 0.0208935 | strainProAla - strainCtrl |
| P16474 | 0.3561169 | 0.0477214 | 7.095150 | 7.462412 | 0.0001325 | 0.0209874 | strainProAla - strainCtrl |
| P40989 | 0.9417151 | 0.1369897 | 7.912320 | 6.874349 | 0.0001346 | 0.0209874 | strainProAla - strainCtrl |
| P19414 | -0.3012047 | 0.0426569 | 7.507790 | -7.061096 | 0.0001440 | 0.0217025 | strainProAla - strainCtrl |
| Q08446 | 0.2700720 | 0.0398675 | 7.912320 | 6.774239 | 0.0001489 | 0.0217172 | strainProAla - strainCtrl |
| Q03148 | -0.2881184 | 0.0419698 | 7.663536 | -6.864892 | 0.0001577 | 0.0222863 | strainProAla - strainCtrl |
| O14467 | 0.3700761 | 0.0547172 | 7.757612 | 6.763433 | 0.0001648 | 0.0225804 | strainProAla - strainCtrl |
| P39522 | -0.2535846 | 0.0380328 | 7.707311 | -6.667526 | 0.0001868 | 0.0248428 | strainProAla - strainCtrl |
| Q03102 | 0.5015183 | 0.0760724 | 7.619844 | 6.592643 | 0.0002118 | 0.0273657 | strainProAla - strainCtrl |
| P34227 | 0.2683759 | 0.0420501 | 7.882979 | 6.382288 | 0.0002270 | 0.0275319 | strainProAla - strainCtrl |
| P00815 | -0.3815024 | 0.0600020 | 7.912320 | -6.358157 | 0.0002292 | 0.0275319 | strainProAla - strainCtrl |
| P47169 | -0.3241307 | 0.0511144 | 7.912320 | -6.341282 | 0.0002333 | 0.0275319 | strainProAla - strainCtrl |
| P40897 | -0.6996143 | 0.1108991 | 7.912320 | -6.308564 | 0.0002416 | 0.0275319 | strainProAla - strainCtrl |
| P07285 | -0.2617630 | 0.0406445 | 7.645178 | -6.440308 | 0.0002435 | 0.0275319 | strainProAla - strainCtrl |
| Q12449 | 0.4906544 | 0.0772868 | 7.751548 | 6.348492 | 0.0002525 | 0.0278487 | strainProAla - strainCtrl |
| P53078 | -0.3821903 | 0.0613209 | 7.912320 | -6.232623 | 0.0002621 | 0.0282149 | strainProAla - strainCtrl |
| Q04792 | -0.3503804 | 0.0515853 | 6.913936 | -6.792258 | 0.0002694 | 0.0283332 | strainProAla - strainCtrl |
| P32528 | -0.3216593 | 0.0467982 | 6.622046 | -6.873320 | 0.0003039 | 0.0305432 | strainProAla - strainCtrl |
| P53101 | -0.4436068 | 0.0727722 | 7.912320 | -6.095832 | 0.0003039 | 0.0305432 | strainProAla - strainCtrl |
| P32476 | 0.2403534 | 0.0364613 | 6.947081 | 6.592017 | 0.0003167 | 0.0305658 | strainProAla - strainCtrl |
| P53128 | -0.3269880 | 0.0519512 | 7.434886 | -6.294136 | 0.0003177 | 0.0305658 | strainProAla - strainCtrl |
| Q96VH4 | 0.4510132 | 0.0725882 | 7.473710 | 6.213317 | 0.0003377 | 0.0317142 | strainProAla - strainCtrl |
| P15992 | -0.3928895 | 0.0650208 | 7.784463 | -6.042521 | 0.0003437 | 0.0317142 | strainProAla - strainCtrl |
| P46367 | 0.2843714 | 0.0457536 | 7.209513 | 6.215280 | 0.0003898 | 0.0352539 | strainProAla - strainCtrl |
| Q12031 | 1.1321548 | 0.1778614 | 6.909355 | 6.365376 | 0.0004005 | 0.0355090 | strainProAla - strainCtrl |
| P38699 | 0.4723628 | 0.0778041 | 7.290098 | 6.071179 | 0.0004314 | 0.0357706 | strainProAla - strainCtrl |
| P26364 | 1.3354185 | 0.2248292 | 7.546286 | 5.939702 | 0.0004331 | 0.0357706 | strainProAla - strainCtrl |
| P47164 | -0.3577568 | 0.0619302 | 7.848219 | -5.776770 | 0.0004467 | 0.0357706 | strainProAla - strainCtrl |
| P07258 | -0.2380199 | 0.0410498 | 7.761790 | -5.798314 | 0.0004544 | 0.0357706 | strainProAla - strainCtrl |
| Q04341 | 0.4320328 | 0.0753174 | 7.908442 | 5.736166 | 0.0004547 | 0.0357706 | strainProAla - strainCtrl |
| P22134 | 0.3430047 | 0.0598458 | 7.912320 | 5.731471 | 0.0004563 | 0.0357706 | strainProAla - strainCtrl |
| Q12498 | 0.8913973 | 0.1479166 | 7.251668 | 6.026351 | 0.0004611 | 0.0357706 | strainProAla - strainCtrl |
| P05694 | -0.2632941 | 0.0456569 | 7.779541 | -5.766792 | 0.0004667 | 0.0357706 | strainProAla - strainCtrl |
| P53946 | -0.2783569 | 0.0481680 | 7.672454 | -5.778880 | 0.0004847 | 0.0360746 | strainProAla - strainCtrl |
| P27809 | -0.2000641 | 0.0352523 | 7.912320 | -5.675204 | 0.0004866 | 0.0360746 | strainProAla - strainCtrl |
| Q12001 | -0.2852686 | 0.0467067 | 6.948009 | -6.107663 | 0.0005018 | 0.0365958 | strainProAla - strainCtrl |
| P19882 | 0.3103866 | 0.0539485 | 7.561603 | 5.753382 | 0.0005259 | 0.0377453 | strainProAla - strainCtrl |
| P53133 | -0.3015024 | 0.0539455 | 7.883584 | -5.589019 | 0.0005444 | 0.0380712 | strainProAla - strainCtrl |
| P40048 | 0.4075567 | 0.0720391 | 7.690333 | 5.657440 | 0.0005502 | 0.0380712 | strainProAla - strainCtrl |
| P24869 | -0.7569742 | 0.1338608 | 7.662168 | -5.654937 | 0.0005590 | 0.0380712 | strainProAla - strainCtrl |
| P25491 | 0.2902356 | 0.0485714 | 6.967679 | 5.975438 | 0.0005656 | 0.0380712 | strainProAla - strainCtrl |
| Q08422 | 0.5614367 | 0.0962914 | 7.224849 | 5.830602 | 0.0005725 | 0.0380712 | strainProAla - strainCtrl |
| P47093 | 0.3475094 | 0.0630101 | 7.894610 | 5.515135 | 0.0005902 | 0.0386782 | strainProAla - strainCtrl |
| P21954 | -0.2371367 | 0.0433249 | 7.912320 | -5.473449 | 0.0006149 | 0.0397212 | strainProAla - strainCtrl |
| Q12329 | 0.4822990 | 0.0867309 | 7.635006 | 5.560869 | 0.0006292 | 0.0400733 | strainProAla - strainCtrl |
| P38891 | -0.3311179 | 0.0616490 | 7.912320 | -5.371016 | 0.0006939 | 0.0435599 | strainProAla - strainCtrl |
| P36007 | -0.2566658 | 0.0473714 | 7.751904 | -5.418163 | 0.0007032 | 0.0435599 | strainProAla - strainCtrl |
| P18544 | -0.2805208 | 0.0531767 | 7.912320 | -5.275261 | 0.0007779 | 0.0475341 | strainProAla - strainCtrl |
| P38910 | 0.2573712 | 0.0438250 | 6.435684 | 5.872701 | 0.0008387 | 0.0495089 | strainProAla - strainCtrl |
| P32332 | -0.2336949 | 0.0444461 | 7.771827 | -5.257942 | 0.0008415 | 0.0495089 | strainProAla - strainCtrl |
| P53885 | 0.2064917 | 0.0393138 | 7.783327 | 5.252401 | 0.0008430 | 0.0495089 | strainProAla - strainCtrl |
Volcanoplots
plotVolcano(inferences) +
facet_wrap(~contrast) +
labs(title = "Non-enriched")
inferences |>
filter(adjPval < alpha) |>
mutate(DA = sign(logFC) |> as.factor() |> recode("-1"= "down","1" = "up")) |>
group_by(contrast, DA) |>
ggplot(aes(x = contrast)) +
geom_bar(aes(fill = factor(DA)),
colour = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90))
We notice that many proteins are DA at the nominal 5% FDR-level.
Heatmaps
lapply(colnames(L),
function(contrast, se, alpha)
{
sig <- rowData(se)[[contrast]] |>
filter(adjPval < alpha) |>
rownames()
if (length(sig) > 2)
{
quants <- t(scale(t(assay(se[sig,]))))
colnames(quants) <- se$sampleId #specific to this dataset to get short colnames
rowclushlp <- quants
rowclushlp[is.na(rowclushlp)] <- min(quants,na.rm=TRUE) - 2
rowclus <- hclust(dist(rowclushlp))
annotations <- columnAnnotation(
group = se$strain
) #3.
set.seed(1234) ## annotation colours are randomly generated by default
return(
Heatmap(show_row_names = FALSE,
quants,
name = "log2 intensity",
top_annotation = annotations,
column_title = paste0(contrast, " = 0"),
cluster_rows = rowclus
)
)
} else return(ggplot() + theme_minimal() + ggtitle(paste0(contrast, " = 0")))
},
se = getWithColData(qf, "proteins"),
alpha = alpha)[[1]]

[[2]]

[[3]]

Detail plots
We can explore the data for a protein to validate the statistical inference results. For example, let’s explore the normalised precursor and the summarised protein intensities for the protein with the most significant log2 fold change.
(target_feature <- inferences |>
dplyr::slice(which.min(pval)) |>
pull(feature)
)[1] "Q12068"
inferences |>
filter(feature == target_feature) logFC se df t
strainCtrl.Q12068 -1.2550208 0.04609818 7.91232 -27.22495
strainProAla.Q12068 -0.6165566 0.04609818 7.91232 -13.37486
strainProAla - strainCtrl.Q12068 0.6384642 0.04609818 7.91232 13.85010
pval adjPval
strainCtrl.Q12068 4.193834e-09 0.0000189771
strainProAla.Q12068 1.033928e-06 0.0046826604
strainProAla - strainCtrl.Q12068 7.923812e-07 0.0008957869
contrast feature
strainCtrl.Q12068 strainCtrl Q12068
strainProAla.Q12068 strainProAla Q12068
strainProAla - strainCtrl.Q12068 strainProAla - strainCtrl Q12068
To obtain the required data, we perform a little data manipulation pipeline:
- We use the QFeatures subsetting functionality to retrieve all data related to and focusing on the precursors_log and proteins sets that contains the peptide ion data used for model fitting.
- We then convert the data with longForm() for plotting.
- Finally, we plot the log2 normalised intensities for each sample at the protein and at the peptide level. Since multiple peptides are recorded for the protein, we link peptides across samples using a grey line. Samples are colored according to strain.
qf[target_feature, , c("precursors_log","precursors_norm", "proteins")] |> #1
longForm(colvars = colnames(colData(qf))) |> #2
data.frame() |>
ggplot() +
aes(x = sampleId,
y = value) +
geom_line(aes(group = rowname), linewidth = 0.1) +
geom_point(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
ggtitle(target_feature) +
theme_minimal() +
theme(axis.text.x = element_blank())Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 36 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
Warning: Removed 16 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 58 rows containing missing values or values outside the scale range
(`geom_point()`).

qf[target_feature, , c("precursors_log","precursors_norm", "proteins")] |> #1
longForm(colvars = colnames(colData(qf))) |> #2
data.frame() |>
filter(!is.na(value)) %>%
{
ggplot(.) +
aes(x = strain,
y = value) +
geom_boxplot(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
geom_jitter(aes(shape = rowname)) +
scale_shape_manual(values = seq_len(dplyr::n_distinct(.$rowname))) +
ggtitle(target_feature) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
guides(shape = "none")
}harmonizing input:
removing 36 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

Compare phospo-precursor level intensities to protein level intensities
We compare the intensities for the top 5 DA phospo-precursors for the first contrast to these of their corresponding protein in the non-enriched assay.
contr = colnames(L)[1]
top5 <- inferencesPTM |>
filter(contrast == contr) |>
arrange(pval) |>
head(n = 5) |>
pull(feature)
for (feat in top5)
{
ptm_data <- qf[,,c("precursorsPTM_norm")] |>
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |>
data.frame() |>
filter(rowname==feat)
feature_protein <- ptm_data |>
pull("Protein.Group") |>
unique()
prot_data <- qf[,,"proteins"]|>
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |>
data.frame() |>
filter(Protein.Group==feature_protein)
ptm_protein <- rbind(ptm_data, prot_data)
ylims <- ptm_protein |>
group_by(assay) |>
summarise(cent = mean(range(value,na.rm=TRUE)), ampl = diff(range(value,na.rm=TRUE))) |>
mutate(lower = cent - max(ampl)/2,
upper = cent + max(ampl)/2) |>
select(-c(cent, ampl))
comparison_plot <- ptm_protein |>
ggplot() +
aes(x = sampleId,
y = value) +
geom_line(aes(group = rowname), linewidth = 0.1) +
geom_point(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
labs(
title = paste0(feat," / ", feature_protein),
subtitle = paste("padj PTM =",
inferencesPTM |>
filter(feature == feat & contrast == contr) |>
pull(adjPval) |>
round(digits = 3))
) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
ggh4x::facetted_pos_scales(
y = list(
assay == ylims$assay[1] ~ scale_y_continuous(limits = unlist(ylims[1,c("lower","upper")])),
assay == ylims$assay[2] ~ scale_y_continuous(limits = unlist(ylims[2,c("lower","upper")])),
assay == ylims$assay[3] ~ scale_y_continuous(limits = unlist(ylims[3,c("lower","upper")]))
)
)
print(comparison_plot)
}




#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11711072 625.5 17409510 929.8 NA 17409510 929.8
Vcells 31212893 238.2 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11711071 625.5 17409510 929.8 NA 17409510 929.8
Vcells 31212925 238.2 82281340 627.8 24576 312076398 2381.0
We see for multiple top DA phospho-precursors, that the reason for DA seems to be associated with the similar changes in overall protein abundance. Interestingly for other phospho-precursors the abundance of the precursor changes between strains while the abundances at the protein-level remains stable between strains. Hence, there is a strong need to correct the phospho-precursor level fold changes/ abundances for corresponding changes at the protein level.
13.8 Data Modeling of phospho-peptidoform usages
13.8.1 Background
Differentially abundant PTMs can arise either from changes in PTM regulation or simply from changes in the abundance of the underlying protein.
To distinguish PTM-specific regulation from protein-level changes, we extend the concept of differential usage (DU) to PTMs:
\[ \log_2 DU = \log_2 FC^\text{PTM} - \log_2 FC^\text{protein}, \]
where \(log_2 FC^\text{PTM}\) is the fold change of the modified precursor/PTM and \(log_2 FC^\text{protein}\) is the fold change of the corresponding protein.
In other words, differential usage represents the differential abundance (DA) of a PTM after correcting for changes in protein abundance.
This metric highlights PTMs whose abundance changes cannot be explained solely by changes in the total protein level, thereby prioritising PTMs that may be under specific regulatory control.
Two experimental designs are commonly used to quantify precursor-level (PTM-level) and protein-level abundances:
Paired design: PTM and protein abundances are measured in the same biological replicate (sample).
Unpaired design: PTM and protein abundances are measured in independent biological replicates (different samples).
Paired samples
Perform regular data processing at the protein-level:
- log2-transformation,
- normalisation and
- summarisation
Calculate relative log2 transformed abundances at peptidoform level \[y^\text{pep,rel}_i = y^\text{pep}_i - y^\text{prot}_i\]
Perform conventional differential msqrob2 analysis on the relative peptidoform level. \(\rightarrow\) Immediately gives us diferential usages!
Optionally
- aggregate relative log2 transformed abundances at peptidoform level to PTM-level
- perform a differential msqrob2 analysis at PTM-level
- The paired design has the advantage that both metrics are estimated on the same biorepeat/sample, which enables a better estimation of the standard error on the usages.
Unpaired PTM-level and protein samples
This is the default MSstatsTMT approach, which they also use for paired designs?!
- Perform regular msqrob2 differential analysis at the protein-level
- Perform regular msqrob2 differential analysis at the peptidoform (PTM-level)
- Calculate usages
\[ \begin{array}{ccc} \log_2 \widehat{DU} & =& \log_2 \widehat{FC}^\text{PTM} - log_2 \widehat{FC}^\text{protein}\\ SE_\widehat{DU} &=& \sqrt{(SE_\widehat{FC}^{PTM})^2 + (SE_\widehat{FC}^{protein})^2} \end{array} \]
Our study
For this dataset we have phospho-enriched and non-enriched runs on the same biological samples. So we have a paired design!
13.8.2 Paired samples: calculate usages
We can calculate the usages for every precursor ion by subtracting the corresponding log-transformed protein level abundance values from the log2 transformed and normalized precursor ion intensities.
We first develop a function that will extract the corresponding protein level abundance values.
It has following inputs:
se: summarised experiment of the log2-transformed and normalized precursor ionseNE: summarised experiment of the log2-transformed, normalized and aggregated protein level abundances.fcol: annotation column with protein name for the precursor ionsfcolNE: annotation column with protein name for the protein level abundancesmatchColsSeToseNE: a vector that matches the columns ofseNEto these inse.
The function
- Initiates the matrix with log2 tranformed correction factors at NA.
- Matches the rows of se with these of seNE using the annotation columns
- Extracts the protein abundance values for the precursors from the matching rows and columns in the protein level assay.
calculate_usage_nfLog <- function(se,
seNE,
fcol = "Protein.Group",
fcolNE = "Protein.Group",
matchColsSeToseNE = 1:ncol(se)
)
{
nf <- assay(se); nf[,] <- NA #1.
matchSeRowsToSeNERows <- match(
rowData(se)[[fcol]],
rowData(seNE)[[fcolNE]]) #2.
idsMatch <- which(!is.na(matchSeRowsToSeNERows))
nf[idsMatch, ] <- assay(seNE)[matchSeRowsToSeNERows[idsMatch], matchColsSeToseNE] #3.
return(nf)
}Here we
- Match the samples of the phosho enriched samples to these of the non-enriched assay.
- Calculate the correction factors
- Make a new assay with usages by sweeping out the log2 protein abundances from from the log2 precursor abundances.
matchedSamps <- match(
getWithColData(qf,"precursorsPTM_norm")$sampleId,
getWithColData(qf,"proteins")$sampleId
) ## 1.
nfLog <- calculate_usage_nfLog(qf[["precursorsPTM_norm"]],
qf[["proteins"]],
fcol= "Protein.Group",
fcolNE = "Protein.Group",
matchColsSeToseNE = matchedSamps) ## 2.
qf <- sweep( #4. Subtract log2 norm factor elementwise (MARGIN = 1:2)
qf,
MARGIN = 1:2,
STATS = nfLog,
i = "precursorsPTM_norm",
name = "precursorsPTM_usage"
) ## 3. We only keep precursors for which the usage can be calculated in at least 6 samples.
n <- ncol(qf[["precursorsPTM_usage"]])
qf <- filterNA(qf, i = "precursorsPTM_usage", pNA = (n - nObs) / n)We assess the distribution of the usages.
qf[, , "precursorsPTM_usage"] |> #1.
longForm(colvars = colnames(colData(qf))) |> #2.
data.frame() |>
filter(!is.na(value)) |>
ggplot() + #3.
aes(x = value,
colour = strain,
group = colname) +
geom_density() +
labs(subtitle = "Usage") +
theme_minimal()harmonizing input:
removing 63 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11726629 626.3 17409510 929.8 NA 17409510 929.8
Vcells 36282732 276.9 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 11726634 626.3 17409510 929.8 NA 17409510 929.8
Vcells 36282774 276.9 82281340 627.8 24576 312076398 2381.0
13.8.3 Model estimation
Again we can estimate the same model but now for the usages.
qf <- msqrob(
qf,
i = "precursorsPTM_usage",
formula = model,
robust = TRUE,
overwrite = TRUE)We enabled M-estimation (robust = TRUE) for improved robustness against outliers.
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12347223 659.5 22260250 1188.9 NA 17409510 929.8
Vcells 37829028 288.7 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12347225 659.5 22260250 1188.9 NA 17409510 929.8
Vcells 37829065 288.7 82281340 627.8 24576 312076398 2381.0
13.8.4 Inference
Contrasts remain the same. We assess the contrast for each precursor.
qf <- hypothesisTest(qf, i = "precursorsPTM_usage", contrast = L, overwrite = TRUE)We extract the results table from the proteins summarised experiment in the qf object.
inferencesUsage <-
msqrobCollect(qf[["precursorsPTM_usage"]], L)#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12347304 659.5 22260250 1188.9 NA 22260250 1188.9
Vcells 38667439 295.1 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12347300 659.5 22260250 1188.9 NA 22260250 1188.9
Vcells 38667466 295.1 82281340 627.8 24576 312076398 2381.0
13.8.5 Report results
We report the results using a results table, volcano plots and heatmaps.
Results table
for (j in colnames(L)) {
inference <- inferencesUsage |>
dplyr::filter(adjPval < alpha & contrast == j)
cat("**Median - Contrast:**", j, "= 0 (", nrow(inference),
"significant proteins)\n\n")
cat('<div style="max-height:300px; overflow-y:auto;">')
print(
kable(
inference |>
dplyr::arrange(pval) |>
dplyr::relocate(feature),
row.names = FALSE
)
)
cat('</div>')
cat("\n\n\n---\n\n")
}Median - Contrast: strainCtrl = 0 ( 1 significant proteins)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|---|---|---|---|---|---|---|
| IGDS(UniMod:21)LQGSPQR2 | -3.08032 | 0.1606909 | 6.847071 | -19.16922 | 3e-07 | 0.0075445 | strainCtrl |
Median - Contrast: strainProAla = 0 ( 3 significant proteins)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|---|---|---|---|---|---|---|
| AVQES(UniMod:21)DS(UniMod:21)TTS(UniMod:21)RIIEEHESPIDAEK3 | -4.050533 | 0.1888981 | 6.011634 | -21.44295 | 7.0e-07 | 0.0149318 | strainProAla |
| (UniMod:1)SDS(UniMod:21)EVNQEAKPEVK2 | 1.977204 | 0.1246825 | 6.131480 | 15.85790 | 3.3e-06 | 0.0373874 | strainProAla |
| S(UniMod:21)DSEVNQEAKPEVK3 | 1.902436 | 0.1575761 | 7.011634 | 12.07312 | 6.0e-06 | 0.0456029 | strainProAla |
Median - Contrast: strainProAla - strainCtrl = 0 ( 9 significant proteins)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|---|---|---|---|---|---|---|
| IGDS(UniMod:21)LQGSPQR2 | 3.033408 | 0.1791169 | 6.847071 | 16.935351 | 8.00e-07 | 0.0084048 | strainProAla - strainCtrl |
| AVQES(UniMod:21)DS(UniMod:21)TTS(UniMod:21)RIIEEHESPIDAEK3 | -3.757376 | 0.1888981 | 6.011634 | -19.891019 | 1.00e-06 | 0.0084048 | strainProAla - strainCtrl |
| GQVVSEEQRPGT(UniMod:21)PLFTVK3 | 2.422281 | 0.1512500 | 6.834792 | 16.015077 | 1.10e-06 | 0.0084048 | strainProAla - strainCtrl |
| (UniMod:1)SDEEHTFENADAGAS(UniMod:21)ATYPMQC(UniMod:4)SALR4 | -7.050728 | 0.5441540 | 6.011634 | -12.957229 | 1.28e-05 | 0.0423083 | strainProAla - strainCtrl |
| EIVFAS(UniMod:21)PPRK2 | 1.515572 | 0.1633213 | 8.011634 | 9.279692 | 1.46e-05 | 0.0423083 | strainProAla - strainCtrl |
| LIDENATENGLAGS(UniMod:21)PKDEDGIIM(UniMod:35)TNK3 | -3.627986 | 0.2824747 | 5.932809 | -12.843578 | 1.49e-05 | 0.0423083 | strainProAla - strainCtrl |
| HSRPLS(UniMod:21)IS(UniMod:21)ST(UniMod:21)TPLDLQR3 | 5.012028 | 0.3049740 | 5.011634 | 16.434281 | 1.49e-05 | 0.0423083 | strainProAla - strainCtrl |
| (UniMod:1)SDS(UniMod:21)EVNQEAKPEVK2 | 2.056253 | 0.1691033 | 6.131480 | 12.159746 | 1.60e-05 | 0.0423083 | strainProAla - strainCtrl |
| RAT(UniMod:21)YAGFLLADPK3 | 2.699517 | 0.2614041 | 7.011634 | 10.326987 | 1.71e-05 | 0.0423083 | strainProAla - strainCtrl |
Volcanoplots
plotVolcano(inferencesUsage) +
facet_wrap(~contrast) +
labs(title = "Usage")
inferencesUsage |>
filter(adjPval < alpha) |>
mutate(DA = sign(logFC) |> as.factor() |> recode("-1"= "down","1" = "up")) |>
group_by(contrast, DA) |>
ggplot(aes(x = contrast)) +
geom_bar(aes(fill = factor(DA)),
colour = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90))
We observe that the number of significant precursors reduced drastically.
Heatmaps
lapply(colnames(L),
function(contrast, se, alpha)
{
sig <- rowData(se)[[contrast]] |>
filter(adjPval < alpha) |>
rownames()
if (length(sig) > 2)
{
quants <- t(scale(t(assay(se[sig,]))))
colnames(quants) <- se$sampleId #specific to this dataset to get short colnames
rowclushlp <- quants
rowclushlp[is.na(rowclushlp)] <- min(quants,na.rm=TRUE) - 2
rowclus <- hclust(dist(rowclushlp))
annotations <- columnAnnotation(
group = se$strain
) #3.
set.seed(1234) ## annotation colours are randomly generated by default
return(
Heatmap(show_row_names = FALSE,
quants,
name = "log2 intensity",
top_annotation = annotations,
column_title = paste0(contrast, " = 0"),
cluster_rows = rowclus
)
)
} else return(ggplot() + theme_minimal() + ggtitle(paste0(contrast, " = 0")))
},
se = getWithColData(qf, "precursorsPTM_usage"),
alpha = alpha)Detail plots
We can explore the data for a protein to validate the statistical inference results. For example, let’s explore the normalised precursor abundance, usages as well as the summarised protein intensities for the precursor with the most significant log2 usage.
(target_feature <- inferencesUsage |>
dplyr::slice(which.min(pval)) |>
pull(feature)
)[1] "IGDS(UniMod:21)LQGSPQR2"
inferencesUsage |>
filter(feature == target_feature) logFC se df
strainCtrl.IGDS(UniMod:21)LQGSPQR2 -3.080320 0.1606909 6.847071
strainProAla.IGDS(UniMod:21)LQGSPQR2 -0.046912 0.1771466 6.847071
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 3.033408 0.1791169 6.847071
t pval
strainCtrl.IGDS(UniMod:21)LQGSPQR2 -19.1692177 3.329132e-07
strainProAla.IGDS(UniMod:21)LQGSPQR2 -0.2648202 7.989425e-01
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 16.9353506 7.656034e-07
adjPval
strainCtrl.IGDS(UniMod:21)LQGSPQR2 0.007544480
strainProAla.IGDS(UniMod:21)LQGSPQR2 0.999480357
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 0.008404769
contrast
strainCtrl.IGDS(UniMod:21)LQGSPQR2 strainCtrl
strainProAla.IGDS(UniMod:21)LQGSPQR2 strainProAla
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 strainProAla - strainCtrl
feature
strainCtrl.IGDS(UniMod:21)LQGSPQR2 IGDS(UniMod:21)LQGSPQR2
strainProAla.IGDS(UniMod:21)LQGSPQR2 IGDS(UniMod:21)LQGSPQR2
strainProAla - strainCtrl.IGDS(UniMod:21)LQGSPQR2 IGDS(UniMod:21)LQGSPQR2
We again make a similar plot as before.
ptm <- qf[target_feature, , c("precursorsPTM_norm", "precursorsPTM_usage")] |> #1
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |> #2
data.frame()Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 54 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
protein <- qf[unique(ptm$Protein.Group), , c("proteins")] |> #1
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |> #2
data.frame()Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 63 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
ptm_protein <- rbind(ptm, protein)
ylims <- ptm_protein |>
group_by(assay) |>
summarise(cent = mean(range(value,na.rm=TRUE)), ampl = diff(range(value,na.rm=TRUE))) |>
mutate(lower = cent - max(ampl)/2,
upper = cent + max(ampl)/2) |>
select(-c(cent, ampl))
outplot <- ptm_protein |>
ggplot() +
aes(x = sampleId,
y = value) +
geom_line(aes(group = rowname), linewidth = 0.1) +
geom_point(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
ggh4x::facetted_pos_scales(
y = list(
assay == ylims$assay[1] ~ scale_y_continuous(limits = unlist(ylims[1,c("lower","upper")])),
assay == ylims$assay[2] ~ scale_y_continuous(limits = unlist(ylims[2,c("lower","upper")])),
assay == ylims$assay[3] ~ scale_y_continuous(limits = unlist(ylims[3,c("lower","upper")]))
)
) +
labs(subtitle = target_feature) +
theme_minimal() +
theme(axis.text.x = element_blank())
print(outplot)Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

rbind(ptm, protein) %>%
{
ggplot(.) +
aes(x = strain,
y = value) +
geom_boxplot(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
geom_jitter(aes(shape = rowname)) +
scale_shape_manual(values = seq_len(dplyr::n_distinct(.$rowname))) +
ggtitle(target_feature) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
guides(shape = "none")
}Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_point()`).

We compare the intensities for the top 5 DA phospo-precursors for the first contrast to these of their corresponding protein in the non-enriched assay and to the usages.
contr = colnames(L)[1]
top5 <- inferencesPTM |>
filter(contrast == contr) |>
arrange(pval) |>
head(n = 5) |>
pull(feature)
for (feat in top5)
{
ptm_data <- qf[,,c("precursorsPTM_norm","precursorsPTM_usage")] |>
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |>
data.frame() |>
filter(rowname==feat)
feature_protein <- ptm_data |>
pull("Protein.Group") |>
unique()
prot_data <- qf[,,"proteins"]|>
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |>
data.frame() |>
filter(Protein.Group==feature_protein)
ptm_protein <- rbind(ptm_data, prot_data)
ylims <- ptm_protein |>
group_by(assay) |>
summarise(cent = mean(range(value,na.rm=TRUE)), ampl = diff(range(value,na.rm=TRUE))) |>
mutate(lower = cent - max(ampl)/2,
upper = cent + max(ampl)/2) |>
select(-c(cent, ampl))
comparison_plot <- ptm_protein |>
ggplot() +
aes(x = sampleId,
y = value) +
geom_line(aes(group = rowname), linewidth = 0.1) +
geom_point(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
labs(
title = paste0(feat," / ", feature_protein),
subtitle = paste("padj PTM =",
inferencesPTM |>
filter(feature == feat & contrast == contr) |>
pull(adjPval) |>
round(digits = 3),
"padj usage =",
inferencesUsage |>
filter(feature == feat & contrast == contr) |>
pull(adjPval) |>
round(digits = 3))
) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
ggh4x::facetted_pos_scales(
y = list(
assay == ylims$assay[1] ~ scale_y_continuous(limits = unlist(ylims[1,c("lower","upper")])),
assay == ylims$assay[2] ~ scale_y_continuous(limits = unlist(ylims[2,c("lower","upper")])),
assay == ylims$assay[3] ~ scale_y_continuous(limits = unlist(ylims[3,c("lower","upper")]))
)
)
print(comparison_plot)
}




We compare the intensities for the top 5 DU phospo-precursors for the first contrast to these of their corresponding protein in the non-enriched assay and to their unadjusted abundances.
contr = colnames(L)[1]
top5 <- inferencesUsage |>
filter(contrast == contr) |>
arrange(pval) |>
head(n = 5) |>
pull(feature)
for (feat in top5)
{
ptm_data <- qf[,,c("precursorsPTM_norm","precursorsPTM_usage")] |>
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |>
data.frame() |>
filter(rowname==feat)
feature_protein <- ptm_data |>
pull("Protein.Group") |>
unique()
prot_data <- qf[,,"proteins"]|>
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |>
data.frame() |>
filter(Protein.Group==feature_protein)
ptm_protein <- rbind(ptm_data, prot_data)
ylims <- ptm_protein |>
group_by(assay) |>
summarise(cent = mean(range(value,na.rm=TRUE)), ampl = diff(range(value,na.rm=TRUE))) |>
mutate(lower = cent - max(ampl)/2,
upper = cent + max(ampl)/2) |>
select(-c(cent, ampl))
comparison_plot <- ptm_protein |>
ggplot() +
aes(x = sampleId,
y = value) +
geom_line(aes(group = rowname), linewidth = 0.1) +
geom_point(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
labs(
title = paste0(feat," / ", feature_protein),
subtitle = paste("padj PTM =",
inferencesPTM |>
filter(feature == feat & contrast == contr) |>
pull(adjPval) |>
round(digits = 3),
"padj usage =",
inferencesUsage |>
filter(feature == feat & contrast == contr) |>
pull(adjPval) |>
round(digits = 3))
) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
ggh4x::facetted_pos_scales(
y = list(
assay == ylims$assay[1] ~ scale_y_continuous(limits = unlist(ylims[1,c("lower","upper")])),
assay == ylims$assay[2] ~ scale_y_continuous(limits = unlist(ylims[2,c("lower","upper")])),
assay == ylims$assay[3] ~ scale_y_continuous(limits = unlist(ylims[3,c("lower","upper")]))
)
)
print(comparison_plot)
}




#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12348541 659.5 22260250 1188.9 NA 22260250 1188.9
Vcells 35544537 271.2 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12348543 659.5 22260250 1188.9 NA 22260250 1188.9
Vcells 35544574 271.2 82281340 627.8 24576 312076398 2381.0
13.9 Differential usage analysis at the PTM level
Multiple precursors are carrying the same PTM (cover the PTM at a certain position in the protein). We can aggregate them together.
13.9.1 Data processing
The variable Protein.Sites in the DIA-NN parquet file contains information on the exact modified residue position in the full protein sequence.
We first will make a new assay with duplicates for the precursors that contain multiple phospho sites.
- We extract the usage assay
- We extract the row data with the original variables (drop msqrob2 colums)
- Add a new variable with the modification type, which can derived from the variable
Modified.Sequence. Indeed, all modifications start with string “UniMod:” followed by a number, i.e. 21 is a phoshorylation. So we extract all patterns “UniMod:[0-9]+” (UniMod:to match the literal text “UniMod:”,[0-9]to match any digit (0–9) and+to match one or more digits. We then replace patternUniprot:with an empty string. - Add a new variable with the location of the modification on the original protein, which can be extracted from the variable
Protein.Sites. Indeed, this variables contains the residu and its position in the protein sequence. If the sequence contains multiple modifications, the sites are separated with a,. Regex(?<=:)[^\]]+extracts all characters immediately after the colon without returning the colon and ignores], e.g. for “[P52871:A2,S4]” it will return “A2,S4”. The resulting string is then splitted and subsequently the residue (first character) is removed. - Replace the rowData of se
- Make vector of the row numbers with duplicates for the sequences containing multiple PTMs
- Make a corresponding vector with the positions of the individual PTM sites
- Make a corresponding vector with the modification type
- Extract the ids of rows that correspond to phospho sites
- Make a new summarised experiment with duplicate entries for precursors with multiple phospho sites.
- Add the position of the phospho site
- Add a variable with the Protein.Group name and the phospho site position attached to it.
- Make the rownames unique by appending the phospho site position to it.
- Add the assay to the QFeatures object.
se <- qf[["precursorsPTM_usage"]] #1.
rd <- rowData(se)[,names(rowData(qf[["precursorsPTM"]]))] |> ##2.
as.data.frame() |>
mutate(
mods = str_extract_all(Modified.Sequence,
"UniMod:[0-9]+") |>
lapply(FUN = gsub, pattern="UniMod:",replacement=""), ##3.
pos = str_extract(Protein.Sites,
"(?<=:)[^\\]]+") |>
strsplit(split = ",") |>
lapply(FUN=substr,start=2,stop=10000) |>
lapply(FUN=as.integer) ##4.
)
rowData(se) <- rd ##5.
ids <- rep(rd |>
nrow() |>
seq_len(),
rd |>
pull(pos) |>
sapply(FUN = length))##6.
pos <- unlist(rd$pos) ##7.
mod <- unlist(rd$mods) ##8.
phosIds <- ids[mod == "21"] ##9.
seDup <- se[phosIds,] ##10.
rowData(seDup)$pos <- pos[mod == "21"] ##11.
rowData(seDup)$Protein.Group.Mod <- paste(
rowData(seDup)$Protein.Group,
rowData(seDup)$pos, sep = "_") ##12.
rownames(seDup) <- paste(rownames(seDup), rowData(seDup)$pos, sep = "_") ##13.
qf <- addAssay(qf, seDup, "precursorsPTM_usage_unnested") ##14.We now aggregate the data of precursors that map to the same PTM position on a protein in one protein expression value.
qf <- aggregateFeatures(qf,
i = "precursorsPTM_usage_unnested",
fcol = "Protein.Group.Mod",
name = "ptm_usage",
fun = function(X, ...) iq::maxLFQ(X)$estimate
)Your quantitative data contain missing values. Please read the relevant
section(s) in the aggregateFeatures manual page regarding the effects
of missing values on data aggregation.
Aggregated: 1/1
We assess the distribution of the PTM usages.
qf[, , "ptm_usage"] |> #1.
longForm(colvars = colnames(colData(qf))) |> #2.
data.frame() |>
filter(!is.na(value)) |>
ggplot() + #3.
aes(x = value,
colour = strain,
group = colname) +
geom_density() +
labs(subtitle = "Usage") +
theme_minimal()harmonizing input:
removing 81 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'

We remove temporarily objects
rm(se, seDup, mod, pos, ids, phosIds)#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12539248 669.7 22260250 1188.9 NA 22260250 1188.9
Vcells 40182702 306.6 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12539238 669.7 22260250 1188.9 NA 22260250 1188.9
Vcells 40182719 306.6 82281340 627.8 24576 312076398 2381.0
13.9.2 Model estimation
Again we can estimate the same model but now for the usages.
qf <- msqrob(
qf,
i = "ptm_usage",
formula = model,
robust = TRUE)We enabled M-estimation (robust = TRUE) for improved robustness against outliers.
#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12960520 692.2 22260250 1188.9 NA 22260250 1188.9
Vcells 41311074 315.2 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 12960522 692.2 22260250 1188.9 NA 22260250 1188.9
Vcells 41311111 315.2 82281340 627.8 24576 312076398 2381.0
13.9.3 Inference
Contrasts remain the same. We assess the contrast for each precursor.
qf <- hypothesisTest(qf, i = "ptm_usage", contrast = L)We extract the results table from the proteins summarised experiment in the qf object.
inferencesPtmUsage <-
msqrobCollect(qf[["ptm_usage"]], L)#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 13025284 695.7 22260250 1188.9 NA 22260250 1188.9
Vcells 42651685 325.5 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 13025280 695.7 22260250 1188.9 NA 22260250 1188.9
Vcells 42651712 325.5 82281340 627.8 24576 312076398 2381.0
13.9.4 Report results
We report the results using a results table, volcano plots and heatmaps.
Results table
for (j in colnames(L)) {
inference <- inferencesPtmUsage |>
dplyr::filter(adjPval < alpha & contrast == j)
cat("**Median - Contrast:**", j, "= 0 (", nrow(inference),
"significant proteins)\n\n")
cat('<div style="max-height:300px; overflow-y:auto;">')
print(
kable(
inference |>
dplyr::arrange(pval) |>
dplyr::relocate(feature),
row.names = FALSE
)
)
cat('</div>')
cat("\n\n\n---\n\n")
}Median - Contrast: strainCtrl = 0 ( 0 significant proteins)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|
Median - Contrast: strainProAla = 0 ( 0 significant proteins)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|
Median - Contrast: strainProAla - strainCtrl = 0 ( 0 significant proteins)
| feature | logFC | se | df | t | pval | adjPval | contrast |
|---|
Volcanoplots
plotVolcano(inferencesPtmUsage) +
facet_wrap(~contrast) +
labs(title = "PTM Usage")
inferencesPtmUsage |>
filter(adjPval < alpha) |>
mutate(DA = sign(logFC) |> as.factor() |> recode("-1"= "down","1" = "up")) |>
group_by(contrast, DA) |>
ggplot(aes(x = contrast)) +
geom_bar(aes(fill = factor(DA)),
colour = "black") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90))
Heatmaps
lapply(colnames(L),
function(contrast, se, alpha)
{
sig <- rowData(se)[[contrast]] |>
filter(adjPval < alpha) |>
rownames()
if (length(sig) > 2)
{
quants <- t(scale(t(assay(se[sig,]))))
colnames(quants) <- se$sampleId #specific to this dataset to get short colnames
rowclushlp <- quants
rowclushlp[is.na(rowclushlp)] <- min(quants,na.rm=TRUE) - 2
rowclus <- hclust(dist(rowclushlp))
annotations <- columnAnnotation(
group = se$strain
) #3.
set.seed(1234) ## annotation colours are randomly generated by default
return(
Heatmap(show_row_names = FALSE,
quants,
name = "log2 intensity",
top_annotation = annotations,
column_title = paste0(contrast, " = 0"),
cluster_rows = rowclus
)
)
} else return(ggplot() + theme_minimal() + ggtitle(paste0(contrast, " = 0")))
},
se = getWithColData(qf, "ptm_usage"),
alpha = alpha)Detail plots
We can explore the data for a protein to validate the statistical inference results. For example, let’s explore the precursor, summarized PTM and summarised protein intensities for the PTM with the most significant log2 fold change.
(target_feature <- inferencesPtmUsage |>
dplyr::slice(which.min(pval)) |>
pull(feature)
)[1] "P32324_763"
inferencesPtmUsage |>
filter(feature == target_feature) logFC se df t
strainCtrl.P32324_763 -1.7844067 0.1620717 6.440118 -11.009983
strainProAla.P32324_763 0.4608437 0.1695649 6.440118 2.717802
strainProAla - strainCtrl.P32324_763 2.2452504 0.1784255 6.440118 12.583687
pval adjPval
strainCtrl.P32324_763 2.052609e-05 0.33882423
strainProAla.P32324_763 3.238282e-02 0.97196697
strainProAla - strainCtrl.P32324_763 8.981049e-06 0.06377349
contrast feature
strainCtrl.P32324_763 strainCtrl P32324_763
strainProAla.P32324_763 strainProAla P32324_763
strainProAla - strainCtrl.P32324_763 strainProAla - strainCtrl P32324_763
ptm <- qf[target_feature, , c("precursorsPTM_usage_unnested", "ptm_usage")] |> #1
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |> #2
data.frame()Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 72 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
precursor <- qf[ptm |>
filter(assay =="precursorsPTM_usage_unnested") |>
pull(rowname) |>
sub(pattern = "_.*", replacement = "") |>
unique(), , "precursorsPTM_norm"] |>
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |> #2
data.frame()Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 81 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
protein <- qf[unique(ptm$Protein.Group), , c("proteins")] |> #1
longForm(colvars = colnames(colData(qf)), rowvars = "Protein.Group") |> #2
data.frame()Warning: 'experiments' dropped; see 'drops()'
harmonizing input:
removing 81 sampleMap rows not in names(experiments)
removing 9 colData rownames not in sampleMap 'primary'
ptm_protein <- rbind(ptm, precursor, protein)
ylims <- ptm_protein |>
group_by(assay) |>
summarise(cent = mean(range(value,na.rm=TRUE)), ampl = diff(range(value,na.rm=TRUE))) |>
mutate(lower = cent - max(ampl)/2,
upper = cent + max(ampl)/2) |>
select(-c(cent, ampl))
ptm_protein |>
filter(!is.na(value)) |>
ggplot() +
aes(x = sampleId,
y = value) +
geom_line(aes(group = rowname), linewidth = 0.1) +
geom_point(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
ggh4x::facetted_pos_scales(
y = list(
assay == ylims$assay[1] ~ scale_y_continuous(limits = unlist(ylims[1,c("lower","upper")])),
assay == ylims$assay[2] ~ scale_y_continuous(limits = unlist(ylims[2,c("lower","upper")])),
assay == ylims$assay[3] ~ scale_y_continuous(limits = unlist(ylims[3,c("lower","upper")])),
assay == ylims$assay[4] ~ scale_y_continuous(limits = unlist(ylims[4,c("lower","upper")]))
)
) +
labs(subtitle = paste0("Protein:", sub("_"," / phosho-position: ", target_feature))) +
theme_minimal() +
theme(axis.text.x = element_blank())
rbind(ptm, precursor, protein) %>%
{
ggplot(.) +
aes(x = strain,
y = value) +
geom_boxplot(aes(colour = strain)) +
facet_wrap(~ assay, scales = "free") +
geom_jitter(aes(shape = rowname)) +
scale_shape_manual(values = seq_len(dplyr::n_distinct(.$rowname))) +
labs(subtitle = paste0("Protein:", sub("_"," / phosho-position: ", target_feature))) +
theme_minimal() +
theme(axis.text.x = element_blank()) +
guides(shape = "none")
}Warning: Removed 10 rows containing non-finite outside the scale range
(`stat_boxplot()`).
Warning: Removed 10 rows containing missing values or values outside the scale range
(`geom_point()`).

#Garbage collection to free space
gc(); gc() used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 13026111 695.7 22260250 1188.9 NA 22260250 1188.9
Vcells 40478472 308.9 82281340 627.8 24576 312076398 2381.0
used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
Ncells 13026110 695.7 22260250 1188.9 NA 22260250 1188.9
Vcells 40478504 308.9 82281340 627.8 24576 312076398 2381.0
13.9.5 Along Protein Mapping
In this section we will make Along Protein graphs. Therefore we need sequence information on the protein sequence, which we can find in the fasta file from uniprot that was used for mapping.
Note, that it is recommended to use the fasta that was used for the mapping. However, this file was not uploaded by the authors so we use the version of the current release for illustration purposes.
- Read fasta file
fastaFile <- bfcrpath(bfc,"https://rest.uniprot.org/uniprotkb/stream?compressed=true&download=true&format=fasta&query=%28%28proteome%3AUP000002311%29+AND+reviewed%3Dtrue%29")
fasta <- seqinr::read.fasta(fastaFile, seqtype = "AA", as.string = T) #2.- Extract the proteins from the fasta file.
# read in fasta file for protein sequence & length -> to calculate the coverage
mapping <- data.frame(fullName = fasta |> names(),
accession = fasta |> names() |> stringr::str_split_i("\\|", 2),
sequence = unlist(fasta)) |>
mutate(length = nchar(sequence))
rownames(mapping) <- mapping$accessionPTM barcode plot
We make a barplot for all phosphorylated residues on protein P32324
We first gather the data:
- We extract the protein name
- we filter all PTMs for that protein
- we add the phospho-position in the rowData to it
- we make a new variable type to indicate if the PTM usage is up, down, non-significant or missing
prot <- sub("_.*","", target_feature) ## 1.
ptmListProt <- inferencesPtmUsage |>
filter(grepl(prot, feature)) |> ## 2.
left_join(rowData(qf[["ptm_usage"]]) |>
as.data.frame() |>
select(Protein.Group.Mod, pos),
by = c("feature" = "Protein.Group.Mod")) |> ## 3.
mutate(type = ifelse(is.na(adjPval),
"missing",
ifelse(adjPval < 0.05,
ifelse(logFC < 0,
"down",
"up"),
"non-sig")) ## 4.
)We make the barplot by adding a geom_point layer and a vertical line (geom_vline) to indicate the residue position.
ptmListProt |>
ggplot(aes(x=pos,y=0,col=type)) +
geom_point() +
geom_vline(aes(xintercept=pos,col=type)) +
xlab("residue") +
ylab("") +
scale_color_manual(values = c("black","grey","blue","red"),
breaks = c("missing","non-sig","down", "up")) +
theme(
axis.ticks.y = element_blank(),
axis.text.y = element_blank(),
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(color = "black", fill = NA),
strip.background = element_rect(color = "black")
) +
xlim(0, mapping[prot,"length"]) +
ggtitle(paste0(prot, ": Phospho residues")) +
facet_grid(contrast~.) 
Along protein plot for all precursors
We first gather the data from all precursors that map to protein P32324
prot <- sub("_.*","", target_feature) ## 1.
precursorListProt <- inferencesUsage |>
left_join(rowData(qf[["precursorsPTM_usage"]]) |>
as.data.frame() |>
select(Precursor.Id, Protein.Group, Stripped.Sequence, Protein.Sites, Modified.Sequence),
by = c("feature" = "Precursor.Id")) |> ## 2.
filter(Protein.Group == prot) |> ## 3.
mutate(seqOrder = as.factor(feature) |> as.numeric(),
type = ifelse(is.na(adjPval),
"missing",
ifelse(adjPval < 0.05,
ifelse(logFC < 0,
"down",
"up"),
"non-sig")), ## 4.
tmp = str_locate_all(mapping[prot,"sequence"], Stripped.Sequence),
start = purrr::map(tmp, ~ .x[, "start"]),
end = purrr::map(tmp, ~ .x[, "end"])) |>
unnest(c(start, end))Add location of Phospo residues.
- Add a new variable with the modification type (21 is phosho)
- Add a new variable with the location of the modification on the original protein
- Make vector of the row numbers with duplicates for the sequences containing multiple PTMs
- Make a corresponding vector with the positions of the individual PTM sites
- Make a corresponding vector with the modification type
- Extract the ids of rows that correspond to phospho sites
- Make a data.frame with duplicate entries for precursors with multiple phospho sites with variables seqOrder, type and contrast
- Add the position of the phospho site
precursorListProt <- precursorListProt |> mutate(
mods = str_extract_all(Modified.Sequence,
"UniMod:[0-9]+") |>
lapply(FUN = gsub, pattern="UniMod:",replacement=""), ##1.
pos = str_extract(Protein.Sites,
"(?<=:)[^\\]]+") |>
strsplit(split = ",") |>
lapply(FUN=substr,start=2,stop=10000) |>
lapply(FUN=as.integer)) ## 2.
ids <- rep(precursorListProt |>
nrow() |>
seq_len(),
precursorListProt |>
pull(pos) |>
sapply(FUN = length))## 3.
pos <- unlist(precursorListProt$pos) ## 4.
mod <- unlist(precursorListProt$mods) ## 5.
phosIds <- ids[mod == "21"] ## 6.
precursorPosListProt <- precursorListProt[phosIds,] |> ## 7.
select(seqOrder,type, contrast) |>
mutate(pos = pos[mod == "21"]) ## 8.Make along protein plot.
precursorListProt |>
ggplot(aes(y=seqOrder)) +
geom_segment(aes(x = start, xend = end, yend = seqOrder, color = type),
linewidth = 1) +
xlim(0, mapping[prot,"length"]) +
ylab("Precursors") + xlab("Residue") + #6.
scale_y_continuous(breaks = precursorListProt$seqOrder, labels = precursorListProt$Stripped.Sequence) +
scale_color_manual(values = c("black","grey","blue","red", "black"),
breaks = c("missing","non-sig","down", "up", "black")) +
geom_segment(data = precursorPosListProt,
aes(x = pos, xend = pos, y = seqOrder - 0.5, yend = seqOrder + 0.5,
color = "black"), linewidth = 1) +
facet_grid(contrast~.) +
theme(
axis.ticks.y = element_blank(),
axis.text.y = element_blank(), #We suppress the feature sequence here because there are too many sequences
panel.grid = element_blank(),
panel.background = element_blank(),
panel.border = element_rect(color = "black", fill = NA),
strip.background = element_rect(color = "black")
) +
labs(title = paste0(prot, " along with precursors"),
subtitle = "(phospho residue in black vertical line)") 