The tidyproteomics data-object • tidyproteomics

The underlying data is just a list object with a structure detailed below. Fundamentally, quantitative data requires only a minimum of identifying data - mainly, the observation (protein_accession), the sample observed in, and the sample replicate. All, underlying analyses and transformation need only this minimum of information, the rest are simply annotations to the data and aptly stored in annotations or accounting.

.
├── origin                     string[ProteomeDiscoverer, MaxQuant]
├── analyte                    string[proteins, peptides]
├── identifier                 string[protein_accession]
├── quantitative_source        string[raw, ...]
├── operations                 list
├── experiments                tibble (lcms runs / samples / replicates) 
│   ├── sample_id              string (crc32 hash of import_file and sample_file)
│   ├── import_file
│   ├── sample_file
│   ├── sample
│   └── replicate
├── quantitative               tibble (quantitative values / normalized)
│   ├── sample_id
│   ├── identifier(...)        protein | protein, peptide, modification
│   ├── sample
│   ├── replicate
│   └── abundance_[raw, ...]
├── accounting                 tibble (summary and qualitative values)
│   ├── sample_id
│   ├── identifier(...) 
│   ├── match_between_runs
│   ├── [protein, ...]_group
│   └── num_[peptides, ...]
├── annotations                tibble (annotation terms and groups)
    ├── identifier(...) 
    ├── term
    └── annotation

library(tidyproteomics)
#> 
#> Attaching package: 'tidyproteomics'
#> The following objects are masked from 'package:base':
#> 
#>     expression, merge
str(hela_proteins)
#> List of 9
#>  $ origin             : chr "ProteomeDiscoverer"
#>  $ analyte            : chr "proteins"
#>  $ identifier         : chr "protein"
#>  $ quantitative_source: chr "raw"
#>  $ operations         :List of 1
#>   ..$ : 'glue' chr "Data files (p97KD_HCT116_proteins.xlsx) were imported as proteins from ProteomeDiscoverer"
#>  $ experiments        :Classes 'tbl_df', 'tbl' and 'data.frame': 6 obs. of  5 variables:
#>   ..$ sample_id  : chr [1:6] "9e6ed3ba" "cc56fc1d" "6a21f7a9" "966be57f" ...
#>   ..$ import_file: chr [1:6] "p97KD_HCT116_proteins.xlsx" "p97KD_HCT116_proteins.xlsx" "p97KD_HCT116_proteins.xlsx" "p97KD_HCT116_proteins.xlsx" ...
#>   ..$ sample_file: chr [1:6] "F1" "F4" "F5" "F2" ...
#>   ..$ sample     : chr [1:6] "control" "control" "control" "knockdown" ...
#>   ..$ replicate  : chr [1:6] "1" "2" "3" "1" ...
#>  $ quantitative       :Classes 'tbl_df', 'tbl' and 'data.frame': 42330 obs. of  5 variables:
#>   ..$ sample_id    : chr [1:42330] "9e6ed3ba" "cc56fc1d" "6a21f7a9" "966be57f" ...
#>   ..$ sample       : chr [1:42330] "control" "control" "control" "knockdown" ...
#>   ..$ replicate    : chr [1:42330] "1" "2" "3" "1" ...
#>   ..$ protein      : chr [1:42330] "Q15149" "Q15149" "Q15149" "Q15149" ...
#>   ..$ abundance_raw: num [1:42330] 1.01e+09 1.09e+09 9.81e+08 1.41e+09 1.07e+09 ...
#>  $ accounting         :Classes 'tbl_df', 'tbl' and 'data.frame': 42330 obs. of  7 variables:
#>   ..$ sample_id          : chr [1:42330] "6a21f7a9" "6a21f7a9" "6a21f7a9" "6a21f7a9" ...
#>   ..$ protein            : chr [1:42330] "A0A024R161" "A0A024R1R8" "A0A024R4E5" "A0A024R571" ...
#>   ..$ num_peptides       : num [1:42330] 2 1 43 22 2 10 2 1 1 14 ...
#>   ..$ num_psms           : num [1:42330] 7 5 248 106 7 54 2 4 1 61 ...
#>   ..$ num_unique_peptides: num [1:42330] 2 1 43 20 2 1 2 1 1 14 ...
#>   ..$ protein_group      : chr [1:42330] "A0A024R161" "A0A024R1R8" "A0A024R4E5" "A0A024R571" ...
#>   ..$ imputed            : num [1:42330] 0 0 0 0 0 0 0 1 0 0 ...
#>  $ annotations        :Classes 'tbl_df', 'tbl' and 'data.frame': 52583 obs. of  3 variables:
#>   ..$ protein   : chr [1:52583] "Q15149" "Q15149" "Q15149" "Q15149" ...
#>   ..$ term      : chr [1:52583] "description" "biological_process" "cellular_component" "molecular_function" ...
#>   ..$ annotation: chr [1:52583] "Plectin OS=Homo sapiens OX=9606 GN=PLEC PE=1 SV=3" "cell differentiation;cell growth;cellular homeostasis;coagulation;defense response;metabolic process" "cytoplasm;cytoskeleton;cytosol;endoplasmic reticulum;endosome;nucleus" "motor activity;structural molecule activity" ...
#>  - attr(*, "class")= chr "tidyproteomics"