The tidyproteomics data-object
data-object.Rmd
The underlying data is just a list object with a structure detailed below. Fundamentally, quantitative data requires only a minimum of identifying data - mainly, the observation (protein_accession), the sample observed in, and the sample replicate. All, underlying analyses and transformation need only this minimum of information, the rest are simply annotations to the data and aptly stored in annotations or accounting.
.
├── origin string[ProteomeDiscoverer, MaxQuant]
├── analyte string[proteins, peptides]
├── identifier string[protein_accession]
├── quantitative_source string[raw, ...]
├── operations list
├── experiments tibble (lcms runs / samples / replicates)
│ ├── sample_id string (crc32 hash of import_file and sample_file)
│ ├── import_file
│ ├── sample_file
│ ├── sample
│ └── replicate
├── quantitative tibble (quantitative values / normalized)
│ ├── sample_id
│ ├── identifier(...) protein | protein, peptide, modification
│ ├── sample
│ ├── replicate
│ └── abundance_[raw, ...]
├── accounting tibble (summary and qualitative values)
│ ├── sample_id
│ ├── identifier(...)
│ ├── match_between_runs
│ ├── [protein, ...]_group
│ └── num_[peptides, ...]
├── annotations tibble (annotation terms and groups)
├── identifier(...)
├── term
└── annotation
library(tidyproteomics)
#>
#> Attaching package: 'tidyproteomics'
#> The following objects are masked from 'package:base':
#>
#> expression, merge
str(hela_proteins)
#> List of 9
#> $ origin : chr "ProteomeDiscoverer"
#> $ analyte : chr "proteins"
#> $ identifier : chr "protein"
#> $ quantitative_source: chr "raw"
#> $ operations :List of 1
#> ..$ : 'glue' chr "Data files (p97KD_HCT116_proteins.xlsx) were imported as proteins from ProteomeDiscoverer"
#> $ experiments :Classes 'tbl_df', 'tbl' and 'data.frame': 6 obs. of 5 variables:
#> ..$ sample_id : chr [1:6] "9e6ed3ba" "cc56fc1d" "6a21f7a9" "966be57f" ...
#> ..$ import_file: chr [1:6] "p97KD_HCT116_proteins.xlsx" "p97KD_HCT116_proteins.xlsx" "p97KD_HCT116_proteins.xlsx" "p97KD_HCT116_proteins.xlsx" ...
#> ..$ sample_file: chr [1:6] "F1" "F4" "F5" "F2" ...
#> ..$ sample : chr [1:6] "control" "control" "control" "knockdown" ...
#> ..$ replicate : chr [1:6] "1" "2" "3" "1" ...
#> $ quantitative :Classes 'tbl_df', 'tbl' and 'data.frame': 42330 obs. of 5 variables:
#> ..$ sample_id : chr [1:42330] "9e6ed3ba" "cc56fc1d" "6a21f7a9" "966be57f" ...
#> ..$ sample : chr [1:42330] "control" "control" "control" "knockdown" ...
#> ..$ replicate : chr [1:42330] "1" "2" "3" "1" ...
#> ..$ protein : chr [1:42330] "Q15149" "Q15149" "Q15149" "Q15149" ...
#> ..$ abundance_raw: num [1:42330] 1.01e+09 1.09e+09 9.81e+08 1.41e+09 1.07e+09 ...
#> $ accounting :Classes 'tbl_df', 'tbl' and 'data.frame': 42330 obs. of 7 variables:
#> ..$ sample_id : chr [1:42330] "6a21f7a9" "6a21f7a9" "6a21f7a9" "6a21f7a9" ...
#> ..$ protein : chr [1:42330] "A0A024R161" "A0A024R1R8" "A0A024R4E5" "A0A024R571" ...
#> ..$ num_peptides : num [1:42330] 2 1 43 22 2 10 2 1 1 14 ...
#> ..$ num_psms : num [1:42330] 7 5 248 106 7 54 2 4 1 61 ...
#> ..$ num_unique_peptides: num [1:42330] 2 1 43 20 2 1 2 1 1 14 ...
#> ..$ protein_group : chr [1:42330] "A0A024R161" "A0A024R1R8" "A0A024R4E5" "A0A024R571" ...
#> ..$ imputed : num [1:42330] 0 0 0 0 0 0 0 1 0 0 ...
#> $ annotations :Classes 'tbl_df', 'tbl' and 'data.frame': 52583 obs. of 3 variables:
#> ..$ protein : chr [1:52583] "Q15149" "Q15149" "Q15149" "Q15149" ...
#> ..$ term : chr [1:52583] "description" "biological_process" "cellular_component" "molecular_function" ...
#> ..$ annotation: chr [1:52583] "Plectin OS=Homo sapiens OX=9606 GN=PLEC PE=1 SV=3" "cell differentiation;cell growth;cellular homeostasis;coagulation;defense response;metabolic process" "cytoplasm;cytoskeleton;cytosol;endoplasmic reticulum;endosome;nucleus" "motor activity;structural molecule activity" ...
#> - attr(*, "class")= chr "tidyproteomics"