Answers to Exercises
4 R Syntax
# John Doe
# 2023-06-02
# Institution Inc.
#
# Some basic R practice
# 3. Calculate the sum of 2 and 3.
2 + 3
# 4. Evaluate if 0.5 is equal to 1 divided by 2.
0.5 == 1 / 2
# 5. Test if 3 is an even number. Hint, use the `round()` or `floor()`
# functions and a comparison operator.
3/2 == floor(3/2)
# 6. Create a function to test if a value is even resulting in `TRUE` or `FALSE`.
even <- function(x) {
is_even <- x/2 == floor(x/2)
return(is_even)
}
even(3)
# 7. Construct an if-else statement to test if the number three is odd or even.
x <- 3
if(x %% 2 == 0) {
print('even')
} else {
print('odd')
}
# 8. Create a function to test or *even* or *odd* by returning a string.
oddeven <- function(x) {
is_even <- x/2 == floor(x/2)
if(is_even == TRUE) {
return('even')
} else {
return('odd')
}
}
oddeven(3)
# 9. Construct a for-loop to test, using the function from #8, if the numbers
# between 1 and 9 are odd or even, printing the number and string for each
# on a new line.
x <- 1:9
for ( i in x) {
cat(i, "\t", oddeven(i), "\n")
}
5 R Objects
# John Doe
# 2023-06-02
# Institution Inc.
#
# Data Object Exercises
# 1. Construct the following vector and store as a variable.
str_gbu <- c('red', 'green', 'blue')
# 2. Extract the 2nd element in the variable.
str_gbu[2]
# 3. Construct a numerical vector of length 5, containing the AREA of circles
# with integer RADIUS 1 to 5. Remember PEMDAS.
area <- (1:5) ^ 2 * pi
# 4. Extract all AREA greater than 50.
area[which(area > 50)]
# 5. Create a data.frame consisting of circles with integer RADIUS 1 to 5, and their AREA.
radius <- 1:5
df <- data.frame(
radius = radius,
area = (radius) ^ 2 * pi
)
df
# 6. Extract all AREA greater than 50 from the data.frame.
w <- which(df$area > 50)
df[w,]
6 Tidyverse
# John Doe
# 2023-06-02
# Institution Inc.
#
# Tidyverse Exercises
# 1. Calculate the mean of following vector.
vec_data <- c(7.48, 14.15, 6.23, 10.21, 15.13, 8.19, 8.58, 8.09, 9.14, 10.41)
mean(vec_data)
# 1. Pipeline (eg. `%>%`) a data operation that provides the mean of following vector.
vec_data %>% mean()
# 3. Employing a pipeline (eg. `%>%`), construct a tibble with columns named
# `radi` and `area` which contains the AREA of circles with integer RADII 1
# to 5. Remember PEMDAS.
library(tidyverse)
tbl_cir <- tibble(
radi = 1:5
) %>%
mutate(area = radi ^ 2 * pi)
tbl_cir
# 4. Extract all AREAs greater than 50.
tbl_cir %>% filter(area > 50)
# 5. Add a column named `circ_type` where you assign the string *odd* or *even*
# depending on the column `radi`. Attempt to use the `purrr::map` function,
# along with the `oddeven()` function from the previous chapter, then compute
# the mean, standard deviation, and coefficient of variation of the AREA for
# each `circ_type`.
oddeven <- function(x) {
if(x %% 2 == 0) {
return('even')
} else {
return('odd')
}
}
tbl_cir %>%
mutate(circ_type = map(radi, oddeven)) %>%
mutate(circ_type = unlist(circ_type)) %>%
group_by(circ_type) %>%
summarise(
area_mean = mean(area),
area_sd = sd(area),
area_cv = area_sd / area_mean
)
7 Data Wrangling
# John Doe
# 2023-06-02
# Institution Inc.
#
# Data Wrangling Exercises
# 1. Download the data.
url <- "https://raw.githubusercontent.com/jeffsocal/ASMS_R_Basics/main/data/bacterial-Metabolites_dose-simicillin_messy.xlsx"
download.file(url, destfile = "./data/bacterial-Metabolites_dose-simicillin_messy.xlsx")
# 2. Read in the messy bacteria data and store it as a variable.
library(tidyverse)
library(readxl)
tbl_bac <- "data/bacterial-Metabolites_dose-simicillin_messy.xlsx" %>% read_excel(col_names = TRUE)
# *In all proceeding exercises, pipe results from previous exercise into current
# exercise creating a single lone pipe for data processing*
# 3. Separate `Culture` column containing culture and dose into `culture` and
# `dose_mg_ml` columns.
tbl_bac %>%
separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--")
# 4. Make `dose_mg_ml` column numeric by removing the text and change the column
# data type from character to numeric.
tbl_bac %>%
separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--") %>%
mutate(dose_mg_ml = gsub("-mg/ml","", dose_mg_ml)) %>%
mutate(dose_mg_ml = as.numeric(dose_mg_ml))
# 5. Pivot the table from wide to long creating `metabolite`, `time_hr` & `abundance` columns.
tbl_bac %>%
separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--") %>%
mutate(dose_mg_ml = gsub("-mg/ml","", dose_mg_ml)) %>%
mutate(dose_mg_ml = as.numeric(dose_mg_ml)) %>%
pivot_longer(cols = 4:13, names_to = "metabolite_time", values_to = "abundance") %>%
separate(metabolite_time, c("metabolite","time_hr"), sep="_runtime_")
# 6. Make sure `time_hr` contains just hours and not a mixture of days and hours.
tbl_bac %>%
separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--") %>%
mutate(dose_mg_ml = gsub("-mg/ml","", dose_mg_ml)) %>%
mutate(dose_mg_ml = as.numeric(dose_mg_ml)) %>%
pivot_longer(cols = 4:13, names_to = "metabolite_time", values_to = "abundance") %>%
separate(metabolite_time, c("metabolite","time_hr"), sep="_runtime_") %>%
mutate(
time_hr = case_when(
grepl("hr", time_hr, ignore.case = TRUE) ~ as.numeric(gsub("hr", "", time_hr)),
grepl("day", time_hr, ignore.case = TRUE) ~ as.numeric(gsub("day", "", time_hr)) * 24
)
)
# 7. Remove the `User` column. See the cheat-sheet here:
# https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
tbl_bac %>%
separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--") %>%
mutate(dose_mg_ml = gsub("-mg/ml","", dose_mg_ml)) %>%
mutate(dose_mg_ml = as.numeric(dose_mg_ml)) %>%
pivot_longer(cols = 4:13, names_to = "metabolite_time", values_to = "abundance") %>%
separate(metabolite_time, c("metabolite","time_hr"), sep="_runtime_") %>%
mutate(
time_hr = case_when(
grepl("hr", time_hr, ignore.case = TRUE) ~ as.numeric(gsub("hr", "", time_hr)),
grepl("day", time_hr, ignore.case = TRUE) ~ as.numeric(gsub("day", "", time_hr)) * 24
)
) %>%
select(-User)
8 Data Visualization
# John Doe
# 2023-06-02
# Institution Inc.
#
# Data Visualization Exercises
# 1. If not already done, download *Bacterial Metabolite Data (tidy)* to use as
# an example data file.
url <- "https://raw.githubusercontent.com/jeffsocal/ASMS_R_Basics/main/data/bacterial-Metabolites_dose-simicillin_tidy.csv"
download.file(url, destfile = "./data/bacterial-Metabolites_dose-simicillin_tidy.csv")
# 2. Read in the dataset .csv using the `tidyverse` set of packages.
library(tidyverse)
tbl_bac <- "./data/bacterial-Metabolites_dose-simicillin_tidy.csv" %>% read_csv()
# 3. Create a Metabolite `Abundance` by `Time_min` ...
tbl_bac %>%
ggplot(aes(Time_min, Abundance)) +
geom_point()
# 4. ... facet by `Organism` and `Metabolite`...
tbl_bac %>%
ggplot(aes(Time_min, Abundance)) +
geom_point() +
facet_grid(Metabolite ~ Organism)
# 4. ... adjust the y-axis to log10, color by `Dose_mg`, and add a 50% transparent line ...
tbl_bac %>%
mutate(Dose_mg = Dose_mg %>% as.factor()) %>%
ggplot(aes(Time_min, Abundance)) +
geom_point(aes(color = Dose_mg)) +
geom_line(aes(color = Dose_mg), alpha = .5) +
facet_grid(Metabolite ~ Organism) +
scale_y_log10()
# 5. ... change the theme to something publishable, add a title, modify the x-
# and y-axis label, modify the legend title, adjust the y-axis ticks to show
# the actually measured time values, and pick a color scheme that highlights
# the dose value...
tbl_bac %>%
mutate(Dose_mg = Dose_mg %>% as.factor()) %>%
ggplot(aes(Time_min, Abundance)) +
geom_point(aes(color = Dose_mg)) +
geom_line(aes(color = Dose_mg), alpha = .5) +
facet_grid(Metabolite ~ Organism) +
scale_color_manual(values = c("grey", "orange", "red")) +
scale_y_log10() +
scale_x_continuous(breaks = unique(tbl_bac$Time_min)) +
labs(title = 'Bacterial Metabolite monitoring by LCMS in response to antibiotic',
subtitle = 'Conditions: metered dose of similicillin',
x = "Time (min)", y = "LCMS Abundance",
color = "Dose (mg)") +
theme_classic()
More 5 R Objects
# John Doe
# 2023-06-02
# Institution Inc.
#
# More Data Object Exercises
# Exercise #1 -- Working with Variables You are running an LC-MS experiment
# using a 60 min LC gradient
# 1.1 Create a variable called gradient_min to hold the length of the gradient
# in minutes.
gradient_min <- 60
# 1.2 Using the gradient length variable you just created, convert it to seconds
# and assign it to a new variable with a meaningful name.
gradient_sec <- gradient_min * 60
# Exercise #2 -- Working with Vectors
# Continuing from Exercise #1...
# 2.1 Imagine you conducted additional experiments, one with a 15 minute gradient
# and one with a 30 min gradient. Create a vector to hold all three gradient
# times in minutes, and assign it to a new variable.
gradients_min <- c(15, 30, 60)
# 2.2 Convert the vector of gradient times to seconds. How does this conversion
# compare to how you did the conversion in Exercise 1?
gradients_sec <- gradients_min * 60
# Exercise #3 -- More Practice with Vectors
# 3.1 The following vector represents precursor m/z values for detected features
# from your experiment:
prec_mz <- c(968.4759, 812.1599, 887.9829, 338.5294, 510.2720,
775.3455, 409.2369, 944.0385, 584.7687, 1041.9523)
# - How many values are there?
length(prec_mz)
# - What is the minimum value? The maximum?
min(prec_mz)
max(prec_mz)
# Exercise #4 -- Vectors and Conditional Expressions
# 4.1 Using the above vector of precursor values, write a conditional expression
# to find the values with m/z \< 600. What is returned by this expression? A
# single value or multiple values? A number or something else?
prec_mz < 600
# 4.2 Use this conditional expression to get the precursor values with m/z \< 600
prec_mz[prec_mz < 600]
# 4.3 Consider a new vector of data that contains the charge states of the same
# detected features from above:
prec_z <- c(2, 4, 2, 3, 2, 2, 2, 2, 2, 2)
# - Write a conditional expression to find which detected features that have
# a charge state of 2.
prec_z == 2
# 4.4 Write an expression to get the precursor m/z values for features having
# charge states of 2?
prec_mz[prec_z == 2]
More 6 Tidyverse
# John Doe
# 2023-06-02
# Institution Inc.
#
# Moire Tidyverse Exercises
library(tidyverse)
# Download the data.
url <- "https://raw.githubusercontent.com/jeffsocal/ASMS_R_Basics/main/data/Choi2017_DDA_Skyline_input.csv.zip"
download.file(url, destfile = "./data/Choi2017_DDA_Skyline_input.csv.zip")
# Exercise #1 -- Reading data
# 1.1 Read the example data from a proteomics experiment NOTE: file is a zipped
# .csv file -- R knows how to read it!
tbl_dda <- read_csv("data/Choi2017_DDA_Skyline_input.csv.zip", guess_max = 10000)
# Exercise #2 -- Reviewing data Frames
# 2.1 Review some basic properties of the data frame
# - How many rows?\
nrow(tbl_dda)
# - How many columns?
ncol(tbl_dda)
# - How many rows & columns (use one expression)
dim(tbl_dda)
# - What are the column names?
names(tbl_dda)
# - What are the data types stored in each column?
str(tbl_dda)
# What kind of data is present? What is the structure of the data?
# - Use the View function to review the data in RStudio.
# - It appears that some of the data is duplicated across many rows? Look at
# the data column by column and see if you can understand why.
View(tbl_dda)
# Exercise #3 -- Working with data Frames
# 3.1 Retrieve the data from the column call "FileName" How many values do you
# expect to get? Write an expression using the data you retrieved to see if
# your guess is correct.
length(tbl_dda$FileName)
# - you'd expect to have the same number of values are there is rows
nrow(tbl_dda)
# 3.2 How many unique values of the data from "FileName" are there? What are
# these values and what do they correspond to?
unique(tbl_dda$FileName)
# 3.3 Using data frame indexing syntax, subset the data to rows for the protein
# "sp\|P33399\|LHP1_YEAST"
tbl_dda[tbl_dda$ProteinName == "sp|P33399|LHP1_YEAST",]
# 3.4 How many unique peptides are present in the data for the above protein?
# - first store the subset in a variable
tbl_dda_subset <- tbl_dda[tbl_dda$ProteinName == "sp|P33399|LHP1_YEAST",]
# - now calculate the number of unique peptides
peptides <- tbl_dda_subset$PeptideSequence
unique_peptides <- unique(peptides)
length(unique_peptides)
# - or you can do it with one expression
length(unique(tbl_dda_subset$PeptideSequence))