Answers to Exercises

4 R Syntax

# John Doe
# 2023-06-02
# Institution Inc.
#
# Some basic R practice

# 3.  Calculate the sum of 2 and 3.

2 + 3


# 4.  Evaluate if 0.5 is equal to 1 divided by 2.

0.5 == 1 / 2


# 5.  Test if 3 is an even number. Hint, use the `round()` or `floor()` 
#     functions and a comparison operator.

3/2 == floor(3/2)


# 6.  Create a function to test if a value is even resulting in `TRUE` or `FALSE`.

even <- function(x) { 
  is_even <- x/2 == floor(x/2)
  return(is_even)
}

even(3)


# 7.  Construct an if-else statement to test if the number three is odd or even.

x <- 3

if(x %% 2 == 0) {
  print('even')
} else {
  print('odd')
}


# 8.  Create a function to test or *even* or *odd* by returning a string.

oddeven <- function(x) { 
  is_even <- x/2 == floor(x/2)
  if(is_even == TRUE) {
    return('even')
  } else {
    return('odd')
  }
}

oddeven(3)


# 9.  Construct a for-loop to test, using the function from #8, if the numbers 
#     between 1 and 9 are odd or even, printing the number and string for each 
#     on a new line.

x <- 1:9

for ( i in x) {
  cat(i, "\t", oddeven(i), "\n")
}

5 R Objects

# John Doe
# 2023-06-02
# Institution Inc.
#
# Data Object Exercises

# 1.  Construct the following vector and store as a variable.

str_gbu <- c('red', 'green', 'blue')


# 2.  Extract the 2nd element in the variable.

str_gbu[2]

# 3.  Construct a numerical vector of length 5, containing the AREA of circles 
#     with integer RADIUS 1 to 5. Remember PEMDAS.

area <- (1:5) ^ 2 * pi


# 4.  Extract all AREA greater than 50.

area[which(area > 50)]


# 5. Create a data.frame consisting of circles with integer RADIUS 1 to 5, and their AREA.

radius <- 1:5

df <- data.frame(
  radius = radius,
  area = (radius) ^ 2 * pi
)

df


# 6.  Extract all AREA greater than 50 from the data.frame.

w <- which(df$area > 50)
df[w,]

6 Tidyverse

# John Doe
# 2023-06-02
# Institution Inc.
#
# Tidyverse Exercises

# 1. Calculate the mean of following vector.

vec_data <- c(7.48, 14.15, 6.23, 10.21, 15.13, 8.19, 8.58, 8.09, 9.14, 10.41)
mean(vec_data)


# 1. Pipeline (eg. `%>%`) a data operation that provides the mean of following vector.

vec_data %>% mean()


# 3.  Employing a pipeline (eg. `%>%`), construct a tibble with columns named 
#     `radi` and `area` which contains the AREA of circles with integer RADII 1 
#     to 5. Remember PEMDAS.

library(tidyverse)

tbl_cir <- tibble(
  radi = 1:5
) %>%
  mutate(area = radi ^ 2 * pi)

tbl_cir


# 4.  Extract all AREAs greater than 50.

tbl_cir %>% filter(area > 50)


# 5.  Add a column named `circ_type` where you assign the string *odd* or *even* 
#     depending on the column `radi`. Attempt to use the `purrr::map` function, 
#     along with the `oddeven()` function from the previous chapter, then compute 
#     the mean, standard deviation, and coefficient of variation of the AREA for 
#     each `circ_type`.

oddeven <- function(x) { 
  if(x %% 2 == 0) {
    return('even')
  } else {
    return('odd')
  }
}

tbl_cir %>% 
  mutate(circ_type = map(radi, oddeven)) %>%
  mutate(circ_type = unlist(circ_type)) %>%
  group_by(circ_type) %>%
  summarise(
    area_mean = mean(area),
    area_sd = sd(area),
    area_cv = area_sd / area_mean
  )

7 Data Wrangling

# John Doe
# 2023-06-02
# Institution Inc.
#
# Data Wrangling Exercises

# 1. Download the data.

url <- "https://raw.githubusercontent.com/jeffsocal/ASMS_R_Basics/main/data/bacterial-Metabolites_dose-simicillin_messy.xlsx"
download.file(url, destfile = "./data/bacterial-Metabolites_dose-simicillin_messy.xlsx")


# 2. Read in the messy bacteria data and store it as a variable.

library(tidyverse)
library(readxl)

tbl_bac <- "data/bacterial-Metabolites_dose-simicillin_messy.xlsx" %>% read_excel(col_names = TRUE)


# *In all proceeding exercises, pipe results from previous exercise into current 
#    exercise creating a single lone pipe for data processing*

# 3. Separate `Culture` column containing culture and dose into `culture` and 
#    `dose_mg_ml` columns.

tbl_bac %>%
  separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--")


# 4. Make `dose_mg_ml` column numeric by removing the text and change the column 
#    data type from character to numeric.

tbl_bac %>%
  separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--") %>%
  mutate(dose_mg_ml = gsub("-mg/ml","", dose_mg_ml)) %>%
  mutate(dose_mg_ml = as.numeric(dose_mg_ml))


# 5. Pivot the table from wide to long creating `metabolite`, `time_hr` & `abundance` columns.

tbl_bac %>%
  separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--") %>%
  mutate(dose_mg_ml = gsub("-mg/ml","", dose_mg_ml)) %>%
  mutate(dose_mg_ml = as.numeric(dose_mg_ml)) %>%
  pivot_longer(cols = 4:13, names_to = "metabolite_time", values_to = "abundance") %>%
  separate(metabolite_time, c("metabolite","time_hr"), sep="_runtime_")


# 6. Make sure `time_hr` contains just hours and not a mixture of days and hours.

tbl_bac %>%
  separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--") %>%
  mutate(dose_mg_ml = gsub("-mg/ml","", dose_mg_ml)) %>%
  mutate(dose_mg_ml = as.numeric(dose_mg_ml)) %>%
  pivot_longer(cols = 4:13, names_to = "metabolite_time", values_to = "abundance") %>%
  separate(metabolite_time, c("metabolite","time_hr"), sep="_runtime_") %>%
    mutate(
    time_hr = case_when(
      grepl("hr", time_hr, ignore.case = TRUE) ~ as.numeric(gsub("hr", "", time_hr)),
      grepl("day", time_hr, ignore.case = TRUE) ~ as.numeric(gsub("day", "", time_hr)) * 24
    )
  )


# 7. Remove the `User` column. See the cheat-sheet here: 
#    https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf

tbl_bac %>%
  separate(Culture, c("culture", "dose_mg_ml"), sep = " dose--") %>%
  mutate(dose_mg_ml = gsub("-mg/ml","", dose_mg_ml)) %>%
  mutate(dose_mg_ml = as.numeric(dose_mg_ml)) %>%
  pivot_longer(cols = 4:13, names_to = "metabolite_time", values_to = "abundance") %>%
  separate(metabolite_time, c("metabolite","time_hr"), sep="_runtime_") %>%
    mutate(
    time_hr = case_when(
      grepl("hr", time_hr, ignore.case = TRUE) ~ as.numeric(gsub("hr", "", time_hr)),
      grepl("day", time_hr, ignore.case = TRUE) ~ as.numeric(gsub("day", "", time_hr)) * 24
    )
  ) %>%
  select(-User)

8 Data Visualization

# John Doe
# 2023-06-02
# Institution Inc.
#
# Data Visualization Exercises

# 1. If not already done, download *Bacterial Metabolite Data (tidy)* to use as 
#    an example data file.

url <- "https://raw.githubusercontent.com/jeffsocal/ASMS_R_Basics/main/data/bacterial-Metabolites_dose-simicillin_tidy.csv"
download.file(url, destfile = "./data/bacterial-Metabolites_dose-simicillin_tidy.csv")


# 2. Read in the dataset .csv using the `tidyverse` set of packages.

library(tidyverse)

tbl_bac <- "./data/bacterial-Metabolites_dose-simicillin_tidy.csv" %>% read_csv()


# 3. Create a Metabolite `Abundance` by `Time_min` ...

tbl_bac %>% 
  ggplot(aes(Time_min, Abundance)) + 
  geom_point()


# 4. ... facet by `Organism` and `Metabolite`...

tbl_bac %>% 
  ggplot(aes(Time_min, Abundance)) + 
  geom_point() + 
  facet_grid(Metabolite ~ Organism)


# 4. ... adjust the y-axis to log10, color by `Dose_mg`, and add a 50% transparent line ...

tbl_bac %>% 
  mutate(Dose_mg = Dose_mg %>% as.factor()) %>%
  ggplot(aes(Time_min, Abundance)) + 
  geom_point(aes(color = Dose_mg)) +
  geom_line(aes(color = Dose_mg), alpha = .5) +
  facet_grid(Metabolite ~ Organism) +
  scale_y_log10()


# 5. ... change the theme to something publishable, add a title, modify the x- 
#    and y-axis label, modify the legend title, adjust the y-axis ticks to show 
#    the actually measured time values, and pick a color scheme that highlights 
#    the dose value...

tbl_bac %>% 
  mutate(Dose_mg = Dose_mg %>% as.factor()) %>%
  ggplot(aes(Time_min, Abundance)) + 
  geom_point(aes(color = Dose_mg)) +
  geom_line(aes(color = Dose_mg), alpha = .5) +
  facet_grid(Metabolite ~ Organism) +
  scale_color_manual(values = c("grey", "orange", "red")) +
  scale_y_log10() +
  scale_x_continuous(breaks = unique(tbl_bac$Time_min)) +
  labs(title = 'Bacterial Metabolite monitoring by LCMS in response to antibiotic',
       subtitle = 'Conditions: metered dose of similicillin',
       x = "Time (min)", y = "LCMS Abundance",
       color = "Dose (mg)") +
  theme_classic()

More 5 R Objects

# John Doe
# 2023-06-02
# Institution Inc.
#
# More Data Object Exercises

# Exercise #1 -- Working with Variables You are running an LC-MS experiment 
#     using a 60 min LC gradient

# 1.1 Create a variable called gradient_min to hold the length of the gradient 
#     in minutes.

gradient_min <- 60


# 1.2 Using the gradient length variable you just created, convert it to seconds 
#     and assign it to a new variable with a meaningful name.

gradient_sec <- gradient_min * 60


# Exercise #2 -- Working with Vectors

# Continuing from Exercise #1...

# 2.1 Imagine you conducted additional experiments, one with a 15 minute gradient 
#     and one with a 30 min gradient. Create a vector to hold all three gradient 
#     times in minutes, and assign it to a new variable.

gradients_min <- c(15, 30, 60)


# 2.2 Convert the vector of gradient times to seconds. How does this conversion 
#     compare to how you did the conversion in Exercise 1?

gradients_sec <- gradients_min * 60


# Exercise #3 -- More Practice with Vectors

# 3.1 The following vector represents precursor m/z values for detected features
#     from your experiment:

prec_mz <- c(968.4759, 812.1599, 887.9829, 338.5294, 510.2720, 
             775.3455, 409.2369, 944.0385, 584.7687, 1041.9523)


# -   How many values are there?

length(prec_mz)


# -   What is the minimum value? The maximum?

min(prec_mz)
max(prec_mz)


# Exercise #4 -- Vectors and Conditional Expressions

# 4.1 Using the above vector of precursor values, write a conditional expression
#     to find the values with m/z \< 600. What is returned by this expression? A
#     single value or multiple values? A number or something else?

prec_mz < 600


# 4.2 Use this conditional expression to get the precursor values with m/z \< 600

prec_mz[prec_mz < 600]


# 4.3 Consider a new vector of data that contains the charge states of the same 
#     detected features from above:

prec_z <- c(2, 4, 2, 3, 2, 2, 2, 2, 2, 2)


# -   Write a conditional expression to find which detected features that have 
#     a charge state of 2.

prec_z == 2


# 4.4 Write an expression to get the precursor m/z values for features having 
#     charge states of 2?

prec_mz[prec_z == 2]

More 6 Tidyverse

# John Doe
# 2023-06-02
# Institution Inc.
#
# Moire Tidyverse Exercises

library(tidyverse)

# Download the data.

url <- "https://raw.githubusercontent.com/jeffsocal/ASMS_R_Basics/main/data/Choi2017_DDA_Skyline_input.csv.zip"
download.file(url, destfile = "./data/Choi2017_DDA_Skyline_input.csv.zip")


# Exercise #1 -- Reading data

# 1.1 Read the example data from a proteomics experiment NOTE: file is a zipped 
#     .csv file -- R knows how to read it!

tbl_dda <- read_csv("data/Choi2017_DDA_Skyline_input.csv.zip", guess_max = 10000)


# Exercise #2 -- Reviewing data Frames

# 2.1 Review some basic properties of the data frame

# -   How many rows?\

nrow(tbl_dda)

# -   How many columns?

ncol(tbl_dda)

# -   How many rows & columns (use one expression)

dim(tbl_dda)

# -   What are the column names?

names(tbl_dda)

# -   What are the data types stored in each column?

str(tbl_dda)


# What kind of data is present? What is the structure of the data?

# -   Use the View function to review the data in RStudio.

# -   It appears that some of the data is duplicated across many rows? Look at 
#     the data column by column and see if you can understand why.

View(tbl_dda)


# Exercise #3 -- Working with data Frames

# 3.1 Retrieve the data from the column call "FileName" How many values do you 
#     expect to get? Write an expression using the data you retrieved to see if 
#     your guess is correct.

length(tbl_dda$FileName)

# -   you'd expect to have the same number of values are there is rows

nrow(tbl_dda)


# 3.2 How many unique values of the data from "FileName" are there? What are 
#     these values and what do they correspond to?

unique(tbl_dda$FileName)


# 3.3 Using data frame indexing syntax, subset the data to rows for the protein 
#     "sp\|P33399\|LHP1_YEAST"

tbl_dda[tbl_dda$ProteinName == "sp|P33399|LHP1_YEAST",]


# 3.4 How many unique peptides are present in the data for the above protein?

# -   first store the subset in a variable

tbl_dda_subset <- tbl_dda[tbl_dda$ProteinName == "sp|P33399|LHP1_YEAST",]

# -   now calculate the number of unique peptides

peptides <- tbl_dda_subset$PeptideSequence
unique_peptides <- unique(peptides)
length(unique_peptides)

# -   or you can do it with one expression

length(unique(tbl_dda_subset$PeptideSequence))