This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the plot. #
Question 2: Load readr and tidyverse libraries —– question 1 already
done
library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ purrr 1.0.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#Question 3: Load the OT and TS data sets.
OT_StickleGene <- read_csv("/stor/work/FRI_321G_JD_Spring2026/Exercises/OT_StickleGene.csv")
## Rows: 76 Columns: 6901
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_id, population, sex, turb_combined
## dbl (6897): ENSGACG00000000009, ENSGACG00000000013, ENSGACG00000000014, ENSG...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
TS_StickleGene <- read_csv("/stor/work/FRI_321G_JD_Spring2026/Exercises/TS_StickleGene.csv")
## Rows: 58 Columns: 5923
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): sample_id, population, sex, turb_combined
## dbl (5919): ENSGACG00000000009, ENSGACG00000000016, ENSGACG00000000024, ENSG...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Question 4: Calculate the number of genes in each data set.
OT_StickleGene %>%
select(starts_with("ENSG")) %>%
ncol()
## [1] 6897
TS_StickleGene %>%
select(starts_with("ENSG")) %>%
ncol()
## [1] 5919
OT_StickleGene %>%
select(starts_with("sample_id")) %>%
nrow()
## [1] 76
TS_StickleGene %>%
select(starts_with("sample_id")) %>%
nrow()
## [1] 58
#Question 6: Make a new data frame (with a new name) where gene expression is mean summarized by population and sex.
OTGene <- OT_StickleGene %>%
group_by(population, sex) %>%
summarise_if(is.numeric, mean, na.rm = TRUE)
TSGene <- TS_StickleGene %>%
group_by(population, sex) %>%
summarise_if(is.numeric, mean, na.rm = TRUE)
#Question 7: Make a data frame each OT_metadata and TS_metadata that does not contain any gene expression columns (i.e., only population, sex, id …).
OT_metadata <- bind_rows(
OT_StickleGene %>% select(sample_id, population, sex, turb_combined)
)
TS_metadata <- bind_rows(
TS_StickleGene %>% select(sample_id, population, sex, turb_combined)
)
#Question 8: Filter the summarized data set (#6) for the target gene you were assigned for Problem Set 1. You will need to have the Ensembl ID that you found in Problem Set 1.
target_gene1 <- OTGene %>%
select(population, sex, ENSGACG00000010827)
target_gene2 <- TSGene %>%
select(population, sex, ENSGACG00000010827)
#Question 9: Using a paired boxplot, plot your gene’s expression for each population paired by sex.
OT_StickleGene %>%
ggplot(aes(x = population, y = ENSGACG00000010827, fill = sex)) + geom_boxplot(position = "dodge") +
labs(
x = "Population",
y = "Gene Expression",
fill = "Sex",
title = "Expression of ENSGACG00000010827 by Population and Sex for OTStickleGene"
) +
theme_minimal()
TS_StickleGene %>%
ggplot(aes(x = population, y = ENSGACG00000010827, fill = sex)) + geom_boxplot(position = "dodge") +
labs(
x = "Population",
y = "Gene Expression",
fill = "Sex",
title = "Expression of ENSGACG00000010827 by Population and Sex for TSStickleGene"
) +
theme_minimal()
#Question 10.1 and 10.2: Summarize your target gene expression by sex and population. Pivot the data frame wider such that the expression is separated into two columns, one for males and one for females. —- already summarized target gene expression in question 8
target_gene1_wide <- target_gene1 %>%
pivot_wider(
names_from = sex,
values_from = ENSGACG00000010827
)
target_gene2_wide <- target_gene2 %>%
pivot_wider(
names_from = sex,
values_from = ENSGACG00000010827
)
#Question 10.3: Plot a scatter plot of Female on to Male expression and color the points by popultion
target_gene1_wide %>%
ggplot(aes(x = M, y = F, color = population)) +
geom_point(size = 3) +
labs(
x = "Male Gene Expression",
y = "Female Gene Expression",
color = "Population"
) +
theme_minimal()
target_gene2_wide %>%
ggplot(aes(x = M, y = F, color = population)) +
geom_point(size = 3) +
labs(
x = "Male Gene Expression",
y = "Female Gene Expression",
color = "Population"
) +
theme_minimal()