## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sampleinfo <- read.csv("data/GSE60450_filtered_metadata-1.csv")
counts <- read.csv("data/GSE60450_GeneLevel_NormalizedCPM.and_.TMM_data.csv")##Questions for Cesar ### How can you make the plot/graphs you make show up in the plots section in the bottom right corner?
## X characteristics
## 1 GSM1480291 mammary gland, luminal cells, virgin
## 2 GSM1480292 mammary gland, luminal cells, virgin
## 3 GSM1480293 mammary gland, luminal cells, 18.5 day pregnancy
## 4 GSM1480294 mammary gland, luminal cells, 18.5 day pregnancy
## 5 GSM1480295 mammary gland, luminal cells, 2 day lactation
## 6 GSM1480296 mammary gland, luminal cells, 2 day lactation
## 7 GSM1480297 mammary gland, basal cells, virgin
## 8 GSM1480298 mammary gland, basal cells, virgin
## 9 GSM1480299 mammary gland, basal cells, 18.5 day pregnancy
## 10 GSM1480300 mammary gland, basal cells, 18.5 day pregnancy
## 11 GSM1480301 mammary gland, basal cells, 2 day lactation
## 12 GSM1480302 mammary gland, basal cells, 2 day lactation
## immunophenotype developmental.stage
## 1 luminal cell population virgin
## 2 luminal cell population virgin
## 3 luminal cell population 18.5 day pregnancy
## 4 luminal cell population 18.5 day pregnancy
## 5 luminal cell population 2 day lactation
## 6 luminal cell population 2 day lactation
## 7 basal cell population virgin
## 8 basal cell population virgin
## 9 basal cell population 18.5 day pregnancy
## 10 basal cell population 18.5 day pregnancy
## 11 basal cell population 2 day lactation
## 12 basal cell population 2 day lactation
## X characteristics
## 1 GSM1480291 mammary gland, luminal cells, virgin
## 2 GSM1480292 mammary gland, luminal cells, virgin
## 3 GSM1480293 mammary gland, luminal cells, 18.5 day pregnancy
## 4 GSM1480294 mammary gland, luminal cells, 18.5 day pregnancy
## 5 GSM1480295 mammary gland, luminal cells, 2 day lactation
## 6 GSM1480296 mammary gland, luminal cells, 2 day lactation
## immunophenotype developmental.stage
## 1 luminal cell population virgin
## 2 luminal cell population virgin
## 3 luminal cell population 18.5 day pregnancy
## 4 luminal cell population 18.5 day pregnancy
## 5 luminal cell population 2 day lactation
## 6 luminal cell population 2 day lactation
## X characteristics
## 7 GSM1480297 mammary gland, basal cells, virgin
## 8 GSM1480298 mammary gland, basal cells, virgin
## 9 GSM1480299 mammary gland, basal cells, 18.5 day pregnancy
## 10 GSM1480300 mammary gland, basal cells, 18.5 day pregnancy
## 11 GSM1480301 mammary gland, basal cells, 2 day lactation
## 12 GSM1480302 mammary gland, basal cells, 2 day lactation
## immunophenotype developmental.stage
## 7 basal cell population virgin
## 8 basal cell population virgin
## 9 basal cell population 18.5 day pregnancy
## 10 basal cell population 18.5 day pregnancy
## 11 basal cell population 2 day lactation
## 12 basal cell population 2 day lactation
## [1] 12 4
## [1] 23735 14
## [1] "mammary gland, luminal cells, virgin"
## [2] "mammary gland, luminal cells, virgin"
## [3] "mammary gland, luminal cells, 18.5 day pregnancy"
#changes the name of the column [is the column #]
colnames(sampleinfo) [1] <- "sample_id"
colnames(counts) [1] <- "gene_id"## 'data.frame': 12 obs. of 4 variables:
## $ sample_id : chr "GSM1480291" "GSM1480292" "GSM1480293" "GSM1480294" ...
## $ characteristics : chr "mammary gland, luminal cells, virgin" "mammary gland, luminal cells, virgin" "mammary gland, luminal cells, 18.5 day pregnancy" "mammary gland, luminal cells, 18.5 day pregnancy" ...
## $ immunophenotype : chr "luminal cell population" "luminal cell population" "luminal cell population" "luminal cell population" ...
## $ developmental.stage: chr "virgin" "virgin" "18.5 day pregnancy" "18.5 day pregnancy" ...
## sample_id characteristics immunophenotype developmental.stage
## Length:12 Length:12 Length:12 Length:12
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
## 'data.frame': 23735 obs. of 14 variables:
## $ gene_id : chr "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
## $ gene_symbol: chr "Gnai3" "Pbsn" "Cdc45" "H19" ...
## $ GSM1480291 : num 243.29 0 11.18 6.31 2.19 ...
## $ GSM1480292 : num 255.66 0 13.78 8.53 4.66 ...
## $ GSM1480293 : num 239.74 0 11.6 7.09 2.8 ...
## $ GSM1480294 : num 217.1 0 4.27 11.04 2.5 ...
## $ GSM1480295 : num 84.744 0 8.35 0.194 1.243 ...
## $ GSM1480296 : num 84.599 0 8.199 0 0.855 ...
## $ GSM1480297 : num 175.04 0 12.11 2.12 5.79 ...
## $ GSM1480298 : num 187.49 0 11.1 1.19 8.8 ...
## $ GSM1480299 : num 176.66 0 7.53 1.55 9.81 ...
## $ GSM1480300 : num 169.094 0 7.099 0.867 7.47 ...
## $ GSM1480301 : num 158.45 0 1.98 10.83 7.57 ...
## $ GSM1480302 : num 133.59 0 2.88 5.77 9.88 ...
## gene_id gene_symbol GSM1480291 GSM1480292
## Length:23735 Length:23735 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Mode :character Median : 1.745 Median : 1.891
## Mean : 42.132 Mean : 42.132
## 3rd Qu.: 29.840 3rd Qu.: 29.604
## Max. :12525.066 Max. :12416.211
## GSM1480293 GSM1480294 GSM1480295
## Min. : 0.000 Min. : 0.000 Min. :0.000e+00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:0.000e+00
## Median : 0.918 Median : 0.888 Median :5.830e-01
## Mean : 42.132 Mean : 42.132 Mean :4.213e+01
## 3rd Qu.: 21.908 3rd Qu.: 19.921 3rd Qu.:1.227e+01
## Max. :49191.148 Max. :55692.086 Max. :1.119e+05
## GSM1480296 GSM1480297 GSM1480298
## Min. :0.000e+00 Min. : 0.000 Min. : 0.000
## 1st Qu.:0.000e+00 1st Qu.: 0.000 1st Qu.: 0.000
## Median :5.440e-01 Median : 2.158 Median : 2.254
## Mean :4.213e+01 Mean : 42.132 Mean : 42.132
## 3rd Qu.:1.228e+01 3rd Qu.: 27.414 3rd Qu.: 26.450
## Max. :1.087e+05 Max. :10489.311 Max. :10662.486
## GSM1480299 GSM1480300 GSM1480301
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 1.854 Median : 1.816 Median : 1.629
## Mean : 42.132 Mean : 42.132 Mean : 42.132
## 3rd Qu.: 24.860 3rd Qu.: 23.443 3rd Qu.: 23.444
## Max. :15194.048 Max. :17434.935 Max. :19152.728
## GSM1480302
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.749
## Mean : 42.132
## 3rd Qu.: 24.818
## Max. :15997.193
#pivot tables to format data into a plot to work with plotgg package
# Variable_name <- pivot_longer(dataset_to_read, cols = starts_with ("common begining file"), names_to = "new column name", values_to = "count column name")
# seqdata <- pivot_longer(counts, cols = GSM1480291:GSM1480302, names_to = "Sample", values_to = "Count")
# seqdata <- pivot_longer(counts, cols = -c("gene_id", "gene_symbol"), names_to = "Sample", values_to = "Count")
seqdata <- pivot_longer(counts, cols = starts_with("GSM"), names_to = "Sample", values_to = "Count")
seqdata## # A tibble: 284,820 × 4
## gene_id gene_symbol Sample Count
## <chr> <chr> <chr> <dbl>
## 1 ENSMUSG00000000001 Gnai3 GSM1480291 243.
## 2 ENSMUSG00000000001 Gnai3 GSM1480292 256.
## 3 ENSMUSG00000000001 Gnai3 GSM1480293 240.
## 4 ENSMUSG00000000001 Gnai3 GSM1480294 217.
## 5 ENSMUSG00000000001 Gnai3 GSM1480295 84.7
## 6 ENSMUSG00000000001 Gnai3 GSM1480296 84.6
## 7 ENSMUSG00000000001 Gnai3 GSM1480297 175.
## 8 ENSMUSG00000000001 Gnai3 GSM1480298 187.
## 9 ENSMUSG00000000001 Gnai3 GSM1480299 177.
## 10 ENSMUSG00000000001 Gnai3 GSM1480300 169.
## # ℹ 284,810 more rows
# if count is really high or has outliers use log#(Count)
#WARNING for counts of 0 and log of 0 = undefined to fix this add a one to the count
# Color can also be added to the samples to help differentiate can also use the fill option also
ggplot(data = allinfo, mapping = aes(x = Sample, y= Count)) + geom_boxplot()ggplot(data = allinfo, mapping = aes(x = Sample, y= log2(Count + 1), colour = Sample)) + geom_boxplot()ggplot(data = allinfo, mapping = aes(x = Sample, y= log2(Count + 1), fill = Sample)) + geom_boxplot()ggplot(data = allinfo, mapping = aes(x = Sample, y= log2(Count + 1), colour = Sample)) + geom_violin()# Y is replcaed with weight why?
ggplot(data = allinfo, mapping = aes(x = Sample, weight = log2(Count + 1), fill = Sample)) + geom_bar()#y input is not needed for density mapping
#Inputs are always case sensitive including for Count
# Can also use fill to help identify specific sampele densities
ggplot(data = allinfo, mapping = aes(x = log2(Count + 1))) + geom_density()# can also use this function to export plots DO NOT FORGET .pdf
pdf("filled.density.map.tumor.pdf")
ggplot(data = allinfo, mapping = aes(x = log2(Count + 1), colour = Sample)) + geom_density()
dev.off()## png
## 2
## tidyverse more practice
## [1] "sample_id" "characteristics" "immunophenotype"
## [4] "developmental.stage"
# mutate is used to create, modify, or delete a column to a table
# In this case it is being used to simplify the data to a easy to read term
# when using a string make sure to use both "," and "*" or "|" to describe and or detection
#biggest fault of all is misspelling and mistaking . for ,
#when using the * in code make sure there is no extra space between the * and word
allinfo <- mutate(allinfo, Group = case_when(
str_detect(characteristics, "basal.*virgin") ~ "bvirg",
str_detect(characteristics, "basal.*pregn") ~ "bpreg",
str_detect(characteristics, "basal.*lact") ~ "blact",
str_detect(characteristics, "luminal.*virgin") ~ "lvirg",
str_detect(characteristics, "luminal.*pregnancy") ~ "lpreg",
str_detect(characteristics, "luminal.*lact") ~ "llact"
))#yourube origional code
allinfo <- mutate(allinfo, Group = case_when(
str_detect(characteristics, "basal.*virgin") ~ "bvirg",
str_detect(characteristics, "basal.*preg") ~ "bpreg",
str_detect(characteristics, "basal.*lact") ~ "blact",
str_detect(characteristics, "luminal.*virgin") ~ "lvirg",
str_detect(characteristics, "luminal.*preg") ~ "lpreg",
str_detect(characteristics, "luminal.*lact") ~ "llact"
))#mutate women data practice through BMI calculation
women.M <- mutate(women, BMI = weight / (height**2) * 703)#ggplot filtering genes of interest
# we are using pipe (%>%) do not forget %>% between steps
my_genes <- allinfo %>%
group_by(gene_symbol) %>%
summarise(Total_count = sum(Count)) %>%
arrange(desc(Total_count)) %>%
head(n=8) %>%
pull(gene_symbol)# use facetwrap to add an to account for additional cofounding factor
ggplot(my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_boxplot() +facet_wrap(~ gene_symbol)ggplot(my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_point() +facet_wrap(~ gene_symbol)# when values are overlapping so much could use jitter point to show the points by adding attitional values to show all points
#Verify with Cesar about this
ggplot(my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_jitter() +facet_wrap(~ gene_symbol)pdf("count_based_characteristics&gene_symbol.pdf")
ggplot(data = my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_jitter() + facet_wrap(~ gene_symbol) + labs(x= "Cells type and stage", y= "count", title = "Mammary gland RNA-seq data") + theme_bw() + theme(axis.text.x = element_text(angle = 90))
dev.off()## png
## 2
#title axis
#when using themes such as theme_bw it overwrites all custom codesbefore it to correct this inser theme before custom code
ggplot(data = my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_jitter() + facet_wrap(~ gene_symbol) + labs(x= "Cells type and stage", y= "count", title = "Mammary gland RNA-seq data") + theme_bw() + theme(axis.text.x = element_text(angle = 90))