Practice

data upload

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sampleinfo <- read.csv("data/GSE60450_filtered_metadata-1.csv")
counts <- read.csv("data/GSE60450_GeneLevel_NormalizedCPM.and_.TMM_data.csv")

##Questions for Cesar ### How can you make the plot/graphs you make show up in the plots section in the bottom right corner?

table observation

sampleinfo 
##             X                                  characteristics
## 1  GSM1480291             mammary gland, luminal cells, virgin
## 2  GSM1480292             mammary gland, luminal cells, virgin
## 3  GSM1480293 mammary gland, luminal cells, 18.5 day pregnancy
## 4  GSM1480294 mammary gland, luminal cells, 18.5 day pregnancy
## 5  GSM1480295    mammary gland, luminal cells, 2 day lactation
## 6  GSM1480296    mammary gland, luminal cells, 2 day lactation
## 7  GSM1480297               mammary gland, basal cells, virgin
## 8  GSM1480298               mammary gland, basal cells, virgin
## 9  GSM1480299   mammary gland, basal cells, 18.5 day pregnancy
## 10 GSM1480300   mammary gland, basal cells, 18.5 day pregnancy
## 11 GSM1480301      mammary gland, basal cells, 2 day lactation
## 12 GSM1480302      mammary gland, basal cells, 2 day lactation
##            immunophenotype developmental.stage
## 1  luminal cell population              virgin
## 2  luminal cell population              virgin
## 3  luminal cell population  18.5 day pregnancy
## 4  luminal cell population  18.5 day pregnancy
## 5  luminal cell population     2 day lactation
## 6  luminal cell population     2 day lactation
## 7    basal cell population              virgin
## 8    basal cell population              virgin
## 9    basal cell population  18.5 day pregnancy
## 10   basal cell population  18.5 day pregnancy
## 11   basal cell population     2 day lactation
## 12   basal cell population     2 day lactation
# first 6 rows of the table 
head(sampleinfo)
##            X                                  characteristics
## 1 GSM1480291             mammary gland, luminal cells, virgin
## 2 GSM1480292             mammary gland, luminal cells, virgin
## 3 GSM1480293 mammary gland, luminal cells, 18.5 day pregnancy
## 4 GSM1480294 mammary gland, luminal cells, 18.5 day pregnancy
## 5 GSM1480295    mammary gland, luminal cells, 2 day lactation
## 6 GSM1480296    mammary gland, luminal cells, 2 day lactation
##           immunophenotype developmental.stage
## 1 luminal cell population              virgin
## 2 luminal cell population              virgin
## 3 luminal cell population  18.5 day pregnancy
## 4 luminal cell population  18.5 day pregnancy
## 5 luminal cell population     2 day lactation
## 6 luminal cell population     2 day lactation
# last 6 rows of the table
tail(sampleinfo)
##             X                                characteristics
## 7  GSM1480297             mammary gland, basal cells, virgin
## 8  GSM1480298             mammary gland, basal cells, virgin
## 9  GSM1480299 mammary gland, basal cells, 18.5 day pregnancy
## 10 GSM1480300 mammary gland, basal cells, 18.5 day pregnancy
## 11 GSM1480301    mammary gland, basal cells, 2 day lactation
## 12 GSM1480302    mammary gland, basal cells, 2 day lactation
##          immunophenotype developmental.stage
## 7  basal cell population              virgin
## 8  basal cell population              virgin
## 9  basal cell population  18.5 day pregnancy
## 10 basal cell population  18.5 day pregnancy
## 11 basal cell population     2 day lactation
## 12 basal cell population     2 day lactation
dim(sampleinfo)
## [1] 12  4
dim(counts)
## [1] 23735    14
# for [#:#] means range of values in rows 
sampleinfo$characteristics[1:3]
## [1] "mammary gland, luminal cells, virgin"            
## [2] "mammary gland, luminal cells, virgin"            
## [3] "mammary gland, luminal cells, 18.5 day pregnancy"

table minipulation

#changes the name of the column [is the column #]
colnames(sampleinfo) [1] <- "sample_id"
colnames(counts) [1] <- "gene_id"
# str shows structure and summary shows statistics summary 
str(sampleinfo)
## 'data.frame':    12 obs. of  4 variables:
##  $ sample_id          : chr  "GSM1480291" "GSM1480292" "GSM1480293" "GSM1480294" ...
##  $ characteristics    : chr  "mammary gland, luminal cells, virgin" "mammary gland, luminal cells, virgin" "mammary gland, luminal cells, 18.5 day pregnancy" "mammary gland, luminal cells, 18.5 day pregnancy" ...
##  $ immunophenotype    : chr  "luminal cell population" "luminal cell population" "luminal cell population" "luminal cell population" ...
##  $ developmental.stage: chr  "virgin" "virgin" "18.5 day pregnancy" "18.5 day pregnancy" ...
summary(sampleinfo)
##   sample_id         characteristics    immunophenotype    developmental.stage
##  Length:12          Length:12          Length:12          Length:12          
##  Class :character   Class :character   Class :character   Class :character   
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character
str(counts)
## 'data.frame':    23735 obs. of  14 variables:
##  $ gene_id    : chr  "ENSMUSG00000000001" "ENSMUSG00000000003" "ENSMUSG00000000028" "ENSMUSG00000000031" ...
##  $ gene_symbol: chr  "Gnai3" "Pbsn" "Cdc45" "H19" ...
##  $ GSM1480291 : num  243.29 0 11.18 6.31 2.19 ...
##  $ GSM1480292 : num  255.66 0 13.78 8.53 4.66 ...
##  $ GSM1480293 : num  239.74 0 11.6 7.09 2.8 ...
##  $ GSM1480294 : num  217.1 0 4.27 11.04 2.5 ...
##  $ GSM1480295 : num  84.744 0 8.35 0.194 1.243 ...
##  $ GSM1480296 : num  84.599 0 8.199 0 0.855 ...
##  $ GSM1480297 : num  175.04 0 12.11 2.12 5.79 ...
##  $ GSM1480298 : num  187.49 0 11.1 1.19 8.8 ...
##  $ GSM1480299 : num  176.66 0 7.53 1.55 9.81 ...
##  $ GSM1480300 : num  169.094 0 7.099 0.867 7.47 ...
##  $ GSM1480301 : num  158.45 0 1.98 10.83 7.57 ...
##  $ GSM1480302 : num  133.59 0 2.88 5.77 9.88 ...
summary(counts)
##    gene_id          gene_symbol          GSM1480291          GSM1480292       
##  Length:23735       Length:23735       Min.   :    0.000   Min.   :    0.000  
##  Class :character   Class :character   1st Qu.:    0.000   1st Qu.:    0.000  
##  Mode  :character   Mode  :character   Median :    1.745   Median :    1.891  
##                                        Mean   :   42.132   Mean   :   42.132  
##                                        3rd Qu.:   29.840   3rd Qu.:   29.604  
##                                        Max.   :12525.066   Max.   :12416.211  
##    GSM1480293          GSM1480294          GSM1480295       
##  Min.   :    0.000   Min.   :    0.000   Min.   :0.000e+00  
##  1st Qu.:    0.000   1st Qu.:    0.000   1st Qu.:0.000e+00  
##  Median :    0.918   Median :    0.888   Median :5.830e-01  
##  Mean   :   42.132   Mean   :   42.132   Mean   :4.213e+01  
##  3rd Qu.:   21.908   3rd Qu.:   19.921   3rd Qu.:1.227e+01  
##  Max.   :49191.148   Max.   :55692.086   Max.   :1.119e+05  
##    GSM1480296          GSM1480297          GSM1480298       
##  Min.   :0.000e+00   Min.   :    0.000   Min.   :    0.000  
##  1st Qu.:0.000e+00   1st Qu.:    0.000   1st Qu.:    0.000  
##  Median :5.440e-01   Median :    2.158   Median :    2.254  
##  Mean   :4.213e+01   Mean   :   42.132   Mean   :   42.132  
##  3rd Qu.:1.228e+01   3rd Qu.:   27.414   3rd Qu.:   26.450  
##  Max.   :1.087e+05   Max.   :10489.311   Max.   :10662.486  
##    GSM1480299          GSM1480300          GSM1480301       
##  Min.   :    0.000   Min.   :    0.000   Min.   :    0.000  
##  1st Qu.:    0.000   1st Qu.:    0.000   1st Qu.:    0.000  
##  Median :    1.854   Median :    1.816   Median :    1.629  
##  Mean   :   42.132   Mean   :   42.132   Mean   :   42.132  
##  3rd Qu.:   24.860   3rd Qu.:   23.443   3rd Qu.:   23.444  
##  Max.   :15194.048   Max.   :17434.935   Max.   :19152.728  
##    GSM1480302       
##  Min.   :    0.000  
##  1st Qu.:    0.000  
##  Median :    1.749  
##  Mean   :   42.132  
##  3rd Qu.:   24.818  
##  Max.   :15997.193
#pivot tables to format data into a plot to work with plotgg package 
# Variable_name <- pivot_longer(dataset_to_read, cols = starts_with ("common begining file"), names_to = "new column name", values_to = "count column name")
# seqdata <- pivot_longer(counts, cols = GSM1480291:GSM1480302, names_to = "Sample", values_to = "Count")
# seqdata <- pivot_longer(counts, cols = -c("gene_id", "gene_symbol"), names_to = "Sample", values_to = "Count")
seqdata <- pivot_longer(counts, cols =  starts_with("GSM"), names_to = "Sample", values_to = "Count")

seqdata
## # A tibble: 284,820 × 4
##    gene_id            gene_symbol Sample     Count
##    <chr>              <chr>       <chr>      <dbl>
##  1 ENSMUSG00000000001 Gnai3       GSM1480291 243. 
##  2 ENSMUSG00000000001 Gnai3       GSM1480292 256. 
##  3 ENSMUSG00000000001 Gnai3       GSM1480293 240. 
##  4 ENSMUSG00000000001 Gnai3       GSM1480294 217. 
##  5 ENSMUSG00000000001 Gnai3       GSM1480295  84.7
##  6 ENSMUSG00000000001 Gnai3       GSM1480296  84.6
##  7 ENSMUSG00000000001 Gnai3       GSM1480297 175. 
##  8 ENSMUSG00000000001 Gnai3       GSM1480298 187. 
##  9 ENSMUSG00000000001 Gnai3       GSM1480299 177. 
## 10 ENSMUSG00000000001 Gnai3       GSM1480300 169. 
## # ℹ 284,810 more rows
allinfo <- full_join(seqdata, sampleinfo, by = c("Sample" = "sample_id"))

ggplot2 data

library(ggplot2)
# if count is really high or has outliers use log#(Count)
#WARNING for counts of 0 and log of 0 = undefined to fix this add a one to the count 
# Color can also be added to the samples to help differentiate can also use the fill option also 
ggplot(data = allinfo, mapping = aes(x = Sample, y= Count)) + geom_boxplot()

ggplot(data = allinfo, mapping = aes(x = Sample, y= log2(Count + 1), colour = Sample)) + geom_boxplot()

ggplot(data = allinfo, mapping = aes(x = Sample, y= log2(Count + 1), fill = Sample)) + geom_boxplot()

ggplot(data = allinfo, mapping = aes(x = Sample, y= log2(Count + 1), colour = Sample)) + geom_violin()

# Y is replcaed with weight why?
ggplot(data = allinfo, mapping = aes(x = Sample, weight = log2(Count + 1), fill = Sample)) + geom_bar()

data("women")
view(women)
ggplot(data = women, mapping = aes(x= height, y = weight)) + geom_point()

ggplot(data = women, mapping = aes(x= height, y = weight)) + geom_path()

#y input is not needed for density mapping
#Inputs are always case sensitive including for Count
# Can also use fill to help identify specific sampele densities
ggplot(data = allinfo, mapping = aes(x = log2(Count + 1))) + geom_density()

ggplot(data = allinfo, mapping = aes(x = log2(Count + 1), fill = Sample)) + geom_density()

ggplot(data = allinfo, mapping = aes(x = log2(Count + 1), colour = Sample)) + geom_density()

# can also use this function to export plots DO NOT FORGET .pdf
pdf("filled.density.map.tumor.pdf")
ggplot(data = allinfo, mapping = aes(x = log2(Count + 1), colour = Sample)) + geom_density()
dev.off()
## png 
##   2
ggplot(data = women, mapping = aes(x = weight)) + geom_density()

## tidyverse more practice

view(sampleinfo$characteristics)
colnames(sampleinfo)
## [1] "sample_id"           "characteristics"     "immunophenotype"    
## [4] "developmental.stage"
# mutate is used to create, modify, or delete a column to a table 
# In this case it is being used to simplify the data to a easy to read term
# when using a string make sure to use both "," and  "*" or "|" to describe and or detection
#biggest fault of all is misspelling and mistaking  . for , 
  #when using the * in code make sure there is no extra space between the * and word
allinfo <- mutate(allinfo, Group = case_when(
str_detect(characteristics, "basal.*virgin") ~ "bvirg",
str_detect(characteristics, "basal.*pregn") ~ "bpreg",
str_detect(characteristics, "basal.*lact") ~ "blact",
str_detect(characteristics, "luminal.*virgin") ~ "lvirg",
str_detect(characteristics, "luminal.*pregnancy") ~ "lpreg",
str_detect(characteristics, "luminal.*lact") ~ "llact"
 ))
#yourube origional code
allinfo <- mutate(allinfo, Group = case_when(
str_detect(characteristics, "basal.*virgin") ~ "bvirg",
str_detect(characteristics, "basal.*preg") ~ "bpreg",
str_detect(characteristics, "basal.*lact") ~ "blact",
str_detect(characteristics, "luminal.*virgin") ~ "lvirg",
str_detect(characteristics, "luminal.*preg") ~ "lpreg",
str_detect(characteristics, "luminal.*lact") ~ "llact"
))
#mutate women data practice through BMI calculation
women.M <- mutate(women, BMI = weight / (height**2) * 703)
#ggplot filtering genes of interest 
# we are using pipe (%>%) do not forget %>% between steps
my_genes <- allinfo %>%
  group_by(gene_symbol) %>%
  summarise(Total_count = sum(Count)) %>%
  arrange(desc(Total_count)) %>%
  head(n=8) %>% 
    pull(gene_symbol)
my_genes_count <- filter(allinfo, gene_symbol %in% my_genes)
# use facetwrap to add an to account for additional cofounding factor
ggplot(my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_boxplot() +facet_wrap(~ gene_symbol)

ggplot(my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_point() +facet_wrap(~ gene_symbol)

# when values are overlapping so much could use jitter point to show the points by adding attitional values to show all points
  #Verify with Cesar about this 
ggplot(my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_jitter() +facet_wrap(~ gene_symbol)

pdf("count_based_characteristics&gene_symbol.pdf")
ggplot(data = my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_jitter() + facet_wrap(~ gene_symbol) + labs(x= "Cells type and stage", y= "count", title = "Mammary gland RNA-seq data") + theme_bw() + theme(axis.text.x = element_text(angle = 90)) 
dev.off()
## png 
##   2
#title axis 
#when using themes such as theme_bw it overwrites all custom codesbefore it  to correct this inser theme before custom code
ggplot(data = my_genes_count, mapping = aes(x= Group, y = log2(Count+1), colour = Group)) + geom_jitter() + facet_wrap(~ gene_symbol) + labs(x= "Cells type and stage", y= "count", title = "Mammary gland RNA-seq data") + theme_bw() + theme(axis.text.x = element_text(angle = 90))