library('datasets')
#install.packages("ggplot2")
remotes::install_github("const-ae/ggupset")
## Skipping install of 'ggupset' from a github remote, the SHA1 (2a03c58d) has not changed since last install.
## Use `force = TRUE` to force installation
#install.packages('remotes')
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("BoutrosLab.utilities_1.9.10.tar.gz", type = "source", dependencies = TRUE, repos = NULL);
#install.packages("BoutrosLab.statistics.general_2.1.3.tar.gz", type = "source", dependencies = TRUE, repos = NULL);
if(!require('BoutrosLab.plotting.general')) {
install.packages('BoutrosLab.plotting.general')
library('BoutrosLab.plotting.general')
}
## Loading required package: BoutrosLab.plotting.general
## Loading required package: lattice
## Loading required package: latticeExtra
##
## Attaching package: 'latticeExtra'
## The following object is masked from 'package:ggplot2':
##
## layer
## Loading required package: cluster
## Loading required package: hexbin
## Loading required package: grid
##
## Attaching package: 'BoutrosLab.plotting.general'
## The following object is masked from 'package:stats':
##
## dist
#install.packages("BoutrosLab.plotting.general")
#library(tidyverse)
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
AHR_file <- read.table("/Users/anvigaur/DEsktop/AHR-test-file.txt")
| 2. Perform a t-test between control and treated |
t.test(AHR_file$Control,AHR_file$Treated, alternative = "two.sided", paired = TRUE)
##
## Paired t-test
##
## data: AHR_file$Control and AHR_file$Treated
## t = -1.6917, df = 7, p-value = 0.1345
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -0.8841939 0.1466939
## sample estimates:
## mean difference
## -0.36875
#paired because using same control and treatment not different mice
| 3. Perform a wilcoxon test between control and treated |
wilcox.test(AHR_file$Control, AHR_file$Treated, paired = TRUE)
##
## Wilcoxon signed rank exact test
##
## data: AHR_file$Control and AHR_file$Treated
## V = 8, p-value = 0.1953
## alternative hypothesis: true location shift is not equal to 0
| 4. Calculate a fold-change between control and treated |
# B − A)/A
control <- AHR_file$Control
treated <- AHR_file$Treated
mouse_foldchange <- (treated - control)/control
AHR_file$foldchange <- mouse_foldchange
print(mouse_foldchange)
## [1] 0.10079576 0.31270358 -0.24250000 0.09350649 0.28295820 0.23364486
## [7] 0.15160350 0.02310231 NA
input_1 <- read.table("/Users/anvigaur/Downloads/r-training-boutroslab/1. statistical-analysis/input-files/input1.txt")
colnames(input_1) <- c("GeneID", "Patient1", "Patient2", "Patient3")
input_2 <- read.table("/Users/anvigaur/Downloads/r-training-boutroslab/1. statistical-analysis/input-files/input2.txt")
colnames(input_2) <- c("GeneID", "Patient4", "Patient5", "Patient6", "Patient7", "Patient8", "Patient9", "Patient10", "Patient11", "Patient12")
input_1 <- input_1[-(1), ]
input_2 <- input_2[-(1), ]
#2A
all_tumors1 <- cbind(input_1$V1, input_1$V2, input_1$V3, input_1$V4, input_2$V1, input_2$V2, input_2$V3, input_2$V4, input_2$V5, input_2$V6, input_2$V7, input_2$V8, input_2$V9, input_2$V10)
#2B
all_tumors2 <- merge(input_1, input_2, by = "GeneID")
| 3. Perform a t-test comparing the first three tumours to the last nine tumours for each gene using a for-loop |
patients_1_3 <- all_tumors2[1,2:4]
patients_4_12 <- all_tumors2[1, 5:13]
class(patients_1_3) # convert both to numberic vectors
## [1] "data.frame"
patients_1_3 <- as.numeric(all_tumors2[1,2:4])
patients_4_12 <- as.numeric(all_tumors2[1,2:4])
t.test(x = patients_1_3, y= patients_4_12, alternative = "two.sided", paired = FALSE)
##
## Welch Two Sample t-test
##
## data: patients_1_3 and patients_4_12
## t = 0, df = 4, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.1882439 0.1882439
## sample estimates:
## mean of x mean of y
## 5.867216 5.867216
p_values <- c()
p_values_df <- c()
for (tup in 1:500) {
first_three <- as.numeric((all_tumors2[tup, 2:4]))
last_nine <- as.numeric((all_tumors2[tup, 5:13]))
#print(t.test(first_three,last_nine, alternative = "two.sided", paired = FALSE))
p_values[tup] <- t.test(first_three,last_nine, alternative = "two.sided", paired = FALSE)$p.value
}
p_values_df <- data.frame(all_tumors2$GeneID,p_values)
colnames(p_values_df) <- c("GENE ID", "p-Values")
#print(p_values)
#(why important + why are you doing it this way) why do we care about the GOF/LOF variants
#put VEP data in a plot instead of table(histogram with mutation on x and score on y, seperate ones for SIFT and POLYPHEN)
#use ggplot to do all plotting in Rtrainig
#dont do everything by hand
#scores<-scores[nonZeros,]
ggplot(p_values_df, aes(x = p_values)) +
geom_histogram( fill = "blue", color = "black") +
labs(title = "Histogram of P-Values",
x = "P-Values",
y = "Frequency") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
| 7. What does this distribution tell you? |
| This distribution tells me that the total number of p values are distrubuted fairly evenly among the p value scores. There is no skewness to the histogram however there does seem to be a particularly high frequency around the p value score of .55 or .70 |
#print(class(p_values_df$`p-Values`))
#apply function to run wilcoxon test
#b<-apply(p_values_df,2,function(x) wilcox.test(p_values_df$`p-Values`, alternative = "two.sided",paired = FALSE))
#wilcox test:
p_values_wilcox <- c()
p_values_df_wilcox <- c()
for (tup in 1:500) {
first_three <- as.numeric((all_tumors2[tup, 2:4]))
last_nine <- as.numeric((all_tumors2[tup, 5:13]))
#print(t.test(first_three,last_nine, alternative = "two.sided", paired = FALSE))
p_values_wilcox[tup] <- wilcox.test(x=first_three,y=last_nine, alternative = "two.sided", paired = FALSE)$p.value
}
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties
p_values_df_wilcox <- data.frame(all_tumors2$GeneID,p_values_wilcox)
colnames(p_values_df_wilcox) <- c("GENE ID", "P_Values")
#fold change
control <- p_values_df$`P-Values`
treated <- p_values_df_wilcox$`P-Values`
df_foldchange <- (treated - control)/control
fc <-apply(p_values_df,2, function(x) (treated - length(x))/length(x))
ggplot(p_values_df_wilcox, aes(x = P_Values)) +
geom_histogram( fill = "blue", color = "black") +
labs(title = "Histogram of P-Values Wilcox Test",
x = "P-Values",
y = "Frequency") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# change the p_values_wilcox dataframe to to fdr
p_values_df_wilcox_fdr <- p_values_df_wilcox
colnames(p_values_df_wilcox_fdr) <- c("GENE ID", "pvalues")
p_values_df_wilcox_fdr$pvalues <- p.adjust(p_values_df_wilcox$P_Values, method = "fdr")
ggplot(p_values_df_wilcox_fdr, aes(x = pvalues)) +
geom_histogram( fill = "blue", color = "black") +
labs(title = "Histogram of P-Values Wilcox Test with FDR adjust",
x = "P-Values",
y = "Frequency") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# change the p_values_wilcox dataframe to to bonferroni correction
p_values_df_wilcox_BN <- p_values_df_wilcox
colnames(p_values_df_wilcox_BN) <- c("GENE ID", "pvalues")
p_values_df_wilcox_BN$pvalues <- p.adjust(p_values_df_wilcox$P_Values, method = "bonferroni")
ggplot(p_values_df_wilcox_BN, aes(x = pvalues)) +
geom_histogram( fill = "pink", color = "orange") +
labs(title = "Histogram of P-Values Wilcox Test with bonferroni adjust",
x = "P-Values",
y = "Frequency") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#BoutrosLab.plotting.general
require(stats); require(graphics)
plot(cars, xlab = "Speed (mph)", ylab = "Stopping distance (ft)",
las = 1)
title(main="Car speed vs stopping data")
#plot heatmap of loblolly data:
plot(height ~ age, data = Loblolly, subset = Seed == 329,
xlab = "Tree age (yr)", las = 1,
ylab = "Tree height (ft)",
main = "Loblolly data")
x <- Loblolly$age
y <- Loblolly$height
data <- expand.grid(X=x, Y=y)
data$count <- runif(7056, 0, 6)
z <- Loblolly$Seed
ggplot(Loblolly, aes(x = Seed, y = height, fill = height)) +
geom_tile(width = 0.5, height = 0.5)+
scale_fill_gradient(low = "red", high = "blue") +
labs( title = "Loblolly data of tree age vs height")+
xlab("Tree Age(yr)")+
ylab("Tree height(ft")
#ChickWeight dataset
require(graphics)
coplot(weight ~ Time | Chick, data = ChickWeight,
type = "b", show.given = FALSE)
ggplot(ChickWeight, aes(x = Time, y = weight, color = Diet)) +
geom_line() +
labs(title = "Chick Growth Over Time by Diet", x = "Time", y = "Weight") +
theme_minimal()
#Q3 of 2. boutrosLab-plotting-general
#read in Q3_SeqControl_data.tsv
df<-read.delim("/Users/anvigaur/Downloads/r-training-boutroslab/2. boutrosLab-plotting-general/Q3/Q3_SeqControl_data.tsv",sep="\t")
yes_decreasing <- df[order( -rank(df$yes.votes), decreasing = TRUE), ]
#Create a heatmap displaying the cpcgene sample names
sample.colour.scheme <- c('rosybrown1', 'rosybrown4', 'red', 'darkred', 'darkorange', 'gold', 'darkolivegreen3', 'darkgreen', 'aquamarine', 'cyan4', 'dodgerblue', 'darkblue');
sample.names <- c('CPCG0003P', 'CPCG0005P', 'CPCG0007P', 'CPCG0040P', 'CPCG0047P', 'CPCG0603P', 'CPCG0098P', 'CPCG0102P', 'CPCG0103P', 'CPCG0123P', 'CPCG0183P', 'CPCG0184');
legend <- legend.grob(
list(
legend = list(
title = expression(underline('Sample')),
colours = sample.colour.scheme,
labels = sample.names,
size = 1.5,
border = 'black'
),
legend = list(
title = expression(underline('Sample Preparation')),
colours = c('white','darkslategrey'),
labels = c('Frozen', 'FFPE'),
size = 1.5,
border = 'black'
),
legend = list(
colours = c('white', 'darkorange'),
labels = c('83.0','97.0'),
border = 'black',
title = expression(underline('% Bases > 0 quality')),
continuous = TRUE
),
legend = list(
colours = c('white', 'darkblue'),
labels = scientific.notation(c(100000000, 360000000)),
border = 'black',
title = expression(underline('Unique start points')),
continuous = TRUE
),
legend = list(
colours = c('white','deeppink'),
labels = c('1.070', '1.190'),
border = 'black',
title = expression(underline('Average reads/start')),
continuous = TRUE
)
),
title.just = 'left',
label.cex = 0.7,
title.cex = 0.7
);
#average reads start
x <- yes_decreasing$CPCG
y <- yes_decreasing$Average.reads.start
u <- yes_decreasing$Unique.start.points
x_bases <- yes_decreasing$X..Bases...0.quality
data <- expand.grid(X=x, Y=y)
#data$count <- yes_decreasing$yes.votes
ggplot(yes_decreasing, aes(CPCG, Average.reads.start, fill= Average.reads.start)) +
geom_tile(width = 0.5, height = 0.5)+
scale_fill_gradient(low = "orange", high = "green") +
labs( title = "CPCG samples vs Average.reads.start")+
xlab("CPCG samples")+
ylab("Average.reads.start")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
##Unique.start.points" heat map
#data <- expand.grid(X=x, U=u)
#data$count <- yes_decreasing$yes.votes
ggplot(yes_decreasing, aes(CPCG, 1, fill= Unique.start.points)) +
geom_tile(width = 0.5, height = 0.5)+
scale_fill_gradient(low = "pink", high = "lightblue") +
labs( title = "CPCG samples vs Unique.start.points")+
xlab("CPCG samples")+
ylab("Unique.start.points")+
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, hjust = 1))
#"X..Bases...0.quality" heatmap
data <- expand.grid(X=x, X_bases=x_bases)
data$count <- yes_decreasing$yes.votes
ggplot(data, aes(X, X_bases, fill= count)) +
geom_tile()+
labs( title = "CPCG samples vs X..Bases...0.quality")+
xlab("CPCG samples")+
ylab("X..Bases...0.quality")
library(BoutrosLab.plotting.general)
# Example Data (replace it with your actual data)
cpcgene_data <- yes_decreasing
ffpe_samples <- c("CPCG0102P", "CPCG0103P")
#your_data$FFPE <- ifelse(your_data$Sample %in% ffpe_samples, "FFPE", "Not FFPE")
cpcgene_data$FFPE <- "NOT FFPE"
cpcgene_data$FFPE[cpcgene_data$CPCG%in%ffpe_samples] <- "FFPE"
#Create covariate bar for FFPE
ggplot(cpcgene_data, aes(x=cpcgene_data$CPCG, fill = cpcgene_data$FFPE))+
geom_bar(stat = "count", position = "stack", color = "black") +
scale_fill_manual(values = c("white", "darkslategrey")) +
labs(title = "FFPE Covariate Bar", x = "Sample", y = "Count") +
theme_minimal()
#Plot the outcome barplot using ggplot2
outcome_samples <- c("<50x")
cpcgene_data$outcome <- ">=50x"
cpcgene_data$outcome[cpcgene_data$outcome == 0] <- "<50x"
ggplot(data =cpcgene_data, aes(x=cpcgene_data$CPCG, fill= cpcgene_data$outcome))+
geom_bar(stat = "count", position = "stack", color = "black") +
scale_fill_manual(values = c("grey", "black")) +
labs(title = "Outcome Barplot", x = "Sample", y = "Count") +
theme_minimal()
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.