R_tutorial

library('datasets')
#install.packages("ggplot2")
remotes::install_github("const-ae/ggupset")

## Skipping install of 'ggupset' from a github remote, the SHA1 (2a03c58d) has not changed since last install.
##   Use `force = TRUE` to force installation

#install.packages('remotes')
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#install.packages("BoutrosLab.utilities_1.9.10.tar.gz", type = "source", dependencies = TRUE, repos = NULL);
#install.packages("BoutrosLab.statistics.general_2.1.3.tar.gz", type = "source", dependencies = TRUE, repos = NULL);
if(!require('BoutrosLab.plotting.general')) {
  install.packages('BoutrosLab.plotting.general')
  library('BoutrosLab.plotting.general')
}

## Loading required package: BoutrosLab.plotting.general

## Loading required package: lattice

## Loading required package: latticeExtra

## 
## Attaching package: 'latticeExtra'

## The following object is masked from 'package:ggplot2':
## 
##     layer

## Loading required package: cluster

## Loading required package: hexbin

## Loading required package: grid

## 
## Attaching package: 'BoutrosLab.plotting.general'

## The following object is masked from 'package:stats':
## 
##     dist

#install.packages("BoutrosLab.plotting.general")
#library(tidyverse)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

##Q1

1. Read the file AHR-test-file.txt

AHR_file <- read.table("/Users/anvigaur/DEsktop/AHR-test-file.txt")

2. Perform a t-test between control and treated

t.test(AHR_file$Control,AHR_file$Treated, alternative = "two.sided", paired = TRUE)

## 
##  Paired t-test
## 
## data:  AHR_file$Control and AHR_file$Treated
## t = -1.6917, df = 7, p-value = 0.1345
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -0.8841939  0.1466939
## sample estimates:
## mean difference 
##        -0.36875

#paired because using same control and treatment not different mice

3. Perform a wilcoxon test between control and treated

wilcox.test(AHR_file$Control, AHR_file$Treated, paired = TRUE)

## 
##  Wilcoxon signed rank exact test
## 
## data:  AHR_file$Control and AHR_file$Treated
## V = 8, p-value = 0.1953
## alternative hypothesis: true location shift is not equal to 0

4. Calculate a fold-change between control and treated

# B − A)/A
control <- AHR_file$Control
treated <- AHR_file$Treated
mouse_foldchange <- (treated - control)/control
AHR_file$foldchange <- mouse_foldchange
print(mouse_foldchange)

## [1]  0.10079576  0.31270358 -0.24250000  0.09350649  0.28295820  0.23364486
## [7]  0.15160350  0.02310231          NA

## Q2

1. Read the two input files

input_1 <- read.table("/Users/anvigaur/Downloads/r-training-boutroslab/1. statistical-analysis/input-files/input1.txt")
colnames(input_1) <- c("GeneID", "Patient1", "Patient2", "Patient3")
input_2 <- read.table("/Users/anvigaur/Downloads/r-training-boutroslab/1. statistical-analysis/input-files/input2.txt")
colnames(input_2) <- c("GeneID", "Patient4", "Patient5", "Patient6", "Patient7", "Patient8", "Patient9", "Patient10", "Patient11", "Patient12")
input_1 <- input_1[-(1), ]
input_2 <- input_2[-(1), ]

#2A
all_tumors1 <- cbind(input_1$V1, input_1$V2, input_1$V3, input_1$V4, input_2$V1, input_2$V2, input_2$V3, input_2$V4, input_2$V5, input_2$V6, input_2$V7, input_2$V8, input_2$V9, input_2$V10)

#2B
all_tumors2 <- merge(input_1, input_2, by = "GeneID")

3. Perform a t-test comparing the first three tumours to the last nine tumours for each gene using a for-loop

patients_1_3 <- all_tumors2[1,2:4]
patients_4_12 <- all_tumors2[1, 5:13]
class(patients_1_3) # convert both to numberic vectors

## [1] "data.frame"

patients_1_3 <- as.numeric(all_tumors2[1,2:4])
patients_4_12 <- as.numeric(all_tumors2[1,2:4])
t.test(x = patients_1_3, y= patients_4_12, alternative = "two.sided", paired = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  patients_1_3 and patients_4_12
## t = 0, df = 4, p-value = 1
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.1882439  0.1882439
## sample estimates:
## mean of x mean of y 
##  5.867216  5.867216

p_values <- c()
p_values_df <- c()
for (tup in 1:500) {
  first_three <- as.numeric((all_tumors2[tup, 2:4]))
  last_nine <- as.numeric((all_tumors2[tup, 5:13]))
  
  #print(t.test(first_three,last_nine, alternative = "two.sided", paired = FALSE))
  p_values[tup] <-  t.test(first_three,last_nine, alternative = "two.sided", paired = FALSE)$p.value
  
}
p_values_df <- data.frame(all_tumors2$GeneID,p_values)
colnames(p_values_df) <- c("GENE ID", "p-Values")
#print(p_values)
#(why important + why are you doing it this way) why do we care about the GOF/LOF variants
#put VEP data in a plot instead of table(histogram with mutation on x and score on y, seperate ones for SIFT and POLYPHEN)
#use ggplot to do all plotting in Rtrainig
#dont do everything by hand

## Including Plots

4. Plot a histogram of the p-values

#scores<-scores[nonZeros,]


ggplot(p_values_df, aes(x = p_values)) +
  geom_histogram( fill = "blue", color = "black") +
  labs(title = "Histogram of P-Values",
       x = "P-Values",
       y = "Frequency") +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

7. What does this distribution tell you?

This distribution tells me that the total number of p values are distrubuted fairly evenly among the p value scores. There is no skewness to the histogram however there does seem to be a particularly high frequency around the p value score of .55 or .70

##Q3

1. Repeat the above comparison using a Wilcoxon test and fold-changes. Create suitable plots to compare the results and write a paragraph describing the similarities/differences. Use the R apply() function

#print(class(p_values_df$`p-Values`))
#apply function to run wilcoxon test
#b<-apply(p_values_df,2,function(x) wilcox.test(p_values_df$`p-Values`, alternative = "two.sided",paired = FALSE))


#wilcox test:

p_values_wilcox <- c()
p_values_df_wilcox <- c()
for (tup in 1:500) {
  first_three <- as.numeric((all_tumors2[tup, 2:4]))
  last_nine <- as.numeric((all_tumors2[tup, 5:13]))
  
  #print(t.test(first_three,last_nine, alternative = "two.sided", paired = FALSE))
  p_values_wilcox[tup] <-  wilcox.test(x=first_three,y=last_nine, alternative = "two.sided", paired = FALSE)$p.value
  
}

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

## Warning in wilcox.test.default(x = first_three, y = last_nine, alternative =
## "two.sided", : cannot compute exact p-value with ties

p_values_df_wilcox <- data.frame(all_tumors2$GeneID,p_values_wilcox)
colnames(p_values_df_wilcox) <- c("GENE ID", "P_Values")

#fold change
control <- p_values_df$`P-Values`
treated <- p_values_df_wilcox$`P-Values`
df_foldchange <- (treated - control)/control
fc <-apply(p_values_df,2, function(x) (treated - length(x))/length(x))




ggplot(p_values_df_wilcox, aes(x = P_Values)) +
  geom_histogram( fill = "blue", color = "black") +
  labs(title = "Histogram of P-Values Wilcox Test",
       x = "P-Values",
       y = "Frequency") +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# change the p_values_wilcox dataframe to to fdr
p_values_df_wilcox_fdr <- p_values_df_wilcox
colnames(p_values_df_wilcox_fdr) <- c("GENE ID", "pvalues")


p_values_df_wilcox_fdr$pvalues <- p.adjust(p_values_df_wilcox$P_Values, method = "fdr")
ggplot(p_values_df_wilcox_fdr, aes(x = pvalues)) +
  geom_histogram( fill = "blue", color = "black") +
  labs(title = "Histogram of P-Values Wilcox Test with FDR adjust",
       x = "P-Values",
       y = "Frequency") +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# change the p_values_wilcox dataframe to to bonferroni correction
p_values_df_wilcox_BN <- p_values_df_wilcox
colnames(p_values_df_wilcox_BN) <- c("GENE ID", "pvalues")
p_values_df_wilcox_BN$pvalues <- p.adjust(p_values_df_wilcox$P_Values, method = "bonferroni")
ggplot(p_values_df_wilcox_BN, aes(x = pvalues)) +
  geom_histogram( fill = "pink", color = "orange") +
  labs(title = "Histogram of P-Values Wilcox Test with bonferroni adjust",
       x = "P-Values",
       y = "Frequency") +
  theme_minimal()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#BoutrosLab.plotting.general

require(stats); require(graphics)
plot(cars, xlab = "Speed (mph)", ylab = "Stopping distance (ft)",
     las = 1)
title(main="Car speed vs stopping data")

#plot heatmap of loblolly data:
plot(height ~ age, data = Loblolly, subset = Seed == 329,
     xlab = "Tree age (yr)", las = 1,
     ylab = "Tree height (ft)",
     main = "Loblolly data")

x <- Loblolly$age
y <- Loblolly$height
data <- expand.grid(X=x, Y=y)
data$count <- runif(7056, 0, 6)
z <- Loblolly$Seed 

ggplot(Loblolly, aes(x = Seed, y = height, fill = height)) + 
  geom_tile(width = 0.5, height = 0.5)+
  scale_fill_gradient(low = "red", high = "blue") +
  labs( title = "Loblolly data of tree age vs height")+
  xlab("Tree Age(yr)")+
  ylab("Tree height(ft")

#ChickWeight dataset
require(graphics)
coplot(weight ~ Time | Chick, data = ChickWeight,
       type = "b", show.given = FALSE)

ggplot(ChickWeight, aes(x = Time, y = weight, color = Diet)) +
  geom_line() +
  labs(title = "Chick Growth Over Time by Diet", x = "Time", y = "Weight") +
  theme_minimal()

#Q3 of 2. boutrosLab-plotting-general

#read in Q3_SeqControl_data.tsv
df<-read.delim("/Users/anvigaur/Downloads/r-training-boutroslab/2. boutrosLab-plotting-general/Q3/Q3_SeqControl_data.tsv",sep="\t")

yes_decreasing <- df[order( -rank(df$yes.votes), decreasing = TRUE), ]

#Create a heatmap displaying the cpcgene sample names
sample.colour.scheme <- c('rosybrown1', 'rosybrown4', 'red', 'darkred', 'darkorange', 'gold', 'darkolivegreen3', 'darkgreen', 'aquamarine', 'cyan4', 'dodgerblue', 'darkblue');
sample.names <- c('CPCG0003P', 'CPCG0005P', 'CPCG0007P', 'CPCG0040P', 'CPCG0047P', 'CPCG0603P', 'CPCG0098P', 'CPCG0102P', 'CPCG0103P', 'CPCG0123P', 'CPCG0183P', 'CPCG0184');

legend <- legend.grob(
    list(
        legend = list(
            title = expression(underline('Sample')),
            colours = sample.colour.scheme,
            labels = sample.names,
            size = 1.5,
            border = 'black'
            ),
        legend = list(
            title = expression(underline('Sample Preparation')),
            colours = c('white','darkslategrey'),
            labels = c('Frozen', 'FFPE'),
            size = 1.5,
            border = 'black'
            ),
        legend = list(
            colours = c('white', 'darkorange'),
            labels = c('83.0','97.0'),
            border = 'black',
            title = expression(underline('% Bases > 0 quality')),
            continuous = TRUE
            ),
        legend = list(
            colours = c('white', 'darkblue'),
            labels = scientific.notation(c(100000000, 360000000)),
            border = 'black',
            title = expression(underline('Unique start points')),
            continuous = TRUE
            ),
        legend = list(
            colours = c('white','deeppink'),
            labels = c('1.070', '1.190'),
            border = 'black',
            title = expression(underline('Average reads/start')),
            continuous = TRUE
            )
        ),
    title.just = 'left',
    label.cex = 0.7,
    title.cex = 0.7
    );
#average reads start
x <- yes_decreasing$CPCG
y <- yes_decreasing$Average.reads.start

u <- yes_decreasing$Unique.start.points
x_bases <- yes_decreasing$X..Bases...0.quality
data <- expand.grid(X=x, Y=y)

#data$count <- yes_decreasing$yes.votes
 

ggplot(yes_decreasing, aes(CPCG, Average.reads.start, fill= Average.reads.start)) + 
  geom_tile(width = 0.5, height = 0.5)+
  scale_fill_gradient(low = "orange", high = "green") +
  labs( title = "CPCG samples vs Average.reads.start")+
  xlab("CPCG samples")+
  ylab("Average.reads.start")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##Unique.start.points" heat map
#data <- expand.grid(X=x, U=u)
#data$count <- yes_decreasing$yes.votes
ggplot(yes_decreasing, aes(CPCG, 1, fill= Unique.start.points)) + 
  geom_tile(width = 0.5, height = 0.5)+
  scale_fill_gradient(low = "pink", high = "lightblue") +
  labs( title = "CPCG samples vs Unique.start.points")+
  xlab("CPCG samples")+
  ylab("Unique.start.points")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#"X..Bases...0.quality" heatmap
data <- expand.grid(X=x, X_bases=x_bases)
data$count <- yes_decreasing$yes.votes
ggplot(data, aes(X, X_bases, fill= count)) + 
  geom_tile()+
  labs( title = "CPCG samples vs X..Bases...0.quality")+
  xlab("CPCG samples")+
  ylab("X..Bases...0.quality")

library(BoutrosLab.plotting.general)


# Example Data (replace it with your actual data)
cpcgene_data <- yes_decreasing
ffpe_samples <- c("CPCG0102P", "CPCG0103P")
#your_data$FFPE <- ifelse(your_data$Sample %in% ffpe_samples, "FFPE", "Not FFPE")
cpcgene_data$FFPE <- "NOT FFPE"
cpcgene_data$FFPE[cpcgene_data$CPCG%in%ffpe_samples] <- "FFPE"

#Create covariate bar for FFPE
ggplot(cpcgene_data, aes(x=cpcgene_data$CPCG, fill = cpcgene_data$FFPE))+
  geom_bar(stat = "count", position = "stack", color = "black") +
  scale_fill_manual(values = c("white", "darkslategrey")) +
  labs(title = "FFPE Covariate Bar", x = "Sample", y = "Count") +
  theme_minimal()

#Plot the outcome barplot using ggplot2
outcome_samples <- c("<50x")
cpcgene_data$outcome <- ">=50x"
cpcgene_data$outcome[cpcgene_data$outcome == 0] <- "<50x"


ggplot(data =cpcgene_data, aes(x=cpcgene_data$CPCG, fill= cpcgene_data$outcome))+
  geom_bar(stat = "count", position = "stack", color = "black") +
  scale_fill_manual(values = c("grey", "black")) +
  labs(title = "Outcome Barplot", x = "Sample", y = "Count") +
  theme_minimal()

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.