#Central Tendency and Variability

#Compute Mean and Median
#Use R to calculate the mean and median of: [12, 15, 22, 13, 10, 25, 18, 14].(5 pts)

m <- c(12, 15, 22, 13, 10, 25, 18, 14)
mean_m <- mean(m)
print(mean_m)
## [1] 16.125
median_m <- median(m)
print(median_m)
## [1] 14.5
#Variance and Standard Deviation
#Calculate variance and standard deviation for the same dataset.(5 pts)
var_m <- var(m)
print(var_m)
## [1] 26.69643
sd_m <- sd(m)
print(sd_m)
## [1] 5.166859

#Statistical Distributions

#Normal Distribution Probability
#Use R to calculate the probability of a value below 45 for a normal distribution (mean = 50, SD = 5).(5 pts)

prob = pnorm(45, mean = 50, sd = 5)
print(prob)
## [1] 0.1586553
#Z-Score Interpretation
#What is the significance of a Z-score of -1.96? Write your answer in R.(5 pts)

#A z-score of -1.96 represents a point 1.96 standard deviations below the mean of a normal distribution. This correlates to ~2.5% of the data falling below the z-score. When hypothesis testing a z-score beyond 1.96 or -1.96 indicates statistically significant results at the 5% significance level.

#Confidence Intervals and Bootstrapping

#Bootstrap Confidence Interval
#Generate a histogram of bootstrapped means for [3, 5, 7, 9, 11, 13] with 1000 resamples in R. (10 pts)

#Install library, define samples
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
samples <- c(3, 5, 7, 9, 11, 13)

#Do bootstrap resampling 1000 times with sampling replacement
boot.means=do(1000) * mean(resample(samples))

#Get percentiles from bootstrap means
q = quantile(boot.means[,1], p = c(0.025, 0.975))

#Plot the histogram
hist(boot.means[,1], col = "blue", border = "white", xlab= "sample means")
abline(v=c(q[1], q[2] ), col = "red")
text(x=q[1], y = 200, round(q[1], 2), adj=c(1, 0))
text(x=q[2], y = 200, round(q[2], 2), adj=c(0, 0))

#CLT vs. Bootstrapping
#Compare confidence intervals obtained using bootstrapping and the Central Limit Theorem. (5 pts)
  
  #Bootstrapping is more flexible, it has no distribution assumptions of normality or minimum sample sizes, though, outliers can skew results. Bootstrapping is better for non-normal data and/or smaller sample sizes. CLT assumes normality and requires large sample sizes to be accurate, it is also less computationally demanding and is less sensitive to outliers due to requiring larger sample sizes.

#Hypothesis Testing and t-Distribution

#t-Test Comparison
#Use R to perform a Welch’s t-test on these datasets:
#Dataset 1: [45, 47, 50, 52, 54]
#Dataset 2: [43, 49, 51, 53, 55]
#(10 pts)

#Define datasets
Dataset1 <- c(45, 47, 50, 52, 54)
Dataset2 <- c(43, 49, 51, 53, 55)

#Welch's t-test:

org.diff = mean(Dataset1)-mean(Dataset2)
stats::t.test(Dataset1, Dataset2)
## 
##  Welch Two Sample t-test
## 
## data:  Dataset1 and Dataset2
## t = -0.22842, df = 7.6014, p-value = 0.8253
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.71311  5.51311
## sample estimates:
## mean of x mean of y 
##      49.6      50.2
#Plot t-Distribution
#Create a t-distribution plot (5 degrees of freedom) in R and compare it to a normal distribution. (5 pts)
#Choose degrees of freedom
df <- 5

#create a sequence of numbers
x <- seq(-5, 5, length = 100)

#Calculate density of t-distribution with 5 df
density <- dt(x, df)

#Calculate density of standard normal distribution
norm <- dnorm(x)

#Plot t-distribution
plot(x, density, type = "l", col = "blue", xlab = "x", ylab = "Density", main = "t-distribution (df = 5), vs. Normal Distribution")

#Plot normal distribution
lines(x, norm, col = "red")

#Add a legend
legend("topleft", legend = c("t-distribution (df = 5)", "Normal Distribution"), col = c("blue", "red"), lwd = 2)

#the t-distribution looks to be normally distributed with a bell shaped curve around the central mean

#Linear Models and Correlation

#Linear Regression in R
#Fit a linear model using this dataset:
#data <- data.frame(age = c(20, 25, 30, 35, 40), expression = c(100, 110, 115, 120, 125))
#lm(expression ~ age, data = data)
#(10 pts)

data <- data.frame(age = c(20, 25, 30, 35, 40), expression = c(100, 110, 115, 120, 125))
mod = lm(expression ~ age, data = data)
plot(data, pch = 20, xlab = "Age", ylab = "Gene Expression")
abline(mod, col = "blue")

#Correlation Coefficient
#Compute and interpret the correlation coefficient for:
#x <- c(2, 4, 6, 8, 10)
#y <- c(3, 6, 9, 12, 15)
#cor(x, y)
#(5 pts)

#Define data
x <- c(2, 4, 6, 8, 10)
y <- c(3, 6, 9, 12, 15)

#Calculate correlation
correlation = cor(x, y)

#print
print(correlation)
## [1] 1
#Use the following code to simulate 50 values for x and y and then answer the following questions: 
 
set.seed(32)
x <- runif(50, 1, 100)
b0 <- 10
b1 <- 2
sigma <- 20
eps <- rnorm(50, 0, sigma)
y <- b0 + b1 * x + eps

#a. Fit a linear model to predict Y based on X.

mod1 <- lm (y ~ x)
summary(mod1)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -38.50 -12.62   1.66  14.26  37.20 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.40951    6.43208   0.064     0.95    
## x            2.08742    0.09775  21.355   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16.96 on 48 degrees of freedom
## Multiple R-squared:  0.9048, Adjusted R-squared:  0.9028 
## F-statistic:   456 on 1 and 48 DF,  p-value: < 2.2e-16
#b. Plot the scatter plot of Y vs.  X and add the fitted regression line.

plot(x, y, pch = 20, ylab = "Dependent variable", xlab = "Independent Varaible")
abline(mod1, col = "blue")

#c. Calculate and interpret the correlation coefficient and R2.

cor_coef <- cor(x, y)
cat("Correlation Coefficient (r):", cor_coef, "\n")
## Correlation Coefficient (r): 0.9511939
r_squ <- summary(mod1)$r.squared
cat("R-squared (R2):", r_squ, "\n")
## R-squared (R2): 0.9047699
#(10 pt) 

#Using the linear model from the previous question: 
#a. Plot the residuals vs. fitted values using the following code:  
    #plot(model, which = 1)
plot(mod1, which = 1)

#b. Paste the plot below and inspect the plot for any patterns. What does this tell you about the assumptions of linear regression (e.g., homoscedasticity)? 
#(10 pt) 

#The residuals show no clear pattern around the line, this indicates constant variance. This supports the assumption of homoscedasticity.