#Loading dataset using read.csv

file.choose()
## [1] "C:\\Users\\dcg5438\\Desktop\\BBH 505\\Gilliland_Lab4.Rmd"
pop_gre <- read.csv("C:\\Users\\dcg5438\\Downloads\\pop_gre (1).csv")


library(rafalib)

#Plot histogram and get descriptives

hist(pop_gre$gre,
     main = "Distribution of the Population of GRE scores",
     xlab = "gre")

sum(!is.na(pop_gre$gre))   # N
## [1] 1551107
mean(pop_gre$gre, na.rm=TRUE) 
## [1] 153.6515
sd(pop_gre$gre, na.rm=TRUE)
## [1] 9.441971
#population
sqrt(sum((pop_gre$gre - mean(pop_gre$gre, na.rm=TRUE))^2, na.rm=TRUE) /
       sum(!is.na(pop_gre$gre)))
## [1] 9.441968

#Calculating the standard error with a given sample size

``` r
# with a sample size of 100
n <- 100
(se <- rafalib::popsd(pop_gre$gre) / sqrt(n))
## [1] 0.9441968

#Using a loop to draw from 10,000 random samples

n <- 100
reps <- 10000
means_of_samples <- vector("numeric", reps)

set.seed(1234) 
# the for loop
for(i in 1:reps){
  samp <- sample(pop_gre$gre, size = 100)
  means_of_samples[i] <- mean(samp)
} 
hist(means_of_samples,main = "Distribution of 10,000 Sample Means")

mean(means_of_samples) 
## [1] 153.6598
sd(means_of_samples) 
## [1] 0.9396674

#Testing whether a sample is different from your known population

treated.mean <- 156.7

# calculate mean difference (sample mean - population mean)
(meandiff <- treated.mean - mean(pop_gre$gre))
## [1] 3.048496
# Z statistic for this difference (meandiff / se)
(z <- meandiff / se)
## [1] 3.228665

#calculate the two-tailed p

(p <- 2 * pnorm(-abs(z)))   
## [1] 0.001243693
p * 100                     
## [1] 0.1243693
cbind(meandiff, se, z, p)
##      meandiff        se        z           p
## [1,] 3.048496 0.9441968 3.228665 0.001243693

#Calculating cohen’s d

(d <- meandiff / rafalib::popsd(pop_gre$gre))
## [1] 0.3228665
lower <- meandiff - qnorm(0.975)*se
upper <- meandiff + qnorm(0.975)*se

# print
cbind(meandiff, lower, upper)
##      meandiff    lower    upper
## [1,] 3.048496 1.197904 4.899087