#Loading dataset using read.csv
file.choose()
## [1] "C:\\Users\\dcg5438\\Desktop\\BBH 505\\Gilliland_Lab4.Rmd"
pop_gre <- read.csv("C:\\Users\\dcg5438\\Downloads\\pop_gre (1).csv")
library(rafalib)
#Plot histogram and get descriptives
hist(pop_gre$gre,
main = "Distribution of the Population of GRE scores",
xlab = "gre")
sum(!is.na(pop_gre$gre)) # N
## [1] 1551107
mean(pop_gre$gre, na.rm=TRUE)
## [1] 153.6515
sd(pop_gre$gre, na.rm=TRUE)
## [1] 9.441971
#population
sqrt(sum((pop_gre$gre - mean(pop_gre$gre, na.rm=TRUE))^2, na.rm=TRUE) /
sum(!is.na(pop_gre$gre)))
## [1] 9.441968
#Calculating the standard error with a given sample size
``` r
# with a sample size of 100
n <- 100
(se <- rafalib::popsd(pop_gre$gre) / sqrt(n))
## [1] 0.9441968
#Using a loop to draw from 10,000 random samples
n <- 100
reps <- 10000
means_of_samples <- vector("numeric", reps)
set.seed(1234)
# the for loop
for(i in 1:reps){
samp <- sample(pop_gre$gre, size = 100)
means_of_samples[i] <- mean(samp)
}
hist(means_of_samples,main = "Distribution of 10,000 Sample Means")
mean(means_of_samples)
## [1] 153.6598
sd(means_of_samples)
## [1] 0.9396674
#Testing whether a sample is different from your known population
treated.mean <- 156.7
# calculate mean difference (sample mean - population mean)
(meandiff <- treated.mean - mean(pop_gre$gre))
## [1] 3.048496
# Z statistic for this difference (meandiff / se)
(z <- meandiff / se)
## [1] 3.228665
#calculate the two-tailed p
(p <- 2 * pnorm(-abs(z)))
## [1] 0.001243693
p * 100
## [1] 0.1243693
cbind(meandiff, se, z, p)
## meandiff se z p
## [1,] 3.048496 0.9441968 3.228665 0.001243693
#Calculating cohen’s d
(d <- meandiff / rafalib::popsd(pop_gre$gre))
## [1] 0.3228665
lower <- meandiff - qnorm(0.975)*se
upper <- meandiff + qnorm(0.975)*se
# print
cbind(meandiff, lower, upper)
## meandiff lower upper
## [1,] 3.048496 1.197904 4.899087