BBH 597 Lab 4

#Loading dataset using read.csv

file.choose()

## [1] "C:\\Users\\dcg5438\\Desktop\\BBH 505\\Gilliland_Lab4.Rmd"

pop_gre <- read.csv("C:\\Users\\dcg5438\\Downloads\\pop_gre (1).csv")


library(rafalib)

#Plot histogram and get descriptives

hist(pop_gre$gre,
     main = "Distribution of the Population of GRE scores",
     xlab = "gre")

sum(!is.na(pop_gre$gre))   # N

## [1] 1551107

mean(pop_gre$gre, na.rm=TRUE)

## [1] 153.6515

sd(pop_gre$gre, na.rm=TRUE)

## [1] 9.441971

#population
sqrt(sum((pop_gre$gre - mean(pop_gre$gre, na.rm=TRUE))^2, na.rm=TRUE) /
       sum(!is.na(pop_gre$gre)))

## [1] 9.441968


#Calculating the standard error with a given sample size

``` r
# with a sample size of 100
n <- 100
(se <- rafalib::popsd(pop_gre$gre) / sqrt(n))

## [1] 0.9441968

#Using a loop to draw from 10,000 random samples

n <- 100
reps <- 10000
means_of_samples <- vector("numeric", reps)

set.seed(1234) 
# the for loop
for(i in 1:reps){
  samp <- sample(pop_gre$gre, size = 100)
  means_of_samples[i] <- mean(samp)
}

hist(means_of_samples,main = "Distribution of 10,000 Sample Means")

mean(means_of_samples)

## [1] 153.6598

sd(means_of_samples)

## [1] 0.9396674

#Testing whether a sample is different from your known population

treated.mean <- 156.7

# calculate mean difference (sample mean - population mean)
(meandiff <- treated.mean - mean(pop_gre$gre))

## [1] 3.048496

# Z statistic for this difference (meandiff / se)
(z <- meandiff / se)

## [1] 3.228665

#calculate the two-tailed p

(p <- 2 * pnorm(-abs(z)))

## [1] 0.001243693

p * 100

## [1] 0.1243693

cbind(meandiff, se, z, p)

##      meandiff        se        z           p
## [1,] 3.048496 0.9441968 3.228665 0.001243693

#Calculating cohen’s d

(d <- meandiff / rafalib::popsd(pop_gre$gre))

## [1] 0.3228665

lower <- meandiff - qnorm(0.975)*se
upper <- meandiff + qnorm(0.975)*se

# print
cbind(meandiff, lower, upper)

##      meandiff    lower    upper
## [1,] 3.048496 1.197904 4.899087

BBH 597 Lab 4

Destiny Gilliland

2025-10-02