getwd()
## [1] "C:/Users/Jerome/Documents/0000_Work_Files/0000_Coursera/Statistics_with_R_Specialization/Course_2_Inferential_Stats/Week2"
library(statsr)
## Warning: package 'statsr' was built under R version 4.0.3
library(ggplot2)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ames <- read.csv("ames.csv", header = TRUE)
n<- 60
samp <- sample_n(ames, n)
samp <- samp
 write.csv (samp, file = "samp.csv", row.names = FALSE)
 samp <- read.csv("samp.csv", header = TRUE)
samp %>%
summarise(samp_mu = mean(area), samp_med = median(area),
samp_sigma = sd(area), samp_pop_iqr = IQR(area),
samp_min = min(area), samp_max = max(area),
samp_q1 = quantile(area, 0.25), # first quartile, 25th percentile
samp_q3 = quantile(area, 0.75)) # third quartile, 75th percentile
##   samp_mu samp_med samp_sigma samp_pop_iqr samp_min samp_max samp_q1 samp_q3
## 1  1543.7     1508   561.0919       715.25      720     3194  1080.5 1795.75
ggplot(data = samp, aes(x = area)) +
geom_histogram(binwidth = 50)

z_star_95 <- qnorm(0.975)
z_star_95
## [1] 1.959964
samp %>%
summarise(lower = mean(area) - z_star_95 * (sd(area) / sqrt(n)),
upper = mean(area) + z_star_95 * (sd(area) / sqrt(n)))
##      lower    upper
## 1 1401.727 1685.673
params <- ames %>% 
  summarise (pop_mean = mean(area))
mean(ames$area)
## [1] 1499.69
ci <- ames %>%
rep_sample_n(size = n, reps = 50, replace = TRUE) %>%
summarise(lower = mean(area) - z_star_95 * (sd(area) / sqrt(n)),
upper = mean(area) + z_star_95 * (sd(area) / sqrt(n)))
## `summarise()` ungrouping output (override with `.groups` argument)
ci %>%
slice(1:5)
## # A tibble: 5 x 3
##   replicate lower upper
##       <int> <dbl> <dbl>
## 1         1 1351. 1698.
## 2         2 1403. 1610.
## 3         3 1318. 1617.
## 4         4 1463. 1707.
## 5         5 1406. 1652.
ci <- ci %>%
mutate(capture_mu = ifelse(lower < params$pop_mean & upper > params$pop_mean, "yes", "no"))
ci_data <- data.frame(ci_id = c(1:50, 1:50),
ci_bounds = c(ci$lower, ci$upper),
capture_mu = c(ci$capture_mu, ci$capture_mu))
ggplot(data = ci_data, aes(x = ci_bounds, y = ci_id,
group = ci_id, color = capture_mu)) +
geom_point(size = 2) + # add points at the ends, size = 2
geom_line() + # connect with lines
geom_vline(xintercept = params$mu, color = "darkgray") # draw vertical line

z_star_99 <- qnorm(0.99)
z_star_99
## [1] 2.326348
ames %>%
summarise(lower = mean(area) - z_star_99 * (sd(area) / sqrt(n)),
upper = mean(area) + z_star_99 * (sd(area) / sqrt(n)))
##      lower   upper
## 1 1347.871 1651.51
ci2 <- ames %>%
rep_sample_n(size = n, reps = 50, replace = TRUE) %>%
summarise(lower = mean(area) - z_star_99 * (sd(area) / sqrt(n)),
upper = mean(area) + z_star_99 * (sd(area) / sqrt(n)))
## `summarise()` ungrouping output (override with `.groups` argument)
ci2 %>%
slice(1:5)
## # A tibble: 5 x 3
##   replicate lower upper
##       <int> <dbl> <dbl>
## 1         1 1305. 1595.
## 2         2 1432. 1739.
## 3         3 1299. 1603.
## 4         4 1347. 1607.
## 5         5 1274. 1593.
ci2 <- ci %>%
mutate(capture_mu2 = ifelse(lower < params$pop_mean & upper > params$pop_mean, "yes", "no"))
ci2_data <- data.frame(ci2_id = c(1:50, 1:50),
ci2_bounds = c(ci2$lower, ci2$upper),
capture_mu2 = c(ci2$capture_mu2, ci2$capture_mu2))
ggplot(data = ci2_data, aes(x = ci2_bounds, y = ci2_id,
group = ci2_id, color = capture_mu2)) +
geom_point(size = 2) + # add points at the ends, size = 2
geom_line() + # connect with lines
geom_vline(xintercept = params$mu, color = "darkgray") # draw vertical line