margin_of_error <- 1.96 * sqrt((.48*.52)/1259) * 100
round(c(48 - margin_of_error,48 + margin_of_error),digits=2)
## [1] 45.24 50.76
#1.96 * sqrt((.48*.52)/n) == .02
n = (.48*.52*1.96^2)/(.02^2)
1.96 * sqrt((.48*.52)/n) == .02
## [1] TRUE
round(n,digits=0) + 1
## [1] 2398
If we wanted to limit the margin of error of a 95% confidence interval to 2%, we would need to survey 2,398 Americans.
pooled_proportion = function(prop1,n1,prop2,n2){
weight_prop1 <- n1/(n1 + n2)
weight_prop2 <- 1 - weight_prop1
return(weight_prop1 * prop1 + weight_prop2 * prop2)
}
SE_mean_using_pooled_proportion = function(prop1,n1,prop2,n2){
my_p <- pooled_proportion(prop1,n1,prop2,n2)
return(sqrt((my_p*(1 - my_p))/n1 + (my_p*(1 - my_p))/n2))
}
SE_mean_using_separated_proportions = function(prop1,n1,prop2,n2){
return(sqrt((prop1*(1 - prop1))/n1 + (prop2*(1 - prop2))/n2))
}
SE_mean_using_pooled_proportion(.08,11545,.088,4691)
## [1] 0.004758691
SE_mean_using_separated_proportions(.08,11545,.088,4691)
## [1] 0.004845984
#Whether we use the separated or pooled proportions to calculate SE of the mean, we get a value around .0048.
c(.008 - (1.96*.0048),.008 + (1.96*.0048))*100
## [1] -0.1408 1.7408
Within the 95% confidence interval, the percentage of sleep-deprived Californians as compared to Oregonians could be anywhere from the percentage of Californians who are sleep deprived being 0.14 points higher, to the percentage of Oregonians who are sleep deprived being 1.74 points higher. Thus while the data lean toward the idea of Oregonians being sleep deprived more frequently, the result is not statistically significant. The confidence interval includes the null hypothesis (that the percentages in the two populations are equal), so we must retain the null hypothesis that the sleep deprivation rates in the two states are equal.
found <- c(4,16,67,345)
expected <- c(.048,.147,.396,.409)*426
expected
## [1] 20.448 62.622 168.696 174.234
differences_squared <- (found - expected)^2
X_squared <- sum(differences_squared/expected)
X_squared
## [1] 276.6135
1 - pchisq(X_squared,df=3)
## [1] 0
chisq.test(found, p = c(.048,.147,.396,.409))
##
## Chi-squared test for given probabilities
##
## data: found
## X-squared = 272.69, df = 3, p-value < 2.2e-16
found <- data.frame(Yes = c(670,373,905,564,95),No = c(11545,6244,16329,11726,2288),row.names=c("Lt.1.a.wk","Btwn.2.and.6.a.wk","1.a.day","Btwn.2.and.3.a.day","4.or.more.a.day"))
found <- t(found)
expected <- data.frame(matrix(NA,nrow=nrow(found),ncol=ncol(found)))
for(i in 1:nrow(found))
{
for(j in 1:ncol(found))
{
expected[i,j] <- (rowSums(found)[i]*colSums(found)[j])/sum(colSums(found))
}
}
expected
## X1 X2 X3 X4 X5
## 1 627.614 339.9854 885.4932 631.4675 122.44
## 2 11587.386 6277.0146 16348.5068 11658.5325 2260.56
X_squared <- (found - expected)^2/expected
X_squared
## X1 X2 X3 X4 X5
## 1 2.8625493 3.2059144 0.4297225 7.2083913 6.1495551
## 2 0.1550458 0.1736437 0.0232753 0.3904321 0.3330817
X_squared <- sum(colSums(X_squared))
X_squared
## [1] 20.93161
1 - pchisq(X_squared,df=4)
## [1] 0.0003267104
chisq.test(found)
##
## Pearson's Chi-squared test
##
## data: found
## X-squared = 20.932, df = 4, p-value = 0.0003267
depression_rate <- sum(found["Yes",])/sum(colSums(found))
depression_rate
## [1] 0.05138059