#6.6
#a) False, CI is for population, not samples
#b) True, CI is 46% +- 3% (43%,49%) and it inferences population, not samples.
#c) True, if we take many random samples, 95% of those sample proportions contain the population proportion.
#d) False, at 90% confidence level, Z score is 1.645 which is less than 1.96 for 95% CI. Given that margin of error at 95% is 3%, it cannot be higher than 3% as reduction in Z value reduces margin of errors.
#6.12
#a) It is sample statistic since respondents are samples.
#b)
n = 1259
z = 1.96
p = 0.48
se = sqrt(p*(1-p)/n)
CI_low = p - z*se
CI_high = p + z*se
CI_low
## [1] 0.4524028
CI_high
## [1] 0.5075972
#c)
#Since sample size (1259) is less than 10 % of U.S population, it meets the Condition of independence.
#Not only that, number of successes and failures are all higher than 10 which fulfills success-failure condition.
#As the model satisfies both conditions, we can say that the sampling distribution is approximately normal in this case.
#d)Yes, since CI_high is higher than 50%.
#6.20
#a)
p <- 0.48
z <- qnorm(0.975)
me <- 0.02
n = ((z^2)*p*(1-p))/(me^2)
n
## [1] 2397.07
#At least 2397 samples are needed.
#6.28
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
sleep<-data.frame(state=NA,sample=NA,Yes=NA,No=NA)
sleep<-rbind(sleep, c("CA","11545","8.0%","92.0%"))
sleep<-rbind(sleep, c("OR","4691","8.8%","91.2%"))
sleep <- na.omit(sleep)
row.names(sleep) <- NULL
kable(sleep, format="pandoc", align="l", row.names = NA, caption = "Sleep Deprivation")
Sleep Deprivation
| CA |
11545 |
8.0% |
92.0% |
| OR |
4691 |
8.8% |
91.2% |
n_ca=11545
n_or = 4691
P_ca = 0.08
P_or = 0.088
P_diff = P_ca - P_or
se = sqrt(P_ca*(1-P_ca)/n_ca + P_or*(1-P_or)/n_or)
z = 1.96
CI_low = P_diff - z*se
CI_high = P_diff + z*se
CI_low
## [1] -0.01749813
CI_high
## [1] 0.001498128
# Proportion of Californians who suffer sleep deprivation could be between -1.75% less than Oregonians and 0.15% more than Oregonians.
#6.44
deer<-data.frame(type=NA,woods=NA,grassplot=NA,forests=NA,other=NA,total=NA)
deer <-rbind(deer, c("Deer forage observed","4","16","61",426-(4+16+61),426))
deer<-rbind(deer, c("Deer forage observed%",paste0(round(4*100/426,4),"%"), paste0(round(16*100/426,4),"%"), paste0(round(61*100/426,4),"%"), paste0(round((426-(4+16+61))*100/426,4),"%"),"100%"))
deer <- na.omit(deer)
row.names(deer) <- NULL
kable(deer, format="pandoc", align="l", row.names = NA, caption = "Barking Deer")
Barking Deer
| Deer forage observed |
4 |
16 |
61 |
345 |
426 |
| Deer forage observed% |
0.939% |
3.7559% |
14.3192% |
80.9859% |
100% |
#a) H_0: no difference in barking deer preferring to forage in certain habitats over others.
# H_a: difference in barking deer preferring to forage in certain habitats over others.
#b) We may use chi-squre test.
#c) The model meets independece condition as each region is in different habitats.
#d)
options("scipen"=100, "digits"=4)
region<-data.frame(type=NA,woods=NA,grassplot=NA,forests=NA,other=NA,total=NA)
region<-rbind(region, c("Region Expected%","4.8%","14.7%","39.6%",paste0(100-(4.8+14.7+39.6),"%"), "100%"))
region<-rbind(region, c("Deer forage Observed%",paste0(round(4*100/426,4),"%"), paste0(round(16*100/426,4),"%"), paste0(round(61*100/426,4),"%"), paste0(round((426-(4+16+61))*100/426,4),"%"),"100%"))
region <- na.omit(region)
row.names(region) <- NULL
kable(region, format="pandoc", align="l", row.names = NA, caption = "Region - Deer forage Data")
Region - Deer forage Data
| Region Expected% |
4.8% |
14.7% |
39.6% |
40.9% |
100% |
| Deer forage Observed% |
0.939% |
3.7559% |
14.3192% |
80.9859% |
100% |
region2 <-
data.frame(type = gsub("\\%","",region$type),
woods = as.numeric(gsub("\\%","",region$woods)),
grassplot = as.numeric(gsub("\\%","",region$grassplot)),
forests = as.numeric(gsub("\\%","",region$forests)),
other = as.numeric(gsub("\\%","",region$other)),
stringsAsFactors = F)
fn_zValue <- function(x,y){return ((x-y)/sqrt(y))}
region.z <- data.frame(calc=NA,woods=NA,grassplot=NA,forests=NA,other=NA)
region.z <- rbind(region.z,
c("zValue",woods = fn_zValue(region2$woods[region2$type == "Deer forage Observed"], region2$woods[region2$type == "Region Expected"]),
grassplot = fn_zValue(region2$grassplot[region2$type == "Deer forage Observed"], region2$grassplot[region2$type == "Region Expected"]),
forests = fn_zValue(region2$forests[region2$type == "Deer forage Observed"], region2$forests[region2$type == "Region Expected"]),
other = fn_zValue(region2$other[region2$type == "Deer forage Observed"], region2$other[region2$type == "Region Expected"])))
region.z <- rbind(region.z,
c("zValue-square",woods = fn_zValue(region2$woods[region2$type == "Deer forage Observed"], region2$woods[region2$type == "Region Expected"])^2,
grassplot = fn_zValue(region2$grassplot[region2$type == "Deer forage Observed"], region2$grassplot[region2$type == "Region Expected"])^2,
forests = fn_zValue(region2$forests[region2$type == "Deer forage Observed"], region2$forests[region2$type == "Region Expected"])^2,
other = fn_zValue(region2$other[region2$type == "Deer forage Observed"], region2$other[region2$type == "Region Expected"])^2))
region.z <- na.omit(region.z)
row.names(region.z) <- NULL
region.z <- data.frame(region.z, stringsAsFactors = F)
region.z$woods <- as.numeric(region.z$woods)
region.z$grassplot <- as.numeric(region.z$grassplot)
region.z$forests <- as.numeric(region.z$forests)
region.z$other <- as.numeric(region.z$other)
kable(region.z, format="pandoc", align="l", row.names = NA, caption = "Region - Deer forage Z-Value Data")
Region - Deer forage Z-Value Data
| zValue |
-1.762 |
-2.854 |
-4.017 |
6.268 |
| zValue-square |
3.106 |
8.148 |
16.139 |
39.288 |
chisqvalue <- region.z$woods[region.z$calc == "zValue-square"] + region.z$grassplot[region.z$calc == "zValue-square"] + region.z$forests[region.z$calc == "zValue-square"] + region.z$other[region.z$calc == "zValue-square"]
df <- 4 - 1
p_value <- 1 - pchisq(chisqvalue, 3)
p_value
## [1] 0.00000000000002187
#P value is less than 0.05 so we reject null hypothesis. There is difference in barking deer preferring to forage in certain habitats over others.
#6.48
#a) chi-square test can be used.
#b)H_0: no relationship between caffeinated coffee consumption and risk of depression in women
### H_A: there is relationship between caffeinated coffee consumption and risk of depression in women
#c)
p_nodep = 48132/50739
p_dep = 1 - p_nodep
p_nodep
## [1] 0.9486
p_dep
## [1] 0.05138
#d)
expected = (2607 * 6617) / 50739
expected
## [1] 340
chi_2to6 = (373 - 340)^2 / 340
chi_2to6
## [1] 3.203
#e)
p_value <- 1 - pchisq(20.93, 4)
p_value
## [1] 0.000327
#f)
#since p-value is less than 0.05, we reject null hypothesis.
#g)
#Since there could be confounding variables that can affect the results, we should be cautious when it comes to interpreting final results. It might not be the coffee that causes depression; it could be that someone who is depressed, perhaps, drink more coffee from the beginning.