data606.hw6

#6.6
#a) False, CI is for population, not samples
#b) True, CI is 46% +- 3% (43%,49%) and it inferences population, not samples.
#c) True, if we take many random samples, 95% of those sample proportions contain the population proportion.
#d) False, at 90% confidence level, Z score is 1.645 which is less than 1.96 for 95% CI. Given that margin of error at 95% is 3%, it cannot be higher than 3% as reduction in Z value reduces margin of errors.

#6.12
#a) It is sample statistic since respondents are samples.
#b) 
n = 1259
z = 1.96
p = 0.48
se = sqrt(p*(1-p)/n)
CI_low = p - z*se
CI_high = p + z*se
CI_low

## [1] 0.4524028

CI_high

## [1] 0.5075972

#c)
#Since sample size (1259) is less than 10 % of U.S population, it meets the Condition of independence.
#Not only that, number of successes and failures are all higher than 10 which fulfills success-failure condition.
#As the model satisfies both conditions, we can say that the sampling distribution is approximately normal in this case.
#d)Yes, since CI_high is higher than 50%.

#6.20
#a) 
p <- 0.48
z <- qnorm(0.975)
me <- 0.02

n = ((z^2)*p*(1-p))/(me^2)
n

## [1] 2397.07

#At least 2397 samples are needed.

#6.28
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)

sleep<-data.frame(state=NA,sample=NA,Yes=NA,No=NA)
sleep<-rbind(sleep, c("CA","11545","8.0%","92.0%"))
sleep<-rbind(sleep, c("OR","4691","8.8%","91.2%"))
sleep <- na.omit(sleep)
row.names(sleep) <- NULL

kable(sleep, format="pandoc", align="l", row.names = NA, caption = "Sleep Deprivation")

Sleep Deprivation
state	sample	Yes	No
CA	11545	8.0%	92.0%
OR	4691	8.8%	91.2%

n_ca=11545
n_or = 4691
P_ca = 0.08
P_or = 0.088
P_diff = P_ca - P_or
se = sqrt(P_ca*(1-P_ca)/n_ca + P_or*(1-P_or)/n_or)
z = 1.96
CI_low = P_diff - z*se 
CI_high = P_diff + z*se 
CI_low

## [1] -0.01749813

CI_high

## [1] 0.001498128

# Proportion of Californians who suffer sleep deprivation could be between -1.75% less than Oregonians and 0.15% more than Oregonians.

#6.44
deer<-data.frame(type=NA,woods=NA,grassplot=NA,forests=NA,other=NA,total=NA)

deer <-rbind(deer, c("Deer forage observed","4","16","61",426-(4+16+61),426))

deer<-rbind(deer, c("Deer forage observed%",paste0(round(4*100/426,4),"%"), paste0(round(16*100/426,4),"%"), paste0(round(61*100/426,4),"%"), paste0(round((426-(4+16+61))*100/426,4),"%"),"100%"))

deer <- na.omit(deer)
row.names(deer) <- NULL
kable(deer, format="pandoc", align="l", row.names = NA, caption = "Barking Deer")

Barking Deer
type	woods	grassplot	forests	other	total
Deer forage observed	4	16	61	345	426
Deer forage observed%	0.939%	3.7559%	14.3192%	80.9859%	100%

#a) H_0: no difference in barking deer preferring to forage in certain habitats over others. 
# H_a: difference in barking deer preferring to forage in certain habitats over others. 

#b) We may use chi-squre test. 
#c) The model meets independece condition as each region is in different habitats.
#d) 
options("scipen"=100, "digits"=4)

region<-data.frame(type=NA,woods=NA,grassplot=NA,forests=NA,other=NA,total=NA)
region<-rbind(region, c("Region Expected%","4.8%","14.7%","39.6%",paste0(100-(4.8+14.7+39.6),"%"), "100%"))
region<-rbind(region, c("Deer forage Observed%",paste0(round(4*100/426,4),"%"), paste0(round(16*100/426,4),"%"), paste0(round(61*100/426,4),"%"), paste0(round((426-(4+16+61))*100/426,4),"%"),"100%"))
region <- na.omit(region)
row.names(region) <- NULL
kable(region, format="pandoc", align="l", row.names = NA, caption = "Region - Deer forage Data")

Region - Deer forage Data
type	woods	grassplot	forests	other	total
Region Expected%	4.8%	14.7%	39.6%	40.9%	100%
Deer forage Observed%	0.939%	3.7559%	14.3192%	80.9859%	100%

region2 <- 
  data.frame(type = gsub("\\%","",region$type), 
             woods = as.numeric(gsub("\\%","",region$woods)),
             grassplot = as.numeric(gsub("\\%","",region$grassplot)),
             forests = as.numeric(gsub("\\%","",region$forests)),
             other = as.numeric(gsub("\\%","",region$other)),
             stringsAsFactors = F)

fn_zValue <- function(x,y){return ((x-y)/sqrt(y))}

region.z <- data.frame(calc=NA,woods=NA,grassplot=NA,forests=NA,other=NA)

region.z <- rbind(region.z, 
                          c("zValue",woods = fn_zValue(region2$woods[region2$type == "Deer forage Observed"], region2$woods[region2$type == "Region Expected"]),
                          grassplot = fn_zValue(region2$grassplot[region2$type == "Deer forage Observed"], region2$grassplot[region2$type == "Region Expected"]),
                          forests = fn_zValue(region2$forests[region2$type == "Deer forage Observed"], region2$forests[region2$type == "Region Expected"]),
                          other = fn_zValue(region2$other[region2$type == "Deer forage Observed"], region2$other[region2$type == "Region Expected"])))


region.z <- rbind(region.z, 
                          c("zValue-square",woods = fn_zValue(region2$woods[region2$type == "Deer forage Observed"], region2$woods[region2$type == "Region Expected"])^2,
                          grassplot = fn_zValue(region2$grassplot[region2$type == "Deer forage Observed"], region2$grassplot[region2$type == "Region Expected"])^2,
                          forests = fn_zValue(region2$forests[region2$type == "Deer forage Observed"], region2$forests[region2$type == "Region Expected"])^2,
                          other = fn_zValue(region2$other[region2$type == "Deer forage Observed"], region2$other[region2$type == "Region Expected"])^2))

region.z <- na.omit(region.z)
row.names(region.z) <- NULL
region.z <- data.frame(region.z, stringsAsFactors = F)
region.z$woods <- as.numeric(region.z$woods)
region.z$grassplot <- as.numeric(region.z$grassplot)
region.z$forests <- as.numeric(region.z$forests)
region.z$other <- as.numeric(region.z$other)

kable(region.z, format="pandoc", align="l", row.names = NA, caption = "Region - Deer forage Z-Value Data")

Region - Deer forage Z-Value Data
calc	woods	grassplot	forests	other
zValue	-1.762	-2.854	-4.017	6.268
zValue-square	3.106	8.148	16.139	39.288

chisqvalue <- region.z$woods[region.z$calc == "zValue-square"] + region.z$grassplot[region.z$calc == "zValue-square"] + region.z$forests[region.z$calc == "zValue-square"] + region.z$other[region.z$calc == "zValue-square"]


df <- 4 - 1


p_value <- 1 - pchisq(chisqvalue, 3)
p_value

## [1] 0.00000000000002187

#P value is less than 0.05 so we reject null hypothesis. There is difference in barking deer preferring to forage in certain habitats over others.

#6.48
#a) chi-square test can be used.
#b)H_0: no relationship between caffeinated coffee consumption and risk of depression in women
### H_A: there is relationship between caffeinated coffee consumption and risk of depression in women
#c)
p_nodep = 48132/50739
p_dep = 1 - p_nodep

p_nodep

## [1] 0.9486

p_dep

## [1] 0.05138

#d)
expected = (2607 * 6617) / 50739
expected

## [1] 340

chi_2to6 = (373 - 340)^2 / 340
chi_2to6

## [1] 3.203

#e)
p_value <- 1 - pchisq(20.93, 4)
p_value

## [1] 0.000327

#f)
#since p-value is less than 0.05, we reject null hypothesis.

#g)
#Since there could be confounding variables that can affect the results, we should be cautious when it comes to interpreting final results. It might not be the coffee that causes depression; it could be that someone who is depressed, perhaps, drink more coffee from the beginning.

data606.hw6

Sang Yoon (Andy) Hwang

2018-04-08