In the Flight Delays Case Study in Section 1.1,
\(\mu\)UA -\(\mu\)AA = 0
\(\mu\)UA - \(\mu\)AA \(\neq 0\)
FD <- read.csv("http://www1.appstate.edu/~arnholta/Data/FlightDelays.csv")
# a.
FDA <- FD %>%
group_by(Carrier) %>%
summarize(Mean = mean(Delay), n = n()) %>%
summarize(obs.diff = diff(Mean))
sims <- 10^4 - 1
md <- numeric(sims)
for (i in 1:sims) {
index <- sample(4029, 1123, replace = FALSE)
md[i] <- mean(FD$Delay[index]) - mean(FD$Delay[-index])
}
# null distribution
hist(md)
pvalue <- ((sum(md >= FDA$obs.diff) + 1)/(sims + 1)) * 2
pvalue
[1] 2e-04
# The average delay for one airline is not the same for another airline
# b. Your code here
FDA <- FD %>%
group_by(Month) %>%
summarize(Mean = mean(Delay), n = n()) %>%
summarize(obs.diff = diff(Mean))
FDA
# A tibble: 1 x 1
obs.diff
<dbl>
1 -5.663341
sims <- 10^4 - 1
md <- numeric(sims)
for (i in 1:sims) {
index <- sample(4029, 1123, replace = FALSE)
md[i] <- mean(FD$Delay[index]) - mean(FD$Delay[-index])
}
hist(md)
pvalue <- ((sum(md <= FDA$obs.diff) + 1)/(sims + 1)) * 2
pvalue
[1] 2e-04
# We find evidence that the difference in mean delay times between the 2 months is statistically significant.
In the Flight Delays Case Study in Section 1.1, the data contain flight delays for two airlines, American Airlines and United Airlines.
# a. Your code here
N <- 10^3 - 1
result <- numeric(N)
set.seed(5)
for (i in 1:N) {
index <- sample(4029, size = 23, replace = FALSE)
result[i] <- mean(FD$Delay[index] > 20) - mean(FD$Delay[-index] > 20)
}
pvalue <- (sum(result >= FD$obs.diff) + 1)/ (N + 1)
pvalue * 2
[1] 0.002
# We find evidence to support that the difference in proportions of times that each carriers flights was delayed more than 20 minutes is statistically significant.
# b. Your code here
VAR <- tapply(FD$Delay, FD$Carrier, var)
VAR
AA UA
1606.457 2037.525
VarTest <- VAR[1] - VAR[2]
names(VarTest) <- NULL
VarTest
[1] -431.0677
# We find evidence to suggest that the flight delay lengths for United Airlines is greater than that of American Airlines.
for loop.Find three p values and they should be the same
N <- 10^4 - 1
UA.Delay <- subset(FD, select = Delay, Carrier == "UA", drop = T)
AA.Delay <- subset(FD, select = Delay, Carrier == "AA", drop = T)
observedSumUA <- sum(UA.Delay)
observedmeanUA <- mean(UA.Delay)
observedmeanDiff <- mean(UA.Delay) - mean(AA.Delay)
m <- length(UA.Delay) #number of UA observations
sumUA <- numeric(N)
meanUA <- numeric(N)
meanDiff <- numeric(N)
set.seed(0)
for (i in 1:N) {
index <- sample(4029, m, replace = FALSE)
sumUA[i] <- sum(FD$Delay[index])
meanUA[i] <- mean(FD$Delay[index])
meanDiff[i] <- mean(FD$Delay[index]) - mean(FD$Delay[-index])
}
(sum(sumUA >= observedSumUA) + 1)/(N + 1)
[1] 1e-04
(sum(meanUA >= observedmeanUA) + 1)/(N + 1)
[1] 1e-04
(sum(meanDiff >= observedmeanDiff) + 1)/(N + 1)
[1] 1e-04
In the Flight Delays Case Study in Section 1.1,
Find the 25% trimmed mean of the delay times for United Airlines and American Airlines.
Conduct a two-sided test to see if the difference in trimmed means is statistically significant.
# Your code here
ANS <- with(data = FD, tapply(Delay, Carrier, mean, trim = 0.25))
ANS
AA UA
-2.5701513 -0.7957371
obsMeanDiffCarrier <- ANS[2] - ANS[1]
obsMeanDiffCarrier
UA
1.774414
with(data = FD, table(Carrier))
Carrier
AA UA
2906 1123
N <- 10^4 - 1
MeanDiffCarrier <- numeric(N)
for (i in 1:N) {
# sample of size 1123 # of UA flights from the 4029 total
index <- sample(4029, size = 1123, replace = FALSE)
MeanDiffCarrier[i] <- mean(FD$Delay[index], trim = 0.25) - mean(FD$Delay[-index],
trim = 0.25)
}
pvalue <- ((min(sum(MeanDiffCarrier >= obsMeanDiffCarrier), sum(MeanDiffCarrier <= obsMeanDiffCarrier)) + 1)/(N + 1)) * 2
pvalue
[1] 2e-04
# We find evidence to suggest that the difference in 25% trimmed means of the delay times for United Airlines and American Airlines are statistically significant.
In the Flight Delays Case Study in Section 1.1,
Compute the proportion of times the flights in May and in June were delayed more than 20 min, and conduct a two-sided test of whether the difference between months is statistically significant.
Compute the variance of the flight delay times in May and June and then conduct a two-sided test of whether the ratio of variances is statistically significantly different from 1.
# a. Your code here
ANS <- with(data = FD, tapply(Delay > 20, Month, mean))
ANS
June May
0.1960591 0.1665833
obsMeanDiffMonth <- ANS[2] - ANS[1]
obsMeanDiffMonth
May
-0.02947582
with(data = FD, table(Month))
Month
June May
2030 1999
N <- 10^4 - 1
with(data = FD, tapply(Delay > 20, Carrier, sum))
AA UA
492 239
MeanDiffMonth <- numeric(N)
for (i in 1:N) {
# sample of size 1999 # of UA flights from the 4029 total
index <- sample(731, size = 239, replace = FALSE)
MeanDiffMonth[i] <- mean(FD$Delay[index]) - mean(FD$Delay[-index])
}
pvalue <- ((min(sum(MeanDiffMonth >= obsMeanDiffMonth), sum(MeanDiffMonth <=
obsMeanDiffMonth)) + 1)/(N + 1)) * 2
pvalue
[1] 0.9266
# We failed to find evidence to reject that the difference between May and June for times of flights delayed more than 20 minutes is statistically significant.
# b. Your code here
ANS <- with(data = FD, tapply(Delay, Month, var))
ANS
June May
2069.884 1375.786
obsMeanDiffMonth <- ANS[1]/ANS[2]
obsMeanDiffMonth
June
1.50451
with(data = FD, table(Month))
Month
June May
2030 1999
N <- 10^4 - 1
MeanDiffMonth <- numeric(N)
for (i in 1:N) {
# sample of size 1999 # of UA flights from the 4029 total
index <- sample(4029, size = 1999, replace = FALSE)
MeanDiffMonth[i] <- var(FD$Delay[index])/var(FD$Delay[-index])
}
pvalue <- ((min(sum(MeanDiffMonth >= obsMeanDiffMonth), sum(MeanDiffMonth <=
obsMeanDiffMonth)) + 1)/(N + 1)) * 2
pvalue
[1] 0.039
# We find evidence that the ratio of flight delay times in May and June are statistically significant from 1.
Research at the University of Nebraska conducted a study to investigate sex differences in dieting trends among a group of Midwestern college students (Davy et al. (2006)). Students were recruited from an introductory nutrition course during one term. Below are data from one question asked to 286 participants.
\(H_{0}\) = Gender does not affect dieting trends among Midwestern college students; they are indpendant
\(H_{A}\) = Gender affects dieting trends among Midwestern college students.
LowFatDiet
Gender Yes No
Women 35 146
Men 8 97
# a. We reject the null hypothesis. Evidence suggests that gender and diet are dependent.
chisq.test(DT, correct = FALSE)
Pearson's Chi-squared test
data: DT
X-squared = 7.1427, df = 1, p-value = 0.007527
College students in the nutrition class in Nebraska do not generalize any large population.
A national polling company conducted a survey in 2001 asking a randomly selected group of Americans of 18 years of age or older whether they supported limited use of marijuana for medicinal purposes. Here is a summary of the data:
Write down the appropriate hypothesis to test whether there is a relationship between age and support for medicinal marijuana and carry out the test.
Support
Age Against For
18-29 years old 52 172
30-49 years old 103 313
50 years or older 119 258
# Null Hypothesis: Age of Americans does no
# Alternative Hypothesis:
chisq.test(T1)
Pearson's Chi-squared test
data: T1
X-squared = 6.6814, df = 2, p-value = 0.03541
Two students went to a local supermarket and collected data on cereals; they classified by their target consumer (children versus adults) and the placement of the cereal on the shelf (bottom, middle, and top). The data are given in Cereals.
Create a table to summarize the relationship between age of target consumer and shelf location.
Conduct a chi-square test using R’s chisq.test command.
R returns a warning message. Compute the expected counts for each cell to see why.
Conduct a permutation test for independence.
Cereals <- read.csv("http://www1.appstate.edu/~arnholta/Data/Cereals.csv")
T1 <- xtabs(~Age + Shelf, data = Cereals)
T1
Shelf
Age bottom middle top
adult 2 1 14
children 7 18 1
chisq.test(T1)
Pearson's Chi-squared test
data: T1
X-squared = 28.625, df = 2, p-value = 6.083e-07
obs.stat <- chisq.test(T1)$stat
obs.stat
X-squared
28.62525
C <- Cereals %>%
group_by(Age) %>%
summarize(Mean = mean(Shelf), n = n()) %>%
summarize(obs.diff = diff(Mean))
sims <- 10^4 - 1
cc <- numeric(sims)
for (i in 1:sims) {
T2 <- xtabs(~Age + sample(Shelf), data = Cereals)
cc[i] <- chisq.test(T2)$stat
}
pvalue <- (sum(cc >= obs.stat)+1)/(sims +1)
pvalue
[1] 1e-04
# We reject that the relationship between age of target consumer and shelf location are independent.
From GSS 2002 Case Study in Section 1.6,
Create a table to summarize the relationship between gender and the person’s choice for president in the 2000 election.
Test to see if a person’s choice for president in the 2000 election is independent of gender (use chisq.test in R).
Repeat the test but use the permutation test for independence. Does your conclusion change? (Be sure to remove observations with missing values)
GSS2002 <- read.csv("http://www1.appstate.edu/~arnholta/Data/GSS2002.csv")
T1 <- xtabs(~Gender + Pres00, data = GSS2002)
T1
Pres00
Gender Bush Didnt vote Gore Nader Other
Female 459 5 492 26 3
Male 426 5 289 31 13
chisq.test(T1)
Pearson's Chi-squared test
data: T1
X-squared = 33.29, df = 4, p-value = 1.042e-06
From GSS 2002 Case Study in Section 1.6,
Create a table to summarize the relationship bewteen gender and the person’s general level of happiness (Happy).
Conduct a permutation test to see if gender and level of happiness are independent (Be sure to remove the observations with missing values).
# Your code here
T1 <- xtabs(~Gender + Happy, data = GSS2002)
T1
Happy
Gender Not too happy Pretty happy Very happy
Female 109 406 205
Male 61 378 210
chisq.test(T1)
Pearson's Chi-squared test
data: T1
X-squared = 10.96, df = 2, p-value = 0.004168
# We reject that gender and level of happiness are independent.
From GSS 2002 Case Study in Section 1.6,
Create a table to summarize the relationship between support for gun laws (GunLaw) and views on government spending on the military (SpendMilitary).
Conduct a permutation test to see if support for gun laws and views on government spending on the military are independent (Be sure to remove observations with missing values).
# Your code here
T1 <- xtabs(~GunLaw + SpendMilitary, data = GSS2002)
T1
SpendMilitary
GunLaw About right Too little Too much
Favor 168 101 72
Oppose 34 33 19
chisq.test(T1)
Pearson's Chi-squared test
data: T1
X-squared = 3.0827, df = 2, p-value = 0.2141
# We fail to reject that support for gun laws and views on government spending on the military are independent.