In the Flight Delays Case Study in Section 1.1,
FD <- read.csv("http://www1.appstate.edu/~arnholta/Data/FlightDelays.csv")
# a.
library(tidyverse)
m_delays <- FD %>%
group_by(Carrier) %>%
summarize(Mean = mean(Delay), n()) %>%
summarize(obs_diff = diff(Mean))
m_delays
# A tibble: 1 x 1
obs_diff
<dbl>
1 5.885696
sims <- 10^4 -1
ts <- numeric(sims)
for(i in 1:sims) {
index <- sample(4029, 1123, replace = FALSE)
ts[i] <- mean(FD$Delay[index]) - mean(FD$Delay[-index])
}
hist(ts)
pvalue <- ((sum(ts >= m_delays$obs_diff) + 1)/ (sims + 1)) * 2
pvalue
[1] 4e-04
\(H_0: \mu_U - \mu_A = 0\)
\(H_A: \mu_U - \mu_A\neq 0\)
There is evidence to support the claim that the average delay times between American Airlines and United Airlines are different in some way.
# b.
m_delays2 <- FD %>%
group_by(Month) %>%
summarize(Mean = mean(Delay), n()) %>%
summarize(obs_diff = diff(Mean))
m_delays2
# A tibble: 1 x 1
obs_diff
<dbl>
1 -5.663341
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims) {
index <- sample(4029, 1999, replace = FALSE)
ts[i] <- mean(FD$Delay[index]) - mean(FD$Delay[-index])
}
hist(ts)
pvalue <- ((sum(ts <= m_delays2$obs_diff) + 1)/ (sims + 1)) * 2
pvalue
[1] 2e-04
\(H_0: \mu_M - \mu_J = 0\)
\(H_A: \mu_M - \mu_J\neq 0\)
There is evidence to suggest that average delay times in May and June are different or are significant in some way.
In the Flight Delays Case Study in Section 1.1, the data contain flight delays for two airlines, American Airlines and United Airlines.
# a.
Prop <- FD %>%
group_by(Carrier) %>%
summarize(Proportion = mean(Delay > 20), n()) %>%
summarize(obs_diff = diff(Proportion))
Prop
# A tibble: 1 x 1
obs_diff
<dbl>
1 0.04351791
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims){
index <- sample(4029, 1123, replace = FALSE)
ts[i] <- mean(FD$Delay[index] > 20) - mean(FD$Delay[-index] > 20)
}
hist(ts)
pvalue <- ((sum(ts >= Prop$obs_diff) + 1)/ (sims + 1)) * 2
pvalue
[1] 0.0026
\(H_0: P_U - P_A = 0\)
\(H_A: P_U - P_A\neq 0\)
There is evidence to suggest that the proportions of flights being delayed for more than 20 minutes are different based on carrier.
# b.
Var <- FD %>%
group_by(Carrier) %>%
summarize(Variance = var(Delay), n()) %>%
summarize(obs_diff = diff(Variance))
Var
# A tibble: 1 x 1
obs_diff
<dbl>
1 431.0677
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims) {
index <- sample(4029, 1123, replace = FALSE)
ts[i] <- var(FD$Delay[index]) - var(FD$Delay[-index])
}
hist(ts)
pvalue <- (sum(ts >= Var$obs_diff) + 1)/ (sims + 1)
pvalue
[1] 0.1479
\(H_0: V_U - V_A = 0\)
\(H_A: V_U - V_A > 0\)
There is no evidence to suggest that the variance in the flight delay lengths for United Airlines is greater than the variance for American Airlines.
for loop.FDUA <- FD %>%
group_by(Carrier == "UA") %>%
summarize(Mean = mean(Delay), Sum = sum(Delay), n())
FDUA
# A tibble: 2 x 4
`Carrier == "UA"` Mean Sum `n()`
<lgl> <dbl> <int> <int>
1 FALSE 10.09738 29343 2906
2 TRUE 15.98308 17949 1123
obs <- FD %>%
group_by(Carrier) %>%
summarize(Mean = mean(Delay), n()) %>%
summarize(obs_diff = diff(Mean))
obs
# A tibble: 1 x 1
obs_diff
<dbl>
1 5.885696
sims <- 10^4 - 1
ts1 <- numeric(sims)
ts2 <- numeric(sims)
ts3 <- numeric(sims)
for(i in 1:sims) {
index <- sample(4029, 1123, replace = FALSE)
ts1[i] <- mean(FD$Delay[index])
ts2[i] <- sum(FD$Delay[index])
ts3[i] <- mean(FD$Delay[index]) - mean(FD$Delay[-index])
}
pvalue1 <- (sum(ts1 >= FDUA$Mean) + 1) / (sims + 1)
pvalue1
[1] 0.4737
pvalue2 <- (sum(ts2 >= FDUA$Sum) + 1) / (sims + 1)
pvalue2
[1] 3e-04
pvalue3 <- (sum(ts3 >= obs$obs_diff) + 1) / (sims + 1)
pvalue3
[1] 3e-04
There is enough evidence to suggest that the average delay times for United Airlines are statistically significant.
In the Flight Delays Case Study in Section 1.1,
Find the 25% trimmed mean of the delay times for United Airlines and American Airlines.
Conduct a two-sided test to see if the difference in trimmed means is statistically significant.
# a.
FD %>%
group_by(Carrier) %>%
summarize(TrimMean = mean(Delay, trim = 0.25))
# A tibble: 2 x 2
Carrier TrimMean
<fctr> <dbl>
1 AA -2.5701513
2 UA -0.7957371
# b.
Trim <- FD %>%
group_by(Carrier) %>%
summarize(TrimMean = mean(Delay, trim = 0.25), n()) %>%
summarize(obs_diff = diff(TrimMean))
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims) {
index <- sample(4029, 1123, replace = FALSE)
ts[i] <- mean(FD$Delay[index]) - mean(FD$Delay[-index])
}
hist(ts)
pvalue <- ((sum(ts >= Trim$obs_diff) + 1)/ (sims + 1)) * 2
pvalue
[1] 0.2312
\(H_0: TM_U - TM_A = 0\)
\(H_A: TM_U - TM_A\neq 0\)
There is no evidence to suggest that the difference in 25% trimmed means between United Airlines and American Airlines are different in some way.
In the Flight Delays Case Study in Section 1.1,
# a.
Prop <- FD %>%
group_by(Month) %>%
summarize(Proportion = mean(Delay > 20), n()) %>%
summarize(obs_diff = diff(Proportion))
Prop
# A tibble: 1 x 1
obs_diff
<dbl>
1 -0.02947582
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims){
index <- sample(4029, 1999, replace = FALSE)
ts[i] <- mean(FD$Delay[index] > 20) - mean(FD$Delay[-index] > 20)
}
hist(ts)
pvalue <- ((sum(ts <= Prop$obs_diff) + 1)/ (sims + 1)) * 2
pvalue
[1] 0.017
\(H_0: P_M - P_J = 0\)
\(H_A: P_M - P_J\neq 0\)
There is enough evidence to suggest that the proportions of times the flights in May and June were delayed more than 20 minutes are different in some way.
# b.
Var <- FD %>%
group_by(Month) %>%
summarize(Variance = var(Delay), n()) %>%
summarize(obs_diff = diff(Variance))
Var
# A tibble: 1 x 1
obs_diff
<dbl>
1 -694.0982
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims) {
index <- sample(4029, 1999, replace = FALSE)
ts[i] <- var(FD$Delay[index]) - var(FD$Delay[-index])
}
hist(ts)
pvalue <- ((sum(ts <= Var$obs_diff) + 1)/ (sims + 1)) * 2
pvalue
[1] 0.0358
\(H_0: V_M - V_J = 0\)
\(H_A: V_M - V_J\neq 0\)
There is evidence to suggest that the variance of flight delay times in May and June are different in some way.
Research at the University of Nebraska conducted a study to investigate sex differences in dieting trends among a group of Midwestern college students (Davy et al. (2006)). Students were recruited from an introductory nutrition course during one term. Below are data from one question asked to 286 participants.
Write down the appropriate hypothesis to test to see if there is a relationship between gender and diet and then carry out the test.
Can the resluts be generalized to a population? Explain.
LowFatDiet
Gender Yes No
Women 35 146
Men 8 97
\(H_0\): Gender and LowFatDiet are independent \(H_A\): Gender and LowFatDiet are dependent
DTT <- as.table(DT)
DTTDF <- as.data.frame(DTT)
DDF <- vcdExtra::expand.dft(DTTDF)
dim(DDF)
[1] 286 2
obs.stat <- chisq.test(xtabs(~Gender + LowFatDiet, data = DDF), correct = FALSE)$stat
obs.stat
X-squared
7.142724
sims <- 10^4 -1
ts <- numeric(sims)
for(i in 1:sims){
ts[i] <- chisq.test(xtabs(~Gender + sample(LowFatDiet), data = DDF),
correct = FALSE)$stat
}
hist(ts)
pvalue <- (sum(ts >= obs.stat) + 1)/(sims + 1)
pvalue
[1] 0.0081
There is evidence to suggest that gender and dieting are dependent variables that rely on each other for results. These results can be generalized to a population because the probability of women dieting is not the same as the probability of being a woman and dieting. This means that this sample is not independent which is exactly what we got in our chi-squared test.
A national polling company conducted a survey in 2001 asking a randomly selected group of Americans of 18 years of age or older whether they supported limited use of marijuana for medicinal purposes. Here is a summary of the data:
Write down the appropriate hypothesis to test whether there is a relationship between age and support for medicinal marijuana and carry out the test.
Support
Age Against For
18-29 years old 52 172
30-49 years old 103 313
50 years or older 119 258
\(H_0\): Age and Support are independent \(H_A\): Age and Support are dependent
chisq.test(T1, correct = FALSE)
Pearson's Chi-squared test
data: T1
X-squared = 6.6814, df = 2, p-value = 0.03541
obs <- chisq.test(T1, correct = FALSE)$stat
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims) {
ts[i] <- chisq.test(xtabs(~Age + sample(Support), data = DF), correct = FALSE)$stat
}
hist(ts)
pvalue <- (sum(ts >= obs) + 1)/ (sims + 1)
pvalue
[1] 0.0352
There is enough evidence to suggest that age and support of the use of marijuana are dependent variables and rely on each other.
Two students went to a local supermarket and collected data on cereals; they classified by their target consumer (children versus adults) and the placement of the cereal on the shelf (bottom, middle, and top). The data are given in Cereals.
Create a table to summarize the relationship between age of target consumer and shelf location.
Conduct a chi-square test using R’s chisq.test command.
R returns a warning message. Compute the expected counts for each cell to see why.
Conduct a permutation test for independence.
Cereals <- read.csv("http://www1.appstate.edu/~arnholta/Data/Cereals.csv")
# a.
T1 <- xtabs(~Age + Shelf, data = Cereals)
T1
Shelf
Age bottom middle top
adult 2 1 14
children 7 18 1
# b.
chisq.test(T1, correct = FALSE)
Warning in chisq.test(T1, correct = FALSE): Chi-squared approximation may
be incorrect
Pearson's Chi-squared test
data: T1
X-squared = 28.625, df = 2, p-value = 6.083e-07
# c.
chisq.test(T1, correct = FALSE)$exp
Warning in chisq.test(T1, correct = FALSE): Chi-squared approximation may
be incorrect
Shelf
Age bottom middle top
adult 3.55814 7.511628 5.930233
children 5.44186 11.488372 9.069767
Sends a warning because some observed and expected values are less than 5.
# d.
obs <- chisq.test(T1, correct = FALSE)$stat
obs
X-squared
28.62525
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims) {
ts[i] <- chisq.test(xtabs(~Age + sample(Shelf), data = Cereals), correct = FALSE)$stat
}
hist(ts)
pvalue <- (sum(ts >= obs) + 1)/ (sims + 1)
pvalue
[1] 1e-04
There is enough evidence to suggest that Age and Shelf are dependent of each other.
From GSS 2002 Case Study in Section 1.6,
Create a table to summarize the relationship between gender and the person’s choice for president in the 2000 election.
Test to see if a person’s choice for president in the 2000 election is independent of gender (use chisq.test in R).
Repeat the test but use the permutation test for independence. Does your conclusion change? (Be sure to remove observations with missing values)
GSS2002 <- read.csv("http://www1.appstate.edu/~arnholta/Data/GSS2002.csv")
# a.
T1 <- xtabs(~Gender + Pres00, data = GSS2002)
T1
Pres00
Gender Bush Didnt vote Gore Nader Other
Female 459 5 492 26 3
Male 426 5 289 31 13
# b.
chisq.test(T1, correct = FALSE)
Pearson's Chi-squared test
data: T1
X-squared = 33.29, df = 4, p-value = 1.042e-06
# c.
obs <- chisq.test(T1, correct = FALSE)$stat
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims) {
ts[i] <- chisq.test(xtabs(~Gender + sample(Pres00), data = GSS2002), correct = FALSE)$stat
}
hist(ts)
pvalue <- (sum(ts >= obs) + 1)/ (sims + 1)
pvalue
[1] 1e-04
The conclusion doesn’t change, we still have enough evidence to suggest that gender and the choice for president in 2000 election are dependent of each other.
From GSS 2002 Case Study in Section 1.6,
Create a table to summarize the relationship bewteen gender and the person’s general level of happiness (Happy).
Conduct a permutation test to see if gender and level of happiness are independent (Be sure to remove the observations with missing values).
# a.
T1 <- xtabs(~Gender + Happy, data = GSS2002)
T1
Happy
Gender Not too happy Pretty happy Very happy
Female 109 406 205
Male 61 378 210
# b.
obs <- chisq.test(T1, correct = FALSE)$stat
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims) {
ts[i] <- chisq.test(xtabs(~Gender + sample(Happy), data = GSS2002), correct = FALSE)$stat
}
hist(ts)
pvalue <- (sum(ts >= obs) + 1)/ (sims + 1)
pvalue
[1] 0.0041
There is enough evidence to suggest that gender and level of happiness are dependent of each other.
From GSS 2002 Case Study in Section 1.6,
Create a table to summarize the relationship between support for gun laws (GunLaw) and views on government spending on the military (SpendMilitary).
Conduct a permutation test to see if support for gun laws and views on government spending on the military are independent (Be sure to remove observations with missing values).
# a.
T1 <- xtabs(~GunLaw + SpendMilitary, data = GSS2002)
T1
SpendMilitary
GunLaw About right Too little Too much
Favor 168 101 72
Oppose 34 33 19
# b.
obs <- chisq.test(T1, correct = FALSE)$stat
sims <- 10^4 - 1
ts <- numeric(sims)
for(i in 1:sims) {
ts[i] <- chisq.test(xtabs(~GunLaw + sample(SpendMilitary), data = GSS2002), correct = FALSE)$stat
}
hist(ts)
pvalue <- (sum(ts >= obs) + 1)/ (sims + 1)
pvalue
[1] 0.2145
There isn’t sufficient evidence to suggest that support for gun laws and views on government spending on the military are dependent of each other.