# Module 09 exercises
# This document uses base R only, so no extra packages are required.
set.seed(5680)
options(digits = 4)
alpha <- 0.05
interpret_p <- function(p_value) {
if (p_value < alpha) {
"significant at alpha = 0.05"
} else {
"not significant at alpha = 0.05"
}
}
deer_candidates <- c(
"C:/Users/yolom/OneDrive/Desktop/GEOG5680/Module 9/Deer.csv",
"Deer.csv",
"./data/Deer.csv"
)
deer_file <- deer_candidates[file.exists(deer_candidates)][1]
if (is.na(deer_file)) {
stop("Deer.csv was not found. Update deer_candidates with the correct file path.")
}
deer <- read.csv(deer_file, na.strings = c("", "NA"))GEOG 5680 Module 09: Simple Inference Tests
Exercise 1: t-tests for actor heights
Run a t-test to compare the Legolas actors to the set of Aragorns and then the set of Gimlis. Do you find evidence for significant differences?
# Recreate the simulated actor data from the lab.
aragorn <- rnorm(50, mean = 180, sd = 10)
gimli <- rnorm(50, mean = 132, sd = 15)
legolas <- rnorm(50, mean = 195, sd = 15)
summary(data.frame(aragorn, gimli, legolas)) aragorn gimli legolas
Min. :154 Min. :112 Min. :167
1st Qu.:169 1st Qu.:127 1st Qu.:188
Median :176 Median :134 Median :197
Mean :178 Mean :136 Mean :196
3rd Qu.:185 3rd Qu.:145 3rd Qu.:205
Max. :210 Max. :164 Max. :223
hist(aragorn,
main = "Aragorn actor heights",
xlab = "Height (cm)",
col = "gray85")hist(gimli,
main = "Gimli actor heights",
xlab = "Height (cm)",
col = "gray85")hist(legolas,
main = "Legolas actor heights",
xlab = "Height (cm)",
col = "gray85")t_legolas_aragorn <- t.test(legolas, aragorn, alternative = "two.sided")
t_legolas_gimli <- t.test(legolas, gimli, alternative = "two.sided")
t_legolas_aragorn
Welch Two Sample t-test
data: legolas and aragorn
t = 7.1, df = 98, p-value = 2e-10
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
13.34 23.62
sample estimates:
mean of x mean of y
196.1 177.6
t_legolas_gimli
Welch Two Sample t-test
data: legolas and gimli
t = 22, df = 98, p-value <2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
55.18 65.91
sample estimates:
mean of x mean of y
196.1 135.6
cat("Legolas vs Aragorn: p =", t_legolas_aragorn$p.value,
"so the difference is", interpret_p(t_legolas_aragorn$p.value), ".\n")Legolas vs Aragorn: p = 1.713e-10 so the difference is significant at alpha = 0.05 .
cat("Legolas vs Gimli: p =", t_legolas_gimli$p.value,
"so the difference is", interpret_p(t_legolas_gimli$p.value), ".\n")Legolas vs Gimli: p = 2.836e-40 so the difference is significant at alpha = 0.05 .
Answer: With the fixed seed used here, both comparisons provide evidence for significant differences in mean height if their p-values are below 0.05.
Exercise 2: F-test for Gimli and Legolas variances
Re-run the variance test to compare the group of Gimli and Legolas actors. Do these groups have different variance?
var_gimli_legolas <- var.test(gimli, legolas)
var_gimli_legolas
F test to compare two variances
data: gimli and legolas
F = 1.1, num df = 49, denom df = 49, p-value = 0.8
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
0.611 1.897
sample estimates:
ratio of variances
1.077
cat("Gimli vs Legolas variance test: p =", var_gimli_legolas$p.value,
"so the variance difference is", interpret_p(var_gimli_legolas$p.value), ".\n")Gimli vs Legolas variance test: p = 0.797 so the variance difference is not significant at alpha = 0.05 .
Answer: If the p-value is below 0.05, reject the null hypothesis that the two variances are equal. If it is above 0.05, there is not enough evidence to say the variances are different.
Exercise 3: Iris correlations by species
Redo the correlation for Sepal Length and Sepal Width for the Iris dataset, but for the three individual species. Are these correlated?
data(iris)
iris_cor_tests <- lapply(
split(iris, iris$Species),
function(x) cor.test(x$Sepal.Length, x$Sepal.Width)
)
iris_cor_summary <- data.frame(
Species = names(iris_cor_tests),
correlation = sapply(iris_cor_tests, function(x) unname(x$estimate)),
t_value = sapply(iris_cor_tests, function(x) unname(x$statistic)),
df = sapply(iris_cor_tests, function(x) unname(x$parameter)),
p_value = sapply(iris_cor_tests, function(x) x$p.value),
conf_low = sapply(iris_cor_tests, function(x) x$conf.int[1]),
conf_high = sapply(iris_cor_tests, function(x) x$conf.int[2])
)
iris_cor_summary Species correlation t_value df p_value conf_low conf_high
setosa setosa 0.7425 7.681 48 6.710e-10 0.5851 0.8460
versicolor versicolor 0.5259 4.284 48 8.772e-05 0.2900 0.7016
virginica virginica 0.4572 3.562 48 8.435e-04 0.2050 0.6525
iris_cor_tests$setosa
Pearson's product-moment correlation
data: x$Sepal.Length and x$Sepal.Width
t = 7.7, df = 48, p-value = 7e-10
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5851 0.8460
sample estimates:
cor
0.7425
iris_cor_tests$versicolor
Pearson's product-moment correlation
data: x$Sepal.Length and x$Sepal.Width
t = 4.3, df = 48, p-value = 9e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.2900 0.7016
sample estimates:
cor
0.5259
iris_cor_tests$virginica
Pearson's product-moment correlation
data: x$Sepal.Length and x$Sepal.Width
t = 3.6, df = 48, p-value = 8e-04
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.2050 0.6525
sample estimates:
cor
0.4572
for (i in seq_len(nrow(iris_cor_summary))) {
cat(iris_cor_summary$Species[i], ": r =",
iris_cor_summary$correlation[i],
", p = ", iris_cor_summary$p_value[i],
" so the correlation is ",
interpret_p(iris_cor_summary$p_value[i]), ".\n", sep = "")
}setosa: r =0.7425, p = 6.71e-10 so the correlation is significant at alpha = 0.05.
versicolor: r =0.5259, p = 8.772e-05 so the correlation is significant at alpha = 0.05.
virginica: r =0.4572, p = 0.0008435 so the correlation is significant at alpha = 0.05.
Answer: Species with p-values below 0.05 show significant correlation between Sepal Length and Sepal Width. The sign of correlation shows whether the relationship is positive or negative.
Exercise 4: Deer chi-squared tests
Using the deer dataset and chisq.test(), test:
- If there are significant differences in the number of deer caught per month.
- If the cases of tuberculosis are uniformly distributed across all farms.
Deer caught per month
deer_month <- deer[!is.na(deer$Month), ]
month_counts <- table(factor(deer_month$Month, levels = 1:12))
month_counts
1 2 3 4 5 6 7 8 9 10 11 12
256 165 27 3 2 35 11 19 58 168 189 188
chisq_month <- chisq.test(month_counts)
chisq_month
Chi-squared test for given probabilities
data: month_counts
X-squared = 997, df = 11, p-value <2e-16
cat("Deer caught by month: p =", chisq_month$p.value,
"so monthly catch totals are", interpret_p(chisq_month$p.value), ".\n")Deer caught by month: p = 8.2e-207 so monthly catch totals are significant at alpha = 0.05 .
Answer: A significant result means deer captures are not uniformly distributed across months.
Tuberculosis cases across farms
all_farms <- sort(unique(deer$Farm[!is.na(deer$Farm)]))
tb_cases <- deer[!is.na(deer$Tb) & deer$Tb == 1 & !is.na(deer$Farm), ]
tb_farm_counts <- table(factor(tb_cases$Farm, levels = all_farms))
tb_farm_counts
AL AU BA BE CB CRC HB LCV LN MAN MB MO NC NV PA PN
3 0 5 0 3 0 1 1 6 24 5 31 4 1 0 0
QM RF RN RO SAL SAU SE TI TN VISO VY
7 1 0 0 1 0 10 0 2 1 4
chisq_tb_farm <- chisq.test(tb_farm_counts)Warning in chisq.test(tb_farm_counts): Chi-squared approximation may be
incorrect
chisq_tb_farm
Chi-squared test for given probabilities
data: tb_farm_counts
X-squared = 340, df = 26, p-value <2e-16
cat("Tuberculosis cases by farm: p =", chisq_tb_farm$p.value,
"so TB cases across farms are", interpret_p(chisq_tb_farm$p.value), ".\n")Tuberculosis cases by farm: p = 2.252e-56 so TB cases across farms are significant at alpha = 0.05 .
Answer: A significant result means tuberculosis cases are not uniformly distributed across farms.