GEOG 5680 Module 09: Simple Inference Tests

Author

Zach Grube

# Module 09 exercises
# This document uses base R only, so no extra packages are required.

set.seed(5680)
options(digits = 4)

alpha <- 0.05

interpret_p <- function(p_value) {
  if (p_value < alpha) {
    "significant at alpha = 0.05"
  } else {
    "not significant at alpha = 0.05"
  }
}

deer_candidates <- c(
  "C:/Users/yolom/OneDrive/Desktop/GEOG5680/Module 9/Deer.csv",
  "Deer.csv",
  "./data/Deer.csv"
)

deer_file <- deer_candidates[file.exists(deer_candidates)][1]
if (is.na(deer_file)) {
  stop("Deer.csv was not found. Update deer_candidates with the correct file path.")
}

deer <- read.csv(deer_file, na.strings = c("", "NA"))

Exercise 1: t-tests for actor heights

Run a t-test to compare the Legolas actors to the set of Aragorns and then the set of Gimlis. Do you find evidence for significant differences?

# Recreate the simulated actor data from the lab.
aragorn <- rnorm(50, mean = 180, sd = 10)
gimli <- rnorm(50, mean = 132, sd = 15)
legolas <- rnorm(50, mean = 195, sd = 15)

summary(data.frame(aragorn, gimli, legolas))

    aragorn        gimli        legolas   
 Min.   :154   Min.   :112   Min.   :167  
 1st Qu.:169   1st Qu.:127   1st Qu.:188  
 Median :176   Median :134   Median :197  
 Mean   :178   Mean   :136   Mean   :196  
 3rd Qu.:185   3rd Qu.:145   3rd Qu.:205  
 Max.   :210   Max.   :164   Max.   :223

hist(aragorn,
     main = "Aragorn actor heights",
     xlab = "Height (cm)",
     col = "gray85")

hist(gimli,
     main = "Gimli actor heights",
     xlab = "Height (cm)",
     col = "gray85")

hist(legolas,
     main = "Legolas actor heights",
     xlab = "Height (cm)",
     col = "gray85")

t_legolas_aragorn <- t.test(legolas, aragorn, alternative = "two.sided")
t_legolas_gimli <- t.test(legolas, gimli, alternative = "two.sided")

t_legolas_aragorn


    Welch Two Sample t-test

data:  legolas and aragorn
t = 7.1, df = 98, p-value = 2e-10
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 13.34 23.62
sample estimates:
mean of x mean of y 
    196.1     177.6

t_legolas_gimli


    Welch Two Sample t-test

data:  legolas and gimli
t = 22, df = 98, p-value <2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 55.18 65.91
sample estimates:
mean of x mean of y 
    196.1     135.6

cat("Legolas vs Aragorn: p =", t_legolas_aragorn$p.value,
    "so the difference is", interpret_p(t_legolas_aragorn$p.value), ".\n")

Legolas vs Aragorn: p = 1.713e-10 so the difference is significant at alpha = 0.05 .

cat("Legolas vs Gimli: p =", t_legolas_gimli$p.value,
    "so the difference is", interpret_p(t_legolas_gimli$p.value), ".\n")

Legolas vs Gimli: p = 2.836e-40 so the difference is significant at alpha = 0.05 .

Answer: With the fixed seed used here, both comparisons provide evidence for significant differences in mean height if their p-values are below 0.05.

Exercise 2: F-test for Gimli and Legolas variances

Re-run the variance test to compare the group of Gimli and Legolas actors. Do these groups have different variance?

var_gimli_legolas <- var.test(gimli, legolas)
var_gimli_legolas


    F test to compare two variances

data:  gimli and legolas
F = 1.1, num df = 49, denom df = 49, p-value = 0.8
alternative hypothesis: true ratio of variances is not equal to 1
95 percent confidence interval:
 0.611 1.897
sample estimates:
ratio of variances 
             1.077

cat("Gimli vs Legolas variance test: p =", var_gimli_legolas$p.value,
    "so the variance difference is", interpret_p(var_gimli_legolas$p.value), ".\n")

Gimli vs Legolas variance test: p = 0.797 so the variance difference is not significant at alpha = 0.05 .

Answer: If the p-value is below 0.05, reject the null hypothesis that the two variances are equal. If it is above 0.05, there is not enough evidence to say the variances are different.

Exercise 3: Iris correlations by species

Redo the correlation for Sepal Length and Sepal Width for the Iris dataset, but for the three individual species. Are these correlated?

data(iris)

iris_cor_tests <- lapply(
  split(iris, iris$Species),
  function(x) cor.test(x$Sepal.Length, x$Sepal.Width)
)

iris_cor_summary <- data.frame(
  Species = names(iris_cor_tests),
  correlation = sapply(iris_cor_tests, function(x) unname(x$estimate)),
  t_value = sapply(iris_cor_tests, function(x) unname(x$statistic)),
  df = sapply(iris_cor_tests, function(x) unname(x$parameter)),
  p_value = sapply(iris_cor_tests, function(x) x$p.value),
  conf_low = sapply(iris_cor_tests, function(x) x$conf.int[1]),
  conf_high = sapply(iris_cor_tests, function(x) x$conf.int[2])
)

iris_cor_summary

              Species correlation t_value df   p_value conf_low conf_high
setosa         setosa      0.7425   7.681 48 6.710e-10   0.5851    0.8460
versicolor versicolor      0.5259   4.284 48 8.772e-05   0.2900    0.7016
virginica   virginica      0.4572   3.562 48 8.435e-04   0.2050    0.6525

iris_cor_tests$setosa


    Pearson's product-moment correlation

data:  x$Sepal.Length and x$Sepal.Width
t = 7.7, df = 48, p-value = 7e-10
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.5851 0.8460
sample estimates:
   cor 
0.7425

iris_cor_tests$versicolor


    Pearson's product-moment correlation

data:  x$Sepal.Length and x$Sepal.Width
t = 4.3, df = 48, p-value = 9e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.2900 0.7016
sample estimates:
   cor 
0.5259

iris_cor_tests$virginica


    Pearson's product-moment correlation

data:  x$Sepal.Length and x$Sepal.Width
t = 3.6, df = 48, p-value = 8e-04
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.2050 0.6525
sample estimates:
   cor 
0.4572

for (i in seq_len(nrow(iris_cor_summary))) {
  cat(iris_cor_summary$Species[i], ": r =",
      iris_cor_summary$correlation[i],
      ", p = ", iris_cor_summary$p_value[i],
      " so the correlation is ",
      interpret_p(iris_cor_summary$p_value[i]), ".\n", sep = "")
}

setosa: r =0.7425, p = 6.71e-10 so the correlation is significant at alpha = 0.05.
versicolor: r =0.5259, p = 8.772e-05 so the correlation is significant at alpha = 0.05.
virginica: r =0.4572, p = 0.0008435 so the correlation is significant at alpha = 0.05.

Answer: Species with p-values below 0.05 show significant correlation between Sepal Length and Sepal Width. The sign of correlation shows whether the relationship is positive or negative.

Exercise 4: Deer chi-squared tests

Using the deer dataset and chisq.test(), test:

If there are significant differences in the number of deer caught per month.
If the cases of tuberculosis are uniformly distributed across all farms.

Deer caught per month

deer_month <- deer[!is.na(deer$Month), ]
month_counts <- table(factor(deer_month$Month, levels = 1:12))
month_counts


  1   2   3   4   5   6   7   8   9  10  11  12 
256 165  27   3   2  35  11  19  58 168 189 188

chisq_month <- chisq.test(month_counts)
chisq_month


    Chi-squared test for given probabilities

data:  month_counts
X-squared = 997, df = 11, p-value <2e-16

cat("Deer caught by month: p =", chisq_month$p.value,
    "so monthly catch totals are", interpret_p(chisq_month$p.value), ".\n")

Deer caught by month: p = 8.2e-207 so monthly catch totals are significant at alpha = 0.05 .

Answer: A significant result means deer captures are not uniformly distributed across months.

Tuberculosis cases across farms

all_farms <- sort(unique(deer$Farm[!is.na(deer$Farm)]))
tb_cases <- deer[!is.na(deer$Tb) & deer$Tb == 1 & !is.na(deer$Farm), ]
tb_farm_counts <- table(factor(tb_cases$Farm, levels = all_farms))
tb_farm_counts


  AL   AU   BA   BE   CB  CRC   HB  LCV   LN  MAN   MB   MO   NC   NV   PA   PN 
   3    0    5    0    3    0    1    1    6   24    5   31    4    1    0    0 
  QM   RF   RN   RO  SAL  SAU   SE   TI   TN VISO   VY 
   7    1    0    0    1    0   10    0    2    1    4

chisq_tb_farm <- chisq.test(tb_farm_counts)

Warning in chisq.test(tb_farm_counts): Chi-squared approximation may be
incorrect

chisq_tb_farm


    Chi-squared test for given probabilities

data:  tb_farm_counts
X-squared = 340, df = 26, p-value <2e-16

cat("Tuberculosis cases by farm: p =", chisq_tb_farm$p.value,
    "so TB cases across farms are", interpret_p(chisq_tb_farm$p.value), ".\n")

Tuberculosis cases by farm: p = 2.252e-56 so TB cases across farms are significant at alpha = 0.05 .

Answer: A significant result means tuberculosis cases are not uniformly distributed across farms.