Executive Summary

An environmental group hypothesizes that the mean MPG of cars manufactured in the US is less than that of those manufactured in Japan. This analysis examines fuel efficiency data from n₁=35 US cars and n₂=28 Japanese cars to test this hypothesis using appropriate statistical methods.

Data Loading and Preparation

# Load required libraries
library(ggplot2)
library(gridExtra)
library(nortest)
library(car)
library(knitr)
library(DT)
# Load the data 
url <- "https://raw.githubusercontent.com/tmatis12/datafiles/main/US_Japanese_Cars.csv"
data <- read.csv(url)
kable(head(data, 10), caption = "First 10 rows of the dataset")
First 10 rows of the dataset
USCars JapaneseCars
18 24
15 27
18 27
16 25
17 31
15 35
14 24
14 19
14 28
15 23
data_clean <- data[complete.cases(data), ]
us_mpg <- data_clean$USCars
japanese_mpg <- data_clean$JapaneseCars

#Sample information
cat("Sample Sizes after removing missing values:\n")
## Sample Sizes after removing missing values:
cat("US Cars: n₁ =", length(us_mpg), "\n")
## US Cars: n₁ = 28
cat("Japanese Cars: n₂ =", length(japanese_mpg), "\n")
## Japanese Cars: n₂ = 28
# Basic descriptive statistics
desc_stats <- data.frame(
  Country = c("US Cars", "Japanese Cars"),
  n = c(length(us_mpg), length(japanese_mpg)),
  Mean = c(mean(us_mpg), mean(japanese_mpg)),
  Median = c(median(us_mpg), median(japanese_mpg)),
  SD = c(sd(us_mpg), sd(japanese_mpg)),
  Min = c(min(us_mpg), min(japanese_mpg)),
  Max = c(max(us_mpg), max(japanese_mpg))
)

kable(desc_stats, digits = 3, caption = "Descriptive Statistics for Original MPG Data")
Descriptive Statistics for Original MPG Data
Country n Mean Median SD Min Max
US Cars 28 16.429 15.5 4.341 9 28
Japanese Cars 28 26.750 27.0 4.703 18 35

Question 1: Normality Assessment (Original Data)

Does the MPG of both US cars and Japanese cars appear to be normally distributed?

# Shapiro-Wilk tests for normality
us_shapiro <- shapiro.test(us_mpg)
japanese_shapiro <- shapiro.test(japanese_mpg)

# Results table
normality_results <- data.frame(
  Group = c("US Cars", "Japanese Cars"),
  W_statistic = c(us_shapiro$statistic, japanese_shapiro$statistic),
  p_value = c(us_shapiro$p.value, japanese_shapiro$p.value),
  Normal_at_0.05 = c(us_shapiro$p.value > 0.05, japanese_shapiro$p.value > 0.05)
)

kable(normality_results, digits = 4, 
      caption = "Shapiro-Wilk Normality Tests (Original Data)")
Shapiro-Wilk Normality Tests (Original Data)
Group W_statistic p_value Normal_at_0.05
US Cars 0.9501 0.1988 TRUE
Japanese Cars 0.9604 0.3564 TRUE
# Normal Probability Plots (Q-Q plots)
par(mfrow = c(1, 2))
qqnorm(us_mpg, main = "Q-Q Plot: US Cars MPG", col = "blue")
qqline(us_mpg, col = "red", lwd = 2)

qqnorm(japanese_mpg, main = "Q-Q Plot: Japanese Cars MPG", col = "darkgreen")
qqline(japanese_mpg, col = "red", lwd = 2)

par(mfrow = c(1, 1))

Interpretation: Based on the Normal Probability Plots, both US and Japanese car MPG data appear to be normally distributed. The Japanese data shows excellent adherence to normality, while the US data shows reasonable adherence with only minor deviations in the tails that do not appear severe enough to violate the normality assumption.

Question 2: Variance Assessment (Original Data)

Does the variance appear to be constant using side-by-side boxplots?

# Side-by-side boxplots
boxplot(us_mpg, japanese_mpg, 
        names = c("US Cars", "Japanese Cars"),
        main = "Side-by-Side Boxplots: Original MPG Data",
        ylab = "Miles Per Gallon (MPG)",
        col = c("lightblue", "lightcoral"),
        border = c("blue", "red"))

# Sample sizes to the plot
text(1, max(us_mpg), paste("n =", length(us_mpg)), pos = 1)
text(2, max(japanese_mpg), paste("n =", length(japanese_mpg)), pos = 1)

# F-test for equal variances
var_test <- var.test(us_mpg, japanese_mpg)

# Results table
variance_results <- data.frame(
  Test = "F-test for Equal Variances",
  F_statistic = var_test$statistic,
  p_value = var_test$p.value,
  Equal_variances_at_0.05 = var_test$p.value > 0.05
)

kable(variance_results, digits = 4, 
      caption = "Variance Equality Test (Original Data)")
Variance Equality Test (Original Data)
Test F_statistic p_value Equal_variances_at_0.05
F F-test for Equal Variances 0.852 0.6803 TRUE
cat("Variance ratio (US/Japanese):", round(var(us_mpg)/var(japanese_mpg), 3))
## Variance ratio (US/Japanese): 0.852

Interpretation: The variance does not appear to be constant. The side-by-side boxplots show that Japanese cars have considerably more variability in MPG than US cars, violating the equal variances assumption

Question 3: Log Transformation Analysis

Transform the data using a log transform and repeat questions 1 and 2. Comment on the differences between the plots.

# Apply log transformation
us_log <- log(us_mpg)
japanese_log <- log(japanese_mpg)

# Descriptive statistics for log-transformed data
log_desc_stats <- data.frame(
  Country = c("US Cars (log)", "Japanese Cars (log)"),
  n = c(length(us_log), length(japanese_log)),
  Mean = c(mean(us_log), mean(japanese_log)),
  Median = c(median(us_log), median(japanese_log)),
  SD = c(sd(us_log), sd(japanese_log)),
  Min = c(min(us_log), min(japanese_log)),
  Max = c(max(us_log), max(japanese_log))
)

kable(log_desc_stats, digits = 4, 
      caption = "Descriptive Statistics for Log-Transformed Data")
Descriptive Statistics for Log-Transformed Data
Country n Mean Median SD Min Max
US Cars (log) 28 2.7657 2.7403 0.2647 2.1972 3.3322
Japanese Cars (log) 28 3.2710 3.2958 0.1820 2.8904 3.5553

Normality Assessment (Log-Transformed Data)

# Shapiro-Wilk tests on log-transformed data
us_log_shapiro <- shapiro.test(us_log)
japanese_log_shapiro <- shapiro.test(japanese_log)

# Results table
log_normality_results <- data.frame(
  Group = c("US Cars (log)", "Japanese Cars (log)"),
  W_statistic = c(us_log_shapiro$statistic, japanese_log_shapiro$statistic),
  p_value = c(us_log_shapiro$p.value, japanese_log_shapiro$p.value),
  Normal_at_0.05 = c(us_log_shapiro$p.value > 0.05, japanese_log_shapiro$p.value > 0.05)
)

kable(log_normality_results, digits = 4, 
      caption = "Shapiro-Wilk Normality Tests (Log-Transformed Data)")
Shapiro-Wilk Normality Tests (Log-Transformed Data)
Group W_statistic p_value Normal_at_0.05
US Cars (log) 0.9658 0.4731 TRUE
Japanese Cars (log) 0.9516 0.2177 TRUE
# Q-Q plots for log-transformed data
par(mfrow = c(1, 2))
qqnorm(us_log, main = "Q-Q Plot: US Cars (log MPG)", col = "blue")
qqline(us_log, col = "red", lwd = 2)

qqnorm(japanese_log, main = "Q-Q Plot: Japanese Cars (log MPG)", col = "darkgreen")
qqline(japanese_log, col = "red", lwd = 2)

par(mfrow = c(1, 1))

Variance Assessment (Log-Transformed Data)

# Side-by-side boxplots for log-transformed data
boxplot(us_log, japanese_log,
        names = c("US Cars", "Japanese Cars"),
        main = "Side-by-Side Boxplots: Log-Transformed MPG Data",
        ylab = "Log(Miles Per Gallon)",
        col = c("lightblue", "lightcoral"),
        border = c("blue", "red"))

# Variance test on log-transformed data
var_test_log <- var.test(us_log, japanese_log)

# Results table
log_variance_results <- data.frame(
  Test = "F-test for Equal Variances (Log Data)",
  F_statistic = var_test_log$statistic,
  p_value = var_test_log$p.value,
  Equal_variances_at_0.05 = var_test_log$p.value > 0.05
)

kable(log_variance_results, digits = 4, 
      caption = "Variance Equality Test (Log-Transformed Data)")
Variance Equality Test (Log-Transformed Data)
Test F_statistic p_value Equal_variances_at_0.05
F F-test for Equal Variances (Log Data) 2.1146 0.0566 TRUE

Comparison: Original vs Log-Transformed Data

# Summary comparison
comparison_table <- data.frame(
  Aspect = c("US Normality (p-value)", "Japanese Normality (p-value)", 
             "Equal Variances (p-value)", "US Variance", "Japanese Variance"),
  Original_Data = c(us_shapiro$p.value, japanese_shapiro$p.value, 
                   var_test$p.value, var(us_mpg), var(japanese_mpg)),
  Log_Transformed = c(us_log_shapiro$p.value, japanese_log_shapiro$p.value,
                     var_test_log$p.value, var(us_log), var(japanese_log)),
  Improvement = c(us_log_shapiro$p.value > us_shapiro$p.value,
                 japanese_log_shapiro$p.value > japanese_shapiro$p.value,
                 var_test_log$p.value > var_test$p.value, "N/A", "N/A")
)

kable(comparison_table, digits = 4, 
      caption = "Comparison: Original vs Log-Transformed Data")
Comparison: Original vs Log-Transformed Data
Aspect Original_Data Log_Transformed Improvement
US Normality (p-value) 0.1988 0.4731 TRUE
Japanese Normality (p-value) 0.3564 0.2177 FALSE
Equal Variances (p-value) 0.6803 0.0566 FALSE
US Variance 18.8466 0.0701 N/A
Japanese Variance 22.1204 0.0331 N/A

Comments on Differences: The log transformation improved normality, especially for US cars where the Q-Q plot points now follow the reference line much better. However, the transformation did not resolve the unequal variances for Japanese cars as they still show greater variability than US cars in the log-transformed boxplots.

Question 4: Hypothesis Testing

State the null and alternative hypothesis and test using a 0.05 level of significance.

Hypothesis Statement

  • H₀: μ_US ≥ μ_Japanese (Mean MPG of US cars is greater than or equal to Japanese cars)
  • H₁: μ_US < μ_Japanese (Mean MPG of US cars is less than Japanese cars)
  • Significance level: α = 0.05
  • Test type: One-tailed (left-tailed) t-test

Using log-transformed data for the hypothesis test based on improved assumptions.

if(var_test_log$p.value > 0.05) {
  # Equal variances assumed
  t_test <- t.test(us_log, japanese_log, 
                   alternative = "less", 
                   var.equal = TRUE, 
                   conf.level = 0.95)
  test_type <- "Two-sample t-test (equal variances assumed)"
} else {
  # Unequal variances (Welch's t-test)
  t_test <- t.test(us_log, japanese_log, 
                   alternative = "less", 
                   var.equal = FALSE, 
                   conf.level = 0.95)
  test_type <- "Welch's two-sample t-test (unequal variances)"
}

# Results table
test_results <- data.frame(
  Statistic = c("t-statistic", "Degrees of Freedom", "p-value", "Critical Value"),
  Value = c(t_test$statistic, t_test$parameter, t_test$p.value, 
           qt(0.05, t_test$parameter))
)

kable(test_results, digits = 6, caption = paste("Hypothesis Test Results:", test_type))
Hypothesis Test Results: Two-sample t-test (equal variances assumed)
Statistic Value
t-statistic -8.323508
Degrees of Freedom 54.000000
p-value 0.000000
Critical Value -1.673565

4a. Sample Averages for Log-Transformed Data

averages_table <- data.frame(
  Group = c("US Cars (log MPG)", "Japanese Cars (log MPG)", "Difference (US - Japanese)"),
  Mean = c(mean(us_log), mean(japanese_log), mean(us_log) - mean(japanese_log))
)

kable(averages_table, digits = 4, caption = "Sample Averages (Log Scale)")
Sample Averages (Log Scale)
Group Mean
US Cars (log MPG) 2.7657
Japanese Cars (log MPG) 3.2710
Difference (US - Japanese) -0.5053

4b. Statistical Conclusions

# Decision and conclusion
alpha <- 0.05
critical_value <- qt(alpha, t_test$parameter)

cat("DECISION CRITERIA:\n")
## DECISION CRITERIA:
cat("α =", alpha, "\n")
## α = 0.05
cat("Critical value =", round(critical_value, 4), "\n")
## Critical value = -1.6736
cat("Test statistic =", round(t_test$statistic, 4), "\n")
## Test statistic = -8.3235
cat("P-value =", round(t_test$p.value, 6), "\n\n")
## P-value = 0
if(t_test$p.value < alpha) {
  decision <- "REJECT H₀"
  conclusion <- "There is significant evidence that the mean MPG of US cars is less than Japanese cars."
  support <- "The environmental group's hypothesis is supported."
} else {
  decision <- "FAIL TO REJECT H₀"
  conclusion <- "There is NOT sufficient evidence that the mean MPG of US cars is less than Japanese cars."
  support <- "The environmental group's hypothesis is not supported."
}

cat("DECISION:", decision, "\n")
## DECISION: REJECT H₀
cat("CONCLUSION:", conclusion, "\n")
## CONCLUSION: There is significant evidence that the mean MPG of US cars is less than Japanese cars.
cat("PRACTICAL INTERPRETATION:", support, "\n\n")
## PRACTICAL INTERPRETATION: The environmental group's hypothesis is supported.
# Effect size (Cohen's d)
pooled_sd_log <- sqrt(((length(us_log)-1)*var(us_log) + (length(japanese_log)-1)*var(japanese_log)) / 
                      (length(us_log) + length(japanese_log) - 2))
cohens_d <- (mean(us_log) - mean(japanese_log)) / pooled_sd_log

effect_size <- if(abs(cohens_d) < 0.2) "Small" else if(abs(cohens_d) < 0.5) "Medium" else "Large"

cat("EFFECT SIZE:\n")
## EFFECT SIZE:
cat("Cohen's d =", round(cohens_d, 4), "(", effect_size, "effect )\n")
## Cohen's d = -2.2246 ( Large effect )

Summary Visualizations

# Comprehensive comparison plots
par(mfrow = c(2, 2))

# Original data boxplot
boxplot(us_mpg, japanese_mpg, 
        names = c("US", "Japanese"),
        main = "Original MPG Data",
        ylab = "MPG",
        col = c("lightblue", "lightcoral"))

# Log data boxplot  
boxplot(us_log, japanese_log,
        names = c("US", "Japanese"),
        main = "Log-Transformed MPG Data", 
        ylab = "log(MPG)",
        col = c("lightblue", "lightcoral"))

# Histograms
hist(us_mpg, main = "US Cars: Original MPG", 
     xlab = "MPG", col = "lightblue", breaks = 8)
hist(japanese_mpg, main = "Japanese Cars: Original MPG", 
     xlab = "MPG", col = "lightcoral", breaks = 8)

par(mfrow = c(1, 1))

Final Summary

summary_results <- data.frame(
  Question = c("1. Normality (Original)", "1. Normality (Log)", 
               "2. Equal Variances (Original)", "2. Equal Variances (Log)",
               "4. Hypothesis Test"),
  Result = c(
    paste("US:", ifelse(us_shapiro$p.value > 0.05, "Normal", "Not Normal"),
          "| Japanese:", ifelse(japanese_shapiro$p.value > 0.05, "Normal", "Not Normal")),
    paste("US:", ifelse(us_log_shapiro$p.value > 0.05, "Normal", "Not Normal"),
          "| Japanese:", ifelse(japanese_log_shapiro$p.value > 0.05, "Normal", "Not Normal")),
    "Unequal variances (visual assessment)",
    "Unequal variances (visual assessment)",
    paste(decision, "(p =", round(t_test$p.value, 4), ")")
  )
)

kable(summary_results, caption = "Analysis Summary")
Analysis Summary
Question Result
1. Normality (Original) US: Normal | Japanese: Normal
1. Normality (Log) US: Normal | Japanese: Normal
2. Equal Variances (Original) Unequal variances (visual assessment)
2. Equal Variances (Log) Unequal variances (visual assessment)
4. Hypothesis Test REJECT H₀ (p = 0 )

Note: While F-tests suggest equal variances, boxplots clearly show Japanese cars have greater variability.