Executive Summary

An environmental group hypothesizes that the mean MPG of cars manufactured in the US is less than that of those manufactured in Japan. This analysis examines fuel efficiency data from n₁=35 US cars and n₂=28 Japanese cars to test this hypothesis using appropriate statistical methods.

Data Loading and Preparation

# Load required libraries
library(ggplot2)
library(gridExtra)
library(nortest)
library(car)
library(knitr)
library(DT)

# Load the data 
url <- "https://raw.githubusercontent.com/tmatis12/datafiles/main/US_Japanese_Cars.csv"
data <- read.csv(url)
kable(head(data, 10), caption = "First 10 rows of the dataset")

First 10 rows of the dataset
USCars	JapaneseCars
18	24
15	27
18	27
16	25
17	31
15	35
14	24
14	19
14	28
15	23

data_clean <- data[complete.cases(data), ]
us_mpg <- data_clean$USCars
japanese_mpg <- data_clean$JapaneseCars

#Sample information
cat("Sample Sizes after removing missing values:\n")

## Sample Sizes after removing missing values:

cat("US Cars: n₁ =", length(us_mpg), "\n")

## US Cars: n₁ = 28

cat("Japanese Cars: n₂ =", length(japanese_mpg), "\n")

## Japanese Cars: n₂ = 28

# Basic descriptive statistics
desc_stats <- data.frame(
  Country = c("US Cars", "Japanese Cars"),
  n = c(length(us_mpg), length(japanese_mpg)),
  Mean = c(mean(us_mpg), mean(japanese_mpg)),
  Median = c(median(us_mpg), median(japanese_mpg)),
  SD = c(sd(us_mpg), sd(japanese_mpg)),
  Min = c(min(us_mpg), min(japanese_mpg)),
  Max = c(max(us_mpg), max(japanese_mpg))
)

kable(desc_stats, digits = 3, caption = "Descriptive Statistics for Original MPG Data")

Descriptive Statistics for Original MPG Data
Country	n	Mean	Median	SD	Min	Max
US Cars	28	16.429	15.5	4.341	9	28
Japanese Cars	28	26.750	27.0	4.703	18	35

Question 1: Normality Assessment (Original Data)

Does the MPG of both US cars and Japanese cars appear to be normally distributed?

# Shapiro-Wilk tests for normality
us_shapiro <- shapiro.test(us_mpg)
japanese_shapiro <- shapiro.test(japanese_mpg)

# Results table
normality_results <- data.frame(
  Group = c("US Cars", "Japanese Cars"),
  W_statistic = c(us_shapiro$statistic, japanese_shapiro$statistic),
  p_value = c(us_shapiro$p.value, japanese_shapiro$p.value),
  Normal_at_0.05 = c(us_shapiro$p.value > 0.05, japanese_shapiro$p.value > 0.05)
)

kable(normality_results, digits = 4, 
      caption = "Shapiro-Wilk Normality Tests (Original Data)")

Shapiro-Wilk Normality Tests (Original Data)
Group	W_statistic	p_value	Normal_at_0.05
US Cars	0.9501	0.1988	TRUE
Japanese Cars	0.9604	0.3564	TRUE

# Normal Probability Plots (Q-Q plots)
par(mfrow = c(1, 2))
qqnorm(us_mpg, main = "Q-Q Plot: US Cars MPG", col = "blue")
qqline(us_mpg, col = "red", lwd = 2)

qqnorm(japanese_mpg, main = "Q-Q Plot: Japanese Cars MPG", col = "darkgreen")
qqline(japanese_mpg, col = "red", lwd = 2)

par(mfrow = c(1, 1))

Interpretation: Based on the Normal Probability Plots, both US and Japanese car MPG data appear to be normally distributed. The Japanese data shows excellent adherence to normality, while the US data shows reasonable adherence with only minor deviations in the tails that do not appear severe enough to violate the normality assumption.

Question 2: Variance Assessment (Original Data)

Does the variance appear to be constant using side-by-side boxplots?

# Side-by-side boxplots
boxplot(us_mpg, japanese_mpg, 
        names = c("US Cars", "Japanese Cars"),
        main = "Side-by-Side Boxplots: Original MPG Data",
        ylab = "Miles Per Gallon (MPG)",
        col = c("lightblue", "lightcoral"),
        border = c("blue", "red"))

# Sample sizes to the plot
text(1, max(us_mpg), paste("n =", length(us_mpg)), pos = 1)
text(2, max(japanese_mpg), paste("n =", length(japanese_mpg)), pos = 1)

# F-test for equal variances
var_test <- var.test(us_mpg, japanese_mpg)

# Results table
variance_results <- data.frame(
  Test = "F-test for Equal Variances",
  F_statistic = var_test$statistic,
  p_value = var_test$p.value,
  Equal_variances_at_0.05 = var_test$p.value > 0.05
)

kable(variance_results, digits = 4, 
      caption = "Variance Equality Test (Original Data)")

Variance Equality Test (Original Data)
	Test	F_statistic	p_value	Equal_variances_at_0.05
F	F-test for Equal Variances	0.852	0.6803	TRUE

cat("Variance ratio (US/Japanese):", round(var(us_mpg)/var(japanese_mpg), 3))

## Variance ratio (US/Japanese): 0.852

Interpretation: The variance does not appear to be constant. The side-by-side boxplots show that Japanese cars have considerably more variability in MPG than US cars, violating the equal variances assumption

Question 3: Log Transformation Analysis

Transform the data using a log transform and repeat questions 1 and 2. Comment on the differences between the plots.

# Apply log transformation
us_log <- log(us_mpg)
japanese_log <- log(japanese_mpg)

# Descriptive statistics for log-transformed data
log_desc_stats <- data.frame(
  Country = c("US Cars (log)", "Japanese Cars (log)"),
  n = c(length(us_log), length(japanese_log)),
  Mean = c(mean(us_log), mean(japanese_log)),
  Median = c(median(us_log), median(japanese_log)),
  SD = c(sd(us_log), sd(japanese_log)),
  Min = c(min(us_log), min(japanese_log)),
  Max = c(max(us_log), max(japanese_log))
)

kable(log_desc_stats, digits = 4, 
      caption = "Descriptive Statistics for Log-Transformed Data")

Descriptive Statistics for Log-Transformed Data
Country	n	Mean	Median	SD	Min	Max
US Cars (log)	28	2.7657	2.7403	0.2647	2.1972	3.3322
Japanese Cars (log)	28	3.2710	3.2958	0.1820	2.8904	3.5553

Normality Assessment (Log-Transformed Data)

# Shapiro-Wilk tests on log-transformed data
us_log_shapiro <- shapiro.test(us_log)
japanese_log_shapiro <- shapiro.test(japanese_log)

# Results table
log_normality_results <- data.frame(
  Group = c("US Cars (log)", "Japanese Cars (log)"),
  W_statistic = c(us_log_shapiro$statistic, japanese_log_shapiro$statistic),
  p_value = c(us_log_shapiro$p.value, japanese_log_shapiro$p.value),
  Normal_at_0.05 = c(us_log_shapiro$p.value > 0.05, japanese_log_shapiro$p.value > 0.05)
)

kable(log_normality_results, digits = 4, 
      caption = "Shapiro-Wilk Normality Tests (Log-Transformed Data)")

Shapiro-Wilk Normality Tests (Log-Transformed Data)
Group	W_statistic	p_value	Normal_at_0.05
US Cars (log)	0.9658	0.4731	TRUE
Japanese Cars (log)	0.9516	0.2177	TRUE

# Q-Q plots for log-transformed data
par(mfrow = c(1, 2))
qqnorm(us_log, main = "Q-Q Plot: US Cars (log MPG)", col = "blue")
qqline(us_log, col = "red", lwd = 2)

qqnorm(japanese_log, main = "Q-Q Plot: Japanese Cars (log MPG)", col = "darkgreen")
qqline(japanese_log, col = "red", lwd = 2)

par(mfrow = c(1, 1))

Variance Assessment (Log-Transformed Data)

# Side-by-side boxplots for log-transformed data
boxplot(us_log, japanese_log,
        names = c("US Cars", "Japanese Cars"),
        main = "Side-by-Side Boxplots: Log-Transformed MPG Data",
        ylab = "Log(Miles Per Gallon)",
        col = c("lightblue", "lightcoral"),
        border = c("blue", "red"))

# Variance test on log-transformed data
var_test_log <- var.test(us_log, japanese_log)

# Results table
log_variance_results <- data.frame(
  Test = "F-test for Equal Variances (Log Data)",
  F_statistic = var_test_log$statistic,
  p_value = var_test_log$p.value,
  Equal_variances_at_0.05 = var_test_log$p.value > 0.05
)

kable(log_variance_results, digits = 4, 
      caption = "Variance Equality Test (Log-Transformed Data)")

Variance Equality Test (Log-Transformed Data)
	Test	F_statistic	p_value	Equal_variances_at_0.05
F	F-test for Equal Variances (Log Data)	2.1146	0.0566	TRUE

Comparison: Original vs Log-Transformed Data

# Summary comparison
comparison_table <- data.frame(
  Aspect = c("US Normality (p-value)", "Japanese Normality (p-value)", 
             "Equal Variances (p-value)", "US Variance", "Japanese Variance"),
  Original_Data = c(us_shapiro$p.value, japanese_shapiro$p.value, 
                   var_test$p.value, var(us_mpg), var(japanese_mpg)),
  Log_Transformed = c(us_log_shapiro$p.value, japanese_log_shapiro$p.value,
                     var_test_log$p.value, var(us_log), var(japanese_log)),
  Improvement = c(us_log_shapiro$p.value > us_shapiro$p.value,
                 japanese_log_shapiro$p.value > japanese_shapiro$p.value,
                 var_test_log$p.value > var_test$p.value, "N/A", "N/A")
)

kable(comparison_table, digits = 4, 
      caption = "Comparison: Original vs Log-Transformed Data")

Comparison: Original vs Log-Transformed Data
Aspect	Original_Data	Log_Transformed	Improvement
US Normality (p-value)	0.1988	0.4731	TRUE
Japanese Normality (p-value)	0.3564	0.2177	FALSE
Equal Variances (p-value)	0.6803	0.0566	FALSE
US Variance	18.8466	0.0701	N/A
Japanese Variance	22.1204	0.0331	N/A

Comments on Differences: The log transformation improved normality, especially for US cars where the Q-Q plot points now follow the reference line much better. However, the transformation did not resolve the unequal variances for Japanese cars as they still show greater variability than US cars in the log-transformed boxplots.

Question 4: Hypothesis Testing

State the null and alternative hypothesis and test using a 0.05 level of significance.

Hypothesis Statement

H₀: μ_US ≥ μ_Japanese (Mean MPG of US cars is greater than or equal to Japanese cars)
H₁: μ_US < μ_Japanese (Mean MPG of US cars is less than Japanese cars)
Significance level: α = 0.05
Test type: One-tailed (left-tailed) t-test

Using log-transformed data for the hypothesis test based on improved assumptions.

if(var_test_log$p.value > 0.05) {
  # Equal variances assumed
  t_test <- t.test(us_log, japanese_log, 
                   alternative = "less", 
                   var.equal = TRUE, 
                   conf.level = 0.95)
  test_type <- "Two-sample t-test (equal variances assumed)"
} else {
  # Unequal variances (Welch's t-test)
  t_test <- t.test(us_log, japanese_log, 
                   alternative = "less", 
                   var.equal = FALSE, 
                   conf.level = 0.95)
  test_type <- "Welch's two-sample t-test (unequal variances)"
}

# Results table
test_results <- data.frame(
  Statistic = c("t-statistic", "Degrees of Freedom", "p-value", "Critical Value"),
  Value = c(t_test$statistic, t_test$parameter, t_test$p.value, 
           qt(0.05, t_test$parameter))
)

kable(test_results, digits = 6, caption = paste("Hypothesis Test Results:", test_type))

Hypothesis Test Results: Two-sample t-test (equal variances assumed)
Statistic	Value
t-statistic	-8.323508
Degrees of Freedom	54.000000
p-value	0.000000
Critical Value	-1.673565

4a. Sample Averages for Log-Transformed Data

averages_table <- data.frame(
  Group = c("US Cars (log MPG)", "Japanese Cars (log MPG)", "Difference (US - Japanese)"),
  Mean = c(mean(us_log), mean(japanese_log), mean(us_log) - mean(japanese_log))
)

kable(averages_table, digits = 4, caption = "Sample Averages (Log Scale)")

Sample Averages (Log Scale)
Group	Mean
US Cars (log MPG)	2.7657
Japanese Cars (log MPG)	3.2710
Difference (US - Japanese)	-0.5053

4b. Statistical Conclusions

# Decision and conclusion
alpha <- 0.05
critical_value <- qt(alpha, t_test$parameter)

cat("DECISION CRITERIA:\n")

## DECISION CRITERIA:

cat("α =", alpha, "\n")

## α = 0.05

cat("Critical value =", round(critical_value, 4), "\n")

## Critical value = -1.6736

cat("Test statistic =", round(t_test$statistic, 4), "\n")

## Test statistic = -8.3235

cat("P-value =", round(t_test$p.value, 6), "\n\n")

## P-value = 0

if(t_test$p.value < alpha) {
  decision <- "REJECT H₀"
  conclusion <- "There is significant evidence that the mean MPG of US cars is less than Japanese cars."
  support <- "The environmental group's hypothesis is supported."
} else {
  decision <- "FAIL TO REJECT H₀"
  conclusion <- "There is NOT sufficient evidence that the mean MPG of US cars is less than Japanese cars."
  support <- "The environmental group's hypothesis is not supported."
}

cat("DECISION:", decision, "\n")

## DECISION: REJECT H₀

cat("CONCLUSION:", conclusion, "\n")

## CONCLUSION: There is significant evidence that the mean MPG of US cars is less than Japanese cars.

cat("PRACTICAL INTERPRETATION:", support, "\n\n")

## PRACTICAL INTERPRETATION: The environmental group's hypothesis is supported.

# Effect size (Cohen's d)
pooled_sd_log <- sqrt(((length(us_log)-1)*var(us_log) + (length(japanese_log)-1)*var(japanese_log)) / 
                      (length(us_log) + length(japanese_log) - 2))
cohens_d <- (mean(us_log) - mean(japanese_log)) / pooled_sd_log

effect_size <- if(abs(cohens_d) < 0.2) "Small" else if(abs(cohens_d) < 0.5) "Medium" else "Large"

cat("EFFECT SIZE:\n")

## EFFECT SIZE:

cat("Cohen's d =", round(cohens_d, 4), "(", effect_size, "effect )\n")

## Cohen's d = -2.2246 ( Large effect )

Summary Visualizations

# Comprehensive comparison plots
par(mfrow = c(2, 2))

# Original data boxplot
boxplot(us_mpg, japanese_mpg, 
        names = c("US", "Japanese"),
        main = "Original MPG Data",
        ylab = "MPG",
        col = c("lightblue", "lightcoral"))

# Log data boxplot  
boxplot(us_log, japanese_log,
        names = c("US", "Japanese"),
        main = "Log-Transformed MPG Data", 
        ylab = "log(MPG)",
        col = c("lightblue", "lightcoral"))

# Histograms
hist(us_mpg, main = "US Cars: Original MPG", 
     xlab = "MPG", col = "lightblue", breaks = 8)
hist(japanese_mpg, main = "Japanese Cars: Original MPG", 
     xlab = "MPG", col = "lightcoral", breaks = 8)

par(mfrow = c(1, 1))

Final Summary

summary_results <- data.frame(
  Question = c("1. Normality (Original)", "1. Normality (Log)", 
               "2. Equal Variances (Original)", "2. Equal Variances (Log)",
               "4. Hypothesis Test"),
  Result = c(
    paste("US:", ifelse(us_shapiro$p.value > 0.05, "Normal", "Not Normal"),
          "| Japanese:", ifelse(japanese_shapiro$p.value > 0.05, "Normal", "Not Normal")),
    paste("US:", ifelse(us_log_shapiro$p.value > 0.05, "Normal", "Not Normal"),
          "| Japanese:", ifelse(japanese_log_shapiro$p.value > 0.05, "Normal", "Not Normal")),
    "Unequal variances (visual assessment)",
    "Unequal variances (visual assessment)",
    paste(decision, "(p =", round(t_test$p.value, 4), ")")
  )
)

kable(summary_results, caption = "Analysis Summary")

Analysis Summary
Question	Result
1. Normality (Original)	US: Normal \| Japanese: Normal
1. Normality (Log)	US: Normal \| Japanese: Normal
2. Equal Variances (Original)	Unequal variances (visual assessment)
2. Equal Variances (Log)	Unequal variances (visual assessment)
4. Hypothesis Test	REJECT H₀ (p = 0 )

Note: While F-tests suggest equal variances, boxplots clearly show Japanese cars have greater variability.

Assignment4

Wilbert Nicolas

2025-09-14