Executive Summary
An environmental group hypothesizes that the mean MPG of cars
manufactured in the US is less than that of those manufactured in Japan.
This analysis examines fuel efficiency data from n₁=35 US cars and n₂=28
Japanese cars to test this hypothesis using appropriate statistical
methods.
Data Loading and Preparation
# Load required libraries
library(ggplot2)
library(gridExtra)
library(nortest)
library(car)
library(knitr)
library(DT)
# Load the data
url <- "https://raw.githubusercontent.com/tmatis12/datafiles/main/US_Japanese_Cars.csv"
data <- read.csv(url)
kable(head(data, 10), caption = "First 10 rows of the dataset")
First 10 rows of the dataset
18 |
24 |
15 |
27 |
18 |
27 |
16 |
25 |
17 |
31 |
15 |
35 |
14 |
24 |
14 |
19 |
14 |
28 |
15 |
23 |
data_clean <- data[complete.cases(data), ]
us_mpg <- data_clean$USCars
japanese_mpg <- data_clean$JapaneseCars
#Sample information
cat("Sample Sizes after removing missing values:\n")
## Sample Sizes after removing missing values:
cat("US Cars: n₁ =", length(us_mpg), "\n")
## US Cars: n₁ = 28
cat("Japanese Cars: n₂ =", length(japanese_mpg), "\n")
## Japanese Cars: n₂ = 28
# Basic descriptive statistics
desc_stats <- data.frame(
Country = c("US Cars", "Japanese Cars"),
n = c(length(us_mpg), length(japanese_mpg)),
Mean = c(mean(us_mpg), mean(japanese_mpg)),
Median = c(median(us_mpg), median(japanese_mpg)),
SD = c(sd(us_mpg), sd(japanese_mpg)),
Min = c(min(us_mpg), min(japanese_mpg)),
Max = c(max(us_mpg), max(japanese_mpg))
)
kable(desc_stats, digits = 3, caption = "Descriptive Statistics for Original MPG Data")
Descriptive Statistics for Original MPG Data
US Cars |
28 |
16.429 |
15.5 |
4.341 |
9 |
28 |
Japanese Cars |
28 |
26.750 |
27.0 |
4.703 |
18 |
35 |
Question 1: Normality Assessment (Original Data)
Does the MPG of both US cars and Japanese cars appear to be normally
distributed?
# Shapiro-Wilk tests for normality
us_shapiro <- shapiro.test(us_mpg)
japanese_shapiro <- shapiro.test(japanese_mpg)
# Results table
normality_results <- data.frame(
Group = c("US Cars", "Japanese Cars"),
W_statistic = c(us_shapiro$statistic, japanese_shapiro$statistic),
p_value = c(us_shapiro$p.value, japanese_shapiro$p.value),
Normal_at_0.05 = c(us_shapiro$p.value > 0.05, japanese_shapiro$p.value > 0.05)
)
kable(normality_results, digits = 4,
caption = "Shapiro-Wilk Normality Tests (Original Data)")
Shapiro-Wilk Normality Tests (Original Data)
US Cars |
0.9501 |
0.1988 |
TRUE |
Japanese Cars |
0.9604 |
0.3564 |
TRUE |
# Normal Probability Plots (Q-Q plots)
par(mfrow = c(1, 2))
qqnorm(us_mpg, main = "Q-Q Plot: US Cars MPG", col = "blue")
qqline(us_mpg, col = "red", lwd = 2)
qqnorm(japanese_mpg, main = "Q-Q Plot: Japanese Cars MPG", col = "darkgreen")
qqline(japanese_mpg, col = "red", lwd = 2)

par(mfrow = c(1, 1))
Interpretation: Based on the Normal Probability
Plots, both US and Japanese car MPG data appear to be normally
distributed. The Japanese data shows excellent adherence to normality,
while the US data shows reasonable adherence with only minor deviations
in the tails that do not appear severe enough to violate the normality
assumption.
Question 2: Variance Assessment (Original Data)
Does the variance appear to be constant using side-by-side
boxplots?
# Side-by-side boxplots
boxplot(us_mpg, japanese_mpg,
names = c("US Cars", "Japanese Cars"),
main = "Side-by-Side Boxplots: Original MPG Data",
ylab = "Miles Per Gallon (MPG)",
col = c("lightblue", "lightcoral"),
border = c("blue", "red"))
# Sample sizes to the plot
text(1, max(us_mpg), paste("n =", length(us_mpg)), pos = 1)
text(2, max(japanese_mpg), paste("n =", length(japanese_mpg)), pos = 1)

# F-test for equal variances
var_test <- var.test(us_mpg, japanese_mpg)
# Results table
variance_results <- data.frame(
Test = "F-test for Equal Variances",
F_statistic = var_test$statistic,
p_value = var_test$p.value,
Equal_variances_at_0.05 = var_test$p.value > 0.05
)
kable(variance_results, digits = 4,
caption = "Variance Equality Test (Original Data)")
Variance Equality Test (Original Data)
F |
F-test for Equal Variances |
0.852 |
0.6803 |
TRUE |
cat("Variance ratio (US/Japanese):", round(var(us_mpg)/var(japanese_mpg), 3))
## Variance ratio (US/Japanese): 0.852
Interpretation: The variance does not appear to be
constant. The side-by-side boxplots show that Japanese cars have
considerably more variability in MPG than US cars, violating the equal
variances assumption
Question 3: Log Transformation Analysis
Transform the data using a log transform and repeat questions 1 and
2. Comment on the differences between the plots.
# Apply log transformation
us_log <- log(us_mpg)
japanese_log <- log(japanese_mpg)
# Descriptive statistics for log-transformed data
log_desc_stats <- data.frame(
Country = c("US Cars (log)", "Japanese Cars (log)"),
n = c(length(us_log), length(japanese_log)),
Mean = c(mean(us_log), mean(japanese_log)),
Median = c(median(us_log), median(japanese_log)),
SD = c(sd(us_log), sd(japanese_log)),
Min = c(min(us_log), min(japanese_log)),
Max = c(max(us_log), max(japanese_log))
)
kable(log_desc_stats, digits = 4,
caption = "Descriptive Statistics for Log-Transformed Data")
Descriptive Statistics for Log-Transformed Data
US Cars (log) |
28 |
2.7657 |
2.7403 |
0.2647 |
2.1972 |
3.3322 |
Japanese Cars (log) |
28 |
3.2710 |
3.2958 |
0.1820 |
2.8904 |
3.5553 |
Question 4: Hypothesis Testing
State the null and alternative hypothesis and test using a 0.05 level
of significance.
Hypothesis Statement
- H₀: μ_US ≥ μ_Japanese (Mean MPG of US cars is
greater than or equal to Japanese cars)
- H₁: μ_US < μ_Japanese (Mean MPG of US cars is
less than Japanese cars)
- Significance level: α = 0.05
- Test type: One-tailed (left-tailed) t-test
Using log-transformed data for the hypothesis test based on
improved assumptions.
if(var_test_log$p.value > 0.05) {
# Equal variances assumed
t_test <- t.test(us_log, japanese_log,
alternative = "less",
var.equal = TRUE,
conf.level = 0.95)
test_type <- "Two-sample t-test (equal variances assumed)"
} else {
# Unequal variances (Welch's t-test)
t_test <- t.test(us_log, japanese_log,
alternative = "less",
var.equal = FALSE,
conf.level = 0.95)
test_type <- "Welch's two-sample t-test (unequal variances)"
}
# Results table
test_results <- data.frame(
Statistic = c("t-statistic", "Degrees of Freedom", "p-value", "Critical Value"),
Value = c(t_test$statistic, t_test$parameter, t_test$p.value,
qt(0.05, t_test$parameter))
)
kable(test_results, digits = 6, caption = paste("Hypothesis Test Results:", test_type))
Hypothesis Test Results: Two-sample t-test (equal variances
assumed)
t-statistic |
-8.323508 |
Degrees of Freedom |
54.000000 |
p-value |
0.000000 |
Critical Value |
-1.673565 |
4b. Statistical Conclusions
# Decision and conclusion
alpha <- 0.05
critical_value <- qt(alpha, t_test$parameter)
cat("DECISION CRITERIA:\n")
## DECISION CRITERIA:
cat("α =", alpha, "\n")
## α = 0.05
cat("Critical value =", round(critical_value, 4), "\n")
## Critical value = -1.6736
cat("Test statistic =", round(t_test$statistic, 4), "\n")
## Test statistic = -8.3235
cat("P-value =", round(t_test$p.value, 6), "\n\n")
## P-value = 0
if(t_test$p.value < alpha) {
decision <- "REJECT H₀"
conclusion <- "There is significant evidence that the mean MPG of US cars is less than Japanese cars."
support <- "The environmental group's hypothesis is supported."
} else {
decision <- "FAIL TO REJECT H₀"
conclusion <- "There is NOT sufficient evidence that the mean MPG of US cars is less than Japanese cars."
support <- "The environmental group's hypothesis is not supported."
}
cat("DECISION:", decision, "\n")
## DECISION: REJECT H₀
cat("CONCLUSION:", conclusion, "\n")
## CONCLUSION: There is significant evidence that the mean MPG of US cars is less than Japanese cars.
cat("PRACTICAL INTERPRETATION:", support, "\n\n")
## PRACTICAL INTERPRETATION: The environmental group's hypothesis is supported.
# Effect size (Cohen's d)
pooled_sd_log <- sqrt(((length(us_log)-1)*var(us_log) + (length(japanese_log)-1)*var(japanese_log)) /
(length(us_log) + length(japanese_log) - 2))
cohens_d <- (mean(us_log) - mean(japanese_log)) / pooled_sd_log
effect_size <- if(abs(cohens_d) < 0.2) "Small" else if(abs(cohens_d) < 0.5) "Medium" else "Large"
cat("EFFECT SIZE:\n")
## EFFECT SIZE:
cat("Cohen's d =", round(cohens_d, 4), "(", effect_size, "effect )\n")
## Cohen's d = -2.2246 ( Large effect )
Summary Visualizations
# Comprehensive comparison plots
par(mfrow = c(2, 2))
# Original data boxplot
boxplot(us_mpg, japanese_mpg,
names = c("US", "Japanese"),
main = "Original MPG Data",
ylab = "MPG",
col = c("lightblue", "lightcoral"))
# Log data boxplot
boxplot(us_log, japanese_log,
names = c("US", "Japanese"),
main = "Log-Transformed MPG Data",
ylab = "log(MPG)",
col = c("lightblue", "lightcoral"))
# Histograms
hist(us_mpg, main = "US Cars: Original MPG",
xlab = "MPG", col = "lightblue", breaks = 8)
hist(japanese_mpg, main = "Japanese Cars: Original MPG",
xlab = "MPG", col = "lightcoral", breaks = 8)

par(mfrow = c(1, 1))
Final Summary
summary_results <- data.frame(
Question = c("1. Normality (Original)", "1. Normality (Log)",
"2. Equal Variances (Original)", "2. Equal Variances (Log)",
"4. Hypothesis Test"),
Result = c(
paste("US:", ifelse(us_shapiro$p.value > 0.05, "Normal", "Not Normal"),
"| Japanese:", ifelse(japanese_shapiro$p.value > 0.05, "Normal", "Not Normal")),
paste("US:", ifelse(us_log_shapiro$p.value > 0.05, "Normal", "Not Normal"),
"| Japanese:", ifelse(japanese_log_shapiro$p.value > 0.05, "Normal", "Not Normal")),
"Unequal variances (visual assessment)",
"Unequal variances (visual assessment)",
paste(decision, "(p =", round(t_test$p.value, 4), ")")
)
)
kable(summary_results, caption = "Analysis Summary")
Analysis Summary
1. Normality (Original) |
US: Normal | Japanese: Normal |
1. Normality (Log) |
US: Normal | Japanese: Normal |
2. Equal Variances (Original) |
Unequal variances (visual assessment) |
2. Equal Variances (Log) |
Unequal variances (visual assessment) |
4. Hypothesis Test |
REJECT H₀ (p = 0 ) |
Note: While F-tests suggest equal variances,
boxplots clearly show Japanese cars have greater variability.