df <- read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/main/US_Japanese_Cars.csv")
df$USCars
## [1] 18 15 18 16 17 15 14 14 14 15 15 14 15 14 22 18 21 21 10 10 11 9 28 25 19
## [26] 16 17 19 18 14 14 14 14 12 13
df$JapaneseCars
## [1] 24 27 27 25 31 35 24 19 28 23 27 20 22 18 20 31 32 31 32 24 26 29 24 24 33
## [26] 33 32 28 NA NA NA NA NA NA NA
qqnorm(df$USCars)
qqnorm(df$JapaneseCars)
#The Normal Probability Plot seems approximately
boxplot(df$USCars,df$JapaneseCars, names = c("US","Japan"), main ="Boxplot of car Origins",xlab= "car origin",ylab="mpg")
#US Cars show a larger spread, so variances are not equal.
df2 <- transform(df,log_USCars=log(USCars),log_JapaneseCars=log(JapaneseCars))
df2
## USCars JapaneseCars log_USCars log_JapaneseCars
## 1 18 24 2.890372 3.178054
## 2 15 27 2.708050 3.295837
## 3 18 27 2.890372 3.295837
## 4 16 25 2.772589 3.218876
## 5 17 31 2.833213 3.433987
## 6 15 35 2.708050 3.555348
## 7 14 24 2.639057 3.178054
## 8 14 19 2.639057 2.944439
## 9 14 28 2.639057 3.332205
## 10 15 23 2.708050 3.135494
## 11 15 27 2.708050 3.295837
## 12 14 20 2.639057 2.995732
## 13 15 22 2.708050 3.091042
## 14 14 18 2.639057 2.890372
## 15 22 20 3.091042 2.995732
## 16 18 31 2.890372 3.433987
## 17 21 32 3.044522 3.465736
## 18 21 31 3.044522 3.433987
## 19 10 32 2.302585 3.465736
## 20 10 24 2.302585 3.178054
## 21 11 26 2.397895 3.258097
## 22 9 29 2.197225 3.367296
## 23 28 24 3.332205 3.178054
## 24 25 24 3.218876 3.178054
## 25 19 33 2.944439 3.496508
## 26 16 33 2.772589 3.496508
## 27 17 32 2.833213 3.465736
## 28 19 28 2.944439 3.332205
## 29 18 NA 2.890372 NA
## 30 14 NA 2.639057 NA
## 31 14 NA 2.639057 NA
## 32 14 NA 2.639057 NA
## 33 14 NA 2.639057 NA
## 34 12 NA 2.484907 NA
## 35 13 NA 2.564949 NA
qqnorm(df2$log_USCars)
qqnorm(df2$log_JapaneseCars)
#The log-transformed data is now approximately normally distributed for both US and Japanese cars.
boxplot(df2$log_USCars,df2$log_JapaneseCars, names = c("US","Japan"),main = "Boxplot of Transformed value",xlab="log of Car Origin",ylab= "log of mpg")
# Read Data
cars <- read.csv("https://raw.githubusercontent.com/tmatis12/datafiles/main/US_Japanese_Cars.csv",
na.strings = c("", "NA"))
# Convert columns to numeric safely
cars$USCars <- as.numeric(cars$USCars)
cars$JapaneseCars <- as.numeric(cars$JapaneseCars)
# Remove rows with any NA values
cars_clean <- na.omit(cars)
# Take log transform
cars_clean$log_US <- log(cars_clean$USCars)
cars_clean$log_Japan <- log(cars_clean$JapaneseCars)
# Sample averages
mean_log_us <- mean(cars_clean$log_US)
mean_log_japan <- mean(cars_clean$log_Japan)
cat("Mean log(mpg) for US cars:", mean_log_us, "\n")
## Mean log(mpg) for US cars: 2.765664
cat("Mean log(mpg) for Japanese cars:", mean_log_japan, "\n")
## Mean log(mpg) for Japanese cars: 3.270957
# Welch two-sample t-test
t_test_result <- t.test(cars_clean$log_US, cars_clean$log_Japan, var.equal = FALSE)
print(t_test_result)
##
## Welch Two Sample t-test
##
## data: cars_clean$log_US and cars_clean$log_Japan
## t = -8.3235, df = 47.87, p-value = 7.325e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.6273609 -0.3832255
## sample estimates:
## mean of x mean of y
## 2.765664 3.270957
#Conclusion : At the 5% significance level, the mean log(mpg) is not equal between US and Japanese cars. Japanese cars are significantly more fuel efficient.