## Homework 4: Regression Refresher, Analyzing Data Next
Steps, Transformation
# load data set
data <- mtcars
# test linearity
ggplot(data, aes(x = wt, y = mpg)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
There appears to be a linear relationship between mpg and wt, so this assumption passes.
# test independence using durbinWatsonTest
cars_lm <- lm(mpg ~ wt, data = data)
cars_dw <- durbinWatsonTest(cars_lm)
cars_dw
## lag Autocorrelation D-W Statistic p-value
## 1 0.3628798 1.251727 0.03
## Alternative hypothesis: rho != 0
# validate independence using residuals
residuals <- as.numeric(cars_lm$residuals)
plot(1:length(residuals), residuals, xlab = "Index", ylab = "Residuals", main = "Residuals vs. Index")
The residuals plot does not show any detectable pattern, but the durbinWatsonTest has a p-value < 0.05, suggesting that independence might not hold.
# test homoscedasticity using residuals vs. fitted plot
plot(cars_lm, 1)
Spread remains roughly consistent, so homoscedasticity is validated.
# isolate the Q-Q plot to test for normality
plot(cars_lm, 2)
# validate normality with Shapiro-Wilk Normality Test
shapiro.test(cars_lm$residuals)
##
## Shapiro-Wilk normality test
##
## data: cars_lm$residuals
## W = 0.94508, p-value = 0.1044
There may be a few potential outliers but the points all fall pretty close to the reference line. Shapiro-Wilks also shows p-value > 0.05, so we can assume it’s normally distributed.
# test for multicolllinearity
cor(data[, c("mpg", "wt")])
## mpg wt
## mpg 1.0000000 -0.8676594
## wt -0.8676594 1.0000000
cars_model <- lm(mpg ~ ., data = data)
vif(cars_model)
## cyl disp hp drat wt qsec vs am
## 15.373833 21.620241 9.832037 3.374620 15.164887 7.527958 4.965873 4.648487
## gear carb
## 5.357452 7.908747
The variance inflation factor for wt is > 10, suggesting strong multicollinearity.
# load iris data
data(iris)
iris <- iris[,1:4]
set.seed(123) # for reproducibility
iris_missing <- iris
n <- 40
random_row_indices <- sample(1:nrow(iris), n, replace = TRUE)
random_col_indices <- sample(1:ncol(iris), n, replace = TRUE)
for (i in 1:length(random_row_indices)) {
iris_missing[random_row_indices[i], random_col_indices[i]] <- NA
}
# check for missing values
print(colSums(is.na(iris_missing)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 9 12 8 7
# check for patterns of missingness
md.pattern(iris_missing)
## Petal.Width Petal.Length Sepal.Length Sepal.Width
## 121 1 1 1 1 0
## 9 1 1 1 0 1
## 5 1 1 0 1 1
## 1 1 1 0 0 2
## 5 1 0 1 1 1
## 2 1 0 0 1 2
## 4 0 1 1 1 1
## 1 0 1 1 0 2
## 1 0 1 0 0 3
## 1 0 0 1 1 2
## 7 8 9 12 36
# impute data
imputed_data <- mice(iris_missing, m = 5, maxit = 50, method = "pmm", printFlag = FALSE)
complete_data <- complete(imputed_data)
# compare original and imputed data
original = c()
imputed = c()
for (i in 1:length(random_row_indices)) {
original = c(original, iris[random_row_indices[i], random_col_indices[i]])
imputed = c(imputed, complete_data[random_row_indices[i], random_col_indices[i]])
}
print(data.frame(original, imputed))
## original imputed
## 1 4.3 4.4
## 2 3.3 3.2
## 3 7.7 7.3
## 4 1.3 1.0
## 5 4.3 4.4
## 6 6.7 5.5
## 7 2.5 2.4
## 8 1.2 1.3
## 9 4.4 4.0
## 10 1.4 1.4
## 11 2.4 1.8
## 12 2.5 2.6
## 13 2.8 2.8
## 14 1.6 1.7
## 15 0.3 0.2
## 16 3.4 3.0
## 17 3.0 3.2
## 18 3.8 4.1
## 19 1.3 1.0
## 20 2.1 2.1
## 21 6.5 6.5
## 22 3.0 3.8
## 23 2.7 2.4
## 24 5.4 4.8
## 25 2.5 3.0
## 26 0.3 0.2
## 27 6.3 6.8
## 28 6.1 6.5
## 29 3.6 3.4
## 30 4.9 4.5
## 31 5.6 4.5
## 32 6.9 6.4
## 33 0.2 0.4
## 34 2.2 3.3
## 35 6.1 5.7
## 36 1.4 1.3
## 37 2.2 3.4
## 38 6.7 6.8
## 39 4.2 3.9
## 40 4.4 4.0
# create pair plot
pairs(data[, c("mpg", "hp", "wt")])
According to the pairs, there’s a clear, negative correlation between
mpg and both hp and wt, indicating that heavier cars and those with
higher horsepower tend to have lower fuel efficiency. Meanwhile, hp and
wt appear to have a positive correlation, suggesting that more powerful
cars also tend to be heavier.
# boxplot of hp column from mtcars dataset
boxplot(data$hp)
# define outlier threshold
outlier_threshold <- 2 * mad(data$hp)
# show outliers
print(data %>%
filter(abs(hp - median(hp)) > outlier_threshold))
## mpg cyl disp hp drat wt qsec vs am gear carb
## Maserati Bora 15 8 301 335 3.54 3.57 14.6 0 1 5 8
# standardize mpg and wt columns
data_std <- data
data_std[, c("mpg", "wt")] <- scale(data[, c("mpg", "wt")])
# preview standardized data
head(data_std)
## mpg cyl disp hp drat wt qsec vs am gear
## Mazda RX4 0.1508848 6 160 110 3.90 -0.610399567 16.46 0 1 4
## Mazda RX4 Wag 0.1508848 6 160 110 3.90 -0.349785269 17.02 0 1 4
## Datsun 710 0.4495434 4 108 93 3.85 -0.917004624 18.61 1 1 4
## Hornet 4 Drive 0.2172534 6 258 110 3.08 -0.002299538 19.44 1 0 3
## Hornet Sportabout -0.2307345 8 360 175 3.15 0.227654255 17.02 0 0 3
## Valiant -0.3302874 6 225 105 2.76 0.248094592 20.22 1 0 3
## carb
## Mazda RX4 4
## Mazda RX4 Wag 4
## Datsun 710 1
## Hornet 4 Drive 1
## Hornet Sportabout 2
## Valiant 1
I used a different approach here because I wanted to ensure that the variables have a mean of 0 and a standard deviation of 1.
# apply log transformation to pop and gdp of gapminder dataset
gapminder <- gapminder
# filter gapminder dataset
gapminder_2007 <- gapminder %>%
filter(year == 2007)
# plot log transformations
ggplot(gapminder_2007, aes(x = log(pop), y = log(gdpPercap))) +
geom_point(color = "blue", size = 3) +
labs(x = "Population (log)", y = "GDP per Capita (log)",
title = "GDP per Capita vs Population (2007) (Log Transformation)")
# define variables
x <- data$hp
y <- data$mpg
# apply boxcox transformation
boxcox_results <- boxcox(y ~ x, data = data)
The Box-Cox transformation plot suggests that the optimal lambda value
is around 0, which corresponds to a log transformation of the data. The
95% confidence interval includes values near 0, meaning a log
transformation (or a small power transformation) would be appropriate to
stabilize variance and improve normality for hp and mpg in the mtcars
dataset.