## Homework 4: Regression Refresher, Analyzing Data Next Steps, Transformation

1. 5 Assumptions of Regression (0.5 points per assumption)

# load data set
data <- mtcars

# test linearity
ggplot(data, aes(x = wt, y = mpg)) + 
  geom_point() + 
  geom_smooth(method = "lm", se = FALSE) + 
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

There appears to be a linear relationship between mpg and wt, so this assumption passes.

# test independence using durbinWatsonTest
cars_lm <- lm(mpg ~ wt, data = data)
cars_dw <- durbinWatsonTest(cars_lm)
cars_dw
##  lag Autocorrelation D-W Statistic p-value
##    1       0.3628798      1.251727    0.03
##  Alternative hypothesis: rho != 0
# validate independence using residuals
residuals <- as.numeric(cars_lm$residuals)
plot(1:length(residuals), residuals, xlab = "Index", ylab = "Residuals", main = "Residuals vs. Index")

The residuals plot does not show any detectable pattern, but the durbinWatsonTest has a p-value < 0.05, suggesting that independence might not hold.

# test homoscedasticity using residuals vs. fitted plot
plot(cars_lm, 1)

Spread remains roughly consistent, so homoscedasticity is validated.

# isolate the Q-Q plot to test for normality
plot(cars_lm, 2)

# validate normality with Shapiro-Wilk Normality Test
shapiro.test(cars_lm$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  cars_lm$residuals
## W = 0.94508, p-value = 0.1044

There may be a few potential outliers but the points all fall pretty close to the reference line. Shapiro-Wilks also shows p-value > 0.05, so we can assume it’s normally distributed.

# test for multicolllinearity
cor(data[, c("mpg", "wt")])
##            mpg         wt
## mpg  1.0000000 -0.8676594
## wt  -0.8676594  1.0000000
cars_model <- lm(mpg ~ ., data = data)
vif(cars_model)
##       cyl      disp        hp      drat        wt      qsec        vs        am 
## 15.373833 21.620241  9.832037  3.374620 15.164887  7.527958  4.965873  4.648487 
##      gear      carb 
##  5.357452  7.908747

The variance inflation factor for wt is > 10, suggesting strong multicollinearity.

2. Missing Data (1.5 points)

# load iris data
data(iris)
iris <- iris[,1:4]
set.seed(123) # for reproducibility
iris_missing <- iris
n <- 40
random_row_indices <- sample(1:nrow(iris), n, replace = TRUE)
random_col_indices <- sample(1:ncol(iris), n, replace = TRUE)
for (i in 1:length(random_row_indices)) {
  iris_missing[random_row_indices[i], random_col_indices[i]] <- NA
}

# check for missing values
print(colSums(is.na(iris_missing)))
## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##            9           12            8            7
# check for patterns of missingness
md.pattern(iris_missing)

##     Petal.Width Petal.Length Sepal.Length Sepal.Width   
## 121           1            1            1           1  0
## 9             1            1            1           0  1
## 5             1            1            0           1  1
## 1             1            1            0           0  2
## 5             1            0            1           1  1
## 2             1            0            0           1  2
## 4             0            1            1           1  1
## 1             0            1            1           0  2
## 1             0            1            0           0  3
## 1             0            0            1           1  2
##               7            8            9          12 36
# impute data
imputed_data <- mice(iris_missing, m = 5, maxit = 50, method = "pmm", printFlag = FALSE)

complete_data <- complete(imputed_data)

# compare original and imputed data
original = c()
imputed = c()
for (i in 1:length(random_row_indices)) {
  original = c(original, iris[random_row_indices[i], random_col_indices[i]])
  imputed = c(imputed, complete_data[random_row_indices[i], random_col_indices[i]])
}
print(data.frame(original, imputed))
##    original imputed
## 1       4.3     4.4
## 2       3.3     3.2
## 3       7.7     7.3
## 4       1.3     1.0
## 5       4.3     4.4
## 6       6.7     5.5
## 7       2.5     2.4
## 8       1.2     1.3
## 9       4.4     4.0
## 10      1.4     1.4
## 11      2.4     1.8
## 12      2.5     2.6
## 13      2.8     2.8
## 14      1.6     1.7
## 15      0.3     0.2
## 16      3.4     3.0
## 17      3.0     3.2
## 18      3.8     4.1
## 19      1.3     1.0
## 20      2.1     2.1
## 21      6.5     6.5
## 22      3.0     3.8
## 23      2.7     2.4
## 24      5.4     4.8
## 25      2.5     3.0
## 26      0.3     0.2
## 27      6.3     6.8
## 28      6.1     6.5
## 29      3.6     3.4
## 30      4.9     4.5
## 31      5.6     4.5
## 32      6.9     6.4
## 33      0.2     0.4
## 34      2.2     3.3
## 35      6.1     5.7
## 36      1.4     1.3
## 37      2.2     3.4
## 38      6.7     6.8
## 39      4.2     3.9
## 40      4.4     4.0

3. Pair Plots (2 points)

# create pair plot
pairs(data[, c("mpg", "hp", "wt")])

According to the pairs, there’s a clear, negative correlation between mpg and both hp and wt, indicating that heavier cars and those with higher horsepower tend to have lower fuel efficiency. Meanwhile, hp and wt appear to have a positive correlation, suggesting that more powerful cars also tend to be heavier.

4. Outliers (2 point)

# boxplot of hp column from mtcars dataset
boxplot(data$hp)

# define outlier threshold
outlier_threshold <- 2 * mad(data$hp)

# show outliers
print(data %>%
        filter(abs(hp - median(hp)) > outlier_threshold))
##               mpg cyl disp  hp drat   wt qsec vs am gear carb
## Maserati Bora  15   8  301 335 3.54 3.57 14.6  0  1    5    8

5. Linear Transformation (1 point)

# standardize mpg and wt columns
data_std <- data
data_std[, c("mpg", "wt")] <- scale(data[, c("mpg", "wt")])

# preview standardized data
head(data_std)
##                          mpg cyl disp  hp drat           wt  qsec vs am gear
## Mazda RX4          0.1508848   6  160 110 3.90 -0.610399567 16.46  0  1    4
## Mazda RX4 Wag      0.1508848   6  160 110 3.90 -0.349785269 17.02  0  1    4
## Datsun 710         0.4495434   4  108  93 3.85 -0.917004624 18.61  1  1    4
## Hornet 4 Drive     0.2172534   6  258 110 3.08 -0.002299538 19.44  1  0    3
## Hornet Sportabout -0.2307345   8  360 175 3.15  0.227654255 17.02  0  0    3
## Valiant           -0.3302874   6  225 105 2.76  0.248094592 20.22  1  0    3
##                   carb
## Mazda RX4            4
## Mazda RX4 Wag        4
## Datsun 710           1
## Hornet 4 Drive       1
## Hornet Sportabout    2
## Valiant              1

I used a different approach here because I wanted to ensure that the variables have a mean of 0 and a standard deviation of 1.

6. Log Transformation (1 point)

# apply log transformation to pop and gdp of gapminder dataset
gapminder <- gapminder

# filter gapminder dataset
gapminder_2007 <- gapminder %>%
  filter(year == 2007)

# plot log transformations
ggplot(gapminder_2007, aes(x = log(pop), y = log(gdpPercap))) +
         geom_point(color = "blue", size = 3) + 
         labs(x = "Population (log)", y = "GDP per Capita (log)",
              title = "GDP per Capita vs Population (2007) (Log Transformation)")

7. Box Cox Transformation (1 point)

# define variables
x <- data$hp
y <- data$mpg

# apply boxcox transformation
boxcox_results <- boxcox(y ~ x, data = data)

The Box-Cox transformation plot suggests that the optimal lambda value is around 0, which corresponds to a log transformation of the data. The 95% confidence interval includes values near 0, meaning a log transformation (or a small power transformation) would be appropriate to stabilize variance and improve normality for hp and mpg in the mtcars dataset.