library(kknn)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
data(mtcars)
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
Scatterplot shows that there is a negative correlation between mpg and hp, suggesting that cars with higher horsepower tend to have lower miles per gallon.
#scatterplots
ggplot(mtcars, aes(x = mpg, y = hp)) +
geom_point() +
labs(x = "mpg", y = "hp") +
ggtitle("Scatterplot: mpg vs. hp")
Histogram shows the distribution of mpg variable
ggplot(mtcars, aes(x=mpg)) +
geom_histogram(binwidth=5, fill="navy", color = "white") +
ggtitle("Distribution of Median House Value") +
labs(x = "mpg", y = "Frequency") +
ylab("Histogram: mpg")
Identify any missing data and impute them with the mean
missing_vals <- mtcars %>%
summarise_all(~ sum(is.na(.)))
print(missing_vals)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 0 0 0 0 0 0 0 0 0 0 0
#impute with mean
mean_impute <- mtcars %>%
mutate_all(~ ifelse(is.na(.), mean(., na.rm = TRUE), .))
lg_model <- lm(mpg ~ ., data = mtcars)
# interperting the coefficients
summary(lg_model)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
# Evaluate the model using MSE (Mean Square Error)
predictions <- predict(lg_model, newdata = mtcars)
mse <- mean((mtcars$mpg - predictions)^2)
# print mse variable
cat("Mean Squared Error (MSE):", mse, "\n")
## Mean Squared Error (MSE): 4.609201
k <- 5 # Number of neighbors
knn_fit_std <- kknn(mpg ~ ., mtcars, mtcars, k=k)
#create dataframe to compare actual and presdicted values
comparison_df <- data.frame(Actual = mtcars$mpg)
comparison_df$knn_std_predicted <- knn_fit_std$fitted.values
head(comparison_df)
## Actual knn_std_predicted
## 1 21.0 21.22292
## 2 21.0 21.22292
## 3 22.8 25.63612
## 4 21.4 20.65088
## 5 18.7 17.98029
## 6 18.1 19.93957
set.seed(1)
n <- nrow(mtcars)
index <- sample(1:n, n*0.7) #70% for training
train <- mtcars[index, ]
test <- mtcars[-index, ]
mse_df <- data.frame(k = integer(), MSE = numeric())
for (k in c(1, 3, 5, 7, 9, 11)) {
knn_model <- kknn(mpg ~ ., train, test, k = k)
mse <- mean((knn_model$fitted.values - test$mpg)^2)
mse_df <- rbind(mse_df, data.frame(k = k, MSE = mse))
}
print(mse_df)
## k MSE
## 1 1 23.725000
## 2 3 15.618086
## 3 5 11.350048
## 4 7 8.676762
## 5 9 6.971479
## 6 11 6.066938
# Linear Regression Scatterplot
plot(predictions, mtcars$mpg, pch = 19, col = "blue",
main = "Linear Regression",
xlab = "Predicted mpg", ylab = "Actual mpg")
# k-NN Scatterplot
plot(comparison_df$knn_std_predicted, comparison_df$Actual, pch = 19, col = "blue",
main = "KNN",
xlab = "Predicted mpg", ylab = "Actual mpg")
The linear regression model does not seem to capture some non-linear patterns. However, the KNN model captures the non-linear patterns more accurately. While the KNN may be more sensitive to the k values, it seems like a better option for this dataset.
Smaller values are more sensitive to local patterns but are prone to high variance due to outliers. n/ Larger values are less sensitive to outliers but mroe prone to missing local patterns
Linear regression assumes that the relationship is linear. The assumptions are met in this dataset as the scatterplot shows linearity
mtcars$interaction_term <- mtcars$drat * mtcars$vs
lg_model <- lm(mpg ~ drat + vs + interaction_term, data = mtcars)
summary(lg_model) #check for significance
##
## Call:
## lm(formula = mpg ~ drat + vs + interaction_term, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.1204 -2.1777 0.1506 1.9699 7.1849
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.2128 6.7565 -0.031 0.975
## drat 4.9612 1.9737 2.514 0.018 *
## vs 1.6822 10.6457 0.158 0.876
## interaction_term 1.0212 2.8928 0.353 0.727
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.857 on 28 degrees of freedom
## Multiple R-squared: 0.6301, Adjusted R-squared: 0.5905
## F-statistic: 15.9 on 3 and 28 DF, p-value: 3.146e-06
summary(lg_model)$r.squared # model performance with R^2
## [1] 0.6301176
ggplot(data = mtcars, aes(x = 1, y = mpg)) +
geom_boxplot() +
labs(title = "Boxplot of mpg", x = "", y = "mpg")
#outlier index
outlier_index <- which(mtcars$mpg %in% boxplot.stats(mtcars$mpg)$out)
outlier_index
## integer(0)
integer(0) means there are no outliers.
Feature scaling can impact how KNN calculates distances and accuracy.