# Install and load required library packages
# install.packages("tidyverse")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
Load the data:
data(mtcars)
?mtcars
attach(mtcars)
## The following object is masked from package:ggplot2:
##
## mpg
#first 5 rows
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
#structure
str(mtcars)
## 'data.frame': 32 obs. of 11 variables:
## $ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
## $ cyl : num 6 6 4 6 8 6 8 4 4 6 ...
## $ disp: num 160 160 108 258 360 ...
## $ hp : num 110 110 93 110 175 105 245 62 95 123 ...
## $ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
## $ wt : num 2.62 2.88 2.32 3.21 3.44 ...
## $ qsec: num 16.5 17 18.6 19.4 17 ...
## $ vs : num 0 0 1 1 0 1 0 1 1 1 ...
## $ am : num 1 1 1 0 0 0 0 0 0 0 ...
## $ gear: num 4 4 4 3 3 3 3 4 4 4 ...
## $ carb: num 4 4 1 1 2 1 4 2 2 4 ...
We can see that every variable is numerical There are 32 obervations and 11 variables
summary(mtcars)
## mpg cyl disp hp
## Min. :10.40 Min. :4.000 Min. : 71.1 Min. : 52.0
## 1st Qu.:15.43 1st Qu.:4.000 1st Qu.:120.8 1st Qu.: 96.5
## Median :19.20 Median :6.000 Median :196.3 Median :123.0
## Mean :20.09 Mean :6.188 Mean :230.7 Mean :146.7
## 3rd Qu.:22.80 3rd Qu.:8.000 3rd Qu.:326.0 3rd Qu.:180.0
## Max. :33.90 Max. :8.000 Max. :472.0 Max. :335.0
## drat wt qsec vs
## Min. :2.760 Min. :1.513 Min. :14.50 Min. :0.0000
## 1st Qu.:3.080 1st Qu.:2.581 1st Qu.:16.89 1st Qu.:0.0000
## Median :3.695 Median :3.325 Median :17.71 Median :0.0000
## Mean :3.597 Mean :3.217 Mean :17.85 Mean :0.4375
## 3rd Qu.:3.920 3rd Qu.:3.610 3rd Qu.:18.90 3rd Qu.:1.0000
## Max. :4.930 Max. :5.424 Max. :22.90 Max. :1.0000
## am gear carb
## Min. :0.0000 Min. :3.000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.:2.000
## Median :0.0000 Median :4.000 Median :2.000
## Mean :0.4062 Mean :3.688 Mean :2.812
## 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :1.0000 Max. :5.000 Max. :8.000
table(mtcars$mpg)
##
## 10.4 13.3 14.3 14.7 15 15.2 15.5 15.8 16.4 17.3 17.8 18.1 18.7 19.2 19.7 21
## 2 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2
## 21.4 21.5 22.8 24.4 26 27.3 30.4 32.4 33.9
## 2 1 2 1 1 1 2 1 1
table(mtcars$cyl) #categorical
##
## 4 6 8
## 11 7 14
table(mtcars$disp)
##
## 71.1 75.7 78.7 79 95.1 108 120.1 120.3 121 140.8 145 146.7 160
## 1 1 1 1 1 1 1 1 1 1 1 1 2
## 167.6 225 258 275.8 301 304 318 350 351 360 400 440 460
## 2 1 1 3 1 1 1 1 1 2 1 1 1
## 472
## 1
table(mtcars$hp)
##
## 52 62 65 66 91 93 95 97 105 109 110 113 123 150 175 180 205 215 230 245
## 1 1 1 2 1 1 1 1 1 1 3 1 2 2 3 3 1 1 1 2
## 264 335
## 1 1
table(mtcars$drat)
##
## 2.76 2.93 3 3.07 3.08 3.15 3.21 3.23 3.54 3.62 3.69 3.7 3.73 3.77 3.85 3.9
## 2 1 1 3 2 2 1 1 1 1 1 1 1 1 1 2
## 3.92 4.08 4.11 4.22 4.43 4.93
## 3 2 1 2 1 1
table(mtcars$wt)
##
## 1.513 1.615 1.835 1.935 2.14 2.2 2.32 2.465 2.62 2.77 2.78 2.875 3.15
## 1 1 1 1 1 1 1 1 1 1 1 1 1
## 3.17 3.19 3.215 3.435 3.44 3.46 3.52 3.57 3.73 3.78 3.84 3.845 4.07
## 1 1 1 1 3 1 1 2 1 1 1 1 1
## 5.25 5.345 5.424
## 1 1 1
table(mtcars$qsec)
##
## 14.5 14.6 15.41 15.5 15.84 16.46 16.7 16.87 16.9 17.02 17.05 17.3 17.4
## 1 1 1 1 1 1 1 1 1 2 1 1 1
## 17.42 17.6 17.82 17.98 18 18.3 18.52 18.6 18.61 18.9 19.44 19.47 19.9
## 1 1 1 1 1 1 1 1 1 2 1 1 1
## 20 20.01 20.22 22.9
## 1 1 1 1
table(mtcars$vs) #categorical
##
## 0 1
## 18 14
table(mtcars$am) #categorical
##
## 0 1
## 19 13
table(mtcars$gear) #categorical
##
## 3 4 5
## 15 12 5
table(mtcars$carb) #categorical
##
## 1 2 3 4 6 8
## 7 10 3 10 1 1
Looking at the data it does not seem that there are any outliers.
However, we can recognize that some variables such as hp
and disp have ver large maximum and observations comparing
to the other variables.
str(mtcars$mpg)
## num [1:32] 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
class(mtcars$mpg)
## [1] "numeric"
Visual exploration to understand the data set.
ggplot(mtcars, aes(x= mpg)) +
geom_histogram(binwidth=5, color="darkblue", fill="lightblue") +
ggtitle("MPG distribution") +
xlab("Miles/(US) gallon") +
ylab("Frequency")
ggplot(mtcars, aes(x= carb)) +
geom_histogram(binwidth=5, color="darkblue", fill="lightblue") +
ggtitle("Number of carburetors") +
xlab("Carbureators") +
ylab("Frequency")
hist(mtcars$cyl, col = 'blue', border = "black", xlab = 'Cyl', ylab = 'Frequency', main = 'Number of cylinders')
hist(mtcars$disp, col = 'blue', border = "black", xlab = 'Disp', ylab = 'Frequency', main = 'Displacement')
hist(mtcars$hp, col = 'blue', border = "black", xlab = 'hp', ylab = 'Frequency', main = 'Gross horsepower')
hist(mtcars$drat, col = 'blue', border = "black", xlab = 'drat', ylab = 'Frequency', main = ' Rear axle ratio')
hist(mtcars$wt, col = 'blue', border = "black", xlab = 'wt', ylab = 'Frequency', main = 'Weight (1000 lbs)')
hist(mtcars$qsec, col = 'blue', border = "black", xlab = 'qsec', ylab = 'Frequency', main = '1/4 mile time Distribution')
ggplot(mtcars, aes(x=wt, y=mpg)) + geom_point(color="blue") +
ggtitle("Weight interaction with mpg")
ggplot(mtcars, aes(x=disp, y=mpg)) + geom_point(color="blue") +
ggtitle("Displacement interaction with mpg")
ggplot(mtcars, aes(x=hp, y=mpg)) + geom_point(color="blue") +
ggtitle("Gross horsepower interaction with mpg")
ggplot(mtcars, aes(x=drat, y=mpg)) + geom_point(color="blue") +
ggtitle("Rear axle ratio interaction with mpg")
ggplot(mtcars, aes(x=hp, y=wt)) + geom_point(color="blue") +
ggtitle(" Gross horsepower interaction with wt")
ggplot(mtcars, aes(x=qsec, y=mpg)) + geom_point(color="blue") +
ggtitle("Weight interaction with mpg")
boxplot(mpg ~ cyl, data = mtcars,
main = "Mileage and cylinders Boxplot",
ylab = "Miles Per Gallon",
xlab = "Cylinders",
col = "yellow")
boxplot(mpg ~ vs, data = mtcars,
main = "Mileage and Engine Boxplot",
ylab = "Miles Per Gallon",
xlab = "Engine",
col = "yellow")
boxplot(mpg ~ carb, data = mtcars,
main = "Mileage and Carbureators Boxplot",
ylab = "Miles Per Gallon",
xlab = "Num. Carbureators",
col = "yellow")
boxplot(hp ~ carb, data = mtcars,
main = "Horsepower and Carbureators Boxplot",
ylab = "Gross Horsepower",
xlab = "Num. Carbureators",
col = "yellow")
corr_matrix <- cor(mtcars)
print(corr_matrix)
## mpg cyl disp hp drat wt
## mpg 1.0000000 -0.8521620 -0.8475514 -0.7761684 0.68117191 -0.8676594
## cyl -0.8521620 1.0000000 0.9020329 0.8324475 -0.69993811 0.7824958
## disp -0.8475514 0.9020329 1.0000000 0.7909486 -0.71021393 0.8879799
## hp -0.7761684 0.8324475 0.7909486 1.0000000 -0.44875912 0.6587479
## drat 0.6811719 -0.6999381 -0.7102139 -0.4487591 1.00000000 -0.7124406
## wt -0.8676594 0.7824958 0.8879799 0.6587479 -0.71244065 1.0000000
## qsec 0.4186840 -0.5912421 -0.4336979 -0.7082234 0.09120476 -0.1747159
## vs 0.6640389 -0.8108118 -0.7104159 -0.7230967 0.44027846 -0.5549157
## am 0.5998324 -0.5226070 -0.5912270 -0.2432043 0.71271113 -0.6924953
## gear 0.4802848 -0.4926866 -0.5555692 -0.1257043 0.69961013 -0.5832870
## carb -0.5509251 0.5269883 0.3949769 0.7498125 -0.09078980 0.4276059
## qsec vs am gear carb
## mpg 0.41868403 0.6640389 0.59983243 0.4802848 -0.55092507
## cyl -0.59124207 -0.8108118 -0.52260705 -0.4926866 0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692 0.39497686
## hp -0.70822339 -0.7230967 -0.24320426 -0.1257043 0.74981247
## drat 0.09120476 0.4402785 0.71271113 0.6996101 -0.09078980
## wt -0.17471588 -0.5549157 -0.69249526 -0.5832870 0.42760594
## qsec 1.00000000 0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs 0.74453544 1.0000000 0.16834512 0.2060233 -0.56960714
## am -0.22986086 0.1683451 1.00000000 0.7940588 0.05753435
## gear -0.21268223 0.2060233 0.79405876 1.0000000 0.27407284
## carb -0.65624923 -0.5696071 0.05753435 0.2740728 1.00000000
library(corrplot)
## corrplot 0.92 loaded
corrplot(corr_matrix, method="circle", type="upper", order="hclust",
tl.col="black", tl.srt=45)
colSums(is.na(mtcars))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 0 0 0 0 0 0 0 0 0 0 0
There is no missing data
boxplot(mtcars, las=2, cex.axis=0.6)
All of them look very good, besides hp and disp that have too big values and one outlier. Therefore we are going to create two new values with standardized observations.
summary(mtcars$disp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 71.1 120.8 196.3 230.7 326.0 472.0
Range standardization to convert any value from 0 to 1. We are using
hp and disp.
Let’s fit a linear regression model on the data with all the variables.
data(mtcars)
mtcars_lm <- lm(mpg ~ ., data = mtcars)
summary(mtcars_lm)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
What if we standardize some variables? Is it going to change?
min_disp <- min(mtcars$disp)
max_disp <- max(mtcars$disp)
mtcars$disp <- (mtcars$disp - min_disp) / (max_disp - min_disp)
summary(mtcars$disp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1240 0.3123 0.3982 0.6358 1.0000
mtcars_lm1 <- lm(mpg ~ ., data = mtcars)
summary(mtcars_lm1)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.25151 18.73849 0.707 0.4872
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 5.34610 7.15907 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
We can observe that the standardized variable has the same p-value.
mtcars_lm_qsec <- lm(mpg ~ qsec , data = mtcars)
summary(mtcars_lm_qsec)
##
## Call:
## lm(formula = mpg ~ qsec, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8760 -3.4539 -0.7203 2.2774 11.6491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.1140 10.0295 -0.510 0.6139
## qsec 1.4121 0.5592 2.525 0.0171 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.564 on 30 degrees of freedom
## Multiple R-squared: 0.1753, Adjusted R-squared: 0.1478
## F-statistic: 6.377 on 1 and 30 DF, p-value: 0.01708
qsec is significant when compared to miles per
gallon.
mtcars_lm_vs <- lm(mpg ~ vs , data = mtcars)
summary(mtcars_lm_vs)
##
## Call:
## lm(formula = mpg ~ vs, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.757 -3.082 -1.267 2.828 9.383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 16.617 1.080 15.390 8.85e-16 ***
## vs 7.940 1.632 4.864 3.42e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.581 on 30 degrees of freedom
## Multiple R-squared: 0.4409, Adjusted R-squared: 0.4223
## F-statistic: 23.66 on 1 and 30 DF, p-value: 3.416e-05
mtcars_lm_wt <- lm(mpg ~ wt , data = mtcars)
summary(mtcars_lm_wt)
##
## Call:
## lm(formula = mpg ~ wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5432 -2.3647 -0.1252 1.4096 6.8727
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2851 1.8776 19.858 < 2e-16 ***
## wt -5.3445 0.5591 -9.559 1.29e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.046 on 30 degrees of freedom
## Multiple R-squared: 0.7528, Adjusted R-squared: 0.7446
## F-statistic: 91.38 on 1 and 30 DF, p-value: 1.294e-10
Running them separately, they appear to be significiant. However when they are all run together, the variables are not significant. Lets try adding the categorical variables.
mtcars_lm_cat <- lm(mpg ~ vs + am + gear + carb + cyl , data = mtcars)
summary(mtcars_lm_cat)
##
## Call:
## lm(formula = mpg ~ vs + am + gear + carb + cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5403 -1.1582 0.2528 1.2787 5.5597
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 25.1591 7.7570 3.243 0.00323 **
## vs 0.8784 1.9957 0.440 0.66347
## am 3.5989 1.8694 1.925 0.06522 .
## gear 1.2516 1.4730 0.850 0.40323
## carb -1.4071 0.5368 -2.621 0.01444 *
## cyl -1.2239 0.7510 -1.630 0.11521
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.808 on 26 degrees of freedom
## Multiple R-squared: 0.8179, Adjusted R-squared: 0.7829
## F-statistic: 23.36 on 5 and 26 DF, p-value: 7.418e-09
Now only carb appears to be significant.
Lets create another model using a categorical, a continuous and their interaction.
mtcars_lm_int <- lm(mpg ~ hp + cyl + hp*cyl, data = mtcars)
summary(mtcars_lm_int)
##
## Call:
## lm(formula = mpg ~ hp + cyl + hp * cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.778 -1.969 -0.228 1.403 6.491
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 50.751207 6.511686 7.794 1.72e-08 ***
## hp -0.170680 0.069102 -2.470 0.019870 *
## cyl -4.119140 0.988229 -4.168 0.000267 ***
## hp:cyl 0.019737 0.008811 2.240 0.033202 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.974 on 28 degrees of freedom
## Multiple R-squared: 0.7801, Adjusted R-squared: 0.7566
## F-statistic: 33.11 on 3 and 28 DF, p-value: 2.386e-09
All of them are significant!
lm_mse <- mean((mtcars_lm$fitted.values - mtcars$mpg)^2)
print(paste("Mean Squared Error for Linear Model:", round(lm_mse, 2)))
## [1] "Mean Squared Error for Linear Model: 4.61"
Then MSE is 4.61. Since it is not very high, the model does not have a lot of error.
We can compile the actual and predicted values and view the first 5 records
# Create a data frame to compare actual and predicted values
comparison_df <- data.frame(Actual = mtcars$mpg, lm_predicted = mtcars_lm$fitted.values)
head(comparison_df)
## Actual lm_predicted
## Mazda RX4 21.0 22.59951
## Mazda RX4 Wag 21.0 22.11189
## Datsun 710 22.8 26.25064
## Hornet 4 Drive 21.4 21.23740
## Hornet Sportabout 18.7 17.69343
## Valiant 18.1 20.38304
The predicted vs the actual values are very good.
# Diagnostic Plots
par(mfrow = c(2, 2))
plot(mtcars_lm)
As we can see in these plots, the residuals follow along with the fitted values and they are going into the same direction. This explain, as we saw doing MSE, that there is not much error in the model.
# Run library
library(kknn)
# Perform K-NN
k <- 5 # Number of neighbors
knn_fit_std_cars <- kknn(mpg ~ ., mtcars, mtcars, k=k)
summary(knn_fit_std_cars)
##
## Call:
## kknn(formula = mpg ~ ., train = mtcars, test = mtcars, k = k)
##
## Response: "continuous"
We can now add our KNN predicted values to the data frame that contains linear model predicted values and true values.
# Create a data frame to compare actual and predicted values
comparison_df$knn_std_predicted <- knn_fit_std_cars$fitted.values
head(comparison_df)
## Actual lm_predicted knn_std_predicted
## Mazda RX4 21.0 22.59951 21.22292
## Mazda RX4 Wag 21.0 22.11189 21.22292
## Datsun 710 22.8 26.25064 25.63612
## Hornet 4 Drive 21.4 21.23740 20.65088
## Hornet Sportabout 18.7 17.69343 17.98029
## Valiant 18.1 20.38304 19.93957
knn_std_mse <- mean((knn_fit_std_cars$fitted.values - mtcars$mpg)^2)
print(paste("Mean Squared Error for stdKNN:", round(knn_std_mse, 2)))
## [1] "Mean Squared Error for stdKNN: 2.16"
MSE is lower than in linear regression model
#scale = FALSE, if you don't want to standardize the model
knn_fit_cars <- kknn(mpg ~ ., mtcars, mtcars, k=k, scale = FALSE)
# Create a data frame to compare actual and predicted values
comparison_df$knn_nostd_predicted_cars <- knn_fit_cars$fitted.values
head(comparison_df)
## Actual lm_predicted knn_std_predicted
## Mazda RX4 21.0 22.59951 21.22292
## Mazda RX4 Wag 21.0 22.11189 21.22292
## Datsun 710 22.8 26.25064 25.63612
## Hornet 4 Drive 21.4 21.23740 20.65088
## Hornet Sportabout 18.7 17.69343 17.98029
## Valiant 18.1 20.38304 19.93957
## knn_nostd_predicted_cars
## Mazda RX4 21.85512
## Mazda RX4 Wag 21.33473
## Datsun 710 23.30812
## Hornet 4 Drive 21.22027
## Hornet Sportabout 18.33042
## Valiant 19.78832
knn_mse_cars_nonstd <- mean((knn_fit_cars$fitted.values - mtcars$mpg)^2)
print(paste("Mean Squared Error for KNN:", round(knn_mse_cars_nonstd, 2)))
## [1] "Mean Squared Error for KNN: 2.67"
The MSE without standardized data is bigger (now 2.67) than with standardized data. Therefore it is better to standardize the data.
Lets try with other k’s.
# Initialize a dataframe to store MSE for each k
mse_df <- data.frame(k = integer(), MSE = numeric())
for (k in c(1, 3, 5, 7, 9, 11)) {
# Fit the k-NN model using kknn function
knn_model_k <- kknn(mpg ~ ., train = mtcars, test = mtcars, k = k)
# Calculate the MSE
mse <- mean((knn_model_k$fitted.values - mtcars$mpg)^2)
mse_df <- rbind(mse_df, data.frame(k = k, MSE = mse))
}
# Show the MSE dataframe
print(mse_df)
## k MSE
## 1 1 0.0000000
## 2 3 0.8303497
## 3 5 2.1607451
## 4 7 3.4229878
## 5 9 4.3303489
## 6 11 4.9779654
We can see that when trying different k’s on KNN, the MSE changes. When k is larger, MSE is also larger and viceversa.
# Initialize a dataframe to store MSE for each k
mse_df <- data.frame(k = integer(), MSE = numeric())
for (k in c(1, 3, 5, 7, 9, 11)) {
# Fit the k-NN model using kknn function
knn_model1 <- kknn(mpg ~ ., train = mtcars, test = mtcars, k = k)
# Calculate the MSE
mse <- mean((knn_model1$fitted.values - mtcars$mpg)^2)
mse_df <- rbind(mse_df, data.frame(k = k, MSE = mse))
}
# Show the MSE dataframe
print(mse_df)
## k MSE
## 1 1 0.0000000
## 2 3 0.8303497
## 3 5 2.1607451
## 4 7 3.4229878
## 5 9 4.3303489
## 6 11 4.9779654
As we did this before, we saw that the smaller the value of k the smaller the MSE. A small value of K provides the most flexible fit, which will have low bias but high variance. This variance is due to the fact that the prediction in a given region is entirely dependent on just one observation. Therefore, a small k does not mean that the model is going to be better, because it is only focusing on one neighbor, not on all of them.
# Diagnostic Plots
par(mfrow = c(2, 2))
plot(mtcars_lm)
When we use linear regression we assume that there is a linear relationship between the variables. Based o this data set and the plots we created we can assume that there is a linear relationship between mpg and the other variables. When observing the normal Q-Q for example, we can see that the relationship is positively skewed.And yes, the assumptions are met in this model, that is why this model is better for this data set.
# linear regression with miles per gallon and rear axle ratio and qsec interaction
mtcars_lm_drqs <- lm(mpg ~ drat + qsec + drat*qsec, data = mtcars)
summary(mtcars_lm_drqs)
##
## Call:
## lm(formula = mpg ~ drat + qsec + drat * qsec, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.2311 -2.5841 -0.0963 2.1933 10.2717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.3372 68.7326 0.179 0.859
## drat -3.4371 18.2960 -0.188 0.852
## qsec -1.0241 3.8196 -0.268 0.791
## drat:qsec 0.5973 1.0142 0.589 0.561
##
## Residual standard error: 4.025 on 28 degrees of freedom
## Multiple R-squared: 0.5972, Adjusted R-squared: 0.554
## F-statistic: 13.84 on 3 and 28 DF, p-value: 1.014e-05
#lm model comparing the effect of cylinders and displacement to miles per gallon
mtcars_lm_int <- lm(mpg ~ disp + cyl + disp*cyl, data = mtcars)
summary(mtcars_lm_int)
##
## Call:
## lm(formula = mpg ~ disp + cyl + disp * cyl, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0809 -1.6054 -0.2948 1.0546 5.7981
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.6903 3.1676 12.214 9.8e-13 ***
## disp -58.3413 16.0370 -3.638 0.00110 **
## cyl -2.2780 0.6561 -3.472 0.00170 **
## disp:cyl 6.3558 1.9836 3.204 0.00337 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.66 on 28 degrees of freedom
## Multiple R-squared: 0.8241, Adjusted R-squared: 0.8052
## F-statistic: 43.72 on 3 and 28 DF, p-value: 1.078e-10
#lm model comparing the effect of weight and gross horsepower to miles per gallon
mtcars_lm_hp_wt <- lm(mpg ~ hp + wt + hp*wt, data = mtcars)
summary(mtcars_lm_hp_wt)
##
## Call:
## lm(formula = mpg ~ hp + wt + hp * wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0632 -1.6491 -0.7362 1.4211 4.5513
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 49.80842 3.60516 13.816 5.01e-14 ***
## hp -0.12010 0.02470 -4.863 4.04e-05 ***
## wt -8.21662 1.26971 -6.471 5.20e-07 ***
## hp:wt 0.02785 0.00742 3.753 0.000811 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.153 on 28 degrees of freedom
## Multiple R-squared: 0.8848, Adjusted R-squared: 0.8724
## F-statistic: 71.66 on 3 and 28 DF, p-value: 2.981e-13
As we can see, when we compared the interaction of Rear axle ratio and 1/4 mile time, the interaction was not significant, which means that it will not influence in mpg. However when we did both interactions of cylinders and displacement (we also saw in corr that it was highly correlated) and weight and horsepower, both were significant. For example in the third one, The R-squared value of 0.8848 confirms that this model (weight and horsepower) explains about 88.48% of the variance in MPG.The p-value lower than 0.05 explains that at least one of the variables or interactions is not equal to zero and that the model is statistically significant. In this case both variables and their interaction are not equal to zero. Increasing the horsepower decreases MPG 12 for every 100 horsepower. Weight decreases the MPG by 8.2 for each 1000 lbs increase.
mtcars_lm_noint <- lm(mpg ~ hp + wt , data = mtcars)
summary(mtcars_lm_noint)
##
## Call:
## lm(formula = mpg ~ hp + wt, data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.941 -1.600 -0.182 1.050 5.854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.22727 1.59879 23.285 < 2e-16 ***
## hp -0.03177 0.00903 -3.519 0.00145 **
## wt -3.87783 0.63273 -6.129 1.12e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.593 on 29 degrees of freedom
## Multiple R-squared: 0.8268, Adjusted R-squared: 0.8148
## F-statistic: 69.21 on 2 and 29 DF, p-value: 9.109e-12
Now that we did a linear regression model with no interaction, we can see that the R-squared is lower, which indicates that this model explains about 82.68% of the variance in MPG. However, bot hp and wt are still significant.
#load data again
data(mtcars)
boxplot(mtcars, las=2, cex.axis=0.6)
##### Answer: Looking at this boxplot we can see that Hp has
probably one outlier and that both hp and disp have very large values
comparared to the other variables. Therefore we will winsorize both
variables, but we will run a linear regresion model and knn first to see
if it changes the outcome.
data(mtcars)
mtcars_lm <- lm(mpg ~ ., data = mtcars)
summary(mtcars_lm)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4506 -1.6044 -0.1196 1.2193 4.6271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.30337 18.71788 0.657 0.5181
## cyl -0.11144 1.04502 -0.107 0.9161
## disp 0.01334 0.01786 0.747 0.4635
## hp -0.02148 0.02177 -0.987 0.3350
## drat 0.78711 1.63537 0.481 0.6353
## wt -3.71530 1.89441 -1.961 0.0633 .
## qsec 0.82104 0.73084 1.123 0.2739
## vs 0.31776 2.10451 0.151 0.8814
## am 2.52023 2.05665 1.225 0.2340
## gear 0.65541 1.49326 0.439 0.6652
## carb -0.19942 0.82875 -0.241 0.8122
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.65 on 21 degrees of freedom
## Multiple R-squared: 0.869, Adjusted R-squared: 0.8066
## F-statistic: 13.93 on 10 and 21 DF, p-value: 3.793e-07
summary(mtcars$hp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 52.0 96.5 123.0 146.7 180.0 335.0
We can observe that hp has a high maximum. Winsorize
hp and disp
# Calculate the 5th and 95th percentiles for 'hp'
lower_bound_hp <- quantile(mtcars$hp, 0.05, na.rm = TRUE)
upper_bound_hp <- quantile(mtcars$hp, 0.95, na.rm = TRUE)
# Winsorize the data
mtcars$hp[mtcars$hp < lower_bound_hp] <- lower_bound_hp
mtcars$hp[mtcars$hp > upper_bound_hp] <- upper_bound_hp
summary(mtcars$hp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 63.65 96.50 123.00 144.23 180.00 253.55
We can see that all the observations are now different. We were able to winsorize the data.
# Calculate the 5th and 95th percentiles for 'disp'
lower_bound_disp <- quantile(mtcars$disp, 0.05, na.rm = TRUE)
upper_bound_disp <- quantile(mtcars$disp, 0.95, na.rm = TRUE)
# Winsorize the data
mtcars$disp[mtcars$disp < lower_bound_disp] <- lower_bound_disp
mtcars$disp[mtcars$disp > upper_bound_disp] <- upper_bound_disp
mtcars_lm <- lm(mpg ~ ., data = mtcars)
summary(mtcars_lm)
##
## Call:
## lm(formula = mpg ~ ., data = mtcars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3638 -1.5102 -0.1574 1.0032 4.6336
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.82127 18.77505 0.789 0.4387
## cyl -0.22715 1.04397 -0.218 0.8299
## disp 0.01685 0.01809 0.931 0.3622
## hp -0.02959 0.02553 -1.159 0.2594
## drat 1.04159 1.62022 0.643 0.5273
## wt -3.67611 1.80306 -2.039 0.0543 .
## qsec 0.71683 0.74474 0.963 0.3467
## vs 0.19506 2.04433 0.095 0.9249
## am 2.26933 2.03967 1.113 0.2785
## gear 0.52738 1.47401 0.358 0.7241
## carb -0.21180 0.77065 -0.275 0.7861
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.623 on 21 degrees of freedom
## Multiple R-squared: 0.8717, Adjusted R-squared: 0.8105
## F-statistic: 14.26 on 10 and 21 DF, p-value: 3.097e-07
After removing winsorization we can see that the p-value in both
variables is smaller, meaning that it is more significant than before.
Disp changes from 0.4635 to 0.3622 and hp from
0.3350 to 0.2594. However it is still not statistically significant and
has no effect on mpg.
#load data again
data(mtcars)
# Perform K-NN
k <- 5 # Number of neighbors
knn_fit_std_cars <- kknn(mpg ~ ., mtcars, mtcars, k=k)
summary(knn_fit_std_cars)
##
## Call:
## kknn(formula = mpg ~ ., train = mtcars, test = mtcars, k = k)
##
## Response: "continuous"
Evaluate the model
knn_std_mse <- mean((knn_fit_std_cars$fitted.values - mtcars$mpg)^2)
print(paste("Mean Squared Error for stdKNN:", round(knn_std_mse, 2)))
## [1] "Mean Squared Error for stdKNN: 2.16"
Now we will winsorize the data again, to see if the model changes.
Winsorize hp and disp
# Calculate the 5th and 95th percentiles for 'hp'
lower_bound_hp <- quantile(mtcars$hp, 0.05, na.rm = TRUE)
upper_bound_hp <- quantile(mtcars$hp, 0.95, na.rm = TRUE)
# Winsorize the data
mtcars$hp[mtcars$hp < lower_bound_hp] <- lower_bound_hp
mtcars$hp[mtcars$hp > upper_bound_hp] <- upper_bound_hp
# Calculate the 5th and 95th percentiles for 'hp'
lower_bound_disp <- quantile(mtcars$disp, 0.05, na.rm = TRUE)
upper_bound_disp <- quantile(mtcars$disp, 0.95, na.rm = TRUE)
# Winsorize the data
mtcars$disp[mtcars$disp < lower_bound_disp] <- lower_bound_disp
mtcars$disp[mtcars$disp > upper_bound_disp] <- upper_bound_disp
Lets perform KNN again:
# Perform K-NN
k <- 5 # Number of neighbors
knn_fit_std_cars <- kknn(mpg ~ ., mtcars, mtcars, k=k)
summary(knn_fit_std_cars)
##
## Call:
## kknn(formula = mpg ~ ., train = mtcars, test = mtcars, k = k)
##
## Response: "continuous"
knn_std_mse <- mean((knn_fit_std_cars$fitted.values - mtcars$mpg)^2)
print(paste("Mean Squared Error for stdKNN:", round(knn_std_mse, 2)))
## [1] "Mean Squared Error for stdKNN: 2.15"
Mean squared error was reduced by 0.10. This means that also in KNN model, winsorization implements the model. However, as we saw before, standardization does not change/improve the Linear Regression model, and with KNN it might be even better without standardization.
Lets look at the KNN model with feature scaling, to see how it could affect the model.
#load data again
data(mtcars)
summary(mtcars$hp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 52.0 96.5 123.0 146.7 180.0 335.0
# Perform K-NN
k <- 5 # Number of neighbors
knn_fit_std_cars <- kknn(mpg ~ ., mtcars, mtcars, k=k)
summary(knn_fit_std_cars)
##
## Call:
## kknn(formula = mpg ~ ., train = mtcars, test = mtcars, k = k)
##
## Response: "continuous"
knn_std_mse <- mean((knn_fit_std_cars$fitted.values - mtcars$mpg)^2)
print(paste("Mean Squared Error for stdKNN:", round(knn_std_mse, 2)))
## [1] "Mean Squared Error for stdKNN: 2.16"
As we observed before MSE was 2.16. Lets look at MSE with feature scaling.
min_disp <- min(mtcars$disp)
max_disp <- max(mtcars$disp)
mtcars$disp <- (mtcars$disp - min_disp) / (max_disp - min_disp)
summary(mtcars$disp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1240 0.3123 0.3982 0.6358 1.0000
min_hp <- min(mtcars$hp)
max_hp <- max(mtcars$hp)
mtcars$hp <- (mtcars$hp - min_hp) / (max_hp - min_hp)
summary(mtcars$hp)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.1572 0.2509 0.3346 0.4523 1.0000
# Perform K-NN
k <- 5 # Number of neighbors
knn_fit_std_cars_stand <- kknn(mpg ~ ., mtcars, mtcars, k=k)
summary(knn_fit_std_cars_stand)
##
## Call:
## kknn(formula = mpg ~ ., train = mtcars, test = mtcars, k = k)
##
## Response: "continuous"
knn_std_mse_stand <- mean((knn_fit_std_cars_stand$fitted.values - mtcars$mpg)^2)
print(paste("Mean Squared Error for stdKNN:", round(knn_std_mse_stand, 2)))
## [1] "Mean Squared Error for stdKNN: 2.16"
As we can see, feature scaling does not improve MSE in the model, because it already implements feature scaling in the analysis. However, if we do it with no standardization, the model changes.
#scale = FALSE, if you don't want to standardize the model
k <- 5
knn_fit_cars_non <- kknn(mpg ~ ., mtcars, mtcars, k=k, scale = FALSE)
knn_std_mse_nonst <- mean((knn_fit_cars_non$fitted.values - mtcars$mpg)^2)
print(paste("Mean Squared Error for stdKNN:", round(knn_std_mse_nonst, 2)))
## [1] "Mean Squared Error for stdKNN: 2.56"
Now we can see that the MSE is larger when there is no scaling. This is because now the neighbors that k is choosing are not between 0 and 1. MSE increase from ‘2.16’ to ‘2.56’
kknn automatically standardizes the predictor variables
before fitting the model by DEFAULT. When we standardize knn, the model
fits even better. On the other side Linear Regression does not
standardize the variables. However if we run the model, then standardize
the values and the run the model again, we do not see any changes in the
p-value. So there is generally no need to standardize data for linear
regression models. However when we winsorize or truncate variables in
the data (to eliminate outliers) we can see in both knn and
lm an improvement in the model and smaller MSE. When we
compile the actual and predicted values in a data frame with both linear
regression and knn models, we observe that knn fits the values better.
If we reduce the number of k’s in the model, the model predicts even
better. When we analyze the MSE of all models we see that knn has a
smaller error than linear regression. By reducing k, the MSE reduces as
well. Both models are good models. KNN has a smaller bias, is more
flexible, works better for nonlinear data. However it has a higher
variance and tends to over fit. A small value of K provides the most
flexible fit, which will have low bias but high variance. This variance
is due to the fact that the prediction in a given region is entirely
dependent on just one observation. This is actually not good because if
k=1 then he only contemplates one variable and it is probably not good
enough to interpret the rest of variables. KNN is only better when the
function is not linear and when there is a lot of data. On the other
side linear regression has a bigger bias, fits a line through data, is
better for smaller data sets and work better for linear data sets. But
it suffers from multicollinearity and has a bigger bias.
In this case, with mtcars data set I think that linear regression is the best model. It is true that it has a larger bias, but the variance will be smaller than with knn. This data set is not very big and most variables follow a linear relationship (most of them are numeric) . Also, with Linear Regression we can see which values best predict mpg. We can compare the p-values within each variable and compare it to the dependent variable. Therefore, I think that linear regression will be probably the best model for this data set.