Study summary as;
Main actions and model building includes;
As with all my personal studies, final conclusion comes first in order to save you from the chaos below! :) These are my exercises and my experiences. Sharing is caring!
All interpretations made over latest model, lm2.
Data extracted from public data set for Machine Learning at UC Irvine (https://archive.ics.uci.edu/ml/datasets.html)
fileUrl <- "https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data"
auto <- read.table(fileUrl)
names(auto) <- c("mpg", "cylinders", "displacement", "horsepower", "weight"
, "acceleration", "model_year", "origin", "car_name")
library(readxl)
Output removed due to packages information lines. Aim is to have a readable output. For more details about packages, one may check raw codes.
Exploratory Data Analysis (EDA) is an approach/philosophy for data analysis that employs a variety of techniques (mostly graphical) to maximize insight into a data set;
This step includes;
dim(auto)
## [1] 398 9
str(auto)
## 'data.frame': 398 obs. of 9 variables:
## $ mpg : num 18 15 18 16 17 15 14 14 14 15 ...
## $ cylinders : int 8 8 8 8 8 8 8 8 8 8 ...
## $ displacement: num 307 350 318 304 302 429 454 440 455 390 ...
## $ horsepower : chr "130.0" "165.0" "150.0" "150.0" ...
## $ weight : num 3504 3693 3436 3433 3449 ...
## $ acceleration: num 12 11.5 11 12 10.5 10 9 8.5 10 8.5 ...
## $ model_year : int 70 70 70 70 70 70 70 70 70 70 ...
## $ origin : int 1 1 1 1 1 1 1 1 1 1 ...
## $ car_name : chr "chevrolet chevelle malibu" "buick skylark 320" "plymouth satellite" "amc rebel sst" ...
length(unique(auto$car_name))
## [1] 305
There are 305 distinct values for car_name. We will remove this column while in regression phase.
Also, horsepower column should be in numerical format. We will convert type of this column.
auto$car_name <- NULL #removed car_name column from df.
auto$horsepower <- as.numeric(auto$horsepower)
#we will remove model_year variable as well.
auto$model_year <- NULL
miss_var_summary(auto)
## # A tibble: 7 x 3
## variable n_miss pct_miss
## <chr> <int> <dbl>
## 1 horsepower 6 1.51
## 2 mpg 0 0
## 3 cylinders 0 0
## 4 displacement 0 0
## 5 weight 0 0
## 6 acceleration 0 0
## 7 origin 0 0
vis_miss(auto)
#Replace missing values of horsepower column with a mean value of the horsepower column.
auto$horsepower[is.na(auto$horsepower)] <- mean(auto$horsepower, na.rm=TRUE)
vis_miss(auto)
par(mfrow=c(2,4))
for (i in names(auto)) {
boxplot(auto[,i], names = "names(auto[,i])")
}
outlier.scores <- lofactor(auto, k=5)
plot(density(outlier.scores))
# pick top 5 as outliers
outliers <- order(outlier.scores, decreasing=T)[1:5]
# who are outliers
n <- nrow(auto)
pch <- rep(".", n)
pch[outliers] <- "+"
col <- rep("black", n)
col[outliers] <- "red"
pairs(auto, pch=pch, col=col)
pairs.panels(auto,method = "pearson",hist.col ="#00AFBB" ,density = TRUE,ellipses = TRUE)
# cross-validation options.
# CV is a re-sample procedure and evaluates the model.
# Data set will split into 10 subsets.
train.control <- trainControl(method = "cv", number = 10)
# Stepwise regression with a K-fold CV.
step.model <- train(mpg ~., data = auto,
method = "lmStepAIC",
trControl = train.control,
trace = FALSE
)
# Model accuracy
step.model$results
## parameter RMSE Rsquared MAE RMSESD RsquaredSD MAESD
## 1 none 4.172355 0.7248044 3.204317 0.466853 0.04169929 0.3070223
# Final model coefficients
step.model$finalModel
##
## Call:
## lm(formula = .outcome ~ horsepower + weight + origin, data = dat)
##
## Coefficients:
## (Intercept) horsepower weight origin
## 41.480044 -0.048941 -0.005038 1.342764
# Summary of the model
summary(step.model$finalModel)
##
## Call:
## lm(formula = .outcome ~ horsepower + weight + origin, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.4032 -2.8540 -0.3403 2.2684 14.9027
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 41.4800443 1.3047868 31.791 < 2e-16 ***
## horsepower -0.0489405 0.0108507 -4.510 8.55e-06 ***
## weight -0.0050379 0.0005359 -9.400 < 2e-16 ***
## origin 1.3427638 0.3233991 4.152 4.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.18 on 394 degrees of freedom
## Multiple R-squared: 0.7162, Adjusted R-squared: 0.714
## F-statistic: 331.4 on 3 and 394 DF, p-value: < 2.2e-16
auto2 <- auto[,c("mpg","horsepower","weight","displacement","origin")]
set.seed(123)
training.samples <- auto2$mpg %>%
createDataPartition(p = 0.8, list = FALSE)
train.data <- auto2[training.samples, ]
test.data <- auto2[-training.samples, ]
model_1 <- lm(mpg ~ horsepower*displacement + weight + origin, data = train.data)
summary(model_1)
##
## Call:
## lm(formula = mpg ~ horsepower * displacement + weight + origin,
## data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.962 -2.563 -0.238 1.828 15.888
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.150e+01 2.037e+00 25.281 < 2e-16 ***
## horsepower -1.791e-01 2.375e-02 -7.539 5.08e-13 ***
## displacement -5.412e-02 1.161e-02 -4.662 4.64e-06 ***
## weight -3.420e-03 8.105e-04 -4.219 3.21e-05 ***
## origin 9.980e-01 3.711e-01 2.689 0.00754 **
## horsepower:displacement 4.189e-04 6.289e-05 6.661 1.21e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.921 on 315 degrees of freedom
## Multiple R-squared: 0.754, Adjusted R-squared: 0.7501
## F-statistic: 193.1 on 5 and 315 DF, p-value: < 2.2e-16
# Make predictions
predictions <- model_1 %>% predict(test.data)
# Model performance
# (a) Prediction error, RMSE
R2(predictions, test.data$mpg)
## [1] 0.757189
glance(model_1) %>%
dplyr::select(adj.r.squared, sigma, AIC, BIC, p.value) %>% kable()
| adj.r.squared | sigma | AIC | BIC | p.value |
|---|---|---|---|---|
| 0.7500539 | 3.921153 | 1796.121 | 1822.521 | 0 |
plot(model_1, 1)
plot(model_1, 3)
# Breusch-Pagan test
bptest(model_1)
##
## studentized Breusch-Pagan test
##
## data: model_1
## BP = 34.087, df = 5, p-value = 2.288e-06
model_2 <- lm(log(mpg) ~ horsepower*displacement + weight + origin, data = train.data)
summary(model_2)
##
## Call:
## lm(formula = log(mpg) ~ horsepower * displacement + weight +
## origin, data = train.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.49245 -0.10085 -0.00438 0.09320 0.55724
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.246e+00 7.778e-02 54.588 < 2e-16 ***
## horsepower -5.943e-03 9.069e-04 -6.554 2.29e-10 ***
## displacement -1.482e-03 4.433e-04 -3.344 0.000925 ***
## weight -1.827e-04 3.095e-05 -5.904 9.20e-09 ***
## origin 3.127e-02 1.417e-02 2.207 0.028043 *
## horsepower:displacement 1.113e-05 2.401e-06 4.635 5.23e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1497 on 315 degrees of freedom
## Multiple R-squared: 0.8088, Adjusted R-squared: 0.8058
## F-statistic: 266.6 on 5 and 315 DF, p-value: < 2.2e-16
# Breusch-Pagan test
bptest(model_2)
##
## studentized Breusch-Pagan test
##
## data: model_2
## BP = 13.3, df = 5, p-value = 0.02072
plot(model_2, 3)
glance(model_2) %>%
dplyr::select(adj.r.squared, sigma, AIC, BIC, p.value) %>% kable()
| adj.r.squared | sigma | AIC | BIC | p.value |
|---|---|---|---|---|
| 0.805797 | 0.1497138 | -300.2754 | -273.8753 | 0 |
plot(model_2, 2)
plot(model_2, 5)
w <- abs(rstudent(model_2)) < 3 & abs(cooks.distance(model_2)) < 4/nrow(model_2$model)
lm2 <- update(model_2, weights=as.numeric(w))
plot(lm2, 5)
summary(lm2)
##
## Call:
## lm(formula = log(mpg) ~ horsepower * displacement + weight +
## origin, data = train.data, weights = as.numeric(w))
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.27339 -0.08549 0.00000 0.08133 0.31665
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.329e+00 7.018e-02 61.686 < 2e-16 ***
## horsepower -7.096e-03 8.406e-04 -8.442 1.43e-15 ***
## displacement -2.457e-03 4.069e-04 -6.038 4.68e-09 ***
## weight -1.415e-04 2.839e-05 -4.983 1.07e-06 ***
## origin 2.204e-02 1.217e-02 1.811 0.0711 .
## horsepower:displacement 1.599e-05 2.163e-06 7.389 1.53e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1247 on 295 degrees of freedom
## Multiple R-squared: 0.857, Adjusted R-squared: 0.8546
## F-statistic: 353.7 on 5 and 295 DF, p-value: < 2.2e-16
glance(lm2) %>%
dplyr::select(adj.r.squared, sigma, AIC, BIC, p.value) %>% kable()
| adj.r.squared | sigma | AIC | BIC | p.value |
|---|---|---|---|---|
| 0.8546076 | 0.1247051 | -391.1054 | -365.1556 | 0 |
car::vif(lm2)
## horsepower displacement weight
## 18.757602 33.953057 11.085264
## origin horsepower:displacement
## 1.778838 41.429320