In this exercise, we will examine a car seller database and use a neural network model to estimate car prices. By leveraging the power of a neural network, we aim to create a predictive model that can learn complex patterns and relationships between car features and their respective prices.
pacotes <- c('tidyverse', 'lubridate', 'stringr', 'viridis', 'rpart', 'rpart.plot', 'gtools', 'Rmisc', 'scales', 'caret', 'neuralnet', 'gamlss', 'gamlss.add', 'randomForest')
if(sum(as.numeric(!pacotes %in% installed.packages())) != 0){
instalador <- pacotes[!pacotes %in% installed.packages()]
for(i in 1:length(instalador)) {
install.packages(instalador, dependencies = T)
break()}
sapply(pacotes, require, character = T)
} else {
sapply(pacotes, require, character = T)
}
load(file='EPA_19.RData')
df %>% str
## 'data.frame': 1272 obs. of 12 variables:
## $ fuel_economy_combined: num 0.221 0.368 0.219 0.326 0.361 ...
## $ eng_disp : num 0.357 0.114 0.429 0.143 0.143 ...
## $ num_cyl : num 0.2308 0.0769 0.3846 0.0769 0.0769 ...
## $ transmission : Factor w/ 7 levels "A","AM","AMS",..: 3 2 4 3 4 3 4 4 4 7 ...
## $ num_gears : num 0.889 0.556 0.778 0.667 0.778 ...
## $ air_aspired_method : Factor w/ 5 levels "Naturally Aspirated",..: 4 4 4 4 4 4 3 1 3 3 ...
## $ regen_brake : Factor w/ 3 levels "Electrical Regen Brake",..: 1 3 3 3 3 3 3 3 3 3 ...
## $ batt_capacity_ah : num 0.25 0 0 0 0 0 0 0 0 0 ...
## $ drive : Factor w/ 5 levels "2-Wheel Drive, Front",..: 4 2 2 4 2 4 2 2 2 2 ...
## $ fuel_type : Factor w/ 5 levels "Diesel, ultra low sulfur (15 ppm, maximum)",..: 4 3 3 5 3 4 4 4 4 4 ...
## $ cyl_deactivate : Factor w/ 2 levels "N","Y": 1 1 1 1 1 2 1 2 2 1 ...
## $ variable_valve : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
cols <- c("fuel_economy_combined", 'eng_disp', 'num_cyl', 'num_gears', 'batt_capacity_ah')
range01 <- function(x){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}
df[cols] <- lapply(df[cols], range01)
We need to transform the categorical variables to numeric ones.
m <- model.matrix(fuel_economy_combined ~ ., data = df)
m <- as.matrix(data.frame(m, df[, 1]))
colnames(m)[28] <- "fuel_economy_combined"
Using the ‘for’ loop structure, let’s create a neural network model and evaluate it through a ‘k-fold’ method.
k <- 10 #number of folds
m2 = m[sample(1:nrow(m)), ]
N <- nrow(m2)
stats <- NULL
pred <- data.frame(matrix(ncol = ncol(m2), nrow = 0))
for (i in 0:(k-1)){
cv_test <- seq(N)>N*(i/k) & seq(N)<=N*((i+1)/k)
cv_train <- !(seq(N)>N*(i/k) & seq(N)<=N*((i+1)/k))
# Training
nn <- neuralnet(fuel_economy_combined ~ .,
data=m2[cv_train,],
hidden = c(7, 3),
# threshold = 0.8,
linear.output = TRUE)
# Evaluating
pred_tmp <- predict(nn, m2[cv_test,])
pred <- rbind(pred, pred_tmp)
}
caret::postResample(pred, m2[,28])
## RMSE Rsquared MAE
## 0.04670791 0.86270240 0.03086598
We have an R² of ~0.85, which indicates that our model has a good accuracy when used to predict from new data.
nn_final <- neuralnet(fuel_economy_combined ~ .,
data=m,
hidden = c(7, 3),
linear.output = TRUE)
pred2 <- predict(nn_final, m)
plot(x = pred2, y = df$fuel_economy_combined)
caret::postResample(pred2, df$fuel_economy_combined)
## RMSE Rsquared MAE
## 0.03374856 0.92761338 0.02560527
plot(nn_final)