In this exercise, we will examine a car seller database and use a neural network model to estimate car prices. By leveraging the power of a neural network, we aim to create a predictive model that can learn complex patterns and relationships between car features and their respective prices.

Loading packages

pacotes <- c('tidyverse', 'lubridate', 'stringr', 'viridis', 'rpart', 'rpart.plot', 'gtools',     'Rmisc', 'scales', 'caret', 'neuralnet', 'gamlss', 'gamlss.add', 'randomForest')

if(sum(as.numeric(!pacotes %in% installed.packages())) != 0){
  instalador <- pacotes[!pacotes %in% installed.packages()]
  for(i in 1:length(instalador)) {
    install.packages(instalador, dependencies = T)
    break()}
  sapply(pacotes, require, character = T) 
} else {
  sapply(pacotes, require, character = T) 
}

Loading the data

load(file='EPA_19.RData')

Checking its structure

df %>% str
## 'data.frame':    1272 obs. of  12 variables:
##  $ fuel_economy_combined: num  0.221 0.368 0.219 0.326 0.361 ...
##  $ eng_disp             : num  0.357 0.114 0.429 0.143 0.143 ...
##  $ num_cyl              : num  0.2308 0.0769 0.3846 0.0769 0.0769 ...
##  $ transmission         : Factor w/ 7 levels "A","AM","AMS",..: 3 2 4 3 4 3 4 4 4 7 ...
##  $ num_gears            : num  0.889 0.556 0.778 0.667 0.778 ...
##  $ air_aspired_method   : Factor w/ 5 levels "Naturally Aspirated",..: 4 4 4 4 4 4 3 1 3 3 ...
##  $ regen_brake          : Factor w/ 3 levels "Electrical Regen Brake",..: 1 3 3 3 3 3 3 3 3 3 ...
##  $ batt_capacity_ah     : num  0.25 0 0 0 0 0 0 0 0 0 ...
##  $ drive                : Factor w/ 5 levels "2-Wheel Drive, Front",..: 4 2 2 4 2 4 2 2 2 2 ...
##  $ fuel_type            : Factor w/ 5 levels "Diesel, ultra low sulfur (15 ppm, maximum)",..: 4 3 3 5 3 4 4 4 4 4 ...
##  $ cyl_deactivate       : Factor w/ 2 levels "N","Y": 1 1 1 1 1 2 1 2 2 1 ...
##  $ variable_valve       : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...

Separating the numeric variables

cols <- c("fuel_economy_combined", 'eng_disp', 'num_cyl', 'num_gears', 'batt_capacity_ah')

Function for standardization/scaling

range01 <- function(x){(x-min(x, na.rm=TRUE))/(max(x, na.rm=TRUE)-min(x, na.rm=TRUE))}

Standardization/Scaling

df[cols] <- lapply(df[cols], range01)

One Hot Coding

We need to transform the categorical variables to numeric ones.

m <- model.matrix(fuel_economy_combined ~ ., data = df)
m <- as.matrix(data.frame(m, df[, 1]))

Adding the output variable to “m”

colnames(m)[28] <- "fuel_economy_combined"

Training and testing the neural network

Using the ‘for’ loop structure, let’s create a neural network model and evaluate it through a ‘k-fold’ method.

Making an k-fold

k <- 10 #number of folds
m2 = m[sample(1:nrow(m)), ] 
N <- nrow(m2)

stats <- NULL 
pred <- data.frame(matrix(ncol = ncol(m2), nrow = 0))
  
for (i in 0:(k-1)){
  cv_test <- seq(N)>N*(i/k) & seq(N)<=N*((i+1)/k)
  cv_train <- !(seq(N)>N*(i/k) & seq(N)<=N*((i+1)/k))
    
# Training
nn <- neuralnet(fuel_economy_combined ~ ., 
                data=m2[cv_train,], 
                hidden = c(7, 3), 
                # threshold = 0.8,
                linear.output = TRUE)
    
# Evaluating
pred_tmp <- predict(nn, m2[cv_test,])
pred <- rbind(pred, pred_tmp)
    
}

Evaluation

caret::postResample(pred, m2[,28])
##       RMSE   Rsquared        MAE 
## 0.04670791 0.86270240 0.03086598

We have an R² of ~0.85, which indicates that our model has a good accuracy when used to predict from new data.

Creating the final model (using all rows)

nn_final <- neuralnet(fuel_economy_combined ~ ., 
                data=m, 
                hidden = c(7, 3), 
                linear.output = TRUE)
pred2 <- predict(nn_final, m)
plot(x = pred2, y = df$fuel_economy_combined)

caret::postResample(pred2, df$fuel_economy_combined)
##       RMSE   Rsquared        MAE 
## 0.03374856 0.92761338 0.02560527
plot(nn_final)