Consider the data on used cars (ToyotaCorolla.csv) with
1436 records and details on 38 attributes, including Price, Age, KM, HP,
and other specifications. The goal is to predict the price of a used
Toyota Corolla based on its specifications.
## 'data.frame': 1436 obs. of 39 variables:
## $ Id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Model : chr "TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" "TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" "TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" "TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors" ...
## $ Price : int 13500 13750 13950 14950 13750 12950 16900 18600 21500 12950 ...
## $ Age_08_04 : int 23 23 24 26 30 32 27 30 27 23 ...
## $ Mfg_Month : int 10 10 9 7 3 1 6 3 6 10 ...
## $ Mfg_Year : int 2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
## $ KM : int 46986 72937 41711 48000 38500 61000 94612 75889 19700 71138 ...
## $ Fuel_Type : chr "Diesel" "Diesel" "Diesel" "Diesel" ...
## $ HP : int 90 90 90 90 90 90 90 90 192 69 ...
## $ Met_Color : int 1 1 1 0 0 0 1 1 0 0 ...
## $ Color : chr "Blue" "Silver" "Blue" "Black" ...
## $ Automatic : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CC : int 2000 2000 2000 2000 2000 2000 2000 2000 1800 1900 ...
## $ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Cylinders : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Gears : int 5 5 5 5 5 5 5 5 5 5 ...
## $ Quarterly_Tax : int 210 210 210 210 210 210 210 210 100 185 ...
## $ Weight : int 1165 1165 1165 1165 1170 1170 1245 1245 1185 1105 ...
## $ Mfr_Guarantee : int 0 0 1 1 1 0 0 1 0 0 ...
## $ BOVAG_Guarantee : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Guarantee_Period : int 3 3 3 3 3 3 3 3 3 3 ...
## $ ABS : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airbag_1 : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Airbag_2 : int 1 1 1 1 1 1 1 1 0 1 ...
## $ Airco : int 0 1 0 0 1 1 1 1 1 1 ...
## $ Automatic_airco : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Boardcomputer : int 1 1 1 1 1 1 1 1 0 1 ...
## $ CD_Player : int 0 1 0 0 0 0 0 1 0 0 ...
## $ Central_Lock : int 1 1 0 0 1 1 1 1 1 0 ...
## $ Powered_Windows : int 1 0 0 0 1 1 1 1 1 0 ...
## $ Power_Steering : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Radio : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Mistlamps : int 0 0 0 0 1 1 0 0 0 0 ...
## $ Sport_Model : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Backseat_Divider : int 1 1 1 1 1 1 1 1 0 1 ...
## $ Metallic_Rim : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Radio_cassette : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Parking_Assistant: int 0 0 0 0 0 0 0 0 0 0 ...
## $ Tow_Bar : int 0 0 0 0 0 0 0 0 0 0 ...
# Type your code here
set.seed(54321)
# Load necessary libraries
library(tidyverse)
library(caret) # for splitting the data## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following objects are masked from 'package:openintro':
##
## ethanol, lsegments
##
## Attaching package: 'caret'
## The following object is masked from 'package:openintro':
##
## dotPlot
## The following object is masked from 'package:purrr':
##
## lift
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
library(dplyr)
# Load the dataset (Assuming the dataset is loaded in the variable 'cars')
# cars <- read.csv("ToyotaCorolla.csv")
# Step 1: Select relevant variables
cars_selected <- cars[, c("Price", "Age_08_04", "KM", "Fuel_Type", "HP", "Automatic", "Doors",
"Quarterly_Tax", "Mfr_Guarantee", "Guarantee_Period", "Airco",
"Automatic_airco", "CD_Player", "Powered_Windows", "Sport_Model", "Tow_Bar")]
# Step 2: Scale numerical predictors and outcome variables (Price, Age_08_04, KM, HP, etc.)
# Scaling function to rescale to 0-1 range
scaled_cars <- cars_selected %>%
mutate(
Price = (Price - min(Price)) / (max(Price) - min(Price)), # Scaling Price
Age_08_04 = (Age_08_04 - min(Age_08_04)) / (max(Age_08_04) - min(Age_08_04)), # Scaling Age
KM = (KM - min(KM)) / (max(KM) - min(KM)), # Scaling KM
HP = (HP - min(HP)) / (max(HP) - min(HP)), # Scaling HP
Quarterly_Tax = (Quarterly_Tax - min(Quarterly_Tax)) / (max(Quarterly_Tax) - min(Quarterly_Tax)), # Scaling Quarterly_Tax
Guarantee_Period = (Guarantee_Period - min(Guarantee_Period)) / (max(Guarantee_Period) - min(Guarantee_Period)) # Scaling Guarantee_Period
)
# Step 3: Convert 'Fuel_Type' to dummy variables (one-hot encoding)
fuel_dummies <- model.matrix(~ Fuel_Type - 1, data = scaled_cars)
# Step 4: Bind the dummy variables back to the data frame
scaled_cars <- bind_cols(scaled_cars, fuel_dummies)
# Remove the original 'Fuel_Type' column using base R
scaled_cars <- scaled_cars[, !grepl("Fuel_Type", colnames(scaled_cars))]
# Check the updated structure of the data frame
str(scaled_cars)## 'data.frame': 1436 obs. of 15 variables:
## $ Price : num 0.325 0.334 0.341 0.377 0.334 ...
## $ Age_08_04 : num 0.278 0.278 0.291 0.316 0.367 ...
## $ KM : num 0.193 0.3 0.172 0.198 0.158 ...
## $ HP : num 0.171 0.171 0.171 0.171 0.171 ...
## $ Automatic : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Quarterly_Tax : num 0.723 0.723 0.723 0.723 0.723 ...
## $ Mfr_Guarantee : int 0 0 1 1 1 0 0 1 0 0 ...
## $ Guarantee_Period: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Airco : int 0 1 0 0 1 1 1 1 1 1 ...
## $ Automatic_airco : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CD_Player : int 0 1 0 0 0 0 0 1 0 0 ...
## $ Powered_Windows : int 1 0 0 0 1 1 1 1 1 0 ...
## $ Sport_Model : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Tow_Bar : int 0 0 0 0 0 0 0 0 0 0 ...
# Step 5: Split the data into training (70%) and test (30%) sets
set.seed(54321) # Set a seed for reproducibility
trainIndex <- createDataPartition(scaled_cars$Price, p = 0.7, list = FALSE)
# Create training and testing datasets
train_data <- scaled_cars[trainIndex, ]
test_data <- scaled_cars[-trainIndex, ]
# View the structure of the train and test data
str(train_data)## 'data.frame': 1007 obs. of 15 variables:
## $ Price : num 0.325 0.341 0.377 0.334 0.506 ...
## $ Age_08_04 : num 0.278 0.291 0.316 0.367 0.367 ...
## $ KM : num 0.193 0.172 0.198 0.158 0.312 ...
## $ HP : num 0.171 0.171 0.171 0.171 0.171 ...
## $ Automatic : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Quarterly_Tax : num 0.723 0.723 0.723 0.723 0.723 ...
## $ Mfr_Guarantee : int 0 1 1 1 1 0 1 1 1 1 ...
## $ Guarantee_Period: num 0 0 0 0 0 ...
## $ Airco : int 0 0 0 1 1 1 1 1 1 1 ...
## $ Automatic_airco : int 0 0 0 0 0 0 1 1 1 1 ...
## $ CD_Player : int 0 0 0 0 1 0 1 0 0 1 ...
## $ Powered_Windows : int 1 0 0 1 1 1 1 1 1 1 ...
## $ Sport_Model : int 0 0 0 0 0 0 0 1 1 1 ...
## $ Tow_Bar : int 0 0 0 0 0 0 0 0 0 0 ...
## 'data.frame': 429 obs. of 15 variables:
## $ Price : num 0.334 0.306 0.446 0.306 0.448 ...
## $ Age_08_04 : num 0.278 0.392 0.329 0.278 0.367 ...
## $ KM : num 0.3 0.251 0.389 0.293 0.265 ...
## $ HP : num 0.171 0.171 0.171 0 0.333 ...
## $ Automatic : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Quarterly_Tax : num 0.723 0.723 0.723 0.629 0.25 ...
## $ Mfr_Guarantee : int 0 0 0 0 1 1 0 1 1 1 ...
## $ Guarantee_Period: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Airco : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Automatic_airco : int 0 0 0 0 0 0 1 1 1 1 ...
## $ CD_Player : int 1 0 0 0 1 1 0 1 1 0 ...
## $ Powered_Windows : int 0 1 1 0 1 1 1 1 1 1 ...
## $ Sport_Model : int 0 0 1 0 1 1 1 1 1 1 ...
## $ Tow_Bar : int 0 0 0 0 0 1 0 0 0 0 ...
Fit a neural network model to the data. Use a single hidden layer
with 2 nodes.
- Use predictors Age_08_04, KM, Fuel_Type, HP, Automatic, Doors,
Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, Airco, Automatic_airco,
CD_Player, Powered_Windows, Sport_Model, and Tow_Bar.
- Record the RMSE for the training data and the test data.
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(caret)
# Step 1: Split the data into training (70%) and test (30%) sets
set.seed(54321)
trainIndex <- createDataPartition(scaled_cars$Price, p = 0.7, list = FALSE)
train_data <- scaled_cars[trainIndex, ]
test_data <- scaled_cars[-trainIndex, ]
# Step 2: Define the formula for the neural network model
formula <- Price ~ Age_08_04 + KM + HP + Automatic + Doors +
Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco +
Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar
# Step 3: Fit the neural network model with a single hidden layer (2 nodes)
nn_model <- neuralnet(formula, data = train_data, hidden = 2, linear.output = TRUE)
plot(nn_model)
# Step 4: Make predictions on the training and test data
train_predictions <- predict(nn_model, train_data)
test_predictions <- predict(nn_model, test_data)
# Step 5: Calculate RMSE for both the training and test data
rmse_train <- sqrt(mean((train_predictions - train_data$Price)^2))
rmse_test <- sqrt(mean((test_predictions - test_data$Price)^2))
# Print RMSE values
print(paste("RMSE on training data:", round(rmse_train, 2)))## [1] "RMSE on training data: 0.04"
## [1] "RMSE on test data: 0.04"
Repeat the process, changing the number of hidden layers and nodes to
{single layer with 5 nodes}, {two layers, 5 nodes in each layer}
i. What happens to the RMS error for the training data as the number of
layers and nodes increases?
ii. What happens to the RMS error for the validation data?
iii. Comment on the appropriate number of layers and nodes for this
application.
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Step 1: Split the data into training (70%) and test (30%) sets
set.seed(54321)
trainIndex <- createDataPartition(scaled_cars$Price, p = 0.7, list = FALSE)
train_data <- scaled_cars[trainIndex, ]
test_data <- scaled_cars[-trainIndex, ]
# Step 2: Define the formula for the neural network model
formula <- Price ~ Age_08_04 + KM + HP + Automatic + Doors +
Quarterly_Tax + Mfr_Guarantee + Guarantee_Period + Airco +
Automatic_airco + CD_Player + Powered_Windows + Sport_Model + Tow_Bar
# -----------------------------------------------
# Model 1: Single hidden layer with 5 nodes
# -----------------------------------------------
nn_model_1 <- neuralnet(formula, data = train_data, hidden = 5, linear.output = TRUE)
# Plot the neural network model
#plot(nn_model_1)
# Plot the neural network model for Model 1
plot(nn_model_1, main = "Model 1: Single Hidden Layer with 5 Nodes")
# Step 3: Make predictions on the training and test data
train_predictions_1 <- predict(nn_model_1, train_data)
test_predictions_1 <- predict(nn_model_1, test_data)
# Step 4: Calculate RMSE for both the training and test data for Model 1
rmse_train_1 <- sqrt(mean((train_predictions_1 - train_data$Price)^2))
rmse_test_1 <- sqrt(mean((test_predictions_1 - test_data$Price)^2))
# Print RMSE values for Model 1
cat("Model 1 - Single hidden layer with 5 nodes:\n")## Model 1 - Single hidden layer with 5 nodes:
## RMSE on training data: 0.04
## RMSE on test data: 0.04
# -----------------------------------------------
# Model 2: Two hidden layers, 5 nodes in each layer
# -----------------------------------------------
nn_model_2 <- neuralnet(formula, data = train_data, hidden = c(5, 5), linear.output = TRUE)
# Plot the neural network model for Model 2
plot(nn_model_2, main = "Model 2: Two Hidden Layers with 5 Nodes Each")
# Step 3: Make predictions on the training and test data
train_predictions_2 <- predict(nn_model_2, train_data)
test_predictions_2 <- predict(nn_model_2, test_data)
# Step 4: Calculate RMSE for both the training and test data for Model 2
rmse_train_2 <- sqrt(mean((train_predictions_2 - train_data$Price)^2))
rmse_test_2 <- sqrt(mean((test_predictions_2 - test_data$Price)^2))
# Print RMSE values for Model 2
cat("Model 2 - Two hidden layers with 5 nodes each:\n")## Model 2 - Two hidden layers with 5 nodes each:
## RMSE on training data: 0.03
## RMSE on test data: 0.04