Assignment Prompt:

Perform an analysis of the dataset(s) used in Homework #2 using the SVM algorithm. Compare the results with the results from previous homework.

Libraries

library(tidyverse)
library(dplyr)
library(tidyr)
library(rpart)
library(rpart.plot)
library(lubridate)
library(skimr)
library(stringr)
library(corrplot)
library(ggplot2)
library(fpp3)
library(caret)
library(readr)
library(GGally)
library(tidymodels)
library(randomForest)
library(e1071)

Process Data

Here we will process the data in the same way that we did in homework 2

# Load Datasets

cars <- "/Users/alecmccabe/Downloads/honda_sell_data.csv"
data <- read_csv(cars)
# remove all rows where there is no price

data <- data[data$Price != 'Not Priced', ]
# parse the price value

data$Price <- parse_number(data$Price)
# replace '-' with null

repl_char <- data$Mileage[2]
data$Mileage <- as.numeric(na_if(data$Mileage, repl_char))
## Warning: NAs introduced by coercion
# calculate median and replace NAs with it

repl_median <- median(data$Mileage, na.rm=TRUE)
data$Mileage <- replace(data$Mileage, is.na(data$Mileage), repl_median)
# drop rating features

data <- data %>%
  select(-c('Comfort_Rating','Interior_Design_Rating','Performance_Rating','Value_For_Money_Rating','Exterior_Styling_Rating','Reliability_Rating','Consumer_Rating','Consumer_Review_#','VIN','Stock_#','Seller_Type'))
# extract special character 'hyphen'
test <- data$MPG[4]
hyphen <- substr(test,3,3)
# create regex patterns
pattern1 <- paste("\\d+",hyphen,sep='')
pattern2 <- paste(hyphen,"\\d+",sep='')
# create function for calculating mean mpg given string
mean_mpg <- function(x){
  
  if (is.na(x)) {
    return(NA)
  }
  
  #print(x)
  
  left <- str_extract(x, regex(pattern1))
  left <- as.numeric(str_replace(left, hyphen,""))
  #print(left)
  
  right <- str_extract(x, regex(pattern2))
  right <- as.numeric(str_replace(right,hyphen,""))
  #print(right)
  
  mean_val <- (left + right) / 2
  
  return(mean_val)
  
}
# apply function to dataset
data$MPG <- unlist(lapply(data$MPG, mean_mpg))
# replace with median

repl_median <- median(data$MPG, na.rm=TRUE)
data$MPG <- replace(data$MPG, is.na(data$MPG), repl_median)
# drop nas

data <- data %>% drop_na()
# function to combine colors

simplify_colors <- function(x) {
  x <- tolower(x)
  
  if (str_detect(x, 'black')) {return('black')}
  if (str_detect(x, 'white')) {return('white')}
  if (str_detect(x, 'gold')) {return('gold')}
  if (str_detect(x, 'silver')) {return('silver')}
  if (str_detect(x, 'red')) {return('red')}
  if (str_detect(x, 'blue')) {return('blue')}
  if (str_detect(x, 'green')) {return('green')}
  if (str_detect(x, 'tan')) {return('tan')}
  if (str_detect(x, 'yellow')) {return('yellow')}
  if (str_detect(x, 'orange')) {return('orange')}
  if (str_detect(x, 'purple')) {return('purple')}
  if (str_detect(x, 'cyan')) {return('cyan')}
  if (str_detect(x, 'pearl')) {return('pearl')}
  if (str_detect(x, 'platinum')) {return('platinum')}
  if (str_detect(x, 'magenta')) {return('magenta')}
  if (str_detect(x, 'grey')) {return('grey')}
  if (str_detect(x, 'gray')) {return('gray')}
  if (str_detect(x, 'metallic')) {return('metallic')}
  if (str_detect(x, 'maroon')) {return('maroon')}
  
  
  return('other')

}
# create simplified color features

data$exterior_color_2 <- sapply(data$Exterior_Color, simplify_colors)
data$interior_color_2 <- sapply(data$Interior_Color, simplify_colors)
# function to simplify model

simplify_model <- function(x) {
  x <- tolower(x)
  
  if (str_detect(x, 'accord')) {return('accord')}
  if (str_detect(x, 'civic')) {return('civic')}
  if (str_detect(x, 'clarity')) {return('clarity')}
  if (str_detect(x, 'cr-v')) {return('cr-v')}
  if (str_detect(x, 'crosstour')) {return('crosstour')}
  if (str_detect(x, 'del sol')) {return('del sol')}
  if (str_detect(x, 'element')) {return('element')}
  if (str_detect(x, 'fit')) {return('fit')}
  if (str_detect(x, 'hr-v')) {return('hr-v')}
  if (str_detect(x, 'insight')) {return('insight')}
  if (str_detect(x, 'odyssey')) {return('odyssey')}
  if (str_detect(x, 'passport')) {return('passport')}
  if (str_detect(x, 'pilot')) {return('pilot')}
  if (str_detect(x, 'prelude')) {return('prelude')}
  if (str_detect(x, 'ridgeline')) {return('ridgeline')}
  if (str_detect(x, 's2000')) {return('s2000')}
  
  
  return('other')

}
# create new model column

data$model_2 <- sapply(data$Model, simplify_model)
# function to simplify transmission

simplify_transmission <- function(x) {
  x <- tolower(x)
  
  if (str_detect(x, 'automatic')) {return('automatic')}
  if (str_detect(x, 'cvt')) {return('cvt')}
  if (str_detect(x, 'continuous')) {return('cvt')}
  if (str_detect(x, 'manual')) {return('manual')}
  
  return('other')
}
# create new transmission column

data$transmission_2 <- sapply(data$Transmission, simplify_transmission)
# function to extract liters

extract_liters <- function(x) {
  liters <- str_extract(x, regex("\\d\\.\\dL"))
  
  if (is.na(liters)) {
    return('other')
  }
  else {
    return(liters)
  }
}
# create new liters column

data$engine_liters <- sapply(data$Engine, extract_liters)
# function to extract cylinders

extract_cylinders <- function(x) {
  liters <- str_extract(x, regex("\\d\\.\\dL"))
  
  if (is.na(liters)) {
    return('other')
  }
  else {
    return(liters)
  }
}
# create new cylinders column

data$engine_cylinders <- sapply(data$Engine, extract_cylinders)
# select final features

data <- data %>%
  select(model_2, Condition, exterior_color_2, interior_color_2, Drivetrain, Fuel_Type, transmission_2, engine_liters, Year, Mileage, MPG, Price)

Partition Data

# Partition data - train (80%) & test (20%)
set.seed(42)
ind <- sample(2, nrow(data), replace = T, prob = c(0.80, 0.20))
train <- data[ind==1,]
test <- data[ind==2,]

SVM Models

Model 1

The first model will use the same predictors as the first model that we tested in homework 2.

svm_model1 <- svm(Price ~ `model_2`,
                 data=train,
                 kernel="polynomial",
                 scale=FALSE)

svm_model1
## 
## Call:
## svm(formula = Price ~ model_2, data = train, kernel = "polynomial", 
##     scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  3 
##       gamma:  0.05882353 
##      coef.0:  0 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  3880
predictions <- predict(svm_model1, newdata=test)

postResample(predictions, test$Price)
##         RMSE     Rsquared          MAE 
## 1.019014e+04 2.670564e-01 8.376551e+03
sqrt(mean((test$Price - predictions)^2))
## [1] 10190.14

When compared to the tree-based models that we tested in homework 2, the SVM model performs significantly worse. Here we are seeing RSME values of over 10,000, which translates to this model on average mis-pricing a car by over $10,000. There are some cars that are easily worth less than 10,000, so this model is not acceptable.

For comparison, the tree-based model which was trained on only the model_2 feature scored an RSME of around 8,000.

Model 2

For this model, we will fit against all predictors, as we did with our optimal tree-based model.

## One hot encode all categorical features

dmy <- dummyVars(" ~ .", data = data)
onehot <- data.frame(predict(dmy, newdata = data))
head(onehot)
# Partition data - train (80%) & test (20%)
set.seed(42)
ind <- sample(2, nrow(onehot), replace = T, prob = c(0.80, 0.20))
dummy_train <- onehot[ind==1,]
dummy_test <- onehot[ind==2,]
svm_model2 <- svm(Price ~ .,
                 data=dummy_train,
                 scale=FALSE)

svm_model2
## 
## Call:
## svm(formula = Price ~ ., data = dummy_train, scale = FALSE)
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.01282051 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  3880
predictions <- predict(svm_model2, newdata=dummy_test)

postResample(predictions, test$Price)
##         RMSE     Rsquared          MAE 
## 1.002443e+04 3.422036e-01 8.213972e+03
sqrt(mean((dummy_test$Price - predictions)^2))
## [1] 10024.43
rf_2 <- randomForest(Price ~ ., importance = TRUE, na.action = na.omit, dummy_train)
# make predictions
p3 <- predict(rf_2, dummy_test)
# Root Mean Square Error
sqrt(mean((dummy_test$Price - p3)^2))
## [1] 2592.207

We see only a slight reduction in performance when using all features. RSME is still above 10,000 using SVM.

For comparison, our best tree-based model fitted against all predictors (the random forest) achieved an RSME of 2729.943.