Assignment Prompt:
Perform an analysis of the dataset(s) used in Homework #2 using the SVM algorithm. Compare the results with the results from previous homework.
Libraries
library(tidyverse)
library(dplyr)
library(tidyr)
library(rpart)
library(rpart.plot)
library(lubridate)
library(skimr)
library(stringr)
library(corrplot)
library(ggplot2)
library(fpp3)
library(caret)
library(readr)
library(GGally)
library(tidymodels)
library(randomForest)
library(e1071)Process Data
Here we will process the data in the same way that we did in homework 2
# Load Datasets
cars <- "/Users/alecmccabe/Downloads/honda_sell_data.csv"
data <- read_csv(cars)# remove all rows where there is no price
data <- data[data$Price != 'Not Priced', ]# parse the price value
data$Price <- parse_number(data$Price)# replace '-' with null
repl_char <- data$Mileage[2]
data$Mileage <- as.numeric(na_if(data$Mileage, repl_char))## Warning: NAs introduced by coercion
# calculate median and replace NAs with it
repl_median <- median(data$Mileage, na.rm=TRUE)
data$Mileage <- replace(data$Mileage, is.na(data$Mileage), repl_median)# drop rating features
data <- data %>%
select(-c('Comfort_Rating','Interior_Design_Rating','Performance_Rating','Value_For_Money_Rating','Exterior_Styling_Rating','Reliability_Rating','Consumer_Rating','Consumer_Review_#','VIN','Stock_#','Seller_Type'))# extract special character 'hyphen'
test <- data$MPG[4]
hyphen <- substr(test,3,3)# create regex patterns
pattern1 <- paste("\\d+",hyphen,sep='')
pattern2 <- paste(hyphen,"\\d+",sep='')# create function for calculating mean mpg given string
mean_mpg <- function(x){
if (is.na(x)) {
return(NA)
}
#print(x)
left <- str_extract(x, regex(pattern1))
left <- as.numeric(str_replace(left, hyphen,""))
#print(left)
right <- str_extract(x, regex(pattern2))
right <- as.numeric(str_replace(right,hyphen,""))
#print(right)
mean_val <- (left + right) / 2
return(mean_val)
}# apply function to dataset
data$MPG <- unlist(lapply(data$MPG, mean_mpg))# replace with median
repl_median <- median(data$MPG, na.rm=TRUE)
data$MPG <- replace(data$MPG, is.na(data$MPG), repl_median)# drop nas
data <- data %>% drop_na()# function to combine colors
simplify_colors <- function(x) {
x <- tolower(x)
if (str_detect(x, 'black')) {return('black')}
if (str_detect(x, 'white')) {return('white')}
if (str_detect(x, 'gold')) {return('gold')}
if (str_detect(x, 'silver')) {return('silver')}
if (str_detect(x, 'red')) {return('red')}
if (str_detect(x, 'blue')) {return('blue')}
if (str_detect(x, 'green')) {return('green')}
if (str_detect(x, 'tan')) {return('tan')}
if (str_detect(x, 'yellow')) {return('yellow')}
if (str_detect(x, 'orange')) {return('orange')}
if (str_detect(x, 'purple')) {return('purple')}
if (str_detect(x, 'cyan')) {return('cyan')}
if (str_detect(x, 'pearl')) {return('pearl')}
if (str_detect(x, 'platinum')) {return('platinum')}
if (str_detect(x, 'magenta')) {return('magenta')}
if (str_detect(x, 'grey')) {return('grey')}
if (str_detect(x, 'gray')) {return('gray')}
if (str_detect(x, 'metallic')) {return('metallic')}
if (str_detect(x, 'maroon')) {return('maroon')}
return('other')
}# create simplified color features
data$exterior_color_2 <- sapply(data$Exterior_Color, simplify_colors)
data$interior_color_2 <- sapply(data$Interior_Color, simplify_colors)# function to simplify model
simplify_model <- function(x) {
x <- tolower(x)
if (str_detect(x, 'accord')) {return('accord')}
if (str_detect(x, 'civic')) {return('civic')}
if (str_detect(x, 'clarity')) {return('clarity')}
if (str_detect(x, 'cr-v')) {return('cr-v')}
if (str_detect(x, 'crosstour')) {return('crosstour')}
if (str_detect(x, 'del sol')) {return('del sol')}
if (str_detect(x, 'element')) {return('element')}
if (str_detect(x, 'fit')) {return('fit')}
if (str_detect(x, 'hr-v')) {return('hr-v')}
if (str_detect(x, 'insight')) {return('insight')}
if (str_detect(x, 'odyssey')) {return('odyssey')}
if (str_detect(x, 'passport')) {return('passport')}
if (str_detect(x, 'pilot')) {return('pilot')}
if (str_detect(x, 'prelude')) {return('prelude')}
if (str_detect(x, 'ridgeline')) {return('ridgeline')}
if (str_detect(x, 's2000')) {return('s2000')}
return('other')
}# create new model column
data$model_2 <- sapply(data$Model, simplify_model)# function to simplify transmission
simplify_transmission <- function(x) {
x <- tolower(x)
if (str_detect(x, 'automatic')) {return('automatic')}
if (str_detect(x, 'cvt')) {return('cvt')}
if (str_detect(x, 'continuous')) {return('cvt')}
if (str_detect(x, 'manual')) {return('manual')}
return('other')
}# create new transmission column
data$transmission_2 <- sapply(data$Transmission, simplify_transmission)# function to extract liters
extract_liters <- function(x) {
liters <- str_extract(x, regex("\\d\\.\\dL"))
if (is.na(liters)) {
return('other')
}
else {
return(liters)
}
}# create new liters column
data$engine_liters <- sapply(data$Engine, extract_liters)# function to extract cylinders
extract_cylinders <- function(x) {
liters <- str_extract(x, regex("\\d\\.\\dL"))
if (is.na(liters)) {
return('other')
}
else {
return(liters)
}
}# create new cylinders column
data$engine_cylinders <- sapply(data$Engine, extract_cylinders)# select final features
data <- data %>%
select(model_2, Condition, exterior_color_2, interior_color_2, Drivetrain, Fuel_Type, transmission_2, engine_liters, Year, Mileage, MPG, Price)Partition Data
# Partition data - train (80%) & test (20%)
set.seed(42)
ind <- sample(2, nrow(data), replace = T, prob = c(0.80, 0.20))
train <- data[ind==1,]
test <- data[ind==2,]SVM Models
Model 1
The first model will use the same predictors as the first model that we tested in homework 2.
svm_model1 <- svm(Price ~ `model_2`,
data=train,
kernel="polynomial",
scale=FALSE)
svm_model1##
## Call:
## svm(formula = Price ~ model_2, data = train, kernel = "polynomial",
## scale = FALSE)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: polynomial
## cost: 1
## degree: 3
## gamma: 0.05882353
## coef.0: 0
## epsilon: 0.1
##
##
## Number of Support Vectors: 3880
predictions <- predict(svm_model1, newdata=test)
postResample(predictions, test$Price)## RMSE Rsquared MAE
## 1.019014e+04 2.670564e-01 8.376551e+03
sqrt(mean((test$Price - predictions)^2))## [1] 10190.14
When compared to the tree-based models that we tested in homework 2, the SVM model performs significantly worse. Here we are seeing RSME values of over 10,000, which translates to this model on average mis-pricing a car by over $10,000. There are some cars that are easily worth less than 10,000, so this model is not acceptable.
For comparison, the tree-based model which was trained on only the model_2 feature scored an RSME of around 8,000.
Model 2
For this model, we will fit against all predictors, as we did with our optimal tree-based model.
## One hot encode all categorical features
dmy <- dummyVars(" ~ .", data = data)
onehot <- data.frame(predict(dmy, newdata = data))
head(onehot)# Partition data - train (80%) & test (20%)
set.seed(42)
ind <- sample(2, nrow(onehot), replace = T, prob = c(0.80, 0.20))
dummy_train <- onehot[ind==1,]
dummy_test <- onehot[ind==2,]svm_model2 <- svm(Price ~ .,
data=dummy_train,
scale=FALSE)
svm_model2##
## Call:
## svm(formula = Price ~ ., data = dummy_train, scale = FALSE)
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.01282051
## epsilon: 0.1
##
##
## Number of Support Vectors: 3880
predictions <- predict(svm_model2, newdata=dummy_test)
postResample(predictions, test$Price)## RMSE Rsquared MAE
## 1.002443e+04 3.422036e-01 8.213972e+03
sqrt(mean((dummy_test$Price - predictions)^2))## [1] 10024.43
rf_2 <- randomForest(Price ~ ., importance = TRUE, na.action = na.omit, dummy_train)# make predictions
p3 <- predict(rf_2, dummy_test)
# Root Mean Square Error
sqrt(mean((dummy_test$Price - p3)^2))## [1] 2592.207
We see only a slight reduction in performance when using all features. RSME is still above 10,000 using SVM.
For comparison, our best tree-based model fitted against all predictors (the random forest) achieved an RSME of 2729.943.