Introduction

Employed data, scripts and a brief description can be found at the original repository. Further references can be found at the page of Prof. Ian Watson.

library(FNN)
library(here)
library(magrittr)
library(tidyverse)

source(here::here("code/calc_KNN_error.R"))

theme_set(theme_bw())

Data Overview

Data has the following attributes:

Make: Make of the car;
Model: Model of the car;
Year: Manufacturing Date;
Engine.Fuel.Type: Kind of fuel the engine runs on;
Engine.HP: Engine HorsePower;
Engine.Cylinders: Number of cylinders in the engine;
Transmission.Type: Type of car transmission;
Driven_Wheels: Wheels added;
Number.of.Doors: Number of doors;
Vehicle.Size: Vehycle size;
Vehicle.Style: Vehycle style;
highway.MPG: Miles per gallon on road;
city.mpg: Miles per gallon on city;
Popularity: Car popularity;
MSRP: Manufacturer’s Suggested Retail Price and target variable.

Loading Data

read_csv(here::here("data/data.csv"),
         progress = FALSE, 
         col_types =
           cols(
            Make = col_character(),
            Model = col_character(),
            Year = col_integer(),
            `Engine Fuel Type` = col_character(),
            `Engine HP` = col_integer(),
            `Engine Cylinders` = col_integer(),
            `Transmission Type` = col_character(),
            Driven_Wheels = col_character(),
            `Number of Doors` = col_integer(),
            `Market Category` = col_character(),
            `Vehicle Size` = col_character(),
            `Vehicle Style` = col_character(),
            `highway MPG` = col_integer(),
            `city mpg` = col_integer(),
            Popularity = col_integer(),
            MSRP = col_integer()
            )) %>% 
  drop_na() -> car_data

Dummify Categorical Variables

car_data %>%
  mutate(
    Make = as.numeric(factor(Make)),
    Model = as.numeric(factor(Model)),
    `Engine Fuel Type` = as.numeric(factor(`Engine Fuel Type`)),
    `Transmission Type` = as.numeric(factor(`Transmission Type`)),
    Driven_Wheels = as.numeric(factor(Driven_Wheels)),
    `Market Category` = as.numeric(factor(`Market Category`)),
    `Vehicle Size` = as.numeric(factor(`Vehicle Size`)),
    `Vehicle Style` = as.numeric(factor(`Vehicle Style`)))-> car_data 

car_data %>% 
    glimpse()

## Observations: 11,812
## Variables: 16
## $ Make                <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
## $ Model               <dbl> 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Year                <int> 2011, 2011, 2011, 2011, 2011, 2012, 2012, ...
## $ `Engine Fuel Type`  <dbl> 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ...
## $ `Engine HP`         <int> 335, 300, 300, 230, 230, 230, 300, 300, 23...
## $ `Engine Cylinders`  <int> 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, ...
## $ `Transmission Type` <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...
## $ Driven_Wheels       <dbl> 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, ...
## $ `Number of Doors`   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ `Market Category`   <dbl> 38, 67, 64, 67, 63, 67, 67, 64, 63, 63, 64...
## $ `Vehicle Size`      <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ `Vehicle Style`     <dbl> 9, 7, 9, 9, 7, 9, 7, 9, 7, 7, 9, 9, 7, 7, ...
## $ `highway MPG`       <int> 26, 28, 28, 28, 28, 28, 26, 28, 28, 27, 28...
## $ `city mpg`          <int> 19, 19, 20, 18, 18, 18, 17, 20, 18, 18, 20...
## $ Popularity          <int> 3916, 3916, 3916, 3916, 3916, 3916, 3916, ...
## $ MSRP                <int> 46135, 40650, 36350, 29450, 34500, 31200, ...

Checking for missing values

row.has.na <- apply(car_data, 
                    1,
                    function(x){any(is.na(x))})
noquote(paste('Number of rows with misssing values: ',
            sum(row.has.na)))

## [1] Number of rows with misssing values:  0

Applying scale to predictor variables

num.vars <- sapply(car_data,
                   is.numeric, 
                   simplify=F)

num.vars$MSRP = FALSE
num.vars <- unlist(num.vars)

car_data[num.vars] <- lapply(car_data[num.vars],
                             scale)
car_data %>% 
  sample_n(10)

Validation

Split data into training/testing sets

set.seed(101)

## Adding surrogate key to dataframe
car_data$id <- 1:nrow(car_data)

car_data %>% 
  dplyr::sample_frac(.8) -> train

dplyr::anti_join(car_data, 
                 train, 
                 by = 'id') -> test

Dissociate predictors from target variable/surrogate key

train %>% 
  select(-MSRP,-id) -> train.predictors

train %>% 
  select(MSRP, id) -> train.response

test %>% 
  select(-MSRP,-id) -> test.predictors

test %>% 
  select(MSRP, id) -> test.response

Apply K Nearest Neighbor

Calculate accumulated error

results <- data.frame(matrix(ncol = 0, nrow = 10))
results$k <- seq(1,10,1)

accum_err <- c()

for(num in results$k) {

  calc_KNN_error(num,
         train.predictors,
         test.predictors,
         train$id,
         train.response,
         test.response) -> err
  
  accum_err <-c(accum_err, err)
}

results$accum_err <- accum_err

results

results %>%
  ggplot(aes(k,accum_err)) +
  geom_point(size = 3,
             alpha = .6) + 
  geom_line() +
  scale_x_continuous(breaks=seq(1,10,1)) +
  labs(y="Accumulated Error", x= "K Value") +
  ggtitle("Accumulated Error by K value")

Results

A smaller K seems to render less accumulated error.
At K = 2, we have an unusual spike in terms of accumulated error, which one could impute on a probable overfit.
K = 1 renders the smallest amount of Accumulated Error.

Case Based Reasoning System for MSRP estimation

José Benardi de Souza Nunes

07/07/2018

Introduction

Data Overview

Loading Data

Dummify Categorical Variables

Checking for missing values

Applying scale to predictor variables

Validation

Split data into training/testing sets

Dissociate predictors from target variable/surrogate key

Apply K Nearest Neighbor

Calculate accumulated error

Results