Libraries

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

library(ggplot2)
library(gridExtra)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load Data

df <- read.csv("Vehicle_Retail_Price_Assignment.csv")

Summary Statistics of Data

dim(df) #number of rows and columns in data

## [1] 205  26

head(df) #first 6 rows of data

##   car_ID symboling                  CarName fueltype aspiration doornumber
## 1      1         3       alfa-romero giulia      gas        std        two
## 2      2         3      alfa-romero stelvio      gas        std        two
## 3      3         1 alfa-romero Quadrifoglio      gas        std        two
## 4      4         2              audi 100 ls      gas        std       four
## 5      5         2               audi 100ls      gas        std       four
## 6      6         2                 audi fox      gas        std        two
##       carbody drivewheel enginelocation wheelbase carlength carwidth carheight
## 1 convertible        rwd          front      88.6     168.8     64.1      48.8
## 2 convertible        rwd          front      88.6     168.8     64.1      48.8
## 3   hatchback        rwd          front      94.5     171.2     65.5      52.4
## 4       sedan        fwd          front      99.8     176.6     66.2      54.3
## 5       sedan        4wd          front      99.4     176.6     66.4      54.3
## 6       sedan        fwd          front      99.8     177.3     66.3      53.1
##   curbweight enginetype cylindernumber enginesize fuelsystem boreratio stroke
## 1       2548       dohc           four        130       mpfi      3.47   2.68
## 2       2548       dohc           four        130       mpfi      3.47   2.68
## 3       2823       ohcv            six        152       mpfi      2.68   3.47
## 4       2337        ohc           four        109       mpfi      3.19   3.40
## 5       2824        ohc           five        136       mpfi      3.19   3.40
## 6       2507        ohc           five        136       mpfi      3.19   3.40
##   compressionratio horsepower peakrpm citympg highwaympg price
## 1              9.0        111    5000      21         27 13495
## 2              9.0        111    5000      21         27 16500
## 3              9.0        154    5000      19         26 16500
## 4             10.0        102    5500      24         30 13950
## 5              8.0        115    5500      18         22 17450
## 6              8.5        110    5500      19         25 15250

str(df) #structure of data

## 'data.frame':    205 obs. of  26 variables:
##  $ car_ID          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ symboling       : int  3 3 1 2 2 2 1 1 1 0 ...
##  $ CarName         : chr  "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
##  $ fueltype        : chr  "gas" "gas" "gas" "gas" ...
##  $ aspiration      : chr  "std" "std" "std" "std" ...
##  $ doornumber      : chr  "two" "two" "two" "four" ...
##  $ carbody         : chr  "convertible" "convertible" "hatchback" "sedan" ...
##  $ drivewheel      : chr  "rwd" "rwd" "rwd" "fwd" ...
##  $ enginelocation  : chr  "front" "front" "front" "front" ...
##  $ wheelbase       : num  88.6 88.6 94.5 99.8 99.4 ...
##  $ carlength       : num  169 169 171 177 177 ...
##  $ carwidth        : num  64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
##  $ carheight       : num  48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ curbweight      : int  2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
##  $ enginetype      : chr  "dohc" "dohc" "ohcv" "ohc" ...
##  $ cylindernumber  : chr  "four" "four" "six" "four" ...
##  $ enginesize      : int  130 130 152 109 136 136 136 136 131 131 ...
##  $ fuelsystem      : chr  "mpfi" "mpfi" "mpfi" "mpfi" ...
##  $ boreratio       : num  3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
##  $ stroke          : num  2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
##  $ compressionratio: num  9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ horsepower      : int  111 111 154 102 115 110 110 110 140 160 ...
##  $ peakrpm         : int  5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
##  $ citympg         : int  21 21 19 24 18 19 19 19 17 16 ...
##  $ highwaympg      : int  27 27 26 30 22 25 25 25 20 22 ...
##  $ price           : num  13495 16500 16500 13950 17450 ...

summary(df) #statistical summary of data

##      car_ID      symboling         CarName            fueltype        
##  Min.   :  1   Min.   :-2.0000   Length:205         Length:205        
##  1st Qu.: 52   1st Qu.: 0.0000   Class :character   Class :character  
##  Median :103   Median : 1.0000   Mode  :character   Mode  :character  
##  Mean   :103   Mean   : 0.8341                                        
##  3rd Qu.:154   3rd Qu.: 2.0000                                        
##  Max.   :205   Max.   : 3.0000                                        
##   aspiration         doornumber          carbody           drivewheel       
##  Length:205         Length:205         Length:205         Length:205        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  enginelocation       wheelbase        carlength        carwidth    
##  Length:205         Min.   : 86.60   Min.   :141.1   Min.   :60.30  
##  Class :character   1st Qu.: 94.50   1st Qu.:166.3   1st Qu.:64.10  
##  Mode  :character   Median : 97.00   Median :173.2   Median :65.50  
##                     Mean   : 98.76   Mean   :174.0   Mean   :65.91  
##                     3rd Qu.:102.40   3rd Qu.:183.1   3rd Qu.:66.90  
##                     Max.   :120.90   Max.   :208.1   Max.   :72.30  
##    carheight       curbweight    enginetype        cylindernumber    
##  Min.   :47.80   Min.   :1488   Length:205         Length:205        
##  1st Qu.:52.00   1st Qu.:2145   Class :character   Class :character  
##  Median :54.10   Median :2414   Mode  :character   Mode  :character  
##  Mean   :53.72   Mean   :2556                                        
##  3rd Qu.:55.50   3rd Qu.:2935                                        
##  Max.   :59.80   Max.   :4066                                        
##    enginesize     fuelsystem          boreratio        stroke     
##  Min.   : 61.0   Length:205         Min.   :2.54   Min.   :2.070  
##  1st Qu.: 97.0   Class :character   1st Qu.:3.15   1st Qu.:3.110  
##  Median :120.0   Mode  :character   Median :3.31   Median :3.290  
##  Mean   :126.9                      Mean   :3.33   Mean   :3.255  
##  3rd Qu.:141.0                      3rd Qu.:3.58   3rd Qu.:3.410  
##  Max.   :326.0                      Max.   :3.94   Max.   :4.170  
##  compressionratio   horsepower       peakrpm        citympg     
##  Min.   : 7.00    Min.   : 48.0   Min.   :4150   Min.   :13.00  
##  1st Qu.: 8.60    1st Qu.: 70.0   1st Qu.:4800   1st Qu.:19.00  
##  Median : 9.00    Median : 95.0   Median :5200   Median :24.00  
##  Mean   :10.14    Mean   :104.1   Mean   :5125   Mean   :25.22  
##  3rd Qu.: 9.40    3rd Qu.:116.0   3rd Qu.:5500   3rd Qu.:30.00  
##  Max.   :23.00    Max.   :288.0   Max.   :6600   Max.   :49.00  
##    highwaympg        price      
##  Min.   :16.00   Min.   : 5118  
##  1st Qu.:25.00   1st Qu.: 7788  
##  Median :30.00   Median :10295  
##  Mean   :30.75   Mean   :13277  
##  3rd Qu.:34.00   3rd Qu.:16503  
##  Max.   :54.00   Max.   :45400

Check for Missing Data

#checking for NAs
colSums(is.na(df)) #there is no NA's in data

##           car_ID        symboling          CarName         fueltype 
##                0                0                0                0 
##       aspiration       doornumber          carbody       drivewheel 
##                0                0                0                0 
##   enginelocation        wheelbase        carlength         carwidth 
##                0                0                0                0 
##        carheight       curbweight       enginetype   cylindernumber 
##                0                0                0                0 
##       enginesize       fuelsystem        boreratio           stroke 
##                0                0                0                0 
## compressionratio       horsepower          peakrpm          citympg 
##                0                0                0                0 
##       highwaympg            price 
##                0                0

Convert Categorical Variables to Factor

#convert character variables in factor
df[sapply(df, is.character)] <- lapply(df[sapply(df, is.character)], 
                                       as.factor)
df$symboling <- as.factor(df$symboling)
str(df)

## 'data.frame':    205 obs. of  26 variables:
##  $ car_ID          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ symboling       : Factor w/ 6 levels "-2","-1","0",..: 6 6 4 5 5 5 4 4 4 3 ...
##  $ CarName         : Factor w/ 147 levels "alfa-romero giulia",..: 1 3 2 4 5 9 5 7 6 8 ...
##  $ fueltype        : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
##  $ aspiration      : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
##  $ doornumber      : Factor w/ 2 levels "four","two": 2 2 2 1 1 2 1 1 1 2 ...
##  $ carbody         : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
##  $ drivewheel      : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
##  $ enginelocation  : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
##  $ wheelbase       : num  88.6 88.6 94.5 99.8 99.4 ...
##  $ carlength       : num  169 169 171 177 177 ...
##  $ carwidth        : num  64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
##  $ carheight       : num  48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ curbweight      : int  2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
##  $ enginetype      : Factor w/ 7 levels "dohc","dohcv",..: 1 1 6 4 4 4 4 4 4 4 ...
##  $ cylindernumber  : Factor w/ 7 levels "eight","five",..: 3 3 4 3 2 2 2 2 2 2 ...
##  $ enginesize      : int  130 130 152 109 136 136 136 136 131 131 ...
##  $ fuelsystem      : Factor w/ 8 levels "1bbl","2bbl",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ boreratio       : num  3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
##  $ stroke          : num  2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
##  $ compressionratio: num  9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ horsepower      : int  111 111 154 102 115 110 110 110 140 160 ...
##  $ peakrpm         : int  5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
##  $ citympg         : int  21 21 19 24 18 19 19 19 17 16 ...
##  $ highwaympg      : int  27 27 26 30 22 25 25 25 20 22 ...
##  $ price           : num  13495 16500 16500 13950 17450 ...

Exploratory Data Analysis

Exploring the Effect of Car Dimensions on Price

#effect of car dimensions on price
d1 <- ggplot(df, aes(x = wheelbase, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

d2 <- ggplot(df, aes(x = carlength, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

d3 <- ggplot(df, aes(x = carwidth,  y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

d4 <- ggplot(df, aes(x = carheight, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

d5 <- ggplot(df, aes(x = curbweight, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()


grid.arrange(d1, d2, d3, d4, d5, top = "Variation of Price with Car Dimensions")

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Engine Technical Specifications on Price

#engine technical specifications
et1 <- ggplot(df, aes(x = boreratio, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

et2 <- ggplot(df, aes(x = stroke, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

et3 <- ggplot(df, aes(x = compressionratio, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

et4 <- ggplot(df, aes(x = horsepower, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

et5 <- ggplot(df, aes(x = peakrpm, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

grid.arrange(et1, et2, et3, et4, et5, top = "Price with Technical Engine Specification")

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Mean Price Based on Engine Specification

c1 <- df %>% group_by(doornumber) %>% summarise(mean_price = mean(price)) %>% {
      ggplot(., aes(x = doornumber, y = mean_price, fill = doornumber)) + 
      geom_bar(stat = "identity") +
      theme_minimal() + theme(legend.position = "none")
}

## `summarise()` ungrouping output (override with `.groups` argument)

c2 <- df %>% group_by(carbody) %>% summarise(mean_price = mean(price)) %>% {
  ggplot(., aes(x = carbody, y = mean_price, fill = carbody)) + 
    geom_bar(stat = "identity") +
    theme_minimal() + theme(legend.position = "none")
}

## `summarise()` ungrouping output (override with `.groups` argument)

c3 <- df %>% group_by(drivewheel) %>% summarise(mean_price = mean(price)) %>% {
  ggplot(., aes(x = drivewheel, y = mean_price, fill = drivewheel)) + 
    geom_bar(stat = "identity") +
    theme_minimal() + theme(legend.position = "none")
}

## `summarise()` ungrouping output (override with `.groups` argument)

c4 <- df %>% group_by(enginelocation) %>% summarise(mean_price = mean(price)) %>% {
  ggplot(., aes(x = enginelocation, y = mean_price, fill = enginelocation)) + 
    geom_bar(stat = "identity") +
    theme_minimal() + theme(legend.position = "none")
}

## `summarise()` ungrouping output (override with `.groups` argument)

grid.arrange(c1, c2, c3, c4, top = "Mean Price for Different Configuations")

MPG with Price

#mpg with price
mpg1 <- ggplot(df, aes(x = citympg, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

mpg2 <- ggplot(df, aes(x = highwaympg, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

grid.arrange(mpg1, mpg2, top = "Price with MPG")

## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Effect of Symboling on Price

#symboling
df %>% group_by(symboling) %>% summarise(mean_price = mean(price)) %>% {
  ggplot(., aes(x = symboling, y = mean_price, fill = symboling)) + 
    geom_bar(stat = "identity") +
    theme_minimal() + theme(legend.position = "none")
}

## `summarise()` ungrouping output (override with `.groups` argument)

Boxplots to Show Variation of Price with Different Factors

#boxplots
box1 <- df %>% group_by(symboling)  %>% {
        ggplot(., aes(x = symboling, y = price, fill = symboling)) + 
        geom_boxplot() +
        theme_minimal() + theme(legend.position = "none")
}
box1

box2 <- df %>% group_by(fueltype)  %>% {
    ggplot(., aes(x = fueltype, y = price, fill = fueltype)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box3 <- df %>% group_by(aspiration)  %>% {
  ggplot(., aes(x = aspiration, y = price, fill = aspiration)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box4 <- df %>% group_by(doornumber)  %>% {
  ggplot(., aes(x = doornumber, y = price, fill = doornumber)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box5 <- df %>% group_by(carbody)  %>% {
  ggplot(., aes(x = carbody, y = price, fill = carbody)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box6 <- df %>% group_by(drivewheel)  %>% {
  ggplot(., aes(x = drivewheel, y = price, fill = drivewheel)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box7 <- df %>% group_by(enginelocation)  %>% {
  ggplot(., aes(x = enginelocation, y = price, fill = enginelocation)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box8 <- df %>% group_by(enginetype)  %>% {
  ggplot(., aes(x = enginetype, y = price, fill = enginetype)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box9 <- df %>% group_by(cylindernumber)  %>% {
  ggplot(., aes(x = cylindernumber, y = price, fill = cylindernumber)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box10 <- df %>% group_by(fuelsystem)  %>% {
  ggplot(., aes(x = fuelsystem, y = price, fill = fuelsystem)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

grid.arrange(box4, box5, box6, box7, top = "Variation of Price for Different Configurations")

grid.arrange(box2, box3, box8, box9, box10, top = "Variation of Price for Different Engine Specifications")

Correlation Analysis

#correlation Analysis
numericData <- df[,sapply(df, is.numeric)] #filter all numeric vars
numericData <- numericData[, -c(1, 15)] #drop the id column and dependent var

library(corrplot)

## corrplot 0.84 loaded

corMat <- cor(numericData) #correlation matrix
corrplot(corMat, method = "number", type = "lower") #plot of corr matrix

highlyCorrelated <- findCorrelation(corMat, cutoff = 0.7) #find highly correlated
highlyCorCol <- colnames(numericData)[highlyCorrelated]
highlyCorCol

## [1] "curbweight" "carlength"  "carwidth"   "highwaympg" "enginesize"
## [6] "citympg"

Feature Selection

Feature selection is done using RFE (Recursive Feature Elimination)

Preprocess the Data for RFE

library(caret)
x <- df[, -c(1, 3, 26)] #drop car_ID, CarName and Price (Dependent Var)

#Convert Fators to Numeric for lmFuncs
x$symboling <- as.numeric(x$symboling)
x$fueltype  <- as.numeric(x$fueltype)
x$aspiration<- as.numeric(x$aspiration)
x$doornumber<- as.numeric(x$doornumber)
x$carbody   <- as.numeric(x$carbody)
x$drivewheel<- as.numeric(x$drivewheel)
x$enginelocation<- as.numeric(x$enginelocation)
x$enginetype<- as.numeric(x$enginetype)
x$cylindernumber<- as.numeric(x$cylindernumber)
x$fuelsystem<- as.numeric(x$fuelsystem)

#Normalize Data
normalization <- preProcess(x)
x <- predict(normalization, x)
x <- as.data.frame(x) #Predictors (Independent Vars)
y <- df[, 26] #Dependent Variable

Apply RFE

set.seed(5)
lmProfile2 <- rfe(x, y,
                  sizes = c(10:15, 20, 23),
                  rfeControl = rfeControl(functions = lmFuncs,
                                          rerank = TRUE,
                                          number = 200))

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

lmpImp <- data.frame(varImp(lmProfile2))
lmpImp <- data.frame(variable = rownames(lmpImp), lmpImp)

ggplot(data=lmpImp, 
       aes(x=reorder(variable, 
                     Overall), 
           y=Overall)) +
  geom_bar(stat="identity") + 
  coord_flip()                 #Variable importance plot

rownames(lmpImp) <- NULL
predictors(lmProfile2) #list of selected variables

##  [1] "enginesize"       "fueltype"         "compressionratio" "carwidth"        
##  [5] "curbweight"       "citympg"          "enginelocation"   "highwaympg"      
##  [9] "horsepower"       "wheelbase"        "peakrpm"          "carlength"

ggplot(lmProfile2) #plot of RMSE with number of variables in model

ggplot(lmProfile2, metric = "Rsquared") #plot of r-squared

Random Forest

Selecting Variables from RFE and Train - Test Split

selected_vars <- predictors(lmProfile2)
#list of selected variables from RFE

selected_vars <- append(selected_vars, c("price", "car_ID")) #add price to data
df3 <- df[, selected_vars]
library(dplyr)

#train test split
set.seed(125)

train <- df3 %>% dplyr::sample_frac(.75)
test  <- dplyr::anti_join(df3, train, by = 'car_ID')

train_id <- data.frame(car_ID = train$car_ID)
test_id  <- data.frame(car_ID = test$car_ID)

train <- train[,-14]
test  <- test[, -14]

X_train <- train[, -13]
Y_train <- train[, 13]

X_test <- test[, -13]
Y_test <- test[, 13]

Vanilla Random Forest

# Train the model 
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

regr <- randomForest(x = X_train, 
                     y = Y_train)

# Make prediction Train
pred_train <- predict(regr, X_train)

result_train <- data.frame(car_ID = train_id, 
                     price = Y_train,
                     predictions = pred_train)
head(result_train)

##   car_ID price predictions
## 1     30 12964   13409.197
## 2    191  9980    9980.839
## 3     72 34184   33986.350
## 4     24  7957    8263.308
## 5    179 16558   16903.626
## 6    121  6229    6440.978

# Make prediction Test
pred_test <- predict(regr, X_test)

result_test <- data.frame(car_ID = test_id, 
                     price = Y_test,
                     predictions = pred_test)
head(result_test)

##   car_ID price predictions
## 1      3 16500   15489.136
## 2      7 17710   19762.496
## 3     14 21105   19374.672
## 4     21  6575    6162.734
## 5     28  8558    8322.231
## 6     29  8921    9758.028

#Performance Matrices
library(Metrics)

## 
## Attaching package: 'Metrics'

## The following objects are masked from 'package:caret':
## 
##     precision, recall

print(paste0('Test RMSE: ' , rmse(result_test$price,
                                  result_test$predictions) )) #testRMSE

## [1] "Test RMSE: 1701.49632721413"

print(paste0('Train RMSE: ' , rmse(result_train$price,
                                  result_train$predictions) )) #RMSE

## [1] "Train RMSE: 1162.73815748046"

print(paste0('Test R2: ' ,
             caret::postResample(result_test$predictions , result_test$price)['Rsquared'] ))

## [1] "Test R2: 0.952186518295217"

print(paste0('Train R2: ' ,
             caret::postResample(result_train$predictions , result_train$price)['Rsquared'] ))

## [1] "Train R2: 0.981086098247521"

ggplot(result_test, aes(x = predictions, y = price)) + geom_point()

Visualizing Actual and Predicted

library(reshape2)

#On Test Data
melt_pred_test <- melt(result_test, id.vars = "car_ID")

ggplot(melt_pred_test, 
       aes(y = value, 
           x = car_ID,
           colour = variable)) +
      geom_point() +
      geom_line() +
  ggtitle("Actual vs Predicted for Test Data")

#On Train Data
melt_pred_train <- melt(result_train, id.vars = "car_ID")

ggplot(melt_pred_train, 
       aes(y = value, 
           x = car_ID,
           colour = variable)) +
      geom_point() +
      geom_line() +
  ggtitle("Actual vs Predicted for Train Data")

Car Price Prediction

Mohammad Amir

09/09/2020

Libraries

Load Data

Summary Statistics of Data

Check for Missing Data

Convert Categorical Variables to Factor

Exploratory Data Analysis

Exploring the Effect of Car Dimensions on Price

Engine Technical Specifications on Price

Mean Price Based on Engine Specification

MPG with Price

Effect of Symboling on Price

Boxplots to Show Variation of Price with Different Factors

Correlation Analysis

Feature Selection

Preprocess the Data for RFE

Apply RFE

Random Forest

Selecting Variables from RFE and Train - Test Split

Vanilla Random Forest

Visualizing Actual and Predicted