Libraries

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(gridExtra)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Load Data

df <- read.csv("Vehicle_Retail_Price_Assignment.csv")

Summary Statistics of Data

dim(df) #number of rows and columns in data
## [1] 205  26
head(df) #first 6 rows of data
##   car_ID symboling                  CarName fueltype aspiration doornumber
## 1      1         3       alfa-romero giulia      gas        std        two
## 2      2         3      alfa-romero stelvio      gas        std        two
## 3      3         1 alfa-romero Quadrifoglio      gas        std        two
## 4      4         2              audi 100 ls      gas        std       four
## 5      5         2               audi 100ls      gas        std       four
## 6      6         2                 audi fox      gas        std        two
##       carbody drivewheel enginelocation wheelbase carlength carwidth carheight
## 1 convertible        rwd          front      88.6     168.8     64.1      48.8
## 2 convertible        rwd          front      88.6     168.8     64.1      48.8
## 3   hatchback        rwd          front      94.5     171.2     65.5      52.4
## 4       sedan        fwd          front      99.8     176.6     66.2      54.3
## 5       sedan        4wd          front      99.4     176.6     66.4      54.3
## 6       sedan        fwd          front      99.8     177.3     66.3      53.1
##   curbweight enginetype cylindernumber enginesize fuelsystem boreratio stroke
## 1       2548       dohc           four        130       mpfi      3.47   2.68
## 2       2548       dohc           four        130       mpfi      3.47   2.68
## 3       2823       ohcv            six        152       mpfi      2.68   3.47
## 4       2337        ohc           four        109       mpfi      3.19   3.40
## 5       2824        ohc           five        136       mpfi      3.19   3.40
## 6       2507        ohc           five        136       mpfi      3.19   3.40
##   compressionratio horsepower peakrpm citympg highwaympg price
## 1              9.0        111    5000      21         27 13495
## 2              9.0        111    5000      21         27 16500
## 3              9.0        154    5000      19         26 16500
## 4             10.0        102    5500      24         30 13950
## 5              8.0        115    5500      18         22 17450
## 6              8.5        110    5500      19         25 15250
str(df) #structure of data
## 'data.frame':    205 obs. of  26 variables:
##  $ car_ID          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ symboling       : int  3 3 1 2 2 2 1 1 1 0 ...
##  $ CarName         : chr  "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
##  $ fueltype        : chr  "gas" "gas" "gas" "gas" ...
##  $ aspiration      : chr  "std" "std" "std" "std" ...
##  $ doornumber      : chr  "two" "two" "two" "four" ...
##  $ carbody         : chr  "convertible" "convertible" "hatchback" "sedan" ...
##  $ drivewheel      : chr  "rwd" "rwd" "rwd" "fwd" ...
##  $ enginelocation  : chr  "front" "front" "front" "front" ...
##  $ wheelbase       : num  88.6 88.6 94.5 99.8 99.4 ...
##  $ carlength       : num  169 169 171 177 177 ...
##  $ carwidth        : num  64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
##  $ carheight       : num  48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ curbweight      : int  2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
##  $ enginetype      : chr  "dohc" "dohc" "ohcv" "ohc" ...
##  $ cylindernumber  : chr  "four" "four" "six" "four" ...
##  $ enginesize      : int  130 130 152 109 136 136 136 136 131 131 ...
##  $ fuelsystem      : chr  "mpfi" "mpfi" "mpfi" "mpfi" ...
##  $ boreratio       : num  3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
##  $ stroke          : num  2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
##  $ compressionratio: num  9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ horsepower      : int  111 111 154 102 115 110 110 110 140 160 ...
##  $ peakrpm         : int  5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
##  $ citympg         : int  21 21 19 24 18 19 19 19 17 16 ...
##  $ highwaympg      : int  27 27 26 30 22 25 25 25 20 22 ...
##  $ price           : num  13495 16500 16500 13950 17450 ...
summary(df) #statistical summary of data
##      car_ID      symboling         CarName            fueltype        
##  Min.   :  1   Min.   :-2.0000   Length:205         Length:205        
##  1st Qu.: 52   1st Qu.: 0.0000   Class :character   Class :character  
##  Median :103   Median : 1.0000   Mode  :character   Mode  :character  
##  Mean   :103   Mean   : 0.8341                                        
##  3rd Qu.:154   3rd Qu.: 2.0000                                        
##  Max.   :205   Max.   : 3.0000                                        
##   aspiration         doornumber          carbody           drivewheel       
##  Length:205         Length:205         Length:205         Length:205        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  enginelocation       wheelbase        carlength        carwidth    
##  Length:205         Min.   : 86.60   Min.   :141.1   Min.   :60.30  
##  Class :character   1st Qu.: 94.50   1st Qu.:166.3   1st Qu.:64.10  
##  Mode  :character   Median : 97.00   Median :173.2   Median :65.50  
##                     Mean   : 98.76   Mean   :174.0   Mean   :65.91  
##                     3rd Qu.:102.40   3rd Qu.:183.1   3rd Qu.:66.90  
##                     Max.   :120.90   Max.   :208.1   Max.   :72.30  
##    carheight       curbweight    enginetype        cylindernumber    
##  Min.   :47.80   Min.   :1488   Length:205         Length:205        
##  1st Qu.:52.00   1st Qu.:2145   Class :character   Class :character  
##  Median :54.10   Median :2414   Mode  :character   Mode  :character  
##  Mean   :53.72   Mean   :2556                                        
##  3rd Qu.:55.50   3rd Qu.:2935                                        
##  Max.   :59.80   Max.   :4066                                        
##    enginesize     fuelsystem          boreratio        stroke     
##  Min.   : 61.0   Length:205         Min.   :2.54   Min.   :2.070  
##  1st Qu.: 97.0   Class :character   1st Qu.:3.15   1st Qu.:3.110  
##  Median :120.0   Mode  :character   Median :3.31   Median :3.290  
##  Mean   :126.9                      Mean   :3.33   Mean   :3.255  
##  3rd Qu.:141.0                      3rd Qu.:3.58   3rd Qu.:3.410  
##  Max.   :326.0                      Max.   :3.94   Max.   :4.170  
##  compressionratio   horsepower       peakrpm        citympg     
##  Min.   : 7.00    Min.   : 48.0   Min.   :4150   Min.   :13.00  
##  1st Qu.: 8.60    1st Qu.: 70.0   1st Qu.:4800   1st Qu.:19.00  
##  Median : 9.00    Median : 95.0   Median :5200   Median :24.00  
##  Mean   :10.14    Mean   :104.1   Mean   :5125   Mean   :25.22  
##  3rd Qu.: 9.40    3rd Qu.:116.0   3rd Qu.:5500   3rd Qu.:30.00  
##  Max.   :23.00    Max.   :288.0   Max.   :6600   Max.   :49.00  
##    highwaympg        price      
##  Min.   :16.00   Min.   : 5118  
##  1st Qu.:25.00   1st Qu.: 7788  
##  Median :30.00   Median :10295  
##  Mean   :30.75   Mean   :13277  
##  3rd Qu.:34.00   3rd Qu.:16503  
##  Max.   :54.00   Max.   :45400

Check for Missing Data

#checking for NAs
colSums(is.na(df)) #there is no NA's in data
##           car_ID        symboling          CarName         fueltype 
##                0                0                0                0 
##       aspiration       doornumber          carbody       drivewheel 
##                0                0                0                0 
##   enginelocation        wheelbase        carlength         carwidth 
##                0                0                0                0 
##        carheight       curbweight       enginetype   cylindernumber 
##                0                0                0                0 
##       enginesize       fuelsystem        boreratio           stroke 
##                0                0                0                0 
## compressionratio       horsepower          peakrpm          citympg 
##                0                0                0                0 
##       highwaympg            price 
##                0                0

Convert Categorical Variables to Factor

#convert character variables in factor
df[sapply(df, is.character)] <- lapply(df[sapply(df, is.character)], 
                                       as.factor)
df$symboling <- as.factor(df$symboling)
str(df)
## 'data.frame':    205 obs. of  26 variables:
##  $ car_ID          : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ symboling       : Factor w/ 6 levels "-2","-1","0",..: 6 6 4 5 5 5 4 4 4 3 ...
##  $ CarName         : Factor w/ 147 levels "alfa-romero giulia",..: 1 3 2 4 5 9 5 7 6 8 ...
##  $ fueltype        : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
##  $ aspiration      : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
##  $ doornumber      : Factor w/ 2 levels "four","two": 2 2 2 1 1 2 1 1 1 2 ...
##  $ carbody         : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
##  $ drivewheel      : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
##  $ enginelocation  : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
##  $ wheelbase       : num  88.6 88.6 94.5 99.8 99.4 ...
##  $ carlength       : num  169 169 171 177 177 ...
##  $ carwidth        : num  64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
##  $ carheight       : num  48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
##  $ curbweight      : int  2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
##  $ enginetype      : Factor w/ 7 levels "dohc","dohcv",..: 1 1 6 4 4 4 4 4 4 4 ...
##  $ cylindernumber  : Factor w/ 7 levels "eight","five",..: 3 3 4 3 2 2 2 2 2 2 ...
##  $ enginesize      : int  130 130 152 109 136 136 136 136 131 131 ...
##  $ fuelsystem      : Factor w/ 8 levels "1bbl","2bbl",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ boreratio       : num  3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
##  $ stroke          : num  2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
##  $ compressionratio: num  9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
##  $ horsepower      : int  111 111 154 102 115 110 110 110 140 160 ...
##  $ peakrpm         : int  5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
##  $ citympg         : int  21 21 19 24 18 19 19 19 17 16 ...
##  $ highwaympg      : int  27 27 26 30 22 25 25 25 20 22 ...
##  $ price           : num  13495 16500 16500 13950 17450 ...

Exploratory Data Analysis

Exploring the Effect of Car Dimensions on Price

#effect of car dimensions on price
d1 <- ggplot(df, aes(x = wheelbase, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

d2 <- ggplot(df, aes(x = carlength, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

d3 <- ggplot(df, aes(x = carwidth,  y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

d4 <- ggplot(df, aes(x = carheight, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

d5 <- ggplot(df, aes(x = curbweight, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()


grid.arrange(d1, d2, d3, d4, d5, top = "Variation of Price with Car Dimensions")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Engine Technical Specifications on Price

#engine technical specifications
et1 <- ggplot(df, aes(x = boreratio, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

et2 <- ggplot(df, aes(x = stroke, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

et3 <- ggplot(df, aes(x = compressionratio, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

et4 <- ggplot(df, aes(x = horsepower, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

et5 <- ggplot(df, aes(x = peakrpm, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

grid.arrange(et1, et2, et3, et4, et5, top = "Price with Technical Engine Specification")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Mean Price Based on Engine Specification

c1 <- df %>% group_by(doornumber) %>% summarise(mean_price = mean(price)) %>% {
      ggplot(., aes(x = doornumber, y = mean_price, fill = doornumber)) + 
      geom_bar(stat = "identity") +
      theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)
c2 <- df %>% group_by(carbody) %>% summarise(mean_price = mean(price)) %>% {
  ggplot(., aes(x = carbody, y = mean_price, fill = carbody)) + 
    geom_bar(stat = "identity") +
    theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)
c3 <- df %>% group_by(drivewheel) %>% summarise(mean_price = mean(price)) %>% {
  ggplot(., aes(x = drivewheel, y = mean_price, fill = drivewheel)) + 
    geom_bar(stat = "identity") +
    theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)
c4 <- df %>% group_by(enginelocation) %>% summarise(mean_price = mean(price)) %>% {
  ggplot(., aes(x = enginelocation, y = mean_price, fill = enginelocation)) + 
    geom_bar(stat = "identity") +
    theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)
grid.arrange(c1, c2, c3, c4, top = "Mean Price for Different Configuations")

MPG with Price

#mpg with price
mpg1 <- ggplot(df, aes(x = citympg, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

mpg2 <- ggplot(df, aes(x = highwaympg, y = price)) + 
  geom_point() + 
  stat_smooth(method="lm", se=F) + theme_minimal()

grid.arrange(mpg1, mpg2, top = "Price with MPG")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

Effect of Symboling on Price

#symboling
df %>% group_by(symboling) %>% summarise(mean_price = mean(price)) %>% {
  ggplot(., aes(x = symboling, y = mean_price, fill = symboling)) + 
    geom_bar(stat = "identity") +
    theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)

Boxplots to Show Variation of Price with Different Factors

#boxplots
box1 <- df %>% group_by(symboling)  %>% {
        ggplot(., aes(x = symboling, y = price, fill = symboling)) + 
        geom_boxplot() +
        theme_minimal() + theme(legend.position = "none")
}
box1

box2 <- df %>% group_by(fueltype)  %>% {
    ggplot(., aes(x = fueltype, y = price, fill = fueltype)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box3 <- df %>% group_by(aspiration)  %>% {
  ggplot(., aes(x = aspiration, y = price, fill = aspiration)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box4 <- df %>% group_by(doornumber)  %>% {
  ggplot(., aes(x = doornumber, y = price, fill = doornumber)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box5 <- df %>% group_by(carbody)  %>% {
  ggplot(., aes(x = carbody, y = price, fill = carbody)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box6 <- df %>% group_by(drivewheel)  %>% {
  ggplot(., aes(x = drivewheel, y = price, fill = drivewheel)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box7 <- df %>% group_by(enginelocation)  %>% {
  ggplot(., aes(x = enginelocation, y = price, fill = enginelocation)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box8 <- df %>% group_by(enginetype)  %>% {
  ggplot(., aes(x = enginetype, y = price, fill = enginetype)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box9 <- df %>% group_by(cylindernumber)  %>% {
  ggplot(., aes(x = cylindernumber, y = price, fill = cylindernumber)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}

box10 <- df %>% group_by(fuelsystem)  %>% {
  ggplot(., aes(x = fuelsystem, y = price, fill = fuelsystem)) + 
    geom_boxplot() +
    theme_minimal() + theme(legend.position = "none")
}
grid.arrange(box4, box5, box6, box7, top = "Variation of Price for Different Configurations")

grid.arrange(box2, box3, box8, box9, box10, top = "Variation of Price for Different Engine Specifications")

Correlation Analysis

#correlation Analysis
numericData <- df[,sapply(df, is.numeric)] #filter all numeric vars
numericData <- numericData[, -c(1, 15)] #drop the id column and dependent var

library(corrplot)
## corrplot 0.84 loaded
corMat <- cor(numericData) #correlation matrix
corrplot(corMat, method = "number", type = "lower") #plot of corr matrix

highlyCorrelated <- findCorrelation(corMat, cutoff = 0.7) #find highly correlated
highlyCorCol <- colnames(numericData)[highlyCorrelated]
highlyCorCol
## [1] "curbweight" "carlength"  "carwidth"   "highwaympg" "enginesize"
## [6] "citympg"

Feature Selection

Feature selection is done using RFE (Recursive Feature Elimination)

Preprocess the Data for RFE

library(caret)
x <- df[, -c(1, 3, 26)] #drop car_ID, CarName and Price (Dependent Var)

#Convert Fators to Numeric for lmFuncs
x$symboling <- as.numeric(x$symboling)
x$fueltype  <- as.numeric(x$fueltype)
x$aspiration<- as.numeric(x$aspiration)
x$doornumber<- as.numeric(x$doornumber)
x$carbody   <- as.numeric(x$carbody)
x$drivewheel<- as.numeric(x$drivewheel)
x$enginelocation<- as.numeric(x$enginelocation)
x$enginetype<- as.numeric(x$enginetype)
x$cylindernumber<- as.numeric(x$cylindernumber)
x$fuelsystem<- as.numeric(x$fuelsystem)

#Normalize Data
normalization <- preProcess(x)
x <- predict(normalization, x)
x <- as.data.frame(x) #Predictors (Independent Vars)
y <- df[, 26] #Dependent Variable

Apply RFE

set.seed(5)
lmProfile2 <- rfe(x, y,
                  sizes = c(10:15, 20, 23),
                  rfeControl = rfeControl(functions = lmFuncs,
                                          rerank = TRUE,
                                          number = 200))
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading

## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
lmpImp <- data.frame(varImp(lmProfile2))
lmpImp <- data.frame(variable = rownames(lmpImp), lmpImp)

ggplot(data=lmpImp, 
       aes(x=reorder(variable, 
                     Overall), 
           y=Overall)) +
  geom_bar(stat="identity") + 
  coord_flip()                 #Variable importance plot

rownames(lmpImp) <- NULL
predictors(lmProfile2) #list of selected variables
##  [1] "enginesize"       "fueltype"         "compressionratio" "carwidth"        
##  [5] "curbweight"       "citympg"          "enginelocation"   "highwaympg"      
##  [9] "horsepower"       "wheelbase"        "peakrpm"          "carlength"
ggplot(lmProfile2) #plot of RMSE with number of variables in model

ggplot(lmProfile2, metric = "Rsquared") #plot of r-squared

Random Forest

Selecting Variables from RFE and Train - Test Split

selected_vars <- predictors(lmProfile2)
#list of selected variables from RFE

selected_vars <- append(selected_vars, c("price", "car_ID")) #add price to data
df3 <- df[, selected_vars]
library(dplyr)

#train test split
set.seed(125)

train <- df3 %>% dplyr::sample_frac(.75)
test  <- dplyr::anti_join(df3, train, by = 'car_ID')

train_id <- data.frame(car_ID = train$car_ID)
test_id  <- data.frame(car_ID = test$car_ID)

train <- train[,-14]
test  <- test[, -14]

X_train <- train[, -13]
Y_train <- train[, 13]

X_test <- test[, -13]
Y_test <- test[, 13]

Vanilla Random Forest

# Train the model 
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
regr <- randomForest(x = X_train, 
                     y = Y_train)

# Make prediction Train
pred_train <- predict(regr, X_train)

result_train <- data.frame(car_ID = train_id, 
                     price = Y_train,
                     predictions = pred_train)
head(result_train)
##   car_ID price predictions
## 1     30 12964   13409.197
## 2    191  9980    9980.839
## 3     72 34184   33986.350
## 4     24  7957    8263.308
## 5    179 16558   16903.626
## 6    121  6229    6440.978
# Make prediction Test
pred_test <- predict(regr, X_test)

result_test <- data.frame(car_ID = test_id, 
                     price = Y_test,
                     predictions = pred_test)
head(result_test)
##   car_ID price predictions
## 1      3 16500   15489.136
## 2      7 17710   19762.496
## 3     14 21105   19374.672
## 4     21  6575    6162.734
## 5     28  8558    8322.231
## 6     29  8921    9758.028
#Performance Matrices
library(Metrics)
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
print(paste0('Test RMSE: ' , rmse(result_test$price,
                                  result_test$predictions) )) #testRMSE
## [1] "Test RMSE: 1701.49632721413"
print(paste0('Train RMSE: ' , rmse(result_train$price,
                                  result_train$predictions) )) #RMSE
## [1] "Train RMSE: 1162.73815748046"
print(paste0('Test R2: ' ,
             caret::postResample(result_test$predictions , result_test$price)['Rsquared'] ))
## [1] "Test R2: 0.952186518295217"
print(paste0('Train R2: ' ,
             caret::postResample(result_train$predictions , result_train$price)['Rsquared'] ))
## [1] "Train R2: 0.981086098247521"
ggplot(result_test, aes(x = predictions, y = price)) + geom_point()

Visualizing Actual and Predicted

library(reshape2)

#On Test Data
melt_pred_test <- melt(result_test, id.vars = "car_ID")

ggplot(melt_pred_test, 
       aes(y = value, 
           x = car_ID,
           colour = variable)) +
      geom_point() +
      geom_line() +
  ggtitle("Actual vs Predicted for Test Data")

#On Train Data
melt_pred_train <- melt(result_train, id.vars = "car_ID")

ggplot(melt_pred_train, 
       aes(y = value, 
           x = car_ID,
           colour = variable)) +
      geom_point() +
      geom_line() +
  ggtitle("Actual vs Predicted for Train Data")