library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(ggplot2)
library(gridExtra)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read.csv("Vehicle_Retail_Price_Assignment.csv")
dim(df) #number of rows and columns in data
## [1] 205 26
head(df) #first 6 rows of data
## car_ID symboling CarName fueltype aspiration doornumber
## 1 1 3 alfa-romero giulia gas std two
## 2 2 3 alfa-romero stelvio gas std two
## 3 3 1 alfa-romero Quadrifoglio gas std two
## 4 4 2 audi 100 ls gas std four
## 5 5 2 audi 100ls gas std four
## 6 6 2 audi fox gas std two
## carbody drivewheel enginelocation wheelbase carlength carwidth carheight
## 1 convertible rwd front 88.6 168.8 64.1 48.8
## 2 convertible rwd front 88.6 168.8 64.1 48.8
## 3 hatchback rwd front 94.5 171.2 65.5 52.4
## 4 sedan fwd front 99.8 176.6 66.2 54.3
## 5 sedan 4wd front 99.4 176.6 66.4 54.3
## 6 sedan fwd front 99.8 177.3 66.3 53.1
## curbweight enginetype cylindernumber enginesize fuelsystem boreratio stroke
## 1 2548 dohc four 130 mpfi 3.47 2.68
## 2 2548 dohc four 130 mpfi 3.47 2.68
## 3 2823 ohcv six 152 mpfi 2.68 3.47
## 4 2337 ohc four 109 mpfi 3.19 3.40
## 5 2824 ohc five 136 mpfi 3.19 3.40
## 6 2507 ohc five 136 mpfi 3.19 3.40
## compressionratio horsepower peakrpm citympg highwaympg price
## 1 9.0 111 5000 21 27 13495
## 2 9.0 111 5000 21 27 16500
## 3 9.0 154 5000 19 26 16500
## 4 10.0 102 5500 24 30 13950
## 5 8.0 115 5500 18 22 17450
## 6 8.5 110 5500 19 25 15250
str(df) #structure of data
## 'data.frame': 205 obs. of 26 variables:
## $ car_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ symboling : int 3 3 1 2 2 2 1 1 1 0 ...
## $ CarName : chr "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
## $ fueltype : chr "gas" "gas" "gas" "gas" ...
## $ aspiration : chr "std" "std" "std" "std" ...
## $ doornumber : chr "two" "two" "two" "four" ...
## $ carbody : chr "convertible" "convertible" "hatchback" "sedan" ...
## $ drivewheel : chr "rwd" "rwd" "rwd" "fwd" ...
## $ enginelocation : chr "front" "front" "front" "front" ...
## $ wheelbase : num 88.6 88.6 94.5 99.8 99.4 ...
## $ carlength : num 169 169 171 177 177 ...
## $ carwidth : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
## $ carheight : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
## $ curbweight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
## $ enginetype : chr "dohc" "dohc" "ohcv" "ohc" ...
## $ cylindernumber : chr "four" "four" "six" "four" ...
## $ enginesize : int 130 130 152 109 136 136 136 136 131 131 ...
## $ fuelsystem : chr "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ boreratio : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
## $ compressionratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
## $ horsepower : int 111 111 154 102 115 110 110 110 140 160 ...
## $ peakrpm : int 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
## $ citympg : int 21 21 19 24 18 19 19 19 17 16 ...
## $ highwaympg : int 27 27 26 30 22 25 25 25 20 22 ...
## $ price : num 13495 16500 16500 13950 17450 ...
summary(df) #statistical summary of data
## car_ID symboling CarName fueltype
## Min. : 1 Min. :-2.0000 Length:205 Length:205
## 1st Qu.: 52 1st Qu.: 0.0000 Class :character Class :character
## Median :103 Median : 1.0000 Mode :character Mode :character
## Mean :103 Mean : 0.8341
## 3rd Qu.:154 3rd Qu.: 2.0000
## Max. :205 Max. : 3.0000
## aspiration doornumber carbody drivewheel
## Length:205 Length:205 Length:205 Length:205
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## enginelocation wheelbase carlength carwidth
## Length:205 Min. : 86.60 Min. :141.1 Min. :60.30
## Class :character 1st Qu.: 94.50 1st Qu.:166.3 1st Qu.:64.10
## Mode :character Median : 97.00 Median :173.2 Median :65.50
## Mean : 98.76 Mean :174.0 Mean :65.91
## 3rd Qu.:102.40 3rd Qu.:183.1 3rd Qu.:66.90
## Max. :120.90 Max. :208.1 Max. :72.30
## carheight curbweight enginetype cylindernumber
## Min. :47.80 Min. :1488 Length:205 Length:205
## 1st Qu.:52.00 1st Qu.:2145 Class :character Class :character
## Median :54.10 Median :2414 Mode :character Mode :character
## Mean :53.72 Mean :2556
## 3rd Qu.:55.50 3rd Qu.:2935
## Max. :59.80 Max. :4066
## enginesize fuelsystem boreratio stroke
## Min. : 61.0 Length:205 Min. :2.54 Min. :2.070
## 1st Qu.: 97.0 Class :character 1st Qu.:3.15 1st Qu.:3.110
## Median :120.0 Mode :character Median :3.31 Median :3.290
## Mean :126.9 Mean :3.33 Mean :3.255
## 3rd Qu.:141.0 3rd Qu.:3.58 3rd Qu.:3.410
## Max. :326.0 Max. :3.94 Max. :4.170
## compressionratio horsepower peakrpm citympg
## Min. : 7.00 Min. : 48.0 Min. :4150 Min. :13.00
## 1st Qu.: 8.60 1st Qu.: 70.0 1st Qu.:4800 1st Qu.:19.00
## Median : 9.00 Median : 95.0 Median :5200 Median :24.00
## Mean :10.14 Mean :104.1 Mean :5125 Mean :25.22
## 3rd Qu.: 9.40 3rd Qu.:116.0 3rd Qu.:5500 3rd Qu.:30.00
## Max. :23.00 Max. :288.0 Max. :6600 Max. :49.00
## highwaympg price
## Min. :16.00 Min. : 5118
## 1st Qu.:25.00 1st Qu.: 7788
## Median :30.00 Median :10295
## Mean :30.75 Mean :13277
## 3rd Qu.:34.00 3rd Qu.:16503
## Max. :54.00 Max. :45400
#checking for NAs
colSums(is.na(df)) #there is no NA's in data
## car_ID symboling CarName fueltype
## 0 0 0 0
## aspiration doornumber carbody drivewheel
## 0 0 0 0
## enginelocation wheelbase carlength carwidth
## 0 0 0 0
## carheight curbweight enginetype cylindernumber
## 0 0 0 0
## enginesize fuelsystem boreratio stroke
## 0 0 0 0
## compressionratio horsepower peakrpm citympg
## 0 0 0 0
## highwaympg price
## 0 0
#convert character variables in factor
df[sapply(df, is.character)] <- lapply(df[sapply(df, is.character)],
as.factor)
df$symboling <- as.factor(df$symboling)
str(df)
## 'data.frame': 205 obs. of 26 variables:
## $ car_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ symboling : Factor w/ 6 levels "-2","-1","0",..: 6 6 4 5 5 5 4 4 4 3 ...
## $ CarName : Factor w/ 147 levels "alfa-romero giulia",..: 1 3 2 4 5 9 5 7 6 8 ...
## $ fueltype : Factor w/ 2 levels "diesel","gas": 2 2 2 2 2 2 2 2 2 2 ...
## $ aspiration : Factor w/ 2 levels "std","turbo": 1 1 1 1 1 1 1 1 2 2 ...
## $ doornumber : Factor w/ 2 levels "four","two": 2 2 2 1 1 2 1 1 1 2 ...
## $ carbody : Factor w/ 5 levels "convertible",..: 1 1 3 4 4 4 4 5 4 3 ...
## $ drivewheel : Factor w/ 3 levels "4wd","fwd","rwd": 3 3 3 2 1 2 2 2 2 1 ...
## $ enginelocation : Factor w/ 2 levels "front","rear": 1 1 1 1 1 1 1 1 1 1 ...
## $ wheelbase : num 88.6 88.6 94.5 99.8 99.4 ...
## $ carlength : num 169 169 171 177 177 ...
## $ carwidth : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
## $ carheight : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
## $ curbweight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
## $ enginetype : Factor w/ 7 levels "dohc","dohcv",..: 1 1 6 4 4 4 4 4 4 4 ...
## $ cylindernumber : Factor w/ 7 levels "eight","five",..: 3 3 4 3 2 2 2 2 2 2 ...
## $ enginesize : int 130 130 152 109 136 136 136 136 131 131 ...
## $ fuelsystem : Factor w/ 8 levels "1bbl","2bbl",..: 6 6 6 6 6 6 6 6 6 6 ...
## $ boreratio : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
## $ compressionratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
## $ horsepower : int 111 111 154 102 115 110 110 110 140 160 ...
## $ peakrpm : int 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
## $ citympg : int 21 21 19 24 18 19 19 19 17 16 ...
## $ highwaympg : int 27 27 26 30 22 25 25 25 20 22 ...
## $ price : num 13495 16500 16500 13950 17450 ...
#effect of car dimensions on price
d1 <- ggplot(df, aes(x = wheelbase, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
d2 <- ggplot(df, aes(x = carlength, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
d3 <- ggplot(df, aes(x = carwidth, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
d4 <- ggplot(df, aes(x = carheight, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
d5 <- ggplot(df, aes(x = curbweight, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
grid.arrange(d1, d2, d3, d4, d5, top = "Variation of Price with Car Dimensions")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#engine technical specifications
et1 <- ggplot(df, aes(x = boreratio, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
et2 <- ggplot(df, aes(x = stroke, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
et3 <- ggplot(df, aes(x = compressionratio, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
et4 <- ggplot(df, aes(x = horsepower, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
et5 <- ggplot(df, aes(x = peakrpm, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
grid.arrange(et1, et2, et3, et4, et5, top = "Price with Technical Engine Specification")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
c1 <- df %>% group_by(doornumber) %>% summarise(mean_price = mean(price)) %>% {
ggplot(., aes(x = doornumber, y = mean_price, fill = doornumber)) +
geom_bar(stat = "identity") +
theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)
c2 <- df %>% group_by(carbody) %>% summarise(mean_price = mean(price)) %>% {
ggplot(., aes(x = carbody, y = mean_price, fill = carbody)) +
geom_bar(stat = "identity") +
theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)
c3 <- df %>% group_by(drivewheel) %>% summarise(mean_price = mean(price)) %>% {
ggplot(., aes(x = drivewheel, y = mean_price, fill = drivewheel)) +
geom_bar(stat = "identity") +
theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)
c4 <- df %>% group_by(enginelocation) %>% summarise(mean_price = mean(price)) %>% {
ggplot(., aes(x = enginelocation, y = mean_price, fill = enginelocation)) +
geom_bar(stat = "identity") +
theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)
grid.arrange(c1, c2, c3, c4, top = "Mean Price for Different Configuations")
#mpg with price
mpg1 <- ggplot(df, aes(x = citympg, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
mpg2 <- ggplot(df, aes(x = highwaympg, y = price)) +
geom_point() +
stat_smooth(method="lm", se=F) + theme_minimal()
grid.arrange(mpg1, mpg2, top = "Price with MPG")
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'
#symboling
df %>% group_by(symboling) %>% summarise(mean_price = mean(price)) %>% {
ggplot(., aes(x = symboling, y = mean_price, fill = symboling)) +
geom_bar(stat = "identity") +
theme_minimal() + theme(legend.position = "none")
}
## `summarise()` ungrouping output (override with `.groups` argument)
#boxplots
box1 <- df %>% group_by(symboling) %>% {
ggplot(., aes(x = symboling, y = price, fill = symboling)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
box1
box2 <- df %>% group_by(fueltype) %>% {
ggplot(., aes(x = fueltype, y = price, fill = fueltype)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
box3 <- df %>% group_by(aspiration) %>% {
ggplot(., aes(x = aspiration, y = price, fill = aspiration)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
box4 <- df %>% group_by(doornumber) %>% {
ggplot(., aes(x = doornumber, y = price, fill = doornumber)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
box5 <- df %>% group_by(carbody) %>% {
ggplot(., aes(x = carbody, y = price, fill = carbody)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
box6 <- df %>% group_by(drivewheel) %>% {
ggplot(., aes(x = drivewheel, y = price, fill = drivewheel)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
box7 <- df %>% group_by(enginelocation) %>% {
ggplot(., aes(x = enginelocation, y = price, fill = enginelocation)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
box8 <- df %>% group_by(enginetype) %>% {
ggplot(., aes(x = enginetype, y = price, fill = enginetype)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
box9 <- df %>% group_by(cylindernumber) %>% {
ggplot(., aes(x = cylindernumber, y = price, fill = cylindernumber)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
box10 <- df %>% group_by(fuelsystem) %>% {
ggplot(., aes(x = fuelsystem, y = price, fill = fuelsystem)) +
geom_boxplot() +
theme_minimal() + theme(legend.position = "none")
}
grid.arrange(box4, box5, box6, box7, top = "Variation of Price for Different Configurations")
grid.arrange(box2, box3, box8, box9, box10, top = "Variation of Price for Different Engine Specifications")
#correlation Analysis
numericData <- df[,sapply(df, is.numeric)] #filter all numeric vars
numericData <- numericData[, -c(1, 15)] #drop the id column and dependent var
library(corrplot)
## corrplot 0.84 loaded
corMat <- cor(numericData) #correlation matrix
corrplot(corMat, method = "number", type = "lower") #plot of corr matrix
highlyCorrelated <- findCorrelation(corMat, cutoff = 0.7) #find highly correlated
highlyCorCol <- colnames(numericData)[highlyCorrelated]
highlyCorCol
## [1] "curbweight" "carlength" "carwidth" "highwaympg" "enginesize"
## [6] "citympg"
Feature selection is done using RFE (Recursive Feature Elimination)
library(caret)
x <- df[, -c(1, 3, 26)] #drop car_ID, CarName and Price (Dependent Var)
#Convert Fators to Numeric for lmFuncs
x$symboling <- as.numeric(x$symboling)
x$fueltype <- as.numeric(x$fueltype)
x$aspiration<- as.numeric(x$aspiration)
x$doornumber<- as.numeric(x$doornumber)
x$carbody <- as.numeric(x$carbody)
x$drivewheel<- as.numeric(x$drivewheel)
x$enginelocation<- as.numeric(x$enginelocation)
x$enginetype<- as.numeric(x$enginetype)
x$cylindernumber<- as.numeric(x$cylindernumber)
x$fuelsystem<- as.numeric(x$fuelsystem)
#Normalize Data
normalization <- preProcess(x)
x <- predict(normalization, x)
x <- as.data.frame(x) #Predictors (Independent Vars)
y <- df[, 26] #Dependent Variable
set.seed(5)
lmProfile2 <- rfe(x, y,
sizes = c(10:15, 20, 23),
rfeControl = rfeControl(functions = lmFuncs,
rerank = TRUE,
number = 200))
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
## Warning in predict.lm(object, x): prediction from a rank-deficient fit may be
## misleading
lmpImp <- data.frame(varImp(lmProfile2))
lmpImp <- data.frame(variable = rownames(lmpImp), lmpImp)
ggplot(data=lmpImp,
aes(x=reorder(variable,
Overall),
y=Overall)) +
geom_bar(stat="identity") +
coord_flip() #Variable importance plot
rownames(lmpImp) <- NULL
predictors(lmProfile2) #list of selected variables
## [1] "enginesize" "fueltype" "compressionratio" "carwidth"
## [5] "curbweight" "citympg" "enginelocation" "highwaympg"
## [9] "horsepower" "wheelbase" "peakrpm" "carlength"
ggplot(lmProfile2) #plot of RMSE with number of variables in model
ggplot(lmProfile2, metric = "Rsquared") #plot of r-squared
selected_vars <- predictors(lmProfile2)
#list of selected variables from RFE
selected_vars <- append(selected_vars, c("price", "car_ID")) #add price to data
df3 <- df[, selected_vars]
library(dplyr)
#train test split
set.seed(125)
train <- df3 %>% dplyr::sample_frac(.75)
test <- dplyr::anti_join(df3, train, by = 'car_ID')
train_id <- data.frame(car_ID = train$car_ID)
test_id <- data.frame(car_ID = test$car_ID)
train <- train[,-14]
test <- test[, -14]
X_train <- train[, -13]
Y_train <- train[, 13]
X_test <- test[, -13]
Y_test <- test[, 13]
# Train the model
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
regr <- randomForest(x = X_train,
y = Y_train)
# Make prediction Train
pred_train <- predict(regr, X_train)
result_train <- data.frame(car_ID = train_id,
price = Y_train,
predictions = pred_train)
head(result_train)
## car_ID price predictions
## 1 30 12964 13409.197
## 2 191 9980 9980.839
## 3 72 34184 33986.350
## 4 24 7957 8263.308
## 5 179 16558 16903.626
## 6 121 6229 6440.978
# Make prediction Test
pred_test <- predict(regr, X_test)
result_test <- data.frame(car_ID = test_id,
price = Y_test,
predictions = pred_test)
head(result_test)
## car_ID price predictions
## 1 3 16500 15489.136
## 2 7 17710 19762.496
## 3 14 21105 19374.672
## 4 21 6575 6162.734
## 5 28 8558 8322.231
## 6 29 8921 9758.028
#Performance Matrices
library(Metrics)
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
print(paste0('Test RMSE: ' , rmse(result_test$price,
result_test$predictions) )) #testRMSE
## [1] "Test RMSE: 1701.49632721413"
print(paste0('Train RMSE: ' , rmse(result_train$price,
result_train$predictions) )) #RMSE
## [1] "Train RMSE: 1162.73815748046"
print(paste0('Test R2: ' ,
caret::postResample(result_test$predictions , result_test$price)['Rsquared'] ))
## [1] "Test R2: 0.952186518295217"
print(paste0('Train R2: ' ,
caret::postResample(result_train$predictions , result_train$price)['Rsquared'] ))
## [1] "Train R2: 0.981086098247521"
ggplot(result_test, aes(x = predictions, y = price)) + geom_point()
library(reshape2)
#On Test Data
melt_pred_test <- melt(result_test, id.vars = "car_ID")
ggplot(melt_pred_test,
aes(y = value,
x = car_ID,
colour = variable)) +
geom_point() +
geom_line() +
ggtitle("Actual vs Predicted for Test Data")
#On Train Data
melt_pred_train <- melt(result_train, id.vars = "car_ID")
ggplot(melt_pred_train,
aes(y = value,
x = car_ID,
colour = variable)) +
geom_point() +
geom_line() +
ggtitle("Actual vs Predicted for Train Data")