Predictive Analysis Final-Term

I. DATA PREPARATION

## 'data.frame':    6000 obs. of  13 variables:
##  $ name         : Factor w/ 30 levels "Amb","Ash","Aud",..: 28 11 29 11 30 8 11 11 18 11 ...
##  $ year         : int  2018 2018 2015 2015 2019 2015 2013 2017 2011 2012 ...
##  $ selling_price: int  850000 750000 1500000 795000 3800000 500000 325000 270000 174000 270000 ...
##  $ km_driven    : int  9500 30000 80000 35000 20000 96500 70000 50000 100000 56000 ...
##  $ fuel         : Factor w/ 4 levels "CNG","Diesel",..: 2 2 2 4 2 2 4 4 2 4 ...
##  $ seller_type  : Factor w/ 3 levels "Dealer","Individual",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ transmission : Factor w/ 2 levels "Automatic","Manual": 2 2 1 2 1 2 2 2 2 2 ...
##  $ owner        : Factor w/ 5 levels "First Owner",..: 1 1 3 1 1 1 1 1 3 1 ...
##  $ mileage      : num  21.5 21.2 12.6 17 18 ...
##  $ engine       : num  1497 1396 2982 1591 1969 ...
##  $ max_power    : num  108.5 88.8 168.5 121.3 190 ...
##  $ torque       : chr  "260Nm@ 1500-2750rpm" "219.66nm@ 1500-2750rpm" "360Nm@ 1400-3200rpm" "154.9Nm@ 4200rpm" ...
##  $ seats        : Factor w/ 9 levels "2","4","5","6",..: 3 3 5 3 3 3 3 3 3 3 ...

## 'data.frame':    2128 obs. of  12 variables:
##  $ name        : Factor w/ 24 levels "Amb","Aud","BMW",..: 21 9 10 14 8 15 15 10 4 10 ...
##  $ year        : int  2016 2017 2014 2014 2015 2008 2007 2011 2013 2019 ...
##  $ km_driven   : int  11000 35000 80000 145241 92651 120000 70000 70000 56000 3500 ...
##  $ fuel        : Factor w/ 4 levels "CNG","Diesel",..: 4 4 4 2 2 4 4 4 2 4 ...
##  $ seller_type : Factor w/ 3 levels "Dealer","Individual",..: 1 2 2 2 1 2 2 2 1 2 ...
##  $ transmission: Factor w/ 2 levels "Automatic","Manual": 1 2 2 2 2 2 2 2 2 2 ...
##  $ owner       : Factor w/ 4 levels "First Owner",..: 1 1 1 1 1 3 4 1 1 1 ...
##  $ mileage     : num  14.3 17.8 19.1 12.1 21.7 ...
##  $ engine      : num  1598 1497 1197 2179 1498 ...
##  $ max_power   : num  104 117 82 120 99 ...
##  $ torque      : chr  "153Nm@ 3800rpm" "145Nm@ 4600rpm" "114Nm@ 4000rpm" "290Nm@ 1800-2800rpm" ...
##  $ seats       : Factor w/ 7 levels "4","5","6","7",..: 2 2 2 4 2 5 2 2 2 2 ...

# Descriptive statistic
summary(train)

##       name           year      selling_price        km_driven      
##  Mar    :1792   Min.   :1983   Min.   :   30000   Min.   :   1000  
##  Hyu    :1034   1st Qu.:2011   1st Qu.:  255750   1st Qu.:  35000  
##  Mah    : 595   Median :2015   Median :  450000   Median :  60000  
##  Tat    : 546   Mean   :2014   Mean   :  638776   Mean   :  69972  
##  Toy    : 366   3rd Qu.:2017   3rd Qu.:  675000   3rd Qu.:  99000  
##  Hon    : 335   Max.   :2020   Max.   :10000000   Max.   :1500000  
##  (Other):1332                                                      
##      fuel                seller_type      transmission 
##  CNG   :  44   Dealer          : 820   Automatic: 772  
##  Diesel:3265   Individual      :5006   Manual   :5228  
##  LPG   :  27   Trustmark Dealer: 174                   
##  Petrol:2664                                           
##                                                        
##                                                        
##                                                        
##                   owner         mileage          engine       max_power     
##  First Owner         :3904   Min.   : 0.00   Min.   : 624   Min.   :  0.00  
##  Fourth & Above Owner: 119   1st Qu.:16.78   1st Qu.:1197   1st Qu.: 68.05  
##  Second Owner        :1558   Median :19.30   Median :1248   Median : 82.00  
##  Test Drive Car      :   5   Mean   :19.36   Mean   :1464   Mean   : 91.59  
##  Third Owner         : 414   3rd Qu.:22.32   3rd Qu.:1582   3rd Qu.:102.00  
##                              Max.   :42.00   Max.   :3604   Max.   :400.00  
##                              NA's   :226     NA's   :159    NA's   :155     
##     torque              seats     
##  Length:6000        5      :4590  
##  Class :character   7      : 842  
##  Mode  :character   8      : 175  
##                     4      :  99  
##                     9      :  68  
##                     (Other):  67  
##                     NA's   : 159

# Check NA
colSums(is.na(train))

##          name          year selling_price     km_driven          fuel 
##             0             0             0             0             0 
##   seller_type  transmission         owner       mileage        engine 
##             0             0             0           226           159 
##     max_power        torque         seats 
##           155             0           159

# Replace NA values by median
train = train %>% 
  mutate(across(c(mileage, engine, max_power),
                ~replace_na(., median(., na.rm=TRUE))))

find_mode <- function(x) {
  u <- unique(x)
  tab <- tabulate(match(x, u))
  u[tab == max(tab)]
}
replace_by_mode <- function(x) {
  x[is.na(x)] <- find_mode(x)
  return(x)
}
train <- train %>%
        mutate_if(is.factor, replace_by_mode)

# Remove "torque" variable
head(levels(as.factor(train$torque)), 10)

##  [1] ""                      "10.2@ 2,600(kgm@ rpm)" "10.4@ 3,200(kgm@ rpm)"
##  [4] "10.7@ 2,500(kgm@ rpm)" "100Nm@ 2700rpm"        "101Nm@ 3000rpm"       
##  [7] "102Nm@ 2600rpm"        "102Nm@ 4000rpm"        "103Nm@ 4500rpm"       
## [10] "104Nm@ 3100rpm"

train$torque <- NULL

# Summarize data
summary(train)

##       name           year      selling_price        km_driven      
##  Mar    :1792   Min.   :1983   Min.   :   30000   Min.   :   1000  
##  Hyu    :1034   1st Qu.:2011   1st Qu.:  255750   1st Qu.:  35000  
##  Mah    : 595   Median :2015   Median :  450000   Median :  60000  
##  Tat    : 546   Mean   :2014   Mean   :  638776   Mean   :  69972  
##  Toy    : 366   3rd Qu.:2017   3rd Qu.:  675000   3rd Qu.:  99000  
##  Hon    : 335   Max.   :2020   Max.   :10000000   Max.   :1500000  
##  (Other):1332                                                      
##      fuel                seller_type      transmission 
##  CNG   :  44   Dealer          : 820   Automatic: 772  
##  Diesel:3265   Individual      :5006   Manual   :5228  
##  LPG   :  27   Trustmark Dealer: 174                   
##  Petrol:2664                                           
##                                                        
##                                                        
##                                                        
##                   owner         mileage          engine       max_power     
##  First Owner         :3904   Min.   : 0.00   Min.   : 624   Min.   :  0.00  
##  Fourth & Above Owner: 119   1st Qu.:16.80   1st Qu.:1197   1st Qu.: 68.10  
##  Second Owner        :1558   Median :19.30   Median :1248   Median : 82.00  
##  Test Drive Car      :   5   Mean   :19.36   Mean   :1459   Mean   : 91.34  
##  Third Owner         : 414   3rd Qu.:22.07   3rd Qu.:1582   3rd Qu.:102.00  
##                              Max.   :42.00   Max.   :3604   Max.   :400.00  
##                                                                             
##      seats     
##  5      :4749  
##  7      : 842  
##  8      : 175  
##  4      :  99  
##  9      :  68  
##  6      :  49  
##  (Other):  18

colSums(is.na(train))

##          name          year selling_price     km_driven          fuel 
##             0             0             0             0             0 
##   seller_type  transmission         owner       mileage        engine 
##             0             0             0             0             0 
##     max_power         seats 
##             0             0

for (col in c('fuel', 'seller_type', 'transmission', 'owner','seats')) {
  plot <- ggplot(train,
                 aes_string(x = col, y = 'selling_price', group = col, fill = col)) + 
    stat_boxplot(outlier.colour = 'red',outlier.size = 1, show.legend = FALSE) + 
    ggtitle(glue::glue("Boxplot of Selling Price according to {col}"))
  print(plot)
}

# Remove outliers 
rm.outlier <- function(x){
  Q <- quantile(x, probs=c(.25, .75), na.rm = FALSE)
  Iqr = IQR(x)

  above = Q[2] + 1.5*Iqr
  below = Q[1] - 1.5*Iqr
  x[x > above | x < below] <- mean(x, na.rm = TRUE)
  return(x)
}

train <- train %>% mutate_if(is.numeric, rm.outlier)
summary(train)

##       name           year      selling_price       km_driven          fuel     
##  Mar    :1792   Min.   :2002   Min.   :  30000   Min.   :  1000   CNG   :  44  
##  Hyu    :1034   1st Qu.:2012   1st Qu.: 255750   1st Qu.: 35000   Diesel:3265  
##  Mah    : 595   Median :2015   Median : 450000   Median : 60000   LPG   :  27  
##  Tat    : 546   Mean   :2014   Mean   : 465936   Mean   : 65907   Petrol:2664  
##  Toy    : 366   3rd Qu.:2017   3rd Qu.: 638776   3rd Qu.: 90000                
##  Hon    : 335   Max.   :2020   Max.   :1300000   Max.   :195000                
##  (Other):1332                                                                  
##            seller_type      transmission                   owner     
##  Dealer          : 820   Automatic: 772   First Owner         :3904  
##  Individual      :5006   Manual   :5228   Fourth & Above Owner: 119  
##  Trustmark Dealer: 174                    Second Owner        :1558  
##                                           Test Drive Car      :   5  
##                                           Third Owner         : 414  
##                                                                      
##                                                                      
##     mileage          engine       max_power          seats     
##  Min.   : 9.00   Min.   : 624   Min.   : 32.80   5      :4749  
##  1st Qu.:16.80   1st Qu.:1197   1st Qu.: 68.10   7      : 842  
##  Median :19.30   Median :1248   Median : 82.00   8      : 175  
##  Mean   :19.39   Mean   :1310   Mean   : 84.69   4      :  99  
##  3rd Qu.:22.07   3rd Qu.:1459   3rd Qu.: 93.70   9      :  68  
##  Max.   :28.40   Max.   :2148   Max.   :152.00   6      :  49  
##                                                  (Other):  18

# Test Set 
test$torque <- NULL
test = test %>% 
  mutate(across(c(mileage, engine, max_power),
                ~replace_na(., median(., na.rm=TRUE))))

test <- test %>%
        mutate_if(is.factor, replace_by_mode)

II. DATA VISUALIZATION

1. Single variables

1.1. Factor variable

#fuel
ggplot(train, aes(x = fuel, fill = fuel)) + geom_bar() +
  scale_fill_manual(values = c("CNG" = "#B5350D",
                                "Diesel" = "#6D1F06",
                                "LPG" = "#6B4B42",
                                "Petrol" = "#EC3D0C"))

Diesel aand petrol fuel cars are used by most divers

#seller_type
ggplot(train, aes(x = seller_type, fill = seller_type)) + geom_bar() +
  scale_fill_manual(values = c("Dealer" = "#B5350D",
                                "Individual" = "#6D1F06",
                                "Trustmark Dealer" = "#EC3D0C"))

Most used cars are sold by individual seller

#transmission
ggplot(train, aes(x = transmission, fill = transmission)) + geom_bar() +
  scale_fill_manual(values = c("Automatic" = "#6D1F06",
                                "Manual" = "#EC3D0C"))

Vehicles with manual transmission are used by most customers

#owner
ggplot(train, aes(x = owner, fill = owner)) + geom_bar() +
  scale_fill_manual(values = c("First Owner" = "#B5350D",
                               "Fourth & Above Owner" = "#6D1F06",
                               "Second Owner" = "#6B4B42",
                               "Test Drive Car" = "#EC3D0C",
                               "Third Owner" = "#D67878"))

First owner cars are sold the most, then second owner cars and the third ower cars. Fourth & above owner and test drive cars are sold fewer ### 1.2. Numeric variable

##    name Freq
## 1   Mar 1792
## 2   Hyu 1034
## 3   Mah  595
## 4   Tat  546
## 5   Toy  366
## 6   Hon  335
## 7   For  293
## 8   Vol  186
## 9   Che  170
## 10  Ren  164

The left figure illustrates the probability density pattern of selling_price variable. In general, it can be seen that the data is in the interval of 3000 to 1210000. The majority of data concentrate on the left-hand side of the plot, that is, the selling price of used cars is approximately 125000 to 750000. It is quite clear to see that its density pattern does not match the curve (the representative of normal distribution), so it can be said that the bmi variable does not compel to normal distribution. For larger certainty,the right figure try to investigate QQplot. Once more, this plot also shows that the relatively large amount of observed data does not lie on the expected line of normal distribution perfectly.

#km_driven
p1 <- ggplot(train, aes(x = km_driven)) +
  geom_histogram(bins = 50, fill = "red", alpha = 0.7) 

p2 <- ggplot(train,aes(km_driven,selling_price)) +
  geom_point(alpha=0.2, colour="red")

grid.arrange(p1, p2, ncol = 2, nrow = 1)

#mileage
p1 <- ggplot(train, aes(x = mileage)) +
  geom_histogram(bins = 50, fill = "red", alpha = 0.7) 

p2 <- ggplot(train,aes(mileage,selling_price)) +
  geom_point(alpha=0.2, colour="red")

grid.arrange(p1, p2, ncol = 2, nrow = 1)

#engine
p1 <- ggplot(train, aes(x = engine)) +
  geom_histogram(bins = 50, fill = "red", alpha = 0.7) 

p2 <- ggplot(train,aes(engine,selling_price)) +
  geom_point(alpha=0.2, colour="red")

grid.arrange(p1, p2, ncol = 2, nrow = 1)

#max_power
p1 <- ggplot(train, aes(x = max_power)) +
  geom_histogram(bins = 50, fill = "red", alpha = 0.7) 

p2 <- ggplot(train,aes(max_power,selling_price)) +
  geom_point(alpha=0.2, colour="red")

grid.arrange(p1, p2, ncol = 2, nrow = 1)

2. Multivariable

library(corrplot)
corrplot(cor(train[sapply(train,is.numeric)]), method = "color", col = COL1(n=20),order = 'AOE', addCoef.col = 'white')

III. BUILDING PREDICTIVE MODELS

set.seed(1)
test_index <-createDataPartition(train$selling_price,
                      times = 1,
                      p = 0.8,
                      list = FALSE)
train1 <- train[test_index, ]
test1 <- train[-test_index, ]

1. Generalized additive models (GAM)

The general linear model is an extension of the standard linear model. If in a linear model the independent variables are monomial, then in GAM, each independent variable is a linear or nonlinear function. \[ Y_i = \beta_0 + f_1(X_{i1}) + f_2(X_{i2}) + ... + f_P(X_{ip}) + \varepsilon_i \] where \(f_j\) can be constant, polynomial, natural splines or smoothing splines,…

gam1 <- gam(selling_price ~ name + s(year) + s(km_driven) + fuel + seller_type + transmission + owner + s(mileage) + s(engine) + s(max_power) + seats,
           data = train1, select=TRUE)  #muốn cải thiện MH, đổi df=1,2,3
gam1.pred <- predict(gam1, newdata = test1)
RMSE(gam1.pred, test1$selling_price)

## [1] 135203

2. Decision tree model

Building decision tree model, the tree model with all variables gives the smallest RMSE.

tree1 = tree(selling_price ~ name + s(year) + s(km_driven) + fuel + seller_type + transmission + owner + s(mileage) + s(engine) + s(max_power) + seats,
  data = train,
  control = tree.control(
    nobs = nrow(train),
    mincut = 2,
    minsize = 4,
    mindev = 0.003
  )
)

Use cross validation method to find the appropriate number of leaves for the model

set.seed(1)
cv.tree.all = cv.tree(tree1, K = 5)
l = cv.tree.all$size[which.min(cv.tree.all$dev)]
l

## [1] 26

Build decision tree model with optimal number of leaves

tree1 = prune.tree(tree1, best = l)
plot(tree1)
text(tree1, pretty = 0)

tree1.pred = predict(tree1, test1)
RMSE(tree1.pred, test1$selling_price)

## [1] 132187.6

3. Ensemble methods

3.1. Random forest

Random forest is another ensemble machine learning algorithm based on bagging technique. It is an extension of the bagging estimator algorithm. The (base estimator) in the random forest are (decision trees) models. Random forest randomly selects a set of (features) to be used to decide the best split at each node of the decision tree model.

This is implemented as follows:

Step 1: Divide the original data set into random subsets
Step 2: At each node in the decision tree model, only a set of random features are considered to decide the best split.
Step 3: A decision tree model is fitted on each subset
Step 4: The final prediction is calculated by averaging the predictions from all decision tree models

library(rpart.plot)
rf_fit<-rpart(selling_price~.,data=train1,method="anova")
rpart.plot(rf_fit,type=3)

rf_fit=randomForest(selling_price~.,data=train1,
                    mtry=4,
                    ntree=10)
rf_fit

## 
## Call:
##  randomForest(formula = selling_price ~ ., data = train1, mtry = 4,      ntree = 10) 
##                Type of random forest: regression
##                      Number of trees: 10
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 9302932747
##                     % Var explained: 85.13

plot(rf_fit)

rf_pred <- predict(rf_fit,test1)
RMSE(rf_pred,test1$selling_price)

## [1] 97869.32

3.2. Bagging

bag.fit<-randomForest(selling_price~.,data=train1,
                      mtry=ncol(train1)-1,
                      importance=TRUE,
                      ntree=200)
bag.pred<-predict(bag.fit,newdata=test1)
sqrt(mean((bag.pred-test1$selling_price)^2))

## [1] 94581.97

RMSE(bag.pred, test1$selling_price)

## [1] 94581.97

3.3 Boosting

Gradient Boosting or GBM is another esemble machine learning algorithm used for both regression and classification. GBM uses boosting technique, which combines several (weak leaners) models to form a (strong leaners) models. It usually uses a (decision tree) as the (base model), each subsequent tree is built on the calculated (errors) computed by the previous tree.

In the random forest or in bagging techniques, in general, the models are trained separately, not related to or affecting each other, it can lead to poor results in some cases when trained models can produce the same result. We cannot control the submodel added to the bagging. We expect weak models to support each other, learn from each other to avoid entering the mistakes of the previous model. This is what bagging cannot do. Boosting was born based on the desire to improve the above limitations. The basic idea is that boosting will create a series of weak models that learn to complement each other. In other words, in boosting, later models try to learn to limit the errors of the former.

boost.fit <- gbm(selling_price ~.,
                 data = train1,
                 n.trees = 5000,
                 interaction.depth = 5,
                 shrinkage = 0.005
)

## Distribution not specified, assuming gaussian ...

boost.pred <- predict(boost.fit, newdata = test1, n.trees = 5000)
RMSE(boost.pred, test1$selling_price)

## [1] 98747.8

The performance of our model (how well our model predict the target variable) can be calculated using root mean squared error:

\[ RMSE = \sqrt{\frac{\sum(prediction_i - actual_i)^2}{n}} \]

RMSE is better than MAE or mean absolute error, because RMSE squared the difference between the actual values and the predicted values, meaning that prediction with higher error will be penalized greatly. This metric is often used to compare two or more alternative models, even though it is harder to interpret than MAE. We can use the RMSE () functions from caret package.

table1<-data.frame(
  Model=c("GAM","DecisionTree","RandomForest","Bagging","Boosting"),
  RMSE=c(RMSE(gam1.pred, test1$selling_price),RMSE(tree1.pred, test1$selling_price),RMSE(rf_pred,test1$selling_price),RMSE(bag.pred, test1$selling_price), RMSE(boost.pred, test1$selling_price)))

kbl(table1, caption = "") %>%
  row_spec(row =0, bold= TRUE, color = "black", background = "#eaf9f3") %>%
  kable_styling(bootstrap_options = "striped", full_width = F, position = "center")


Model	RMSE
GAM	135203.02
DecisionTree	132187.64
RandomForest	97869.32
Bagging	94581.97
Boosting	98747.80

Bagging has the smallest RMSE ## 4. Predict the used car in test dataset

test$year<-as.numeric(test$year)
test$km_driven<-as.numeric(test$km_driven)
selling_price = train1$selling_price
train1$selling_price <- NULL
test <- rbind(train1[1, ] , test)
test<- test[-1,]

bag.fit<-randomForest(selling_price~.,data=train1,
                      mtry=ncol(train1)-1,
                      importance=TRUE,
                      ntree=200)
bag.pred<-predict(bag.fit,newdata=test)
write.csv(bag.pred, file ="selling_price.csv",sep="\t",row.names = FALSE)