Predictive Analysis Final-Term
I. DATA PREPARATION
## 'data.frame': 6000 obs. of 13 variables:
## $ name : Factor w/ 30 levels "Amb","Ash","Aud",..: 28 11 29 11 30 8 11 11 18 11 ...
## $ year : int 2018 2018 2015 2015 2019 2015 2013 2017 2011 2012 ...
## $ selling_price: int 850000 750000 1500000 795000 3800000 500000 325000 270000 174000 270000 ...
## $ km_driven : int 9500 30000 80000 35000 20000 96500 70000 50000 100000 56000 ...
## $ fuel : Factor w/ 4 levels "CNG","Diesel",..: 2 2 2 4 2 2 4 4 2 4 ...
## $ seller_type : Factor w/ 3 levels "Dealer","Individual",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ transmission : Factor w/ 2 levels "Automatic","Manual": 2 2 1 2 1 2 2 2 2 2 ...
## $ owner : Factor w/ 5 levels "First Owner",..: 1 1 3 1 1 1 1 1 3 1 ...
## $ mileage : num 21.5 21.2 12.6 17 18 ...
## $ engine : num 1497 1396 2982 1591 1969 ...
## $ max_power : num 108.5 88.8 168.5 121.3 190 ...
## $ torque : chr "260Nm@ 1500-2750rpm" "219.66nm@ 1500-2750rpm" "360Nm@ 1400-3200rpm" "154.9Nm@ 4200rpm" ...
## $ seats : Factor w/ 9 levels "2","4","5","6",..: 3 3 5 3 3 3 3 3 3 3 ...
## 'data.frame': 2128 obs. of 12 variables:
## $ name : Factor w/ 24 levels "Amb","Aud","BMW",..: 21 9 10 14 8 15 15 10 4 10 ...
## $ year : int 2016 2017 2014 2014 2015 2008 2007 2011 2013 2019 ...
## $ km_driven : int 11000 35000 80000 145241 92651 120000 70000 70000 56000 3500 ...
## $ fuel : Factor w/ 4 levels "CNG","Diesel",..: 4 4 4 2 2 4 4 4 2 4 ...
## $ seller_type : Factor w/ 3 levels "Dealer","Individual",..: 1 2 2 2 1 2 2 2 1 2 ...
## $ transmission: Factor w/ 2 levels "Automatic","Manual": 1 2 2 2 2 2 2 2 2 2 ...
## $ owner : Factor w/ 4 levels "First Owner",..: 1 1 1 1 1 3 4 1 1 1 ...
## $ mileage : num 14.3 17.8 19.1 12.1 21.7 ...
## $ engine : num 1598 1497 1197 2179 1498 ...
## $ max_power : num 104 117 82 120 99 ...
## $ torque : chr "153Nm@ 3800rpm" "145Nm@ 4600rpm" "114Nm@ 4000rpm" "290Nm@ 1800-2800rpm" ...
## $ seats : Factor w/ 7 levels "4","5","6","7",..: 2 2 2 4 2 5 2 2 2 2 ...
# Descriptive statistic
summary(train)## name year selling_price km_driven
## Mar :1792 Min. :1983 Min. : 30000 Min. : 1000
## Hyu :1034 1st Qu.:2011 1st Qu.: 255750 1st Qu.: 35000
## Mah : 595 Median :2015 Median : 450000 Median : 60000
## Tat : 546 Mean :2014 Mean : 638776 Mean : 69972
## Toy : 366 3rd Qu.:2017 3rd Qu.: 675000 3rd Qu.: 99000
## Hon : 335 Max. :2020 Max. :10000000 Max. :1500000
## (Other):1332
## fuel seller_type transmission
## CNG : 44 Dealer : 820 Automatic: 772
## Diesel:3265 Individual :5006 Manual :5228
## LPG : 27 Trustmark Dealer: 174
## Petrol:2664
##
##
##
## owner mileage engine max_power
## First Owner :3904 Min. : 0.00 Min. : 624 Min. : 0.00
## Fourth & Above Owner: 119 1st Qu.:16.78 1st Qu.:1197 1st Qu.: 68.05
## Second Owner :1558 Median :19.30 Median :1248 Median : 82.00
## Test Drive Car : 5 Mean :19.36 Mean :1464 Mean : 91.59
## Third Owner : 414 3rd Qu.:22.32 3rd Qu.:1582 3rd Qu.:102.00
## Max. :42.00 Max. :3604 Max. :400.00
## NA's :226 NA's :159 NA's :155
## torque seats
## Length:6000 5 :4590
## Class :character 7 : 842
## Mode :character 8 : 175
## 4 : 99
## 9 : 68
## (Other): 67
## NA's : 159
# Check NA
colSums(is.na(train))## name year selling_price km_driven fuel
## 0 0 0 0 0
## seller_type transmission owner mileage engine
## 0 0 0 226 159
## max_power torque seats
## 155 0 159
# Replace NA values by median
train = train %>%
mutate(across(c(mileage, engine, max_power),
~replace_na(., median(., na.rm=TRUE))))
find_mode <- function(x) {
u <- unique(x)
tab <- tabulate(match(x, u))
u[tab == max(tab)]
}
replace_by_mode <- function(x) {
x[is.na(x)] <- find_mode(x)
return(x)
}
train <- train %>%
mutate_if(is.factor, replace_by_mode)
# Remove "torque" variable
head(levels(as.factor(train$torque)), 10)## [1] "" "10.2@ 2,600(kgm@ rpm)" "10.4@ 3,200(kgm@ rpm)"
## [4] "10.7@ 2,500(kgm@ rpm)" "100Nm@ 2700rpm" "101Nm@ 3000rpm"
## [7] "102Nm@ 2600rpm" "102Nm@ 4000rpm" "103Nm@ 4500rpm"
## [10] "104Nm@ 3100rpm"
train$torque <- NULL
# Summarize data
summary(train)## name year selling_price km_driven
## Mar :1792 Min. :1983 Min. : 30000 Min. : 1000
## Hyu :1034 1st Qu.:2011 1st Qu.: 255750 1st Qu.: 35000
## Mah : 595 Median :2015 Median : 450000 Median : 60000
## Tat : 546 Mean :2014 Mean : 638776 Mean : 69972
## Toy : 366 3rd Qu.:2017 3rd Qu.: 675000 3rd Qu.: 99000
## Hon : 335 Max. :2020 Max. :10000000 Max. :1500000
## (Other):1332
## fuel seller_type transmission
## CNG : 44 Dealer : 820 Automatic: 772
## Diesel:3265 Individual :5006 Manual :5228
## LPG : 27 Trustmark Dealer: 174
## Petrol:2664
##
##
##
## owner mileage engine max_power
## First Owner :3904 Min. : 0.00 Min. : 624 Min. : 0.00
## Fourth & Above Owner: 119 1st Qu.:16.80 1st Qu.:1197 1st Qu.: 68.10
## Second Owner :1558 Median :19.30 Median :1248 Median : 82.00
## Test Drive Car : 5 Mean :19.36 Mean :1459 Mean : 91.34
## Third Owner : 414 3rd Qu.:22.07 3rd Qu.:1582 3rd Qu.:102.00
## Max. :42.00 Max. :3604 Max. :400.00
##
## seats
## 5 :4749
## 7 : 842
## 8 : 175
## 4 : 99
## 9 : 68
## 6 : 49
## (Other): 18
colSums(is.na(train))## name year selling_price km_driven fuel
## 0 0 0 0 0
## seller_type transmission owner mileage engine
## 0 0 0 0 0
## max_power seats
## 0 0
for (col in c('fuel', 'seller_type', 'transmission', 'owner','seats')) {
plot <- ggplot(train,
aes_string(x = col, y = 'selling_price', group = col, fill = col)) +
stat_boxplot(outlier.colour = 'red',outlier.size = 1, show.legend = FALSE) +
ggtitle(glue::glue("Boxplot of Selling Price according to {col}"))
print(plot)
}# Remove outliers
rm.outlier <- function(x){
Q <- quantile(x, probs=c(.25, .75), na.rm = FALSE)
Iqr = IQR(x)
above = Q[2] + 1.5*Iqr
below = Q[1] - 1.5*Iqr
x[x > above | x < below] <- mean(x, na.rm = TRUE)
return(x)
}
train <- train %>% mutate_if(is.numeric, rm.outlier)
summary(train)## name year selling_price km_driven fuel
## Mar :1792 Min. :2002 Min. : 30000 Min. : 1000 CNG : 44
## Hyu :1034 1st Qu.:2012 1st Qu.: 255750 1st Qu.: 35000 Diesel:3265
## Mah : 595 Median :2015 Median : 450000 Median : 60000 LPG : 27
## Tat : 546 Mean :2014 Mean : 465936 Mean : 65907 Petrol:2664
## Toy : 366 3rd Qu.:2017 3rd Qu.: 638776 3rd Qu.: 90000
## Hon : 335 Max. :2020 Max. :1300000 Max. :195000
## (Other):1332
## seller_type transmission owner
## Dealer : 820 Automatic: 772 First Owner :3904
## Individual :5006 Manual :5228 Fourth & Above Owner: 119
## Trustmark Dealer: 174 Second Owner :1558
## Test Drive Car : 5
## Third Owner : 414
##
##
## mileage engine max_power seats
## Min. : 9.00 Min. : 624 Min. : 32.80 5 :4749
## 1st Qu.:16.80 1st Qu.:1197 1st Qu.: 68.10 7 : 842
## Median :19.30 Median :1248 Median : 82.00 8 : 175
## Mean :19.39 Mean :1310 Mean : 84.69 4 : 99
## 3rd Qu.:22.07 3rd Qu.:1459 3rd Qu.: 93.70 9 : 68
## Max. :28.40 Max. :2148 Max. :152.00 6 : 49
## (Other): 18
# Test Set
test$torque <- NULL
test = test %>%
mutate(across(c(mileage, engine, max_power),
~replace_na(., median(., na.rm=TRUE))))
test <- test %>%
mutate_if(is.factor, replace_by_mode)II. DATA VISUALIZATION
1. Single variables
1.1. Factor variable
#fuel
ggplot(train, aes(x = fuel, fill = fuel)) + geom_bar() +
scale_fill_manual(values = c("CNG" = "#B5350D",
"Diesel" = "#6D1F06",
"LPG" = "#6B4B42",
"Petrol" = "#EC3D0C"))Diesel aand petrol fuel cars are used by most divers
#seller_type
ggplot(train, aes(x = seller_type, fill = seller_type)) + geom_bar() +
scale_fill_manual(values = c("Dealer" = "#B5350D",
"Individual" = "#6D1F06",
"Trustmark Dealer" = "#EC3D0C"))Most used cars are sold by individual seller
#transmission
ggplot(train, aes(x = transmission, fill = transmission)) + geom_bar() +
scale_fill_manual(values = c("Automatic" = "#6D1F06",
"Manual" = "#EC3D0C"))Vehicles with manual transmission are used by most customers
#owner
ggplot(train, aes(x = owner, fill = owner)) + geom_bar() +
scale_fill_manual(values = c("First Owner" = "#B5350D",
"Fourth & Above Owner" = "#6D1F06",
"Second Owner" = "#6B4B42",
"Test Drive Car" = "#EC3D0C",
"Third Owner" = "#D67878"))First owner cars are sold the most, then second owner cars and the third ower cars. Fourth & above owner and test drive cars are sold fewer ### 1.2. Numeric variable
## name Freq
## 1 Mar 1792
## 2 Hyu 1034
## 3 Mah 595
## 4 Tat 546
## 5 Toy 366
## 6 Hon 335
## 7 For 293
## 8 Vol 186
## 9 Che 170
## 10 Ren 164
The left figure illustrates the probability density pattern of selling_price variable. In general, it can be seen that the data is in the interval of 3000 to 1210000. The majority of data concentrate on the left-hand side of the plot, that is, the selling price of used cars is approximately 125000 to 750000. It is quite clear to see that its density pattern does not match the curve (the representative of normal distribution), so it can be said that the bmi variable does not compel to normal distribution. For larger certainty,the right figure try to investigate QQplot. Once more, this plot also shows that the relatively large amount of observed data does not lie on the expected line of normal distribution perfectly.
#km_driven
p1 <- ggplot(train, aes(x = km_driven)) +
geom_histogram(bins = 50, fill = "red", alpha = 0.7)
p2 <- ggplot(train,aes(km_driven,selling_price)) +
geom_point(alpha=0.2, colour="red")
grid.arrange(p1, p2, ncol = 2, nrow = 1)#mileage
p1 <- ggplot(train, aes(x = mileage)) +
geom_histogram(bins = 50, fill = "red", alpha = 0.7)
p2 <- ggplot(train,aes(mileage,selling_price)) +
geom_point(alpha=0.2, colour="red")
grid.arrange(p1, p2, ncol = 2, nrow = 1)#engine
p1 <- ggplot(train, aes(x = engine)) +
geom_histogram(bins = 50, fill = "red", alpha = 0.7)
p2 <- ggplot(train,aes(engine,selling_price)) +
geom_point(alpha=0.2, colour="red")
grid.arrange(p1, p2, ncol = 2, nrow = 1)#max_power
p1 <- ggplot(train, aes(x = max_power)) +
geom_histogram(bins = 50, fill = "red", alpha = 0.7)
p2 <- ggplot(train,aes(max_power,selling_price)) +
geom_point(alpha=0.2, colour="red")
grid.arrange(p1, p2, ncol = 2, nrow = 1)2. Multivariable
library(corrplot)
corrplot(cor(train[sapply(train,is.numeric)]), method = "color", col = COL1(n=20),order = 'AOE', addCoef.col = 'white')III. BUILDING PREDICTIVE MODELS
set.seed(1)
test_index <-createDataPartition(train$selling_price,
times = 1,
p = 0.8,
list = FALSE)
train1 <- train[test_index, ]
test1 <- train[-test_index, ]1. Generalized additive models (GAM)
The general linear model is an extension of the standard linear model. If in a linear model the independent variables are monomial, then in GAM, each independent variable is a linear or nonlinear function. \[ Y_i = \beta_0 + f_1(X_{i1}) + f_2(X_{i2}) + ... + f_P(X_{ip}) + \varepsilon_i \] where \(f_j\) can be constant, polynomial, natural splines or smoothing splines,…
gam1 <- gam(selling_price ~ name + s(year) + s(km_driven) + fuel + seller_type + transmission + owner + s(mileage) + s(engine) + s(max_power) + seats,
data = train1, select=TRUE) #muốn cải thiện MH, đổi df=1,2,3
gam1.pred <- predict(gam1, newdata = test1)
RMSE(gam1.pred, test1$selling_price)## [1] 135203
2. Decision tree model
Building decision tree model, the tree model with all variables gives the smallest RMSE.
tree1 = tree(selling_price ~ name + s(year) + s(km_driven) + fuel + seller_type + transmission + owner + s(mileage) + s(engine) + s(max_power) + seats,
data = train,
control = tree.control(
nobs = nrow(train),
mincut = 2,
minsize = 4,
mindev = 0.003
)
)Use cross validation method to find the appropriate number of leaves for the model
set.seed(1)
cv.tree.all = cv.tree(tree1, K = 5)
l = cv.tree.all$size[which.min(cv.tree.all$dev)]
l## [1] 26
Build decision tree model with optimal number of leaves
tree1 = prune.tree(tree1, best = l)
plot(tree1)
text(tree1, pretty = 0)tree1.pred = predict(tree1, test1)
RMSE(tree1.pred, test1$selling_price)## [1] 132187.6
3. Ensemble methods
3.1. Random forest
Random forest is another ensemble machine learning algorithm based on bagging technique. It is an extension of the bagging estimator algorithm. The (base estimator) in the random forest are (decision trees) models. Random forest randomly selects a set of (features) to be used to decide the best split at each node of the decision tree model.
This is implemented as follows:
Step 1: Divide the original data set into random subsets
Step 2: At each node in the decision tree model, only a set of random features are considered to decide the best split.
Step 3: A decision tree model is fitted on each subset
Step 4: The final prediction is calculated by averaging the predictions from all decision tree models
library(rpart.plot)
rf_fit<-rpart(selling_price~.,data=train1,method="anova")
rpart.plot(rf_fit,type=3)rf_fit=randomForest(selling_price~.,data=train1,
mtry=4,
ntree=10)
rf_fit##
## Call:
## randomForest(formula = selling_price ~ ., data = train1, mtry = 4, ntree = 10)
## Type of random forest: regression
## Number of trees: 10
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 9302932747
## % Var explained: 85.13
plot(rf_fit)rf_pred <- predict(rf_fit,test1)
RMSE(rf_pred,test1$selling_price)## [1] 97869.32
3.2. Bagging
bag.fit<-randomForest(selling_price~.,data=train1,
mtry=ncol(train1)-1,
importance=TRUE,
ntree=200)
bag.pred<-predict(bag.fit,newdata=test1)
sqrt(mean((bag.pred-test1$selling_price)^2))## [1] 94581.97
RMSE(bag.pred, test1$selling_price)## [1] 94581.97
3.3 Boosting
Gradient Boosting or GBM is another esemble machine learning algorithm used for both regression and classification. GBM uses boosting technique, which combines several (weak leaners) models to form a (strong leaners) models. It usually uses a (decision tree) as the (base model), each subsequent tree is built on the calculated (errors) computed by the previous tree.
In the random forest or in bagging techniques, in general, the models are trained separately, not related to or affecting each other, it can lead to poor results in some cases when trained models can produce the same result. We cannot control the submodel added to the bagging. We expect weak models to support each other, learn from each other to avoid entering the mistakes of the previous model. This is what bagging cannot do. Boosting was born based on the desire to improve the above limitations. The basic idea is that boosting will create a series of weak models that learn to complement each other. In other words, in boosting, later models try to learn to limit the errors of the former.
boost.fit <- gbm(selling_price ~.,
data = train1,
n.trees = 5000,
interaction.depth = 5,
shrinkage = 0.005
)## Distribution not specified, assuming gaussian ...
boost.pred <- predict(boost.fit, newdata = test1, n.trees = 5000)
RMSE(boost.pred, test1$selling_price)## [1] 98747.8
The performance of our model (how well our model predict the target variable) can be calculated using root mean squared error:
\[ RMSE = \sqrt{\frac{\sum(prediction_i - actual_i)^2}{n}} \]RMSE is better than MAE or mean absolute error, because RMSE squared the difference between the actual values and the predicted values, meaning that prediction with higher error will be penalized greatly. This metric is often used to compare two or more alternative models, even though it is harder to interpret than MAE. We can use the RMSE () functions from caret package.
table1<-data.frame(
Model=c("GAM","DecisionTree","RandomForest","Bagging","Boosting"),
RMSE=c(RMSE(gam1.pred, test1$selling_price),RMSE(tree1.pred, test1$selling_price),RMSE(rf_pred,test1$selling_price),RMSE(bag.pred, test1$selling_price), RMSE(boost.pred, test1$selling_price)))
kbl(table1, caption = "") %>%
row_spec(row =0, bold= TRUE, color = "black", background = "#eaf9f3") %>%
kable_styling(bootstrap_options = "striped", full_width = F, position = "center")| Model | RMSE |
|---|---|
| GAM | 135203.02 |
| DecisionTree | 132187.64 |
| RandomForest | 97869.32 |
| Bagging | 94581.97 |
| Boosting | 98747.80 |
Bagging has the smallest RMSE ## 4. Predict the used car in test dataset
test$year<-as.numeric(test$year)
test$km_driven<-as.numeric(test$km_driven)
selling_price = train1$selling_price
train1$selling_price <- NULL
test <- rbind(train1[1, ] , test)
test<- test[-1,]
bag.fit<-randomForest(selling_price~.,data=train1,
mtry=ncol(train1)-1,
importance=TRUE,
ntree=200)
bag.pred<-predict(bag.fit,newdata=test)
write.csv(bag.pred, file ="selling_price.csv",sep="\t",row.names = FALSE)