# Load the data
accidents.df <- mlba::AccidentsFull
head(accidents.df)
## HOUR_I_R ALCHL_I ALIGN_I STRATUM_R WRK_ZONE WKDY_I_R INT_HWY LGTCON_I_R
## 1 0 2 2 1 0 1 0 3
## 2 1 2 1 0 0 1 1 3
## 3 1 2 1 0 0 1 0 3
## 4 1 2 1 1 0 0 0 3
## 5 1 1 1 0 0 1 0 3
## 6 1 2 1 1 0 1 0 3
## MANCOL_I_R PED_ACC_R RELJCT_I_R REL_RWY_R PROFIL_I_R SPD_LIM SUR_COND
## 1 0 0 1 0 1 40 4
## 2 2 0 1 1 1 70 4
## 3 2 0 1 1 1 35 4
## 4 2 0 1 1 1 35 4
## 5 2 0 0 1 1 25 4
## 6 0 0 1 0 1 70 4
## TRAF_CON_R TRAF_WAY VEH_INVL WEATHER_R INJURY_CRASH NO_INJ_I PRPTYDMG_CRASH
## 1 0 3 1 1 1 1 0
## 2 0 3 2 2 0 0 1
## 3 1 2 2 2 0 0 1
## 4 1 2 2 1 0 0 1
## 5 0 2 3 1 0 0 1
## 6 0 2 1 2 1 1 0
## FATALITIES MAX_SEV_IR
## 1 0 1
## 2 0 0
## 3 0 0
## 4 0 0
## 5 0 0
## 6 0 1
# Create and insert a dummy variable called "INJURY"
accidents.df$INJURY <- ifelse(accidents.df$MAX_SEV_IR>0, "yes", "no")
# Create a table based on injury and assign variables
inj.tbl <- table(accidents.df$INJURY)
show(inj.tbl)
##
## no yes
## 20721 21462
injury_yes <- inj.tbl["yes"]
injury_no <- inj.tbl["no"]
total_accident <- injury_yes + injury_no
# Calculate the probability of injury
injury_prob <- injury_yes / total_accident * 100
cat('Injury Probability:', injury_prob, '\n')
## Injury Probability: 50.87832
# i. Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns.
# Convert variables to categorical type
for (i in c(1:dim(accidents.df)[2])){
accidents.df[,i] <- as.factor(accidents.df[,i])
}
new_df <- accidents.df[1:12, c("INJURY", "WEATHER_R", "TRAF_CON_R")]
str(new_df)
## 'data.frame': 12 obs. of 3 variables:
## $ INJURY : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
## $ WEATHER_R : Factor w/ 2 levels "1","2": 1 2 2 1 1 2 2 1 2 2 ...
## $ TRAF_CON_R: Factor w/ 3 levels "0","1","2": 1 1 2 2 1 1 1 1 1 1 ...
rpivotTable::rpivotTable(new_df)
# ii. Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors.
#To find P(Injury=yes|WEATHER_R=1, TRAF_CON_R=0)
numerator1 <- 2/3 * 3/12
denominator1 <- 3/12
prob1 <- numerator1/denominator1
cat('1st Probability:', prob1, '\n')
## 1st Probability: 0.6666667
#To find P(Injury=yes|WEATHER_R=1, TRAF_CON_R=1)
numerator2 <- 0 * 3/12
denominator2 <- 1/12
prob2 <- numerator2/denominator2
cat('2nd Probability:', prob2, '\n')
## 2nd Probability: 0
#To find P(Injury=yes|WEATHER_R=1, TRAF_CON_R=2)
numerator3 <- 0 * 3/12
denominator3 <- 1/12
prob3 <- numerator3/denominator3
cat('3rd Probability:', prob3, '\n')
## 3rd Probability: 0
#To find P(Injury=yes|WEATHER_R=2, TRAF_CON_R=0)
numerator4 <- 1/3 * 3/12
denominator4 <- 6/12
prob4 <- numerator4/denominator4
cat('4th Probability:', prob4, '\n')
## 4th Probability: 0.1666667
#To find P(Injury=yes|WEATHER_R=2, TRAF_CON_R=1)
numerator5 <- 0 * 3/12
denominator5 <- 1/12
prob5 <- numerator5/denominator5
cat('5th Probability:', prob5, '\n')
## 5th Probability: 0
#To find P(Injury=yes|WEATHER_R=2, TRAF_CON_R=2)
numerator6 <- 0 * 3/12
denominator6 <- 0
prob6 <- numerator6/denominator6
cat('6th Probability:', prob6, '\n')
## 6th Probability: NaN
# Create the data frame for conclusion
a <- c(1,2,3,4,5,6)
b <- c(prob1,prob2,prob3,prob4,prob5,prob6)
prob.df <- data.frame(a,b)
names(prob.df) <- c('Combination No','Probability')
prob.df %>% mutate_if(is.numeric, round, 3)
## Combination No Probability
## 1 1 0.667
## 2 2 0.000
## 3 3 0.000
## 4 4 0.167
## 5 5 0.000
## 6 6 NaN
# iii. Classify the 12 accidents using these probabilities and a cutoff of 0.5
# Create new data frame for probabilities calculation
new_prob_df <- new_df
head(new_df)
## INJURY WEATHER_R TRAF_CON_R
## 1 yes 1 0
## 2 no 2 0
## 3 no 2 1
## 4 no 1 1
## 5 no 1 0
## 6 yes 2 0
# Create new data frame from previous 12 records in (i)
prob.inj <- c(0.667, 0.167, 0, 0, 0.667, 0.167, 0.167, 0.667, 0.167, 0.167, 0.167, 0)
new_prob_df$PROB_INJURY<-prob.inj
# Calculate the probabilities when cutoff is 0.5
new_prob_df$PREDICT_PROB <- ifelse(new_prob_df$PROB_INJURY>.5,"yes","no")
print(new_prob_df)
## INJURY WEATHER_R TRAF_CON_R PROB_INJURY PREDICT_PROB
## 1 yes 1 0 0.667 yes
## 2 no 2 0 0.167 no
## 3 no 2 1 0.000 no
## 4 no 1 1 0.000 no
## 5 no 1 0 0.667 yes
## 6 yes 2 0 0.167 no
## 7 no 2 0 0.167 no
## 8 yes 1 0 0.667 yes
## 9 no 2 0 0.167 no
## 10 no 2 0 0.167 no
## 11 no 2 0 0.167 no
## 12 no 1 2 0.000 no
# iv. Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1
# To find P(Injury=yes| WEATHER_R=1, TRAF_CON_R=1):
# Probability of injury involved in accidents =
#(proportion of WEATHER_R=1 when Injury = yes)
#*(proportion of TRAF_CON_R=1 when Injury = yes)
#*(propotion of Injury = yes in all cases)
man.prob <- 2/3 * 0/3 * 3/12
man.prob
## [1] 0
# v. Run a Naive Bayes classifier on the 12 records and two predictors using R.
nb <- naiveBayes(INJURY ~ TRAF_CON_R + WEATHER_R, data=new_df)
(predict(nb, newdata = accidents.df[1:12, c("INJURY", "WEATHER_R", "TRAF_CON_R")],
type = "raw"))
## no yes
## [1,] 0.5000000 0.5000000000
## [2,] 0.8000000 0.2000000000
## [3,] 0.9992506 0.0007494379
## [4,] 0.9970090 0.0029910269
## [5,] 0.5000000 0.5000000000
## [6,] 0.8000000 0.2000000000
## [7,] 0.8000000 0.2000000000
## [8,] 0.5000000 0.5000000000
## [9,] 0.8000000 0.2000000000
## [10,] 0.8000000 0.2000000000
## [11,] 0.8000000 0.2000000000
## [12,] 0.9940358 0.0059642147
# Test NB with caret
library(caret)
## Loading required package: lattice
x=new_df[,-3]
y=new_df$INJURY
model <- train(x,y,'nb', trControl = trainControl(method = 'cv',number=10))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
print(model)
## Naive Bayes
##
## 12 samples
## 2 predictor
## 2 classes: 'no', 'yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 11, 10, 11, 10, 11, 11, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.9444444 0.6666667
## TRUE 0.9444444 0.6666667
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE and adjust
## = 1.
# Generate prediction
model.pred <- predict(model$finalModel,x)
model.pred
## $class
## 1 2 3 4 5 6 7 8 9 10 11 12
## yes no no no no yes no yes no no no no
## Levels: no yes
##
## $posterior
## no yes
## 1 0.001497753 0.9985022466
## 2 0.999833361 0.0001666389
## 3 0.999833361 0.0001666389
## 4 0.999333777 0.0006662225
## 5 0.999333777 0.0006662225
## 6 0.005964215 0.9940357853
## 7 0.999833361 0.0001666389
## 8 0.001497753 0.9985022466
## 9 0.999833361 0.0001666389
## 10 0.999833361 0.0001666389
## 11 0.999833361 0.0001666389
## 12 0.999333777 0.0006662225
# Build a confusion matrix to show errors
table(model.pred$class,y)
## y
## no yes
## no 9 0
## yes 0 3
# Compare with the manually calculated results
new_prob_df$PREDICT_PROB_NB <- model.pred$class
new_prob_df
## INJURY WEATHER_R TRAF_CON_R PROB_INJURY PREDICT_PROB PREDICT_PROB_NB
## 1 yes 1 0 0.667 yes yes
## 2 no 2 0 0.167 no no
## 3 no 2 1 0.000 no no
## 4 no 1 1 0.000 no no
## 5 no 1 0 0.667 yes no
## 6 yes 2 0 0.167 no yes
## 7 no 2 0 0.167 no no
## 8 yes 1 0 0.667 yes yes
## 9 no 2 0 0.167 no no
## 10 no 2 0 0.167 no no
## 11 no 2 0 0.167 no no
## 12 no 1 2 0.000 no no
# Partiton the data into training (60%) and validation (40%)
set.seed(1)
train.index <- sample(c(1:dim(accidents.df)[1]), dim(accidents.df)[1]*0.6)
valid.index <- setdiff(c(1:dim(accidents.df)[1]), train.index)
train.df <- accidents.df[train.index, ]
valid.df <- accidents.df[valid.index, ]
# i. Assuming that no information or initial reports about the accident itself are available at the time of prediction (only location characteristics, weather conditions, etc.), which predictors can we include in the analysis?
# Define the predictors are non-specific to the accident
#"INJURY", "HOUR_I_R", "ALIGN_I" ,"WRK_ZONE", "WKDY_I_R", "INT_HWY", "LGTCON_I_R", "PROFIL_I_R", "SPD_LIM", "SUR_COND","TRAF_CON_R", "TRAF_WAY", "WEATHER_R")
vars <- c(25, 1, 3, 5, 6, 7, 8, 13, 14, 15, 16, 17, 19)
# ii. Run a naive Bayes classifier on the complete training set with the relevant predictors (and INJURY as the response). Note that all predictors are categorical. Show the confusion matrix.
nb <- naiveBayes(INJURY ~ ., data = train.df[,vars])
# Generate the confusion matrix using the train.df, the prediction and the classes
confusionMatrix(train.df$INJURY, predict(nb, train.df[, vars]), positive = "yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 5097 7405
## yes 4230 8577
##
## Accuracy : 0.5403
## 95% CI : (0.5341, 0.5464)
## No Information Rate : 0.6315
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0776
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5367
## Specificity : 0.5465
## Pos Pred Value : 0.6697
## Neg Pred Value : 0.4077
## Prevalence : 0.6315
## Detection Rate : 0.3389
## Detection Prevalence : 0.5060
## Balanced Accuracy : 0.5416
##
## 'Positive' Class : yes
##
# Calculate nb error
nb_acc <- 0.5403
nb_er <- 1 - nb_acc
nb_er
## [1] 0.4597
# iii. What is the overall error for the holdout set?
vars2 <- c("INJURY", "HOUR_I_R", "ALIGN_I" ,"WRK_ZONE", "WKDY_I_R",
"INT_HWY", "LGTCON_I_R", "PROFIL_I_R", "SPD_LIM", "SUR_COND",
"TRAF_CON_R", "TRAF_WAY", "WEATHER_R")
confusionMatrix(valid.df$INJURY, predict(nb, valid.df[, vars2]), positive = "yes")
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 3203 5016
## yes 2862 5793
##
## Accuracy : 0.5331
## 95% CI : (0.5256, 0.5407)
## No Information Rate : 0.6406
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0594
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5359
## Specificity : 0.5281
## Pos Pred Value : 0.6693
## Neg Pred Value : 0.3897
## Prevalence : 0.6406
## Detection Rate : 0.3433
## Detection Prevalence : 0.5129
## Balanced Accuracy : 0.5320
##
## 'Positive' Class : yes
##
# Calculate validation error
val_acc <- 0.5331
val_er <- 1 - val_acc
val_er
## [1] 0.4669
# iv. What is the percent improvement relative to the naive rule?
imp_rate <- val_er - nb_er
cat("The percent improvement: ",scales::percent(imp_rate,0.01))
## The percent improvement: 0.72%
# v.Examine the conditional probabilities output. Why do we get a probability of zero for P(INJURY = No j SPD_LIM = 5)?
# Determine the number of records with speed limit=5 and no injury
table(train.df$INJURY, train.df$SPD_LIM)
##
## 5 10 15 20 25 30 35 40 45 50 55 60 65 70
## no 1 6 55 107 1402 1076 2371 1203 1942 510 1988 444 807 512
## yes 2 4 52 50 1161 1102 2719 1369 2016 505 1984 551 796 399
##
## 75
## no 78
## yes 97
nb$tables$SPD_LIM
## SPD_LIM
## Y 5 10 15 20 25
## no 0.0000799872 0.0004799232 0.0043992961 0.0085586306 0.1121420573
## yes 0.0001561646 0.0003123292 0.0040602795 0.0039041149 0.0906535488
## SPD_LIM
## Y 30 35 40 45 50
## no 0.0860662294 0.1896496561 0.0962246041 0.1553351464 0.0407934730
## yes 0.0860466932 0.2123057703 0.1068946670 0.1574139143 0.0394315609
## SPD_LIM
## Y 55 60 65 70 75
## no 0.1590145577 0.0355143177 0.0645496721 0.0409534474 0.0062390018
## yes 0.1549152807 0.0430233466 0.0621535098 0.0311548372 0.0075739830
# Load the data
car.df<-mlba::ToyotaCorolla
View(car.df)
# Partition data into 60/40
set.seed(1)
percnt.of.data<- 0.6
train.index <- sample(c(1:dim(car.df)[1]), dim(car.df)[1]*percnt.of.data)
# The validation is the remaining record
valid.index <- setdiff(c(1:dim(car.df)[1]), train.index)
train.df <- car.df[train.index, ]
valid.df <- car.df[valid.index, ]
tr <- rpart(Price ~ Age_08_04 + KM + Fuel_Type +
HP + Automatic + Doors + Quarterly_Tax +
Mfr_Guarantee + Guarantee_Period + Airco +
Automatic_airco + CD_Player + Powered_Windows +
Sport_Model + Tow_Bar, data = train.df, method = "anova", minbucket = 1, maxdepth = 30, cp = 0.001)
prp(tr)
# i. Which appear to be the three or four most important car specifications for predicting the car's price?
# Using str we can identify the components of the tree
str(tr)
## List of 15
## $ frame :'data.frame': 67 obs. of 8 variables:
## ..$ var : chr [1:67] "Age_08_04" "Age_08_04" "Age_08_04" "KM" ...
## ..$ n : int [1:67] 861 743 471 226 72 154 113 41 245 97 ...
## ..$ wt : num [1:67] 861 743 471 226 72 154 113 41 245 97 ...
## ..$ dev : num [1:67] 1.20e+10 2.90e+09 8.14e+08 2.57e+08 8.76e+07 ...
## ..$ yval : num [1:67] 10813 9616 8648 7876 7318 ...
## ..$ complexity: num [1:67] 0.64776 0.10048 0.0216 0.00274 0.00051 ...
## ..$ ncompete : int [1:67] 4 4 4 4 0 4 0 0 4 4 ...
## ..$ nsurrogate: int [1:67] 5 5 5 3 0 1 0 0 5 0 ...
## $ where : Named int [1:861] 13 17 53 29 35 40 40 7 7 29 ...
## ..- attr(*, "names")= chr [1:861] "1017" "679" "129" "930" ...
## $ call : language rpart(formula = Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee | __truncated__ ...
## $ terms :Classes 'terms', 'formula' language Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax + Mfr_Guarantee + Guarantee_Peri| __truncated__ ...
## .. ..- attr(*, "variables")= language list(Price, Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, | __truncated__ ...
## .. ..- attr(*, "factors")= int [1:16, 1:15] 0 1 0 0 0 0 0 0 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:16] "Price" "Age_08_04" "KM" "Fuel_Type" ...
## .. .. .. ..$ : chr [1:15] "Age_08_04" "KM" "Fuel_Type" "HP" ...
## .. ..- attr(*, "term.labels")= chr [1:15] "Age_08_04" "KM" "Fuel_Type" "HP" ...
## .. ..- attr(*, "order")= int [1:15] 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(Price, Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax, Mfr_Guarantee, Guarantee_Period, | __truncated__ ...
## .. ..- attr(*, "dataClasses")= Named chr [1:16] "numeric" "numeric" "numeric" "character" ...
## .. .. ..- attr(*, "names")= chr [1:16] "Price" "Age_08_04" "KM" "Fuel_Type" ...
## $ cptable : num [1:32, 1:5] 0.6478 0.1005 0.0472 0.0216 0.0184 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:32] "1" "2" "3" "4" ...
## .. ..$ : chr [1:5] "CP" "nsplit" "rel error" "xerror" ...
## $ method : chr "anova"
## $ parms : NULL
## $ control :List of 9
## ..$ minsplit : num 3
## ..$ minbucket : num 1
## ..$ cp : num 0.001
## ..$ maxcompete : int 4
## ..$ maxsurrogate : int 5
## ..$ usesurrogate : int 2
## ..$ surrogatestyle: int 0
## ..$ maxdepth : num 30
## ..$ xval : int 10
## $ functions :List of 2
## ..$ summary:function (yval, dev, wt, ylevel, digits)
## ..$ text :function (yval, dev, wt, ylevel, digits, n, use.n)
## $ numresp : int 1
## $ splits : num [1:258, 1:5] 861 861 861 861 861 0 0 0 0 0 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:258] "Age_08_04" "Automatic_airco" "KM" "CD_Player" ...
## .. ..$ : chr [1:5] "count" "ncat" "improve" "index" ...
## $ csplit : int [1:17, 1:3] 3 1 3 2 1 3 3 1 1 1 ...
## $ variable.importance: Named num [1:14] 9.89e+09 2.99e+09 2.89e+09 1.89e+09 1.67e+09 ...
## ..- attr(*, "names")= chr [1:14] "Age_08_04" "Automatic_airco" "KM" "Quarterly_Tax" ...
## $ y : Named int [1:861] 9250 9895 17950 9995 10900 13995 10950 7950 8950 10450 ...
## ..- attr(*, "names")= chr [1:861] "1017" "679" "129" "930" ...
## $ ordered : Named logi [1:15] FALSE FALSE FALSE FALSE FALSE FALSE ...
## ..- attr(*, "names")= chr [1:15] "Age_08_04" "KM" "Fuel_Type" "HP" ...
## - attr(*, "xlevels")=List of 1
## ..$ Fuel_Type: chr [1:3] "CNG" "Diesel" "Petrol"
## - attr(*, "class")= chr "rpart"
# Transpose of the gives us column vector with the variable labels
t(t(tr$variable.importance))
## [,1]
## Age_08_04 9885745349
## Automatic_airco 2987651311
## KM 2886703313
## Quarterly_Tax 1885088412
## HP 1665897142
## Guarantee_Period 430364046
## CD_Player 357332146
## Fuel_Type 206914123
## Airco 122474786
## Powered_Windows 109250988
## Doors 63966015
## Mfr_Guarantee 21300178
## Automatic 19776457
## Sport_Model 3395399
# ii. Compare the prediction errors of the training and validation sets by examining their RMS error and by plotting the two boxplots. How does the predictive performance of the validation set compare to the training set? Why does this occur?
# Calculate Training accuracy
accuracy(predict(tr, train.df), train.df$Price)
## ME RMSE MAE MPE MAPE
## Test set -7.061715e-14 986.5121 760.0899 -1.006552 7.671931
# Calculate Validation accuracy
accuracy(predict(tr, valid.df), valid.df$Price)
## ME RMSE MAE MPE MAPE
## Test set 11.42479 1192.877 907.2528 -1.008939 9.038641
# To get the errors subtract the prediction from the actual
train.err <- predict(tr, train.df) - train.df$Price
valid.err <- predict(tr, valid.df) - valid.df$Price
# Create a data frame from the training and validation errors
err <- data.frame(Error = c(train.err, valid.err),
Set = c(rep("Training", length(train.err)),
rep("Validation", length(valid.err))))
ggplot(err, aes(x = Set, y = Error)) + geom_boxplot()
# iv. Create a less deep tree by leaving the arguments cp, minbucket, and maxdepth at their defaults. Compared to the deeper tree, what is the predictive performance on the validation set?
# Prune the tree
tr.shallow <- rpart(Price ~ Age_08_04 + KM + Fuel_Type +
HP + Automatic + Doors + Quarterly_Tax +
Mfr_Guarantee + Guarantee_Period + Airco +
Automatic_airco + CD_Player + Powered_Windows +
Sport_Model + Tow_Bar, data = train.df)
prp(tr.shallow)
# Determine the training accuracy
accuracy(predict(tr.shallow, train.df), train.df$Price)
## ME RMSE MAE MPE MAPE
## Test set 2.11238e-13 1396.864 1010.652 -1.735082 9.987881
# Determine the validation accuracy
accuracy(predict(tr.shallow, valid.df), valid.df$Price)
## ME RMSE MAE MPE MAPE
## Test set 35.55491 1341.264 1022.343 -1.05491 9.829217
# Create the bins based on the car prices
bins <- seq(min(car.df$Price),
max(car.df$Price),
(max(car.df$Price) - min(car.df$Price))/20)
# Determine the number of bins based on the min and max prices
bins
## [1] 4350.0 5757.5 7165.0 8572.5 9980.0 11387.5 12795.0 14202.5 15610.0
## [10] 17017.5 18425.0 19832.5 21240.0 22647.5 24055.0 25462.5 26870.0 28277.5
## [19] 29685.0 31092.5 32500.0
# Use bincode to determine the bin assignments
Binned_Price <- .bincode(car.df$Price,
bins,
include.lowest = TRUE)
# Convert the Binned_Price to factors for classification
Binned_Price <- as.factor(Binned_Price)
Binned_Price
## [1] 7 7 7 8 7 7 9 11 13 7 12 12 11 13 13 13 14 10 9 9 9 9 9 9
## [25] 9 9 10 9 9 10 7 9 9 8 8 9 9 8 9 8 7 9 7 9 9 11 10 9
## [49] 10 13 10 9 12 13 8 7 8 8 11 9 8 9 11 10 10 9 11 8 13 9 9 7
## [73] 11 9 12 9 11 11 9 8 11 10 8 10 9 10 8 10 9 13 9 13 12 9 11 12
## [97] 9 9 11 10 11 9 11 11 11 9 11 10 10 20 19 20 15 15 14 15 13 10 11 13
## [121] 11 12 9 11 9 13 9 9 10 9 9 9 9 9 9 9 11 9 14 12 9 14 12 11
## [145] 11 9 12 15 11 12 10 12 11 11 13 9 11 11 11 11 11 12 11 11 10 12 12 12
## [169] 12 10 10 14 11 11 13 12 11 12 13 13 11 11 12 13 10 10 2 4 6 3 6 1
## [193] 1 6 7 6 6 8 4 6 6 5 5 5 7 6 6 5 6 6 7 8 6 6 7 5
## [217] 7 5 5 7 6 6 6 8 6 7 6 6 6 6 6 7 6 7 6 6 5 7 7 6
## [241] 5 6 6 7 6 7 6 7 7 6 6 5 6 8 4 7 7 6 6 7 6 6 7 6
## [265] 6 6 6 6 8 5 7 7 7 7 7 6 7 6 6 8 7 7 7 7 6 7 6 4
## [289] 6 7 6 7 5 6 7 5 7 7 7 7 6 6 7 6 7 6 4 7 6 6 7 7
## [313] 6 6 4 7 7 5 4 6 6 5 7 5 7 6 5 7 7 6 5 7 6 6 6 6
## [337] 7 6 6 6 6 6 8 6 7 8 7 7 7 6 6 4 6 6 8 7 6 8 6 8
## [361] 7 6 6 7 7 5 5 6 6 7 5 7 6 7 7 6 6 7 2 2 2 3 4 3
## [385] 4 4 5 4 3 4 3 3 4 1 4 4 4 6 5 5 4 5 1 5 4 4 5 6
## [409] 4 6 3 5 4 6 5 4 4 5 4 4 5 4 4 6 4 4 6 6 5 7 6 5
## [433] 5 5 5 5 6 4 5 6 6 5 6 6 6 5 6 5 6 4 5 6 6 6 6 4
## [457] 5 5 4 5 4 6 5 4 4 6 4 6 7 5 5 4 4 6 5 4 5 4 5 6
## [481] 6 6 6 4 4 5 5 4 6 4 5 5 4 6 6 5 6 5 5 4 4 6 4 5
## [505] 4 6 6 6 5 5 6 6 7 5 5 5 6 5 5 6 4 6 4 11 6 5 6 4
## [529] 5 7 4 5 5 6 7 6 5 4 5 6 5 6 5 5 7 5 6 4 5 6 5 5
## [553] 7 5 6 5 6 7 5 7 5 5 4 7 4 5 5 5 5 7 7 6 5 6 4 6
## [577] 6 6 6 6 6 5 4 5 5 7 4 7 4 4 5 5 4 5 5 5 5 5 5 7
## [601] 5 3 4 2 3 2 3 3 2 1 2 3 3 3 3 2 4 2 3 3 4 2 4 4
## [625] 3 4 4 4 3 3 3 4 4 4 4 4 5 3 5 4 4 4 3 5 4 4 3 2
## [649] 3 4 4 3 4 4 2 3 4 3 4 5 3 4 4 4 4 3 4 4 4 4 2 3
## [673] 3 4 2 4 4 4 4 4 3 4 3 4 4 4 4 4 4 4 4 4 4 4 4 4
## [697] 6 4 5 4 3 4 3 5 3 4 5 3 4 4 4 3 4 4 3 3 4 4 4 3
## [721] 3 3 4 3 2 3 5 4 4 4 6 5 5 4 5 5 4 4 4 4 3 5 4 4
## [745] 3 3 3 5 4 4 5 5 3 4 4 4 4 4 3 5 3 3 4 4 5 5 4 4
## [769] 5 3 3 3 4 5 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 4 3
## [793] 3 4 6 4 6 4 4 3 4 5 4 5 4 4 4 3 3 4 3 4 4 5 4 4
## [817] 3 4 4 5 4 3 4 5 2 4 2 4 4 4 4 4 5 5 4 4 5 4 4 5
## [841] 5 4 5 4 3 4 5 5 4 4 3 4 3 4 4 4 4 3 3 4 4 5 4 3
## [865] 4 4 5 4 4 5 4 4 5 4 4 4 4 4 3 4 5 4 4 3 4 4 5 4
## [889] 5 4 3 6 4 3 5 4 3 4 4 4 3 4 4 4 4 4 4 4 3 3 4 4
## [913] 4 7 4 5 3 5 4 4 5 4 4 3 4 4 5 4 4 5 4 5 5 4 4 5
## [937] 5 4 4 5 4 4 5 3 5 5 6 4 3 4 3 4 3 4 4 4 5 4 4 4
## [961] 4 4 4 4 5 4 4 4 4 5 4 5 4 3 5 4 4 4 4 4 5 4 4 3
## [985] 4 4 3 4 5 4 5 3 4 4 3 4 4 4 4 5 4 4 3 5 4 4 5 5
## [1009] 4 4 4 4 4 3 5 5 4 4 5 4 6 3 5 5 5 4 5 4 5 5 4 5
## [1033] 5 5 5 6 4 5 4 5 4 5 5 4 2 2 2 1 1 2 3 2 2 1 4 2
## [1057] 2 2 5 3 3 2 2 2 1 2 4 2 3 3 3 3 2 3 2 1 2 2 3 4
## [1081] 3 4 4 2 3 3 2 3 2 3 4 3 3 1 3 2 3 3 3 3 3 2 2 3
## [1105] 3 3 3 3 3 4 3 3 3 1 2 2 2 3 4 3 3 3 3 4 3 2 2 4
## [1129] 3 3 3 4 2 4 3 2 2 2 4 3 2 3 3 4 3 2 2 3 2 3 4 3
## [1153] 3 3 2 3 2 4 2 4 3 3 3 4 4 4 3 2 3 4 2 2 3 2 3 4
## [1177] 4 3 3 4 3 2 4 3 4 2 3 3 3 3 2 3 2 3 3 4 4 4 3 4
## [1201] 4 3 2 3 3 2 3 3 3 3 3 3 3 2 4 4 3 3 4 3 3 3 3 3
## [1225] 4 3 2 3 3 4 4 2 3 3 4 3 3 2 3 2 4 4 3 3 2 3 3 2
## [1249] 3 3 4 3 3 2 3 3 3 3 3 4 3 4 4 3 3 4 2 3 4 4 3 2
## [1273] 3 2 4 3 3 4 3 3 3 3 3 4 4 3 3 3 4 3 3 3 3 3 2 3
## [1297] 3 2 3 4 3 2 3 3 3 4 3 4 3 4 4 4 4 2 3 4 3 3 3 3
## [1321] 4 3 4 4 3 2 3 4 2 3 4 2 3 5 2 4 3 4 3 4 4 2 3 3
## [1345] 4 3 3 3 4 2 3 2 3 3 4 2 3 3 3 4 4 2 3 2 3 3 3 4
## [1369] 4 3 4 4 2 3 4 3 3 4 4 3 4 3 2 5 4 3 4 3 4 4 3 4
## [1393] 3 3 3 4 4 3 4 4 3 4 5 2 3 3 4 3 4 3 3 3 4 4 3 2
## [1417] 4 4 3 3 3 3 3 3 3 3 4 4 3 4 3 3 5 3 3 2
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 19 20
# Add the binned price to the data frame by the indexes
# Identify the training and validation data frames
train.df$Binned_Price <- Binned_Price[train.index]
valid.df$Binned_Price <- Binned_Price[-train.index]
# i. Compare the tree generated by the CT with the generated by the less deep RT. Are they different? (Look at structure, the top predictors, size oftree, etc.) Why?
tr.binned <- rpart(Binned_Price ~ Age_08_04 + KM + Fuel_Type +
HP + Automatic + Doors + Quarterly_Tax +
Mfr_Guarantee + Guarantee_Period + Airco +
Automatic_airco + CD_Player + Powered_Windows +
Sport_Model + Tow_Bar, data = train.df)
prp(tr.binned)
t(t(tr.binned$variable.importance))
## [,1]
## Age_08_04 152.5600337
## KM 58.4034964
## CD_Player 26.9950441
## Automatic_airco 19.8587797
## Sport_Model 18.6676759
## Airco 18.2834578
## Quarterly_Tax 16.1438410
## HP 7.2273021
## Powered_Windows 7.0148174
## Mfr_Guarantee 2.9656114
## Doors 2.6224520
## Fuel_Type 1.0327227
## Automatic 0.4534818
## Guarantee_Period 0.4534818
# ii. Predict the price, using the less deep RT and the CT, of a used Toyota Corolla with the specifications listed in Table 9.6.
# Create new record
new.record <- data.frame(Age_08_04 = 77,
KM = 117000,
Fuel_Type = "Petrol",
HP = 110,
Automatic = 0,
Doors = 5,
Quarterly_Tax = 100,
Mfr_Guarantee = 0,
Guarantee_Period = 3,
Airco = 1,
Automatic_airco = 0,
CD_Player = 0,
Powered_Windows = 0,
Sport_Model = 0,
Tow_Bar = 1)
# Using regression tree
price.tr <- predict(tr, newdata = new.record)
price.tr
## 1
## 7318.056
# Using classification tree
price.tr.bin <- bins[predict(tr.binned, newdata = new.record, type = "class")]
cat(paste("Regression Price Estimate: ",scales::dollar(price.tr,0.01)),
paste("Classification Price Estimate: ",scales::dollar(price.tr.bin,0.01)),
sep='\n')
## Regression Price Estimate: $7,318.06
## Classification Price Estimate: $7,165.00