EOC_Problem 8.2 & 9.3

Problem 8.2

# Load the data
accidents.df <- mlba::AccidentsFull
head(accidents.df)

##   HOUR_I_R ALCHL_I ALIGN_I STRATUM_R WRK_ZONE WKDY_I_R INT_HWY LGTCON_I_R
## 1        0       2       2         1        0        1       0          3
## 2        1       2       1         0        0        1       1          3
## 3        1       2       1         0        0        1       0          3
## 4        1       2       1         1        0        0       0          3
## 5        1       1       1         0        0        1       0          3
## 6        1       2       1         1        0        1       0          3
##   MANCOL_I_R PED_ACC_R RELJCT_I_R REL_RWY_R PROFIL_I_R SPD_LIM SUR_COND
## 1          0         0          1         0          1      40        4
## 2          2         0          1         1          1      70        4
## 3          2         0          1         1          1      35        4
## 4          2         0          1         1          1      35        4
## 5          2         0          0         1          1      25        4
## 6          0         0          1         0          1      70        4
##   TRAF_CON_R TRAF_WAY VEH_INVL WEATHER_R INJURY_CRASH NO_INJ_I PRPTYDMG_CRASH
## 1          0        3        1         1            1        1              0
## 2          0        3        2         2            0        0              1
## 3          1        2        2         2            0        0              1
## 4          1        2        2         1            0        0              1
## 5          0        2        3         1            0        0              1
## 6          0        2        1         2            1        1              0
##   FATALITIES MAX_SEV_IR
## 1          0          1
## 2          0          0
## 3          0          0
## 4          0          0
## 5          0          0
## 6          0          1

# Create and insert a dummy variable called "INJURY" 
accidents.df$INJURY <- ifelse(accidents.df$MAX_SEV_IR>0, "yes", "no")

a. Using the information in this dataset, if an accident has just been reported and no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?

# Create a table based on injury and assign variables
inj.tbl <- table(accidents.df$INJURY)
show(inj.tbl)

## 
##    no   yes 
## 20721 21462

injury_yes <- inj.tbl["yes"]
injury_no <- inj.tbl["no"]
total_accident <- injury_yes + injury_no

# Calculate the probability of injury
injury_prob <- injury_yes / total_accident * 100
cat('Injury Probability:', injury_prob, '\n')

## Injury Probability: 50.87832

b. Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R.

# i. Create a pivot table that examines INJURY as a function of the two predictors for these 12 records. Use all three variables in the pivot table as rows/columns.

# Convert variables to categorical type
for (i in c(1:dim(accidents.df)[2])){
  accidents.df[,i] <- as.factor(accidents.df[,i])
}

new_df <- accidents.df[1:12, c("INJURY", "WEATHER_R", "TRAF_CON_R")]
str(new_df)

## 'data.frame':    12 obs. of  3 variables:
##  $ INJURY    : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ WEATHER_R : Factor w/ 2 levels "1","2": 1 2 2 1 1 2 2 1 2 2 ...
##  $ TRAF_CON_R: Factor w/ 3 levels "0","1","2": 1 1 2 2 1 1 1 1 1 1 ...

rpivotTable::rpivotTable(new_df)

# ii. Compute the exact Bayes conditional probabilities of an injury (INJURY = Yes) given the six possible combinations of the predictors.

#To find P(Injury=yes|WEATHER_R=1, TRAF_CON_R=0)
numerator1 <- 2/3 * 3/12
denominator1 <- 3/12
prob1 <- numerator1/denominator1
cat('1st Probability:', prob1, '\n')

## 1st Probability: 0.6666667

#To find P(Injury=yes|WEATHER_R=1, TRAF_CON_R=1)
numerator2 <- 0 * 3/12
denominator2 <- 1/12
prob2 <- numerator2/denominator2
cat('2nd Probability:', prob2, '\n')

## 2nd Probability: 0

#To find P(Injury=yes|WEATHER_R=1, TRAF_CON_R=2)
numerator3 <- 0 * 3/12
denominator3 <- 1/12
prob3 <- numerator3/denominator3
cat('3rd Probability:', prob3, '\n')

## 3rd Probability: 0

#To find P(Injury=yes|WEATHER_R=2, TRAF_CON_R=0)
numerator4 <- 1/3 * 3/12
denominator4 <- 6/12
prob4 <- numerator4/denominator4
cat('4th Probability:', prob4, '\n')

## 4th Probability: 0.1666667

#To find P(Injury=yes|WEATHER_R=2, TRAF_CON_R=1)
numerator5 <- 0 * 3/12
denominator5 <- 1/12
prob5 <- numerator5/denominator5
cat('5th Probability:', prob5, '\n')

## 5th Probability: 0

#To find P(Injury=yes|WEATHER_R=2, TRAF_CON_R=2)
numerator6 <- 0 * 3/12
denominator6 <- 0
prob6 <- numerator6/denominator6
cat('6th Probability:', prob6, '\n')

## 6th Probability: NaN

# Create the data frame for conclusion
a <- c(1,2,3,4,5,6)
b <- c(prob1,prob2,prob3,prob4,prob5,prob6)

prob.df <- data.frame(a,b)
names(prob.df) <- c('Combination No','Probability')
prob.df %>% mutate_if(is.numeric, round, 3)

##   Combination No Probability
## 1              1       0.667
## 2              2       0.000
## 3              3       0.000
## 4              4       0.167
## 5              5       0.000
## 6              6         NaN

# iii. Classify the 12 accidents using these probabilities and a cutoff of 0.5
# Create new data frame for probabilities calculation
new_prob_df <- new_df
head(new_df)

##   INJURY WEATHER_R TRAF_CON_R
## 1    yes         1          0
## 2     no         2          0
## 3     no         2          1
## 4     no         1          1
## 5     no         1          0
## 6    yes         2          0

# Create new data frame from previous 12 records in (i)
prob.inj <- c(0.667, 0.167, 0, 0, 0.667, 0.167, 0.167, 0.667, 0.167, 0.167, 0.167, 0)
new_prob_df$PROB_INJURY<-prob.inj

# Calculate the probabilities when cutoff is 0.5
new_prob_df$PREDICT_PROB <- ifelse(new_prob_df$PROB_INJURY>.5,"yes","no")
print(new_prob_df)

##    INJURY WEATHER_R TRAF_CON_R PROB_INJURY PREDICT_PROB
## 1     yes         1          0       0.667          yes
## 2      no         2          0       0.167           no
## 3      no         2          1       0.000           no
## 4      no         1          1       0.000           no
## 5      no         1          0       0.667          yes
## 6     yes         2          0       0.167           no
## 7      no         2          0       0.167           no
## 8     yes         1          0       0.667          yes
## 9      no         2          0       0.167           no
## 10     no         2          0       0.167           no
## 11     no         2          0       0.167           no
## 12     no         1          2       0.000           no

# iv. Compute manually the naive Bayes conditional probability of an injury given WEATHER_R = 1 and TRAF_CON_R = 1
# To find P(Injury=yes| WEATHER_R=1, TRAF_CON_R=1):
  # Probability of injury involved in accidents =   
        #(proportion of WEATHER_R=1 when Injury = yes)
       #*(proportion of TRAF_CON_R=1 when Injury = yes)
       #*(propotion of Injury = yes in all cases)

man.prob <- 2/3 * 0/3 * 3/12
man.prob

## [1] 0

# v. Run a Naive Bayes classifier on the 12 records and two predictors using R.
nb <- naiveBayes(INJURY ~ TRAF_CON_R + WEATHER_R, data=new_df)
(predict(nb, newdata = accidents.df[1:12, c("INJURY", "WEATHER_R", "TRAF_CON_R")],
        type = "raw"))

##              no          yes
##  [1,] 0.5000000 0.5000000000
##  [2,] 0.8000000 0.2000000000
##  [3,] 0.9992506 0.0007494379
##  [4,] 0.9970090 0.0029910269
##  [5,] 0.5000000 0.5000000000
##  [6,] 0.8000000 0.2000000000
##  [7,] 0.8000000 0.2000000000
##  [8,] 0.5000000 0.5000000000
##  [9,] 0.8000000 0.2000000000
## [10,] 0.8000000 0.2000000000
## [11,] 0.8000000 0.2000000000
## [12,] 0.9940358 0.0059642147

# Test NB with caret 
library(caret)

## Loading required package: lattice

x=new_df[,-3]
y=new_df$INJURY
model <- train(x,y,'nb', trControl = trainControl(method = 'cv',number=10))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

print(model)

## Naive Bayes 
## 
## 12 samples
##  2 predictor
##  2 classes: 'no', 'yes' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 11, 10, 11, 10, 11, 11, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE      0.9444444  0.6666667
##    TRUE      0.9444444  0.6666667
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = FALSE and adjust
##  = 1.

# Generate prediction
model.pred <- predict(model$finalModel,x)
model.pred

## $class
##   1   2   3   4   5   6   7   8   9  10  11  12 
## yes  no  no  no  no yes  no yes  no  no  no  no 
## Levels: no yes
## 
## $posterior
##             no          yes
## 1  0.001497753 0.9985022466
## 2  0.999833361 0.0001666389
## 3  0.999833361 0.0001666389
## 4  0.999333777 0.0006662225
## 5  0.999333777 0.0006662225
## 6  0.005964215 0.9940357853
## 7  0.999833361 0.0001666389
## 8  0.001497753 0.9985022466
## 9  0.999833361 0.0001666389
## 10 0.999833361 0.0001666389
## 11 0.999833361 0.0001666389
## 12 0.999333777 0.0006662225

# Build a confusion matrix to show errors
table(model.pred$class,y)

##      y
##       no yes
##   no   9   0
##   yes  0   3

# Compare with the manually calculated results
new_prob_df$PREDICT_PROB_NB <- model.pred$class
new_prob_df

##    INJURY WEATHER_R TRAF_CON_R PROB_INJURY PREDICT_PROB PREDICT_PROB_NB
## 1     yes         1          0       0.667          yes             yes
## 2      no         2          0       0.167           no              no
## 3      no         2          1       0.000           no              no
## 4      no         1          1       0.000           no              no
## 5      no         1          0       0.667          yes              no
## 6     yes         2          0       0.167           no             yes
## 7      no         2          0       0.167           no              no
## 8     yes         1          0       0.667          yes             yes
## 9      no         2          0       0.167           no              no
## 10     no         2          0       0.167           no              no
## 11     no         2          0       0.167           no              no
## 12     no         1          2       0.000           no              no

c. Let us now return to the entire dataset. Partition the data into training (60%) and validation (40%).

# Partiton the data into training (60%) and validation (40%) 
set.seed(1)
train.index <- sample(c(1:dim(accidents.df)[1]), dim(accidents.df)[1]*0.6)  
valid.index <- setdiff(c(1:dim(accidents.df)[1]), train.index)  
train.df <- accidents.df[train.index, ]
valid.df <- accidents.df[valid.index, ]

# i. Assuming that no information or initial reports about the accident itself are available at the time of prediction (only location characteristics, weather conditions, etc.), which predictors can we include in the analysis?

# Define the  predictors are non-specific to the accident
#"INJURY", "HOUR_I_R",  "ALIGN_I" ,"WRK_ZONE",  "WKDY_I_R", "INT_HWY",  "LGTCON_I_R", "PROFIL_I_R", "SPD_LIM", "SUR_COND","TRAF_CON_R",   "TRAF_WAY",   "WEATHER_R")

vars <- c(25, 1, 3, 5,  6, 7,  8, 13, 14, 15, 16, 17, 19)

# ii.   Run a naive Bayes classifier on the complete training set with the relevant predictors (and INJURY as the response). Note that all predictors are categorical. Show the confusion matrix.

nb <- naiveBayes(INJURY ~ ., data = train.df[,vars])

# Generate the confusion matrix using the train.df, the prediction and the classes
confusionMatrix(train.df$INJURY, predict(nb, train.df[, vars]), positive = "yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  5097 7405
##        yes 4230 8577
##                                           
##                Accuracy : 0.5403          
##                  95% CI : (0.5341, 0.5464)
##     No Information Rate : 0.6315          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0776          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5367          
##             Specificity : 0.5465          
##          Pos Pred Value : 0.6697          
##          Neg Pred Value : 0.4077          
##              Prevalence : 0.6315          
##          Detection Rate : 0.3389          
##    Detection Prevalence : 0.5060          
##       Balanced Accuracy : 0.5416          
##                                           
##        'Positive' Class : yes             
##

# Calculate nb error
nb_acc <- 0.5403
nb_er <- 1 - nb_acc
nb_er

## [1] 0.4597

# iii. What is the overall error for the holdout set?
vars2 <- c("INJURY", "HOUR_I_R",    "ALIGN_I"   ,"WRK_ZONE",    "WKDY_I_R",
          "INT_HWY",    "LGTCON_I_R",   "PROFIL_I_R",   "SPD_LIM",  "SUR_COND",
          "TRAF_CON_R", "TRAF_WAY", "WEATHER_R")

confusionMatrix(valid.df$INJURY, predict(nb, valid.df[, vars2]), positive = "yes")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   no  yes
##        no  3203 5016
##        yes 2862 5793
##                                           
##                Accuracy : 0.5331          
##                  95% CI : (0.5256, 0.5407)
##     No Information Rate : 0.6406          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0594          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5359          
##             Specificity : 0.5281          
##          Pos Pred Value : 0.6693          
##          Neg Pred Value : 0.3897          
##              Prevalence : 0.6406          
##          Detection Rate : 0.3433          
##    Detection Prevalence : 0.5129          
##       Balanced Accuracy : 0.5320          
##                                           
##        'Positive' Class : yes             
##

# Calculate validation error
val_acc <- 0.5331
val_er <- 1 - val_acc
val_er

## [1] 0.4669

# iv. What is the percent improvement relative to the naive rule?
imp_rate <- val_er - nb_er
cat("The percent improvement: ",scales::percent(imp_rate,0.01))

## The percent improvement:  0.72%

# v.Examine the conditional probabilities output. Why do we get a probability of zero for P(INJURY = No j SPD_LIM = 5)?
# Determine the number of records with speed limit=5 and no injury
table(train.df$INJURY, train.df$SPD_LIM)

##      
##          5   10   15   20   25   30   35   40   45   50   55   60   65   70
##   no     1    6   55  107 1402 1076 2371 1203 1942  510 1988  444  807  512
##   yes    2    4   52   50 1161 1102 2719 1369 2016  505 1984  551  796  399
##      
##         75
##   no    78
##   yes   97

nb$tables$SPD_LIM

##      SPD_LIM
## Y                5           10           15           20           25
##   no  0.0000799872 0.0004799232 0.0043992961 0.0085586306 0.1121420573
##   yes 0.0001561646 0.0003123292 0.0040602795 0.0039041149 0.0906535488
##      SPD_LIM
## Y               30           35           40           45           50
##   no  0.0860662294 0.1896496561 0.0962246041 0.1553351464 0.0407934730
##   yes 0.0860466932 0.2123057703 0.1068946670 0.1574139143 0.0394315609
##      SPD_LIM
## Y               55           60           65           70           75
##   no  0.1590145577 0.0355143177 0.0645496721 0.0409534474 0.0062390018
##   yes 0.1549152807 0.0430233466 0.0621535098 0.0311548372 0.0075739830

Problem 9.3

# Load the data
car.df<-mlba::ToyotaCorolla
View(car.df)

# Partition data into 60/40
set.seed(1)  
percnt.of.data<- 0.6
train.index <- sample(c(1:dim(car.df)[1]), dim(car.df)[1]*percnt.of.data)

# The validation is the remaining record
valid.index <- setdiff(c(1:dim(car.df)[1]), train.index)  

train.df <- car.df[train.index, ]
valid.df <- car.df[valid.index, ]

Part A

tr <- rpart(Price ~  Age_08_04 + KM + Fuel_Type + 
              HP + Automatic + Doors + Quarterly_Tax + 
              Mfr_Guarantee + Guarantee_Period + Airco + 
              Automatic_airco + CD_Player + Powered_Windows + 
              Sport_Model + Tow_Bar, data = train.df, method = "anova", minbucket = 1, maxdepth = 30, cp = 0.001)
prp(tr)

# i. Which appear to be the three or four most important car specifications for predicting the car's price?
# Using str we can identify the components of the tree
str(tr)

## List of 15
##  $ frame              :'data.frame': 67 obs. of  8 variables:
##   ..$ var       : chr [1:67] "Age_08_04" "Age_08_04" "Age_08_04" "KM" ...
##   ..$ n         : int [1:67] 861 743 471 226 72 154 113 41 245 97 ...
##   ..$ wt        : num [1:67] 861 743 471 226 72 154 113 41 245 97 ...
##   ..$ dev       : num [1:67] 1.20e+10 2.90e+09 8.14e+08 2.57e+08 8.76e+07 ...
##   ..$ yval      : num [1:67] 10813 9616 8648 7876 7318 ...
##   ..$ complexity: num [1:67] 0.64776 0.10048 0.0216 0.00274 0.00051 ...
##   ..$ ncompete  : int [1:67] 4 4 4 4 0 4 0 0 4 4 ...
##   ..$ nsurrogate: int [1:67] 5 5 5 3 0 1 0 0 5 0 ...
##  $ where              : Named int [1:861] 13 17 53 29 35 40 40 7 7 29 ...
##   ..- attr(*, "names")= chr [1:861] "1017" "679" "129" "930" ...
##  $ call               : language rpart(formula = Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors +      Quarterly_Tax + Mfr_Guarantee | __truncated__ ...
##  $ terms              :Classes 'terms', 'formula'  language Price ~ Age_08_04 + KM + Fuel_Type + HP + Automatic + Doors + Quarterly_Tax +      Mfr_Guarantee + Guarantee_Peri| __truncated__ ...
##   .. ..- attr(*, "variables")= language list(Price, Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax,      Mfr_Guarantee, Guarantee_Period, | __truncated__ ...
##   .. ..- attr(*, "factors")= int [1:16, 1:15] 0 1 0 0 0 0 0 0 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:16] "Price" "Age_08_04" "KM" "Fuel_Type" ...
##   .. .. .. ..$ : chr [1:15] "Age_08_04" "KM" "Fuel_Type" "HP" ...
##   .. ..- attr(*, "term.labels")= chr [1:15] "Age_08_04" "KM" "Fuel_Type" "HP" ...
##   .. ..- attr(*, "order")= int [1:15] 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(Price, Age_08_04, KM, Fuel_Type, HP, Automatic, Doors, Quarterly_Tax,      Mfr_Guarantee, Guarantee_Period, | __truncated__ ...
##   .. ..- attr(*, "dataClasses")= Named chr [1:16] "numeric" "numeric" "numeric" "character" ...
##   .. .. ..- attr(*, "names")= chr [1:16] "Price" "Age_08_04" "KM" "Fuel_Type" ...
##  $ cptable            : num [1:32, 1:5] 0.6478 0.1005 0.0472 0.0216 0.0184 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:32] "1" "2" "3" "4" ...
##   .. ..$ : chr [1:5] "CP" "nsplit" "rel error" "xerror" ...
##  $ method             : chr "anova"
##  $ parms              : NULL
##  $ control            :List of 9
##   ..$ minsplit      : num 3
##   ..$ minbucket     : num 1
##   ..$ cp            : num 0.001
##   ..$ maxcompete    : int 4
##   ..$ maxsurrogate  : int 5
##   ..$ usesurrogate  : int 2
##   ..$ surrogatestyle: int 0
##   ..$ maxdepth      : num 30
##   ..$ xval          : int 10
##  $ functions          :List of 2
##   ..$ summary:function (yval, dev, wt, ylevel, digits)  
##   ..$ text   :function (yval, dev, wt, ylevel, digits, n, use.n)  
##  $ numresp            : int 1
##  $ splits             : num [1:258, 1:5] 861 861 861 861 861 0 0 0 0 0 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:258] "Age_08_04" "Automatic_airco" "KM" "CD_Player" ...
##   .. ..$ : chr [1:5] "count" "ncat" "improve" "index" ...
##  $ csplit             : int [1:17, 1:3] 3 1 3 2 1 3 3 1 1 1 ...
##  $ variable.importance: Named num [1:14] 9.89e+09 2.99e+09 2.89e+09 1.89e+09 1.67e+09 ...
##   ..- attr(*, "names")= chr [1:14] "Age_08_04" "Automatic_airco" "KM" "Quarterly_Tax" ...
##  $ y                  : Named int [1:861] 9250 9895 17950 9995 10900 13995 10950 7950 8950 10450 ...
##   ..- attr(*, "names")= chr [1:861] "1017" "679" "129" "930" ...
##  $ ordered            : Named logi [1:15] FALSE FALSE FALSE FALSE FALSE FALSE ...
##   ..- attr(*, "names")= chr [1:15] "Age_08_04" "KM" "Fuel_Type" "HP" ...
##  - attr(*, "xlevels")=List of 1
##   ..$ Fuel_Type: chr [1:3] "CNG" "Diesel" "Petrol"
##  - attr(*, "class")= chr "rpart"

# Transpose of the gives us column vector with the variable labels
t(t(tr$variable.importance))

##                        [,1]
## Age_08_04        9885745349
## Automatic_airco  2987651311
## KM               2886703313
## Quarterly_Tax    1885088412
## HP               1665897142
## Guarantee_Period  430364046
## CD_Player         357332146
## Fuel_Type         206914123
## Airco             122474786
## Powered_Windows   109250988
## Doors              63966015
## Mfr_Guarantee      21300178
## Automatic          19776457
## Sport_Model         3395399

# ii.   Compare the prediction errors of the training and validation sets by examining their RMS error and by plotting the two boxplots. How does the predictive performance of the validation set compare to the training set? Why does this occur?

# Calculate Training accuracy
accuracy(predict(tr, train.df), train.df$Price)

##                     ME     RMSE      MAE       MPE     MAPE
## Test set -7.061715e-14 986.5121 760.0899 -1.006552 7.671931

# Calculate Validation accuracy
accuracy(predict(tr, valid.df), valid.df$Price)

##                ME     RMSE      MAE       MPE     MAPE
## Test set 11.42479 1192.877 907.2528 -1.008939 9.038641

# To get the errors subtract the prediction from the actual
train.err <- predict(tr, train.df) - train.df$Price
valid.err <- predict(tr, valid.df) - valid.df$Price

# Create a data frame from the training and validation errors
err <- data.frame(Error = c(train.err, valid.err), 
                  Set = c(rep("Training", length(train.err)),
                          rep("Validation", length(valid.err))))

ggplot(err, aes(x = Set, y = Error)) + geom_boxplot()

# iv. Create a less deep tree by leaving the arguments cp, minbucket, and maxdepth at their defaults. Compared to the deeper tree, what is the predictive performance on the validation set?

# Prune the tree
tr.shallow <- rpart(Price ~  Age_08_04 + KM + Fuel_Type + 
              HP + Automatic + Doors + Quarterly_Tax + 
              Mfr_Guarantee + Guarantee_Period + Airco + 
              Automatic_airco + CD_Player + Powered_Windows + 
              Sport_Model + Tow_Bar, data = train.df)
           
prp(tr.shallow)

# Determine the training accuracy 
accuracy(predict(tr.shallow, train.df), train.df$Price)

##                   ME     RMSE      MAE       MPE     MAPE
## Test set 2.11238e-13 1396.864 1010.652 -1.735082 9.987881

# Determine the validation accuracy
accuracy(predict(tr.shallow, valid.df), valid.df$Price)

##                ME     RMSE      MAE      MPE     MAPE
## Test set 35.55491 1341.264 1022.343 -1.05491 9.829217

Part B

# Create the bins based on the car prices
bins <- seq(min(car.df$Price), 
            max(car.df$Price), 
            (max(car.df$Price) - min(car.df$Price))/20)

# Determine the number of bins based on the min and max prices
bins

##  [1]  4350.0  5757.5  7165.0  8572.5  9980.0 11387.5 12795.0 14202.5 15610.0
## [10] 17017.5 18425.0 19832.5 21240.0 22647.5 24055.0 25462.5 26870.0 28277.5
## [19] 29685.0 31092.5 32500.0

# Use bincode to determine the bin assignments
Binned_Price <- .bincode(car.df$Price, 
                         bins, 
                         include.lowest = TRUE)

# Convert the Binned_Price to factors for classification
Binned_Price <- as.factor(Binned_Price)
Binned_Price

##    [1] 7  7  7  8  7  7  9  11 13 7  12 12 11 13 13 13 14 10 9  9  9  9  9  9 
##   [25] 9  9  10 9  9  10 7  9  9  8  8  9  9  8  9  8  7  9  7  9  9  11 10 9 
##   [49] 10 13 10 9  12 13 8  7  8  8  11 9  8  9  11 10 10 9  11 8  13 9  9  7 
##   [73] 11 9  12 9  11 11 9  8  11 10 8  10 9  10 8  10 9  13 9  13 12 9  11 12
##   [97] 9  9  11 10 11 9  11 11 11 9  11 10 10 20 19 20 15 15 14 15 13 10 11 13
##  [121] 11 12 9  11 9  13 9  9  10 9  9  9  9  9  9  9  11 9  14 12 9  14 12 11
##  [145] 11 9  12 15 11 12 10 12 11 11 13 9  11 11 11 11 11 12 11 11 10 12 12 12
##  [169] 12 10 10 14 11 11 13 12 11 12 13 13 11 11 12 13 10 10 2  4  6  3  6  1 
##  [193] 1  6  7  6  6  8  4  6  6  5  5  5  7  6  6  5  6  6  7  8  6  6  7  5 
##  [217] 7  5  5  7  6  6  6  8  6  7  6  6  6  6  6  7  6  7  6  6  5  7  7  6 
##  [241] 5  6  6  7  6  7  6  7  7  6  6  5  6  8  4  7  7  6  6  7  6  6  7  6 
##  [265] 6  6  6  6  8  5  7  7  7  7  7  6  7  6  6  8  7  7  7  7  6  7  6  4 
##  [289] 6  7  6  7  5  6  7  5  7  7  7  7  6  6  7  6  7  6  4  7  6  6  7  7 
##  [313] 6  6  4  7  7  5  4  6  6  5  7  5  7  6  5  7  7  6  5  7  6  6  6  6 
##  [337] 7  6  6  6  6  6  8  6  7  8  7  7  7  6  6  4  6  6  8  7  6  8  6  8 
##  [361] 7  6  6  7  7  5  5  6  6  7  5  7  6  7  7  6  6  7  2  2  2  3  4  3 
##  [385] 4  4  5  4  3  4  3  3  4  1  4  4  4  6  5  5  4  5  1  5  4  4  5  6 
##  [409] 4  6  3  5  4  6  5  4  4  5  4  4  5  4  4  6  4  4  6  6  5  7  6  5 
##  [433] 5  5  5  5  6  4  5  6  6  5  6  6  6  5  6  5  6  4  5  6  6  6  6  4 
##  [457] 5  5  4  5  4  6  5  4  4  6  4  6  7  5  5  4  4  6  5  4  5  4  5  6 
##  [481] 6  6  6  4  4  5  5  4  6  4  5  5  4  6  6  5  6  5  5  4  4  6  4  5 
##  [505] 4  6  6  6  5  5  6  6  7  5  5  5  6  5  5  6  4  6  4  11 6  5  6  4 
##  [529] 5  7  4  5  5  6  7  6  5  4  5  6  5  6  5  5  7  5  6  4  5  6  5  5 
##  [553] 7  5  6  5  6  7  5  7  5  5  4  7  4  5  5  5  5  7  7  6  5  6  4  6 
##  [577] 6  6  6  6  6  5  4  5  5  7  4  7  4  4  5  5  4  5  5  5  5  5  5  7 
##  [601] 5  3  4  2  3  2  3  3  2  1  2  3  3  3  3  2  4  2  3  3  4  2  4  4 
##  [625] 3  4  4  4  3  3  3  4  4  4  4  4  5  3  5  4  4  4  3  5  4  4  3  2 
##  [649] 3  4  4  3  4  4  2  3  4  3  4  5  3  4  4  4  4  3  4  4  4  4  2  3 
##  [673] 3  4  2  4  4  4  4  4  3  4  3  4  4  4  4  4  4  4  4  4  4  4  4  4 
##  [697] 6  4  5  4  3  4  3  5  3  4  5  3  4  4  4  3  4  4  3  3  4  4  4  3 
##  [721] 3  3  4  3  2  3  5  4  4  4  6  5  5  4  5  5  4  4  4  4  3  5  4  4 
##  [745] 3  3  3  5  4  4  5  5  3  4  4  4  4  4  3  5  3  3  4  4  5  5  4  4 
##  [769] 5  3  3  3  4  5  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  4  3 
##  [793] 3  4  6  4  6  4  4  3  4  5  4  5  4  4  4  3  3  4  3  4  4  5  4  4 
##  [817] 3  4  4  5  4  3  4  5  2  4  2  4  4  4  4  4  5  5  4  4  5  4  4  5 
##  [841] 5  4  5  4  3  4  5  5  4  4  3  4  3  4  4  4  4  3  3  4  4  5  4  3 
##  [865] 4  4  5  4  4  5  4  4  5  4  4  4  4  4  3  4  5  4  4  3  4  4  5  4 
##  [889] 5  4  3  6  4  3  5  4  3  4  4  4  3  4  4  4  4  4  4  4  3  3  4  4 
##  [913] 4  7  4  5  3  5  4  4  5  4  4  3  4  4  5  4  4  5  4  5  5  4  4  5 
##  [937] 5  4  4  5  4  4  5  3  5  5  6  4  3  4  3  4  3  4  4  4  5  4  4  4 
##  [961] 4  4  4  4  5  4  4  4  4  5  4  5  4  3  5  4  4  4  4  4  5  4  4  3 
##  [985] 4  4  3  4  5  4  5  3  4  4  3  4  4  4  4  5  4  4  3  5  4  4  5  5 
## [1009] 4  4  4  4  4  3  5  5  4  4  5  4  6  3  5  5  5  4  5  4  5  5  4  5 
## [1033] 5  5  5  6  4  5  4  5  4  5  5  4  2  2  2  1  1  2  3  2  2  1  4  2 
## [1057] 2  2  5  3  3  2  2  2  1  2  4  2  3  3  3  3  2  3  2  1  2  2  3  4 
## [1081] 3  4  4  2  3  3  2  3  2  3  4  3  3  1  3  2  3  3  3  3  3  2  2  3 
## [1105] 3  3  3  3  3  4  3  3  3  1  2  2  2  3  4  3  3  3  3  4  3  2  2  4 
## [1129] 3  3  3  4  2  4  3  2  2  2  4  3  2  3  3  4  3  2  2  3  2  3  4  3 
## [1153] 3  3  2  3  2  4  2  4  3  3  3  4  4  4  3  2  3  4  2  2  3  2  3  4 
## [1177] 4  3  3  4  3  2  4  3  4  2  3  3  3  3  2  3  2  3  3  4  4  4  3  4 
## [1201] 4  3  2  3  3  2  3  3  3  3  3  3  3  2  4  4  3  3  4  3  3  3  3  3 
## [1225] 4  3  2  3  3  4  4  2  3  3  4  3  3  2  3  2  4  4  3  3  2  3  3  2 
## [1249] 3  3  4  3  3  2  3  3  3  3  3  4  3  4  4  3  3  4  2  3  4  4  3  2 
## [1273] 3  2  4  3  3  4  3  3  3  3  3  4  4  3  3  3  4  3  3  3  3  3  2  3 
## [1297] 3  2  3  4  3  2  3  3  3  4  3  4  3  4  4  4  4  2  3  4  3  3  3  3 
## [1321] 4  3  4  4  3  2  3  4  2  3  4  2  3  5  2  4  3  4  3  4  4  2  3  3 
## [1345] 4  3  3  3  4  2  3  2  3  3  4  2  3  3  3  4  4  2  3  2  3  3  3  4 
## [1369] 4  3  4  4  2  3  4  3  3  4  4  3  4  3  2  5  4  3  4  3  4  4  3  4 
## [1393] 3  3  3  4  4  3  4  4  3  4  5  2  3  3  4  3  4  3  3  3  4  4  3  2 
## [1417] 4  4  3  3  3  3  3  3  3  3  4  4  3  4  3  3  5  3  3  2 
## Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 19 20

# Add the binned price to the data frame by the indexes 
# Identify the training and validation data frames

train.df$Binned_Price <- Binned_Price[train.index]
valid.df$Binned_Price <- Binned_Price[-train.index]

# i. Compare the tree generated by the CT with the generated by the less deep RT. Are they different? (Look at structure, the top predictors, size oftree, etc.) Why?
tr.binned <- rpart(Binned_Price ~  Age_08_04 + KM + Fuel_Type + 
                      HP + Automatic + Doors + Quarterly_Tax + 
                      Mfr_Guarantee + Guarantee_Period + Airco + 
                      Automatic_airco + CD_Player + Powered_Windows + 
                      Sport_Model + Tow_Bar, data = train.df)
prp(tr.binned)

t(t(tr.binned$variable.importance))

##                         [,1]
## Age_08_04        152.5600337
## KM                58.4034964
## CD_Player         26.9950441
## Automatic_airco   19.8587797
## Sport_Model       18.6676759
## Airco             18.2834578
## Quarterly_Tax     16.1438410
## HP                 7.2273021
## Powered_Windows    7.0148174
## Mfr_Guarantee      2.9656114
## Doors              2.6224520
## Fuel_Type          1.0327227
## Automatic          0.4534818
## Guarantee_Period   0.4534818

# ii. Predict the price, using the less deep RT and the CT, of a used Toyota Corolla with the specifications listed in Table 9.6.  
# Create new record
new.record <- data.frame(Age_08_04 = 77, 
                         KM = 117000, 
                         Fuel_Type = "Petrol", 
                         HP = 110, 
                         Automatic = 0, 
                         Doors = 5, 
                         Quarterly_Tax = 100, 
                         Mfr_Guarantee = 0, 
                         Guarantee_Period = 3, 
                         Airco = 1, 
                         Automatic_airco = 0, 
                         CD_Player = 0, 
                         Powered_Windows = 0, 
                         Sport_Model = 0, 
                         Tow_Bar = 1)

# Using regression tree
price.tr <- predict(tr, newdata = new.record)
price.tr

##        1 
## 7318.056

# Using classification tree
price.tr.bin <- bins[predict(tr.binned, newdata = new.record, type = "class")]

cat(paste("Regression Price Estimate: ",scales::dollar(price.tr,0.01)), 
      paste("Classification Price Estimate: ",scales::dollar(price.tr.bin,0.01)),
      sep='\n')

## Regression Price Estimate:  $7,318.06
## Classification Price Estimate:  $7,165.00

EOC_Problem 8.2 & 9.3

Bowie Pattanan Stanescki

Problem 8.2

a. Using the information in this dataset, if an accident has just been reported and no further information is available, what should the prediction be? (INJURY = Yes or No?) Why?

b. Select the first 12 records in the dataset and look only at the response (INJURY) and the two predictors WEATHER_R and TRAF_CON_R.

c. Let us now return to the entire dataset. Partition the data into training (60%) and validation (40%).

Problem 9.3

Part A

Part B