Build a tree based classification model to predict which customers are most likely to repurchase

Part 1 - tree model is decision tree based on Ionosphere exercise

Part 2 - pruning tree model based on 02 decision_tree_with_pruning

Target is the predictor we are trying to classify on

#import data
repurchase_tree <- read.csv("raw_data/repurchase_training.csv")

#Explore dataset
nrow(repurchase_tree)
## [1] 131337
ncol(repurchase_tree)
## [1] 17
summary(repurchase_tree)
##        ID             Target               age_band         gender     
##  Min.   :     1   Min.   :0.00000   NULL       :112375   Female:25957  
##  1st Qu.: 38563   1st Qu.:0.00000   4. 45 to 54:  4058   Male  :36072  
##  Median : 77132   Median :0.00000   3. 35 to 44:  3833   NULL  :69308  
##  Mean   : 77097   Mean   :0.02681   2. 25 to 34:  3548                 
##  3rd Qu.:115668   3rd Qu.:0.00000   5. 55 to 64:  3397                 
##  Max.   :154139   Max.   :1.00000   6. 65 to 74:  2140                 
##                                     (Other)    :  1986                 
##    car_model           car_segment    age_of_vehicle_years
##  model_2:34491   Large/SUV   :52120   Min.   : 1.000      
##  model_5:24674   LCV         :24606   1st Qu.: 3.000      
##  model_3:17074   Other       :   58   Median : 5.000      
##  model_1:15331   Small/Medium:54553   Mean   : 5.493      
##  model_4:15155                        3rd Qu.: 8.000      
##  model_7: 8167                        Max.   :10.000      
##  (Other):16445                                            
##  sched_serv_warr  non_sched_serv_warr sched_serv_paid  non_sched_serv_paid
##  Min.   : 1.000   Min.   : 1.000      Min.   : 1.000   Min.   : 1.000     
##  1st Qu.: 3.000   1st Qu.: 3.000      1st Qu.: 3.000   1st Qu.: 3.000     
##  Median : 5.000   Median : 5.000      Median : 5.000   Median : 5.000     
##  Mean   : 5.452   Mean   : 5.473      Mean   : 5.452   Mean   : 5.497     
##  3rd Qu.: 8.000   3rd Qu.: 8.000      3rd Qu.: 8.000   3rd Qu.: 8.000     
##  Max.   :10.000   Max.   :10.000      Max.   :10.000   Max.   :10.000     
##                                                                           
##  total_paid_services total_services   mth_since_last_serv
##  Min.   : 1.000      Min.   : 1.000   Min.   : 1.00      
##  1st Qu.: 3.000      1st Qu.: 3.000   1st Qu.: 3.00      
##  Median : 5.000      Median : 5.000   Median : 5.00      
##  Mean   : 5.482      Mean   : 5.455   Mean   : 5.47      
##  3rd Qu.: 8.000      3rd Qu.: 8.000   3rd Qu.: 8.00      
##  Max.   :10.000      Max.   :10.000   Max.   :10.00      
##                                                          
##  annualised_mileage num_dealers_visited num_serv_dealer_purchased
##  Min.   : 1.000     Min.   : 1.000      Min.   : 1.000           
##  1st Qu.: 3.000     1st Qu.: 3.000      1st Qu.: 3.000           
##  Median : 5.000     Median : 5.000      Median : 5.000           
##  Mean   : 5.503     Mean   : 5.485      Mean   : 5.481           
##  3rd Qu.: 8.000     3rd Qu.: 8.000      3rd Qu.: 8.000           
##  Max.   :10.000     Max.   :10.000      Max.   :10.000           
## 
#clean the repurchase_tree data of NAs - excluding age and gender variables due to NA
repurchase_tree <- repurchase_tree %>%
  select(ID, Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage,num_dealers_visited,num_serv_dealer_purchased)

repurchase_tree$Target <- as.factor(repurchase_tree$Target)

#find name of predicted variable
names(repurchase_tree)
##  [1] "ID"                        "Target"                   
##  [3] "car_model"                 "car_segment"              
##  [5] "age_of_vehicle_years"      "sched_serv_warr"          
##  [7] "non_sched_serv_warr"       "sched_serv_paid"          
##  [9] "non_sched_serv_paid"       "total_paid_services"      
## [11] "total_services"            "mth_since_last_serv"      
## [13] "annualised_mileage"        "num_dealers_visited"      
## [15] "num_serv_dealer_purchased"
#get index of predicted variable
typeColNum <- grep("Target",names(repurchase_tree))

Try checking the data to see it can work

View(repurchase_tree)

Creating test and training sets

#create training and test sets

## 80% of the sample size, use floor to round down to nearest integer
trainset_size <- floor(0.80 * nrow(repurchase_tree))

#set random seed (try diffe)
set.seed(42)

trainset_indices <- sample(seq_len(nrow(repurchase_tree)), size = trainset_size)
#assign observations to training and testing sets

trainset <- repurchase_tree[trainset_indices, ]
testset <- repurchase_tree[-trainset_indices, ]

#rowcounts to check
nrow(trainset)
## [1] 105069
nrow(testset)
## [1] 26268
nrow(repurchase_tree)
## [1] 131337

Grow the tree with rpart

Do I need to make Target a factor first?

#default params. This is a classification problem so set method="class" and exclude ID
fit <- rpart(Target~.,data = trainset[,-1], method='class', model=TRUE)

printcp(fit) # display the results 
## 
## Classification tree:
## rpart(formula = Target ~ ., data = trainset[, -1], method = "class", 
##     model = TRUE)
## 
## Variables actually used in tree construction:
## [1] age_of_vehicle_years      annualised_mileage       
## [3] mth_since_last_serv       num_dealers_visited      
## [5] num_serv_dealer_purchased sched_serv_paid          
## [7] sched_serv_warr           total_paid_services      
## [9] total_services           
## 
## Root node error: 2790/105069 = 0.026554
## 
## n= 105069 
## 
##         CP nsplit rel error  xerror     xstd
## 1 0.054480      0   1.00000 1.00000 0.018679
## 2 0.042294      4   0.78208 0.78208 0.016568
## 3 0.024731      5   0.73978 0.73978 0.016123
## 4 0.024373      6   0.71505 0.72473 0.015961
## 5 0.017563      7   0.69068 0.69068 0.015589
## 6 0.011350      8   0.67312 0.67312 0.015393
## 7 0.010275     11   0.63907 0.64659 0.015092
## 8 0.010000     14   0.60824 0.62294 0.014818
plotcp(fit) # visualize cross-validation results 

summary(fit) # detailed summary of splits
## Call:
## rpart(formula = Target ~ ., data = trainset[, -1], method = "class", 
##     model = TRUE)
##   n= 105069 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.05448029      0 1.0000000 1.0000000 0.01867901
## 2 0.04229391      4 0.7820789 0.7820789 0.01656785
## 3 0.02473118      5 0.7397849 0.7397849 0.01612288
## 4 0.02437276      6 0.7150538 0.7247312 0.01596124
## 5 0.01756272      7 0.6906810 0.6906810 0.01558896
## 6 0.01135006      8 0.6731183 0.6731183 0.01539314
## 7 0.01027479     11 0.6390681 0.6465950 0.01509223
## 8 0.01000000     14 0.6082437 0.6229391 0.01481831
## 
## Variable importance
##       mth_since_last_serv            total_services 
##                        27                        18 
##       total_paid_services       non_sched_serv_warr 
##                        13                         8 
##        annualised_mileage           sched_serv_paid 
##                         8                         5 
##           sched_serv_warr       num_dealers_visited 
##                         5                         5 
##       non_sched_serv_paid      age_of_vehicle_years 
##                         5                         3 
## num_serv_dealer_purchased 
##                         1 
## 
## Node number 1: 105069 observations,    complexity param=0.05448029
##   predicted class=0  expected loss=0.02655398  P(node) =1
##     class counts: 102279  2790
##    probabilities: 0.973 0.027 
##   left son=2 (72613 obs) right son=3 (32456 obs)
##   Primary splits:
##       sched_serv_warr      < 3.5 to the right, improve=197.46810, (0 missing)
##       sched_serv_paid      < 3.5 to the right, improve=183.13970, (0 missing)
##       total_services       < 3.5 to the right, improve=174.74720, (0 missing)
##       mth_since_last_serv  < 4.5 to the right, improve=121.36940, (0 missing)
##       age_of_vehicle_years < 5.5 to the right, improve= 92.88465, (0 missing)
##   Surrogate splits:
##       sched_serv_paid     < 3.5 to the right, agree=0.907, adj=0.700, (0 split)
##       annualised_mileage  < 2.5 to the right, agree=0.883, adj=0.622, (0 split)
##       total_services      < 3.5 to the right, agree=0.874, adj=0.591, (0 split)
##       total_paid_services < 2.5 to the right, agree=0.856, adj=0.534, (0 split)
##       mth_since_last_serv < 2.5 to the right, agree=0.841, adj=0.485, (0 split)
## 
## Node number 2: 72613 observations,    complexity param=0.02437276
##   predicted class=0  expected loss=0.006059521  P(node) =0.6910982
##     class counts: 72173   440
##    probabilities: 0.994 0.006 
##   left son=4 (72545 obs) right son=5 (68 obs)
##   Primary splits:
##       mth_since_last_serv  < 1.5 to the right, improve=134.48270, (0 missing)
##       annualised_mileage   < 2.5 to the right, improve=120.45060, (0 missing)
##       age_of_vehicle_years < 1.5 to the right, improve= 78.79551, (0 missing)
##       num_dealers_visited  < 2.5 to the right, improve= 14.90362, (0 missing)
##       sched_serv_warr      < 4.5 to the right, improve= 14.42322, (0 missing)
## 
## Node number 3: 32456 observations,    complexity param=0.05448029
##   predicted class=0  expected loss=0.07240572  P(node) =0.3089018
##     class counts: 30106  2350
##    probabilities: 0.928 0.072 
##   left son=6 (20358 obs) right son=7 (12098 obs)
##   Primary splits:
##       annualised_mileage        < 2.5 to the left,  improve=231.90760, (0 missing)
##       num_serv_dealer_purchased < 6.5 to the left,  improve=159.90600, (0 missing)
##       mth_since_last_serv       < 6.5 to the right, improve= 80.65430, (0 missing)
##       non_sched_serv_paid       < 3.5 to the left,  improve= 57.41369, (0 missing)
##       age_of_vehicle_years      < 6.5 to the right, improve= 56.40522, (0 missing)
##   Surrogate splits:
##       total_services      < 2.5 to the left,  agree=0.870, adj=0.651, (0 split)
##       mth_since_last_serv < 2.5 to the left,  agree=0.870, adj=0.650, (0 split)
##       total_paid_services < 2.5 to the left,  agree=0.847, adj=0.589, (0 split)
##       num_dealers_visited < 3.5 to the left,  agree=0.833, adj=0.551, (0 split)
##       non_sched_serv_warr < 3.5 to the left,  agree=0.831, adj=0.548, (0 split)
## 
## Node number 4: 72545 observations
##   predicted class=0  expected loss=0.005127852  P(node) =0.690451
##     class counts: 72173   372
##    probabilities: 0.995 0.005 
## 
## Node number 5: 68 observations
##   predicted class=1  expected loss=0  P(node) =0.0006471937
##     class counts:     0    68
##    probabilities: 0.000 1.000 
## 
## Node number 6: 20358 observations
##   predicted class=0  expected loss=0.02632872  P(node) =0.1937584
##     class counts: 19822   536
##    probabilities: 0.974 0.026 
## 
## Node number 7: 12098 observations,    complexity param=0.05448029
##   predicted class=0  expected loss=0.1499421  P(node) =0.1151434
##     class counts: 10284  1814
##    probabilities: 0.850 0.150 
##   left son=14 (8203 obs) right son=15 (3895 obs)
##   Primary splits:
##       mth_since_last_serv  < 4.5 to the right, improve=686.3013, (0 missing)
##       total_services       < 2.5 to the right, improve=586.0275, (0 missing)
##       total_paid_services  < 1.5 to the right, improve=570.3556, (0 missing)
##       age_of_vehicle_years < 1.5 to the right, improve=293.9976, (0 missing)
##       num_dealers_visited  < 2.5 to the right, improve=279.4856, (0 missing)
##   Surrogate splits:
##       age_of_vehicle_years < 3.5 to the right, agree=0.717, adj=0.120, (0 split)
##       total_paid_services  < 1.5 to the right, agree=0.701, adj=0.072, (0 split)
##       total_services       < 2.5 to the right, agree=0.698, adj=0.063, (0 split)
##       num_dealers_visited  < 2.5 to the right, agree=0.686, adj=0.025, (0 split)
##       car_model            splits as  LLLLLLLLLR-LLLLLLLL, agree=0.678, adj=0.001, (0 split)
## 
## Node number 14: 8203 observations,    complexity param=0.01756272
##   predicted class=0  expected loss=0.03389004  P(node) =0.0780725
##     class counts:  7925   278
##    probabilities: 0.966 0.034 
##   left son=28 (8134 obs) right son=29 (69 obs)
##   Primary splits:
##       total_paid_services  < 1.5 to the right, improve=93.84841, (0 missing)
##       total_services       < 1.5 to the right, improve=78.80645, (0 missing)
##       age_of_vehicle_years < 1.5 to the right, improve=72.02925, (0 missing)
##       mth_since_last_serv  < 5.5 to the right, improve=39.89778, (0 missing)
##       num_dealers_visited  < 2.5 to the right, improve=16.73143, (0 missing)
##   Surrogate splits:
##       total_services < 1.5 to the right, agree=0.993, adj=0.116, (0 split)
## 
## Node number 15: 3895 observations,    complexity param=0.05448029
##   predicted class=0  expected loss=0.3943517  P(node) =0.03707088
##     class counts:  2359  1536
##    probabilities: 0.606 0.394 
##   left son=30 (2743 obs) right son=31 (1152 obs)
##   Primary splits:
##       total_services      < 2.5 to the right, improve=446.7662, (0 missing)
##       total_paid_services < 1.5 to the right, improve=264.6727, (0 missing)
##       sched_serv_paid     < 3.5 to the right, improve=263.4874, (0 missing)
##       mth_since_last_serv < 2.5 to the right, improve=255.6778, (0 missing)
##       non_sched_serv_warr < 5.5 to the right, improve=249.5261, (0 missing)
##   Surrogate splits:
##       non_sched_serv_warr < 3.5 to the right, agree=0.833, adj=0.435, (0 split)
##       non_sched_serv_paid < 3.5 to the right, agree=0.830, adj=0.426, (0 split)
##       total_paid_services < 2.5 to the right, agree=0.822, adj=0.398, (0 split)
##       mth_since_last_serv < 1.5 to the right, agree=0.714, adj=0.033, (0 split)
##       car_model           splits as  LLLLRLRLLL-LLLLLLLL, agree=0.705, adj=0.003, (0 split)
## 
## Node number 28: 8134 observations
##   predicted class=0  expected loss=0.02692402  P(node) =0.07741579
##     class counts:  7915   219
##    probabilities: 0.973 0.027 
## 
## Node number 29: 69 observations
##   predicted class=1  expected loss=0.1449275  P(node) =0.0006567113
##     class counts:    10    59
##    probabilities: 0.145 0.855 
## 
## Node number 30: 2743 observations,    complexity param=0.04229391
##   predicted class=0  expected loss=0.2391542  P(node) =0.02610665
##     class counts:  2087   656
##    probabilities: 0.761 0.239 
##   left son=60 (2625 obs) right son=61 (118 obs)
##   Primary splits:
##       mth_since_last_serv  < 1.5 to the right, improve=142.75840, (0 missing)
##       num_dealers_visited  < 2.5 to the right, improve= 91.35358, (0 missing)
##       age_of_vehicle_years < 1.5 to the right, improve= 85.79846, (0 missing)
##       sched_serv_warr      < 2.5 to the right, improve= 76.66942, (0 missing)
##       sched_serv_paid      < 2.5 to the right, improve= 75.07761, (0 missing)
## 
## Node number 31: 1152 observations,    complexity param=0.02473118
##   predicted class=1  expected loss=0.2361111  P(node) =0.01096422
##     class counts:   272   880
##    probabilities: 0.236 0.764 
##   left son=62 (103 obs) right son=63 (1049 obs)
##   Primary splits:
##       sched_serv_paid           < 3.5 to the right, improve=81.12717, (0 missing)
##       num_serv_dealer_purchased < 4.5 to the left,  improve=50.37944, (0 missing)
##       total_paid_services       < 1.5 to the right, improve=49.25548, (0 missing)
##       age_of_vehicle_years      < 3.5 to the left,  improve=42.71477, (0 missing)
##       total_services            < 1.5 to the right, improve=37.06786, (0 missing)
## 
## Node number 60: 2625 observations,    complexity param=0.01135006
##   predicted class=0  expected loss=0.2049524  P(node) =0.02498358
##     class counts:  2087   538
##    probabilities: 0.795 0.205 
##   left son=120 (2263 obs) right son=121 (362 obs)
##   Primary splits:
##       num_dealers_visited  < 2.5 to the right, improve=81.55303, (0 missing)
##       age_of_vehicle_years < 1.5 to the right, improve=66.32133, (0 missing)
##       sched_serv_warr      < 2.5 to the right, improve=60.02577, (0 missing)
##       sched_serv_paid      < 2.5 to the right, improve=57.88734, (0 missing)
##       total_services       < 3.5 to the right, improve=56.96500, (0 missing)
##   Surrogate splits:
##       num_serv_dealer_purchased < 3.5 to the right, agree=0.910, adj=0.348, (0 split)
##       car_model                 splits as  LLLLLLLRLL-LLLLLLLL, agree=0.862, adj=0.003, (0 split)
##       car_segment               splits as  LLRL, agree=0.862, adj=0.003, (0 split)
## 
## Node number 61: 118 observations
##   predicted class=1  expected loss=0  P(node) =0.001123072
##     class counts:     0   118
##    probabilities: 0.000 1.000 
## 
## Node number 62: 103 observations
##   predicted class=0  expected loss=0.1650485  P(node) =0.0009803082
##     class counts:    86    17
##    probabilities: 0.835 0.165 
## 
## Node number 63: 1049 observations
##   predicted class=1  expected loss=0.1773117  P(node) =0.009983915
##     class counts:   186   863
##    probabilities: 0.177 0.823 
## 
## Node number 120: 2263 observations,    complexity param=0.01135006
##   predicted class=0  expected loss=0.1551038  P(node) =0.02153823
##     class counts:  1912   351
##    probabilities: 0.845 0.155 
##   left son=240 (2225 obs) right son=241 (38 obs)
##   Primary splits:
##       age_of_vehicle_years      < 1.5 to the right, improve=55.17912, (0 missing)
##       sched_serv_paid           < 2.5 to the right, improve=43.59277, (0 missing)
##       sched_serv_warr           < 2.5 to the right, improve=42.41872, (0 missing)
##       num_serv_dealer_purchased < 6.5 to the left,  improve=38.55943, (0 missing)
##       total_services            < 3.5 to the right, improve=32.83042, (0 missing)
## 
## Node number 121: 362 observations,    complexity param=0.01135006
##   predicted class=1  expected loss=0.4834254  P(node) =0.003445355
##     class counts:   175   187
##    probabilities: 0.483 0.517 
##   left son=242 (255 obs) right son=243 (107 obs)
##   Primary splits:
##       age_of_vehicle_years < 3.5 to the left,  improve=18.953940, (0 missing)
##       total_services       < 5.5 to the right, improve=16.235800, (0 missing)
##       non_sched_serv_warr  < 7.5 to the right, improve=15.901500, (0 missing)
##       sched_serv_warr      < 1.5 to the right, improve= 6.784471, (0 missing)
##       car_model            splits as  LLLLR--RLL-LLLLLRR-, improve= 6.289823, (0 missing)
##   Surrogate splits:
##       non_sched_serv_warr < 9.5 to the left,  agree=0.721, adj=0.056, (0 split)
##       total_services      < 7.5 to the left,  agree=0.718, adj=0.047, (0 split)
##       car_model           splits as  LLRLL--RLL-LLLLLLL-, agree=0.710, adj=0.019, (0 split)
## 
## Node number 240: 2225 observations,    complexity param=0.01027479
##   predicted class=0  expected loss=0.1406742  P(node) =0.02117656
##     class counts:  1912   313
##    probabilities: 0.859 0.141 
##   left son=480 (1366 obs) right son=481 (859 obs)
##   Primary splits:
##       sched_serv_warr           < 2.5 to the right, improve=36.54207, (0 missing)
##       sched_serv_paid           < 2.5 to the right, improve=36.42171, (0 missing)
##       num_serv_dealer_purchased < 6.5 to the left,  improve=33.14668, (0 missing)
##       total_services            < 3.5 to the right, improve=27.34574, (0 missing)
##       mth_since_last_serv       < 2.5 to the right, improve=23.41607, (0 missing)
##   Surrogate splits:
##       sched_serv_paid    < 2.5 to the right, agree=0.819, adj=0.531, (0 split)
##       annualised_mileage < 3.5 to the right, agree=0.641, adj=0.070, (0 split)
##       car_model          splits as  LLRLLRR-RR-LLLLLLLL, agree=0.625, adj=0.029, (0 split)
## 
## Node number 241: 38 observations
##   predicted class=1  expected loss=0  P(node) =0.0003616671
##     class counts:     0    38
##    probabilities: 0.000 1.000 
## 
## Node number 242: 255 observations
##   predicted class=0  expected loss=0.4117647  P(node) =0.002426977
##     class counts:   150   105
##    probabilities: 0.588 0.412 
## 
## Node number 243: 107 observations
##   predicted class=1  expected loss=0.2336449  P(node) =0.001018378
##     class counts:    25    82
##    probabilities: 0.234 0.766 
## 
## Node number 480: 1366 observations
##   predicted class=0  expected loss=0.06881406  P(node) =0.01300098
##     class counts:  1272    94
##    probabilities: 0.931 0.069 
## 
## Node number 481: 859 observations,    complexity param=0.01027479
##   predicted class=0  expected loss=0.2549476  P(node) =0.00817558
##     class counts:   640   219
##    probabilities: 0.745 0.255 
##   left son=962 (643 obs) right son=963 (216 obs)
##   Primary splits:
##       num_serv_dealer_purchased < 6.5 to the left,  improve=35.97829, (0 missing)
##       mth_since_last_serv       < 2.5 to the right, improve=26.90772, (0 missing)
##       non_sched_serv_warr       < 6.5 to the right, improve=25.59131, (0 missing)
##       total_services            < 3.5 to the right, improve=19.35219, (0 missing)
##       age_of_vehicle_years      < 3.5 to the left,  improve=17.44636, (0 missing)
##   Surrogate splits:
##       non_sched_serv_warr  < 8.5 to the left,  agree=0.797, adj=0.194, (0 split)
##       total_services       < 5.5 to the left,  agree=0.787, adj=0.153, (0 split)
##       age_of_vehicle_years < 9.5 to the left,  agree=0.751, adj=0.009, (0 split)
## 
## Node number 962: 643 observations
##   predicted class=0  expected loss=0.1710731  P(node) =0.006119788
##     class counts:   533   110
##    probabilities: 0.829 0.171 
## 
## Node number 963: 216 observations,    complexity param=0.01027479
##   predicted class=1  expected loss=0.4953704  P(node) =0.002055792
##     class counts:   107   109
##    probabilities: 0.495 0.505 
##   left son=1926 (112 obs) right son=1927 (104 obs)
##   Primary splits:
##       total_services      < 4.5 to the right, improve=67.04843, (0 missing)
##       non_sched_serv_warr < 7.5 to the right, improve=66.83889, (0 missing)
##       non_sched_serv_paid < 7.5 to the right, improve=13.37070, (0 missing)
##       total_paid_services < 5.5 to the right, improve=12.08381, (0 missing)
##       annualised_mileage  < 8.5 to the right, improve=10.81422, (0 missing)
##   Surrogate splits:
##       non_sched_serv_warr < 7.5 to the right, agree=0.931, adj=0.856, (0 split)
##       non_sched_serv_paid < 7.5 to the right, agree=0.667, adj=0.308, (0 split)
##       total_paid_services < 5.5 to the right, agree=0.657, adj=0.288, (0 split)
##       annualised_mileage  < 6.5 to the right, agree=0.625, adj=0.221, (0 split)
##       car_model           splits as  LLLRL----L-LRRRLLLL, agree=0.611, adj=0.192, (0 split)
## 
## Node number 1926: 112 observations
##   predicted class=0  expected loss=0.125  P(node) =0.001065966
##     class counts:    98    14
##    probabilities: 0.875 0.125 
## 
## Node number 1927: 104 observations
##   predicted class=1  expected loss=0.08653846  P(node) =0.0009898257
##     class counts:     9    95
##    probabilities: 0.087 0.913
#predict on test data
rpart_predict <- predict(fit,testset[,-typeColNum],type="class")

Part 2 - cost-complexity pruning

#what's the tree size which results in the min cross validated error
opt <- which.min(fit$cptable[,"xerror"])

#value of the complexity parameter (alpha) for that gives a a tree 
#of that size
cp <- fit$cptable[opt, "CP"]

#"prune" the tree using that value of the complexity parameter
pruned_model <- prune(fit,cp)

#plot pruned tree
prp(pruned_model)

#predictions from pruned model
rpart_pruned_predict <- predict(pruned_model,testset[,-typeColNum],type="class")

#Accuracy of pruned model
mean(rpart_pruned_predict==testset$Target)
## [1] 0.9834399
#confusion matrix (PRUNED model)
table(pred=rpart_pruned_predict,true=testset$Target)
##     true
## pred     0     1
##    0 25479   377
##    1    58   354

what about average accuracy over different train/test partitions?

write function to build multiple rpart unpruned and pruned models over different partitions and return accuracies for all models

unpruned (same as in exercise 1)…

source(“multiple_runs_rpart.R”)

multiple_runs_rpart <- function(df,class_variable_name,train_fraction,nruns){
  
  #Purpose:
  #Builds rpart model for nrun data partitions
  
  #Return value:
  #Vector containing nrun accuracies
  
  #Arguments:
  #df: variable containing dataframe
  #class_variable_name: class name as a quoted string. e.g. "Class"
  #train_fraction: fraction of data to be assigned to training set (0<train_fraction<1)
  #nruns: number of data partitions
  
  #find column index of class variable
  typeColNum <- grep(class_variable_name,names(df))
  #initialize accuracy vector
  accuracies <- rep(NA,nruns)
  #set seed (can be any integer)
  set.seed(1)
  for (i in 1:nruns){
    #partition data
    trainset_size <- floor(train_fraction * nrow(df))
    trainset_indices <- sample(seq_len(nrow(df)), size = trainset_size)
    trainset <- df[trainset_indices, ]
    testset <- df[-trainset_indices, ]
    #build model 
    #paste builds formula string and as.formula interprets it as an R formula
    rpart_model <- rpart(as.formula(paste(class_variable_name,"~.")),data = trainset, method="class")
    #predict on test data
    rpart_predict <- predict(rpart_model,testset[,-typeColNum],type="class")
    #accuracy
    accuracies[i] <- mean(rpart_predict==testset[[class_variable_name]])
  }
  
  return(accuracies)
}
#calculate average accuracy and std dev over 30 random partitions
accuracy_results_unpruned <- multiple_runs_rpart(repurchase_tree,"Target",0.8,30)
mean(accuracy_results_unpruned)
## [1] 0.9999987
sd(accuracy_results_unpruned)
## [1] 6.950441e-06

pruned…

#pruned...

#source("multiple_runs_rpart_pruned.R")

multiple_runs_rpart_pruned <- function(df,class_variable_name,train_fraction,nruns){
 
  #Purpose:
  #Builds rpart pruned model for nrun data partitions.
  
  #Return value:
  #Vector containing nrun accuracies
  
  #Arguments:
  #df: variable containing dataframe
  #class_variable_name: class name as a quoted string. e.g. "Class"
  #train_fraction: fraction of data to be assigned to training set (0<train_fraction<1)
  #nruns: number of data partitions  
  
  typeColNum <- grep(class_variable_name,names(df))
  accuracies <- rep(NA,nruns)
  set.seed(1)
  for (i in 1:nruns){
    trainset_size <- floor(train_fraction * nrow(df))
    trainset_indices <- sample(seq_len(nrow(df)), size = trainset_size)
    trainset <- df[trainset_indices, ]
    testset <- df[-trainset_indices, ]
    #build model 
    #paste builds formula string and as.formula interprets it as an R formula  
    rpart_model <- rpart(as.formula(paste(class_variable_name,"~.")),data = trainset, method="class")
    #prune
    opt <- which.min(rpart_model$cptable[,"xerror"])
    cp <- rpart_model$cptable[opt, "CP"]
    pruned_model <- prune(rpart_model,cp)
    #predictions of PRUNED model
    rpart_pruned_predict <- predict(pruned_model,testset[,-typeColNum],type="class")
    #accuracy
    accuracies[i] <- mean(rpart_predict==testset[[class_variable_name]])
  }
  return(accuracies)
}
#calculate average accuracy and std dev over 30 random partitions
accuracy_results_pruned <- multiple_runs_rpart_pruned(repurchase_tree,"Target",0.8,30)

get the mean of pruned model

mean(accuracy_results_pruned)
## [1] 0.9833777
#getthe sd of pruned model
sd(accuracy_results_pruned)
## [1] 0.0001911273

Try the RANDOM FOREST as a comparison

find name of predicted variable

#Build random forest model

repurchase_tree.rf <- randomForest(Target ~.,data = trainset, 
                       importance=TRUE, xtest=testset[,-typeColNum],ntree=1000)
#model summary
summary(repurchase_tree.rf)
##                 Length Class  Mode     
## call                 6 -none- call     
## type                 1 -none- character
## predicted       105069 factor numeric  
## err.rate          3000 -none- numeric  
## confusion            6 -none- numeric  
## votes           210138 matrix numeric  
## oob.times       105069 -none- numeric  
## classes              2 -none- character
## importance          56 -none- numeric  
## importanceSD        42 -none- numeric  
## localImportance      0 -none- NULL     
## proximity            0 -none- NULL     
## ntree                1 -none- numeric  
## mtry                 1 -none- numeric  
## forest               0 -none- NULL     
## y               105069 factor numeric  
## test                 5 -none- list     
## inbag                0 -none- NULL     
## terms                3 terms  call
#variables contained in model 
names(repurchase_tree.rf)
##  [1] "call"            "type"            "predicted"      
##  [4] "err.rate"        "confusion"       "votes"          
##  [7] "oob.times"       "classes"         "importance"     
## [10] "importanceSD"    "localImportance" "proximity"      
## [13] "ntree"           "mtry"            "forest"         
## [16] "y"               "test"            "inbag"          
## [19] "terms"
#predictions for test set
test_predictions_rf <- data.frame(testset,repurchase_tree.rf$test$predicted)

#accuracy for test set
mean(repurchase_tree.rf$test$predicted==testset$Target)
## [1] 1
#confusion matrix
table(repurchase_tree.rf$test$predicted,testset$Target)
##    
##         0     1
##   0 25537     0
##   1     0   731
#quantitative measure of variable importance
importance(repurchase_tree.rf)
##                                    0             1 MeanDecreaseAccuracy
## ID                        173.730938 263.879970389           248.557424
## car_model                   4.904517   1.636390968             4.965468
## car_segment                 1.636027  -0.003547715             1.638161
## age_of_vehicle_years       22.334638  10.957448568            22.567144
## sched_serv_warr            17.400935  14.896001976            17.716216
## non_sched_serv_warr        12.040156  12.156117316            12.171405
## sched_serv_paid            13.795168   9.204049295            13.942654
## non_sched_serv_paid        10.753038   7.562621406            10.824218
## total_paid_services        11.089435   7.414215278            11.135195
## total_services             20.461936  14.029378479            20.645160
## mth_since_last_serv        26.849001  18.045049849            27.017357
## annualised_mileage         20.851312   7.271398160            20.874954
## num_dealers_visited        12.835700  13.821226152            13.057339
## num_serv_dealer_purchased  18.945624  13.915429729            19.372926
##                           MeanDecreaseGini
## ID                             4322.335453
## car_model                        19.897346
## car_segment                       3.056903
## age_of_vehicle_years             97.570276
## sched_serv_warr                  99.390696
## non_sched_serv_warr              46.756112
## sched_serv_paid                  82.384973
## non_sched_serv_paid              33.501232
## total_paid_services              51.981259
## total_services                  118.114533
## mth_since_last_serv             202.354581
## annualised_mileage              153.450574
## num_dealers_visited              57.635085
## num_serv_dealer_purchased       137.892507
#sorted plot of importance
varImpPlot(repurchase_tree.rf)

#Homework: estimate average accuracy for a randomForest model on this dataset. You will need to
#write a function to do this.
AccuracyRF <- function(df,class_variable_name,train_fraction,nruns){
  #Purpose:
  #Builds rpart model for nrun data partitions
  #Return value:
  #Vector containing nrun accuracies
  #Arguments:
  #df: variable containing dataframe
  #class_variable_name: class name as a quoted string. e.g. "Class"
  #train_fraction: fraction of data to be assigned to training set (0<train_fraction<1)
  #nruns: number of data partitions
  
  #Of all the names in the dataframe find me the one that matches class_variable_names and assign
  #its index number to typeColNum
  typeColNum <- grep(class_variable_name,names(df))
  #Creating an empty vector with a size of nruns (the number of runs to test average accuracy) 
  accuracies <- rep(NA,nruns)
  set.seed(1)
  #this loop finds the mean of the testset for each run and adding it to the accuracies vector
  for (i in 1:nruns) {
    #this determines the size of the training set based on the train_fraction
    trainset_size <- floor(train_fraction * nrow(df))
    #samples a number of rows and assigns it to trainset_indices according to the trainset_size
    trainset_indices <- sample(seq_len(nrow(df)), size = trainset_size)
    #the indices defined above beciome part of the trainset
    trainset <- df[trainset_indices, ]
    #the negative of the line above becomes part of the testset
    testset <- df[-trainset_indices, ]
    #rpart_model will model us the trainset data using the classification method
    randomForest_model <- randomForest(as.formula(paste(class_variable_name,"~.")),data = trainset, 
                            importance=TRUE, xtest=testset[,-typeColNum],ntree=1000)
    #predict on test data
    accuracies[i] <- mean(randomForest_model$test$predicted==testset$Type)
     }
  return(accuracies)
}

THIS IS THE BIT TO LOOK AT TO FIX - KEEPS GIVING is.na() applied to non-(list or vector) of type null error

#Accuracy <- AccuracyRF(repurchase_tree,"Target",0.8,1)
#mean(Accuracy)
#sd(Accuracy)
#run the EstAveAccuracy function and assign the results to accuracy_results
#accuracy_results <- EstAveAccuracy(repurchase_tree,"Type",0.8,30)
#find mean of accuracy_results
#mean(accuracy_results)
#find standard deviation of accuracy_results
#sd(accuracy_results)