Target is the predictor we are trying to classify on
#import data
repurchase_tree <- read.csv("raw_data/repurchase_training.csv")
#Explore dataset
nrow(repurchase_tree)
## [1] 131337
ncol(repurchase_tree)
## [1] 17
summary(repurchase_tree)
## ID Target age_band gender
## Min. : 1 Min. :0.00000 NULL :112375 Female:25957
## 1st Qu.: 38563 1st Qu.:0.00000 4. 45 to 54: 4058 Male :36072
## Median : 77132 Median :0.00000 3. 35 to 44: 3833 NULL :69308
## Mean : 77097 Mean :0.02681 2. 25 to 34: 3548
## 3rd Qu.:115668 3rd Qu.:0.00000 5. 55 to 64: 3397
## Max. :154139 Max. :1.00000 6. 65 to 74: 2140
## (Other) : 1986
## car_model car_segment age_of_vehicle_years
## model_2:34491 Large/SUV :52120 Min. : 1.000
## model_5:24674 LCV :24606 1st Qu.: 3.000
## model_3:17074 Other : 58 Median : 5.000
## model_1:15331 Small/Medium:54553 Mean : 5.493
## model_4:15155 3rd Qu.: 8.000
## model_7: 8167 Max. :10.000
## (Other):16445
## sched_serv_warr non_sched_serv_warr sched_serv_paid non_sched_serv_paid
## Min. : 1.000 Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 5.000 Median : 5.000 Median : 5.000 Median : 5.000
## Mean : 5.452 Mean : 5.473 Mean : 5.452 Mean : 5.497
## 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :10.000 Max. :10.000 Max. :10.000 Max. :10.000
##
## total_paid_services total_services mth_since_last_serv
## Min. : 1.000 Min. : 1.000 Min. : 1.00
## 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.00
## Median : 5.000 Median : 5.000 Median : 5.00
## Mean : 5.482 Mean : 5.455 Mean : 5.47
## 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.00
## Max. :10.000 Max. :10.000 Max. :10.00
##
## annualised_mileage num_dealers_visited num_serv_dealer_purchased
## Min. : 1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.: 3.000 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 5.000 Median : 5.000 Median : 5.000
## Mean : 5.503 Mean : 5.485 Mean : 5.481
## 3rd Qu.: 8.000 3rd Qu.: 8.000 3rd Qu.: 8.000
## Max. :10.000 Max. :10.000 Max. :10.000
##
#clean the repurchase_tree data of NAs - excluding age and gender variables due to NA
repurchase_tree <- repurchase_tree %>%
select(ID, Target, car_model, car_segment, age_of_vehicle_years, sched_serv_warr, non_sched_serv_warr, sched_serv_paid, non_sched_serv_paid, total_paid_services, total_services, mth_since_last_serv, annualised_mileage,num_dealers_visited,num_serv_dealer_purchased)
repurchase_tree$Target <- as.factor(repurchase_tree$Target)
#find name of predicted variable
names(repurchase_tree)
## [1] "ID" "Target"
## [3] "car_model" "car_segment"
## [5] "age_of_vehicle_years" "sched_serv_warr"
## [7] "non_sched_serv_warr" "sched_serv_paid"
## [9] "non_sched_serv_paid" "total_paid_services"
## [11] "total_services" "mth_since_last_serv"
## [13] "annualised_mileage" "num_dealers_visited"
## [15] "num_serv_dealer_purchased"
#get index of predicted variable
typeColNum <- grep("Target",names(repurchase_tree))
View(repurchase_tree)
#create training and test sets
## 80% of the sample size, use floor to round down to nearest integer
trainset_size <- floor(0.80 * nrow(repurchase_tree))
#set random seed (try diffe)
set.seed(42)
trainset_indices <- sample(seq_len(nrow(repurchase_tree)), size = trainset_size)
#assign observations to training and testing sets
trainset <- repurchase_tree[trainset_indices, ]
testset <- repurchase_tree[-trainset_indices, ]
#rowcounts to check
nrow(trainset)
## [1] 105069
nrow(testset)
## [1] 26268
nrow(repurchase_tree)
## [1] 131337
Do I need to make Target a factor first?
#default params. This is a classification problem so set method="class" and exclude ID
fit <- rpart(Target~.,data = trainset[,-1], method='class', model=TRUE)
printcp(fit) # display the results
##
## Classification tree:
## rpart(formula = Target ~ ., data = trainset[, -1], method = "class",
## model = TRUE)
##
## Variables actually used in tree construction:
## [1] age_of_vehicle_years annualised_mileage
## [3] mth_since_last_serv num_dealers_visited
## [5] num_serv_dealer_purchased sched_serv_paid
## [7] sched_serv_warr total_paid_services
## [9] total_services
##
## Root node error: 2790/105069 = 0.026554
##
## n= 105069
##
## CP nsplit rel error xerror xstd
## 1 0.054480 0 1.00000 1.00000 0.018679
## 2 0.042294 4 0.78208 0.78208 0.016568
## 3 0.024731 5 0.73978 0.73978 0.016123
## 4 0.024373 6 0.71505 0.72473 0.015961
## 5 0.017563 7 0.69068 0.69068 0.015589
## 6 0.011350 8 0.67312 0.67312 0.015393
## 7 0.010275 11 0.63907 0.64659 0.015092
## 8 0.010000 14 0.60824 0.62294 0.014818
plotcp(fit) # visualize cross-validation results
summary(fit) # detailed summary of splits
## Call:
## rpart(formula = Target ~ ., data = trainset[, -1], method = "class",
## model = TRUE)
## n= 105069
##
## CP nsplit rel error xerror xstd
## 1 0.05448029 0 1.0000000 1.0000000 0.01867901
## 2 0.04229391 4 0.7820789 0.7820789 0.01656785
## 3 0.02473118 5 0.7397849 0.7397849 0.01612288
## 4 0.02437276 6 0.7150538 0.7247312 0.01596124
## 5 0.01756272 7 0.6906810 0.6906810 0.01558896
## 6 0.01135006 8 0.6731183 0.6731183 0.01539314
## 7 0.01027479 11 0.6390681 0.6465950 0.01509223
## 8 0.01000000 14 0.6082437 0.6229391 0.01481831
##
## Variable importance
## mth_since_last_serv total_services
## 27 18
## total_paid_services non_sched_serv_warr
## 13 8
## annualised_mileage sched_serv_paid
## 8 5
## sched_serv_warr num_dealers_visited
## 5 5
## non_sched_serv_paid age_of_vehicle_years
## 5 3
## num_serv_dealer_purchased
## 1
##
## Node number 1: 105069 observations, complexity param=0.05448029
## predicted class=0 expected loss=0.02655398 P(node) =1
## class counts: 102279 2790
## probabilities: 0.973 0.027
## left son=2 (72613 obs) right son=3 (32456 obs)
## Primary splits:
## sched_serv_warr < 3.5 to the right, improve=197.46810, (0 missing)
## sched_serv_paid < 3.5 to the right, improve=183.13970, (0 missing)
## total_services < 3.5 to the right, improve=174.74720, (0 missing)
## mth_since_last_serv < 4.5 to the right, improve=121.36940, (0 missing)
## age_of_vehicle_years < 5.5 to the right, improve= 92.88465, (0 missing)
## Surrogate splits:
## sched_serv_paid < 3.5 to the right, agree=0.907, adj=0.700, (0 split)
## annualised_mileage < 2.5 to the right, agree=0.883, adj=0.622, (0 split)
## total_services < 3.5 to the right, agree=0.874, adj=0.591, (0 split)
## total_paid_services < 2.5 to the right, agree=0.856, adj=0.534, (0 split)
## mth_since_last_serv < 2.5 to the right, agree=0.841, adj=0.485, (0 split)
##
## Node number 2: 72613 observations, complexity param=0.02437276
## predicted class=0 expected loss=0.006059521 P(node) =0.6910982
## class counts: 72173 440
## probabilities: 0.994 0.006
## left son=4 (72545 obs) right son=5 (68 obs)
## Primary splits:
## mth_since_last_serv < 1.5 to the right, improve=134.48270, (0 missing)
## annualised_mileage < 2.5 to the right, improve=120.45060, (0 missing)
## age_of_vehicle_years < 1.5 to the right, improve= 78.79551, (0 missing)
## num_dealers_visited < 2.5 to the right, improve= 14.90362, (0 missing)
## sched_serv_warr < 4.5 to the right, improve= 14.42322, (0 missing)
##
## Node number 3: 32456 observations, complexity param=0.05448029
## predicted class=0 expected loss=0.07240572 P(node) =0.3089018
## class counts: 30106 2350
## probabilities: 0.928 0.072
## left son=6 (20358 obs) right son=7 (12098 obs)
## Primary splits:
## annualised_mileage < 2.5 to the left, improve=231.90760, (0 missing)
## num_serv_dealer_purchased < 6.5 to the left, improve=159.90600, (0 missing)
## mth_since_last_serv < 6.5 to the right, improve= 80.65430, (0 missing)
## non_sched_serv_paid < 3.5 to the left, improve= 57.41369, (0 missing)
## age_of_vehicle_years < 6.5 to the right, improve= 56.40522, (0 missing)
## Surrogate splits:
## total_services < 2.5 to the left, agree=0.870, adj=0.651, (0 split)
## mth_since_last_serv < 2.5 to the left, agree=0.870, adj=0.650, (0 split)
## total_paid_services < 2.5 to the left, agree=0.847, adj=0.589, (0 split)
## num_dealers_visited < 3.5 to the left, agree=0.833, adj=0.551, (0 split)
## non_sched_serv_warr < 3.5 to the left, agree=0.831, adj=0.548, (0 split)
##
## Node number 4: 72545 observations
## predicted class=0 expected loss=0.005127852 P(node) =0.690451
## class counts: 72173 372
## probabilities: 0.995 0.005
##
## Node number 5: 68 observations
## predicted class=1 expected loss=0 P(node) =0.0006471937
## class counts: 0 68
## probabilities: 0.000 1.000
##
## Node number 6: 20358 observations
## predicted class=0 expected loss=0.02632872 P(node) =0.1937584
## class counts: 19822 536
## probabilities: 0.974 0.026
##
## Node number 7: 12098 observations, complexity param=0.05448029
## predicted class=0 expected loss=0.1499421 P(node) =0.1151434
## class counts: 10284 1814
## probabilities: 0.850 0.150
## left son=14 (8203 obs) right son=15 (3895 obs)
## Primary splits:
## mth_since_last_serv < 4.5 to the right, improve=686.3013, (0 missing)
## total_services < 2.5 to the right, improve=586.0275, (0 missing)
## total_paid_services < 1.5 to the right, improve=570.3556, (0 missing)
## age_of_vehicle_years < 1.5 to the right, improve=293.9976, (0 missing)
## num_dealers_visited < 2.5 to the right, improve=279.4856, (0 missing)
## Surrogate splits:
## age_of_vehicle_years < 3.5 to the right, agree=0.717, adj=0.120, (0 split)
## total_paid_services < 1.5 to the right, agree=0.701, adj=0.072, (0 split)
## total_services < 2.5 to the right, agree=0.698, adj=0.063, (0 split)
## num_dealers_visited < 2.5 to the right, agree=0.686, adj=0.025, (0 split)
## car_model splits as LLLLLLLLLR-LLLLLLLL, agree=0.678, adj=0.001, (0 split)
##
## Node number 14: 8203 observations, complexity param=0.01756272
## predicted class=0 expected loss=0.03389004 P(node) =0.0780725
## class counts: 7925 278
## probabilities: 0.966 0.034
## left son=28 (8134 obs) right son=29 (69 obs)
## Primary splits:
## total_paid_services < 1.5 to the right, improve=93.84841, (0 missing)
## total_services < 1.5 to the right, improve=78.80645, (0 missing)
## age_of_vehicle_years < 1.5 to the right, improve=72.02925, (0 missing)
## mth_since_last_serv < 5.5 to the right, improve=39.89778, (0 missing)
## num_dealers_visited < 2.5 to the right, improve=16.73143, (0 missing)
## Surrogate splits:
## total_services < 1.5 to the right, agree=0.993, adj=0.116, (0 split)
##
## Node number 15: 3895 observations, complexity param=0.05448029
## predicted class=0 expected loss=0.3943517 P(node) =0.03707088
## class counts: 2359 1536
## probabilities: 0.606 0.394
## left son=30 (2743 obs) right son=31 (1152 obs)
## Primary splits:
## total_services < 2.5 to the right, improve=446.7662, (0 missing)
## total_paid_services < 1.5 to the right, improve=264.6727, (0 missing)
## sched_serv_paid < 3.5 to the right, improve=263.4874, (0 missing)
## mth_since_last_serv < 2.5 to the right, improve=255.6778, (0 missing)
## non_sched_serv_warr < 5.5 to the right, improve=249.5261, (0 missing)
## Surrogate splits:
## non_sched_serv_warr < 3.5 to the right, agree=0.833, adj=0.435, (0 split)
## non_sched_serv_paid < 3.5 to the right, agree=0.830, adj=0.426, (0 split)
## total_paid_services < 2.5 to the right, agree=0.822, adj=0.398, (0 split)
## mth_since_last_serv < 1.5 to the right, agree=0.714, adj=0.033, (0 split)
## car_model splits as LLLLRLRLLL-LLLLLLLL, agree=0.705, adj=0.003, (0 split)
##
## Node number 28: 8134 observations
## predicted class=0 expected loss=0.02692402 P(node) =0.07741579
## class counts: 7915 219
## probabilities: 0.973 0.027
##
## Node number 29: 69 observations
## predicted class=1 expected loss=0.1449275 P(node) =0.0006567113
## class counts: 10 59
## probabilities: 0.145 0.855
##
## Node number 30: 2743 observations, complexity param=0.04229391
## predicted class=0 expected loss=0.2391542 P(node) =0.02610665
## class counts: 2087 656
## probabilities: 0.761 0.239
## left son=60 (2625 obs) right son=61 (118 obs)
## Primary splits:
## mth_since_last_serv < 1.5 to the right, improve=142.75840, (0 missing)
## num_dealers_visited < 2.5 to the right, improve= 91.35358, (0 missing)
## age_of_vehicle_years < 1.5 to the right, improve= 85.79846, (0 missing)
## sched_serv_warr < 2.5 to the right, improve= 76.66942, (0 missing)
## sched_serv_paid < 2.5 to the right, improve= 75.07761, (0 missing)
##
## Node number 31: 1152 observations, complexity param=0.02473118
## predicted class=1 expected loss=0.2361111 P(node) =0.01096422
## class counts: 272 880
## probabilities: 0.236 0.764
## left son=62 (103 obs) right son=63 (1049 obs)
## Primary splits:
## sched_serv_paid < 3.5 to the right, improve=81.12717, (0 missing)
## num_serv_dealer_purchased < 4.5 to the left, improve=50.37944, (0 missing)
## total_paid_services < 1.5 to the right, improve=49.25548, (0 missing)
## age_of_vehicle_years < 3.5 to the left, improve=42.71477, (0 missing)
## total_services < 1.5 to the right, improve=37.06786, (0 missing)
##
## Node number 60: 2625 observations, complexity param=0.01135006
## predicted class=0 expected loss=0.2049524 P(node) =0.02498358
## class counts: 2087 538
## probabilities: 0.795 0.205
## left son=120 (2263 obs) right son=121 (362 obs)
## Primary splits:
## num_dealers_visited < 2.5 to the right, improve=81.55303, (0 missing)
## age_of_vehicle_years < 1.5 to the right, improve=66.32133, (0 missing)
## sched_serv_warr < 2.5 to the right, improve=60.02577, (0 missing)
## sched_serv_paid < 2.5 to the right, improve=57.88734, (0 missing)
## total_services < 3.5 to the right, improve=56.96500, (0 missing)
## Surrogate splits:
## num_serv_dealer_purchased < 3.5 to the right, agree=0.910, adj=0.348, (0 split)
## car_model splits as LLLLLLLRLL-LLLLLLLL, agree=0.862, adj=0.003, (0 split)
## car_segment splits as LLRL, agree=0.862, adj=0.003, (0 split)
##
## Node number 61: 118 observations
## predicted class=1 expected loss=0 P(node) =0.001123072
## class counts: 0 118
## probabilities: 0.000 1.000
##
## Node number 62: 103 observations
## predicted class=0 expected loss=0.1650485 P(node) =0.0009803082
## class counts: 86 17
## probabilities: 0.835 0.165
##
## Node number 63: 1049 observations
## predicted class=1 expected loss=0.1773117 P(node) =0.009983915
## class counts: 186 863
## probabilities: 0.177 0.823
##
## Node number 120: 2263 observations, complexity param=0.01135006
## predicted class=0 expected loss=0.1551038 P(node) =0.02153823
## class counts: 1912 351
## probabilities: 0.845 0.155
## left son=240 (2225 obs) right son=241 (38 obs)
## Primary splits:
## age_of_vehicle_years < 1.5 to the right, improve=55.17912, (0 missing)
## sched_serv_paid < 2.5 to the right, improve=43.59277, (0 missing)
## sched_serv_warr < 2.5 to the right, improve=42.41872, (0 missing)
## num_serv_dealer_purchased < 6.5 to the left, improve=38.55943, (0 missing)
## total_services < 3.5 to the right, improve=32.83042, (0 missing)
##
## Node number 121: 362 observations, complexity param=0.01135006
## predicted class=1 expected loss=0.4834254 P(node) =0.003445355
## class counts: 175 187
## probabilities: 0.483 0.517
## left son=242 (255 obs) right son=243 (107 obs)
## Primary splits:
## age_of_vehicle_years < 3.5 to the left, improve=18.953940, (0 missing)
## total_services < 5.5 to the right, improve=16.235800, (0 missing)
## non_sched_serv_warr < 7.5 to the right, improve=15.901500, (0 missing)
## sched_serv_warr < 1.5 to the right, improve= 6.784471, (0 missing)
## car_model splits as LLLLR--RLL-LLLLLRR-, improve= 6.289823, (0 missing)
## Surrogate splits:
## non_sched_serv_warr < 9.5 to the left, agree=0.721, adj=0.056, (0 split)
## total_services < 7.5 to the left, agree=0.718, adj=0.047, (0 split)
## car_model splits as LLRLL--RLL-LLLLLLL-, agree=0.710, adj=0.019, (0 split)
##
## Node number 240: 2225 observations, complexity param=0.01027479
## predicted class=0 expected loss=0.1406742 P(node) =0.02117656
## class counts: 1912 313
## probabilities: 0.859 0.141
## left son=480 (1366 obs) right son=481 (859 obs)
## Primary splits:
## sched_serv_warr < 2.5 to the right, improve=36.54207, (0 missing)
## sched_serv_paid < 2.5 to the right, improve=36.42171, (0 missing)
## num_serv_dealer_purchased < 6.5 to the left, improve=33.14668, (0 missing)
## total_services < 3.5 to the right, improve=27.34574, (0 missing)
## mth_since_last_serv < 2.5 to the right, improve=23.41607, (0 missing)
## Surrogate splits:
## sched_serv_paid < 2.5 to the right, agree=0.819, adj=0.531, (0 split)
## annualised_mileage < 3.5 to the right, agree=0.641, adj=0.070, (0 split)
## car_model splits as LLRLLRR-RR-LLLLLLLL, agree=0.625, adj=0.029, (0 split)
##
## Node number 241: 38 observations
## predicted class=1 expected loss=0 P(node) =0.0003616671
## class counts: 0 38
## probabilities: 0.000 1.000
##
## Node number 242: 255 observations
## predicted class=0 expected loss=0.4117647 P(node) =0.002426977
## class counts: 150 105
## probabilities: 0.588 0.412
##
## Node number 243: 107 observations
## predicted class=1 expected loss=0.2336449 P(node) =0.001018378
## class counts: 25 82
## probabilities: 0.234 0.766
##
## Node number 480: 1366 observations
## predicted class=0 expected loss=0.06881406 P(node) =0.01300098
## class counts: 1272 94
## probabilities: 0.931 0.069
##
## Node number 481: 859 observations, complexity param=0.01027479
## predicted class=0 expected loss=0.2549476 P(node) =0.00817558
## class counts: 640 219
## probabilities: 0.745 0.255
## left son=962 (643 obs) right son=963 (216 obs)
## Primary splits:
## num_serv_dealer_purchased < 6.5 to the left, improve=35.97829, (0 missing)
## mth_since_last_serv < 2.5 to the right, improve=26.90772, (0 missing)
## non_sched_serv_warr < 6.5 to the right, improve=25.59131, (0 missing)
## total_services < 3.5 to the right, improve=19.35219, (0 missing)
## age_of_vehicle_years < 3.5 to the left, improve=17.44636, (0 missing)
## Surrogate splits:
## non_sched_serv_warr < 8.5 to the left, agree=0.797, adj=0.194, (0 split)
## total_services < 5.5 to the left, agree=0.787, adj=0.153, (0 split)
## age_of_vehicle_years < 9.5 to the left, agree=0.751, adj=0.009, (0 split)
##
## Node number 962: 643 observations
## predicted class=0 expected loss=0.1710731 P(node) =0.006119788
## class counts: 533 110
## probabilities: 0.829 0.171
##
## Node number 963: 216 observations, complexity param=0.01027479
## predicted class=1 expected loss=0.4953704 P(node) =0.002055792
## class counts: 107 109
## probabilities: 0.495 0.505
## left son=1926 (112 obs) right son=1927 (104 obs)
## Primary splits:
## total_services < 4.5 to the right, improve=67.04843, (0 missing)
## non_sched_serv_warr < 7.5 to the right, improve=66.83889, (0 missing)
## non_sched_serv_paid < 7.5 to the right, improve=13.37070, (0 missing)
## total_paid_services < 5.5 to the right, improve=12.08381, (0 missing)
## annualised_mileage < 8.5 to the right, improve=10.81422, (0 missing)
## Surrogate splits:
## non_sched_serv_warr < 7.5 to the right, agree=0.931, adj=0.856, (0 split)
## non_sched_serv_paid < 7.5 to the right, agree=0.667, adj=0.308, (0 split)
## total_paid_services < 5.5 to the right, agree=0.657, adj=0.288, (0 split)
## annualised_mileage < 6.5 to the right, agree=0.625, adj=0.221, (0 split)
## car_model splits as LLLRL----L-LRRRLLLL, agree=0.611, adj=0.192, (0 split)
##
## Node number 1926: 112 observations
## predicted class=0 expected loss=0.125 P(node) =0.001065966
## class counts: 98 14
## probabilities: 0.875 0.125
##
## Node number 1927: 104 observations
## predicted class=1 expected loss=0.08653846 P(node) =0.0009898257
## class counts: 9 95
## probabilities: 0.087 0.913
#predict on test data
rpart_predict <- predict(fit,testset[,-typeColNum],type="class")
#what's the tree size which results in the min cross validated error
opt <- which.min(fit$cptable[,"xerror"])
#value of the complexity parameter (alpha) for that gives a a tree
#of that size
cp <- fit$cptable[opt, "CP"]
#"prune" the tree using that value of the complexity parameter
pruned_model <- prune(fit,cp)
#plot pruned tree
prp(pruned_model)
#predictions from pruned model
rpart_pruned_predict <- predict(pruned_model,testset[,-typeColNum],type="class")
#Accuracy of pruned model
mean(rpart_pruned_predict==testset$Target)
## [1] 0.9834399
#confusion matrix (PRUNED model)
table(pred=rpart_pruned_predict,true=testset$Target)
## true
## pred 0 1
## 0 25479 377
## 1 58 354
multiple_runs_rpart <- function(df,class_variable_name,train_fraction,nruns){
#Purpose:
#Builds rpart model for nrun data partitions
#Return value:
#Vector containing nrun accuracies
#Arguments:
#df: variable containing dataframe
#class_variable_name: class name as a quoted string. e.g. "Class"
#train_fraction: fraction of data to be assigned to training set (0<train_fraction<1)
#nruns: number of data partitions
#find column index of class variable
typeColNum <- grep(class_variable_name,names(df))
#initialize accuracy vector
accuracies <- rep(NA,nruns)
#set seed (can be any integer)
set.seed(1)
for (i in 1:nruns){
#partition data
trainset_size <- floor(train_fraction * nrow(df))
trainset_indices <- sample(seq_len(nrow(df)), size = trainset_size)
trainset <- df[trainset_indices, ]
testset <- df[-trainset_indices, ]
#build model
#paste builds formula string and as.formula interprets it as an R formula
rpart_model <- rpart(as.formula(paste(class_variable_name,"~.")),data = trainset, method="class")
#predict on test data
rpart_predict <- predict(rpart_model,testset[,-typeColNum],type="class")
#accuracy
accuracies[i] <- mean(rpart_predict==testset[[class_variable_name]])
}
return(accuracies)
}
#calculate average accuracy and std dev over 30 random partitions
accuracy_results_unpruned <- multiple_runs_rpart(repurchase_tree,"Target",0.8,30)
mean(accuracy_results_unpruned)
## [1] 0.9999987
sd(accuracy_results_unpruned)
## [1] 6.950441e-06
#pruned...
#source("multiple_runs_rpart_pruned.R")
multiple_runs_rpart_pruned <- function(df,class_variable_name,train_fraction,nruns){
#Purpose:
#Builds rpart pruned model for nrun data partitions.
#Return value:
#Vector containing nrun accuracies
#Arguments:
#df: variable containing dataframe
#class_variable_name: class name as a quoted string. e.g. "Class"
#train_fraction: fraction of data to be assigned to training set (0<train_fraction<1)
#nruns: number of data partitions
typeColNum <- grep(class_variable_name,names(df))
accuracies <- rep(NA,nruns)
set.seed(1)
for (i in 1:nruns){
trainset_size <- floor(train_fraction * nrow(df))
trainset_indices <- sample(seq_len(nrow(df)), size = trainset_size)
trainset <- df[trainset_indices, ]
testset <- df[-trainset_indices, ]
#build model
#paste builds formula string and as.formula interprets it as an R formula
rpart_model <- rpart(as.formula(paste(class_variable_name,"~.")),data = trainset, method="class")
#prune
opt <- which.min(rpart_model$cptable[,"xerror"])
cp <- rpart_model$cptable[opt, "CP"]
pruned_model <- prune(rpart_model,cp)
#predictions of PRUNED model
rpart_pruned_predict <- predict(pruned_model,testset[,-typeColNum],type="class")
#accuracy
accuracies[i] <- mean(rpart_predict==testset[[class_variable_name]])
}
return(accuracies)
}
#calculate average accuracy and std dev over 30 random partitions
accuracy_results_pruned <- multiple_runs_rpart_pruned(repurchase_tree,"Target",0.8,30)
mean(accuracy_results_pruned)
## [1] 0.9833777
#getthe sd of pruned model
sd(accuracy_results_pruned)
## [1] 0.0001911273
#Build random forest model
repurchase_tree.rf <- randomForest(Target ~.,data = trainset,
importance=TRUE, xtest=testset[,-typeColNum],ntree=1000)
#model summary
summary(repurchase_tree.rf)
## Length Class Mode
## call 6 -none- call
## type 1 -none- character
## predicted 105069 factor numeric
## err.rate 3000 -none- numeric
## confusion 6 -none- numeric
## votes 210138 matrix numeric
## oob.times 105069 -none- numeric
## classes 2 -none- character
## importance 56 -none- numeric
## importanceSD 42 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 0 -none- NULL
## y 105069 factor numeric
## test 5 -none- list
## inbag 0 -none- NULL
## terms 3 terms call
#variables contained in model
names(repurchase_tree.rf)
## [1] "call" "type" "predicted"
## [4] "err.rate" "confusion" "votes"
## [7] "oob.times" "classes" "importance"
## [10] "importanceSD" "localImportance" "proximity"
## [13] "ntree" "mtry" "forest"
## [16] "y" "test" "inbag"
## [19] "terms"
#predictions for test set
test_predictions_rf <- data.frame(testset,repurchase_tree.rf$test$predicted)
#accuracy for test set
mean(repurchase_tree.rf$test$predicted==testset$Target)
## [1] 1
#confusion matrix
table(repurchase_tree.rf$test$predicted,testset$Target)
##
## 0 1
## 0 25537 0
## 1 0 731
#quantitative measure of variable importance
importance(repurchase_tree.rf)
## 0 1 MeanDecreaseAccuracy
## ID 173.730938 263.879970389 248.557424
## car_model 4.904517 1.636390968 4.965468
## car_segment 1.636027 -0.003547715 1.638161
## age_of_vehicle_years 22.334638 10.957448568 22.567144
## sched_serv_warr 17.400935 14.896001976 17.716216
## non_sched_serv_warr 12.040156 12.156117316 12.171405
## sched_serv_paid 13.795168 9.204049295 13.942654
## non_sched_serv_paid 10.753038 7.562621406 10.824218
## total_paid_services 11.089435 7.414215278 11.135195
## total_services 20.461936 14.029378479 20.645160
## mth_since_last_serv 26.849001 18.045049849 27.017357
## annualised_mileage 20.851312 7.271398160 20.874954
## num_dealers_visited 12.835700 13.821226152 13.057339
## num_serv_dealer_purchased 18.945624 13.915429729 19.372926
## MeanDecreaseGini
## ID 4322.335453
## car_model 19.897346
## car_segment 3.056903
## age_of_vehicle_years 97.570276
## sched_serv_warr 99.390696
## non_sched_serv_warr 46.756112
## sched_serv_paid 82.384973
## non_sched_serv_paid 33.501232
## total_paid_services 51.981259
## total_services 118.114533
## mth_since_last_serv 202.354581
## annualised_mileage 153.450574
## num_dealers_visited 57.635085
## num_serv_dealer_purchased 137.892507
#sorted plot of importance
varImpPlot(repurchase_tree.rf)
#Homework: estimate average accuracy for a randomForest model on this dataset. You will need to
#write a function to do this.
AccuracyRF <- function(df,class_variable_name,train_fraction,nruns){
#Purpose:
#Builds rpart model for nrun data partitions
#Return value:
#Vector containing nrun accuracies
#Arguments:
#df: variable containing dataframe
#class_variable_name: class name as a quoted string. e.g. "Class"
#train_fraction: fraction of data to be assigned to training set (0<train_fraction<1)
#nruns: number of data partitions
#Of all the names in the dataframe find me the one that matches class_variable_names and assign
#its index number to typeColNum
typeColNum <- grep(class_variable_name,names(df))
#Creating an empty vector with a size of nruns (the number of runs to test average accuracy)
accuracies <- rep(NA,nruns)
set.seed(1)
#this loop finds the mean of the testset for each run and adding it to the accuracies vector
for (i in 1:nruns) {
#this determines the size of the training set based on the train_fraction
trainset_size <- floor(train_fraction * nrow(df))
#samples a number of rows and assigns it to trainset_indices according to the trainset_size
trainset_indices <- sample(seq_len(nrow(df)), size = trainset_size)
#the indices defined above beciome part of the trainset
trainset <- df[trainset_indices, ]
#the negative of the line above becomes part of the testset
testset <- df[-trainset_indices, ]
#rpart_model will model us the trainset data using the classification method
randomForest_model <- randomForest(as.formula(paste(class_variable_name,"~.")),data = trainset,
importance=TRUE, xtest=testset[,-typeColNum],ntree=1000)
#predict on test data
accuracies[i] <- mean(randomForest_model$test$predicted==testset$Type)
}
return(accuracies)
}
THIS IS THE BIT TO LOOK AT TO FIX - KEEPS GIVING is.na() applied to non-(list or vector) of type null error
#Accuracy <- AccuracyRF(repurchase_tree,"Target",0.8,1)
#mean(Accuracy)
#sd(Accuracy)
#run the EstAveAccuracy function and assign the results to accuracy_results
#accuracy_results <- EstAveAccuracy(repurchase_tree,"Type",0.8,30)
#find mean of accuracy_results
#mean(accuracy_results)
#find standard deviation of accuracy_results
#sd(accuracy_results)