Loading NetGuard Plus Data

# Look into insured_data structure.  
str(insured_data)

## Classes 'tbl_df', 'tbl' and 'data.frame':    18385 obs. of  9 variables:
##  $ InsuredID          : chr  "574" "748" "6798" "6922" ...
##  $ InsuredName        : chr  "Autohaus Automotive" "Friendly Holding Company" "Lou Bachrodt Chevrolet Co." "Superstore Automotive, Inc." ...
##  $ InsuredState       : chr  "IL" "MI" "IL" "MN" ...
##  $ BusinessClass      : chr  "Auto Dealers" "Auto Dealers" "Auto Dealers" "Auto Dealers" ...
##  $ InsuredRegion      : chr  "Mid West" "Mid West" "Mid West" "Mid West" ...
##  $ ClaimStatus        : num  0 0 0 0 0 0 0 1 0 0 ...
##  $ BusinessRevenue    : chr  "140000000" "200000000" "80000000" "78164902" ...
##  $ GrossPremium       : num  3045 25342 8887 7775 39931 ...
##  $ DiscretionaryCredit: num  0 0 0 0 0 ...

NetGuard Plus Data (2015 - 2019)

18,385 observations with 9 variables:
- Insured ID
- Insured Name
- Insured State
- Business Class
- Insured Region
- Claim Status
- Business Revenue
- Gross Premium
- Discretionary Credit

What we’ll do to the raw Insured data:

Change the data types.

Example: Changing “Character” to “Factor”.

Create new fields using existing fields.

Example: Revenue Band field.

Delete outliers (numerical fields only).

Example: Gross Premium & Discretionary credit

##:::::::::::::::::::::::::::::::::
## Change Data Type
##:::::::::::::::::::::::::::::::::

# change data type to factors.
insured_data$InsuredState <- as.factor(insured_data$InsuredState)

insured_data$BusinessClass <- as.factor(insured_data$BusinessClass)

insured_data$InsuredRegion <- as.factor(insured_data$InsuredRegion)

insured_data$ClaimStatus <- as.factor(insured_data$ClaimStatus)

# change data type to numeric.
insured_data$BusinessRevenue <- as.numeric(insured_data$BusinessRevenue)

## Warning: NAs introduced by coercion

##:::::::::::::::::::::::::::::::::
## Creat new fields
##:::::::::::::::::::::::::::::::::

# Create a new variable in the "insured_date" data set called "RevenueBand".
# Use the *rep()* function to create observation value = "NA" and number of observation = length of Buiness Revenue column.
insured_data$RevenueBand <- rep(NA, length(insured_data$BusinessRevenue))

# Categorize each observation into bucket (groups).
insured_data$RevenueBand[which(insured_data$BusinessRevenue <= 10000000)] <- "1. $0M - 10M"
insured_data$RevenueBand[which(insured_data$BusinessRevenue > 10000000 & insured_data$BusinessRevenue <= 50000000)] <- "2. >$10M - 50M"
insured_data$RevenueBand[which(insured_data$BusinessRevenue > 50000000 & insured_data$BusinessRevenue <= 250000000)] <- "3. >$50M - 250M"
insured_data$RevenueBand[which(insured_data$BusinessRevenue > 250000000 & insured_data$BusinessRevenue <= 1000000000)] <- "4. >$250M - $1B"
insured_data$RevenueBand[which(insured_data$BusinessRevenue > 1000000000)] <- "5. Over $1B"
insured_data$RevenueBand[which(is.na(insured_data$BusinessRevenue))] <- "6. Undefined"

#change data type as Factor.
insured_data$RevenueBand <- as.factor(insured_data$RevenueBand)


##:::::::::::::::::::::::::::::::::
## Eliminate outliers
##:::::::::::::::::::::::::::::::::

#Gross Premium - Outlier Rule of Tumb
#First, create the cutoff for Gross Premium
GrossPremium_cutoff <- quantile(insured_data$GrossPremium, 0.75) + 1.5 * IQR(insured_data$GrossPremium)

#Second, create a group of the outliers that meet the cut-off criteria and save it in a variable: GrossPremium_ROT
GrossPremium_ROT <- which(insured_data$GrossPremium> GrossPremium_cutoff)

#Third, delete the outlier group from the data set and rename the data set.  Repeat this is for Discretionary Credit.
insured_data_ROT1 <- insured_data[-GrossPremium_ROT, ]


#Discretionary Credit 
DiscretionaryCredit_cutoff <- quantile(insured_data_ROT1$DiscretionaryCredit, 0.75) + 1.5 * IQR(insured_data_ROT1$DiscretionaryCredit)

DiscretionaryCredit_ROT <- which(insured_data_ROT1$DiscretionaryCredit> DiscretionaryCredit_cutoff )

insured_data_ROT2 <- insured_data_ROT1[-DiscretionaryCredit_ROT, ]

##:::::::::::::::::::::::::::::::::
## Evaluate updated data set
##:::::::::::::::::::::::::::::::::

str(insured_data_ROT2)

## Classes 'tbl_df', 'tbl' and 'data.frame':    12281 obs. of  10 variables:
##  $ InsuredID          : chr  "574" "748" "6798" "6922" ...
##  $ InsuredName        : chr  "Autohaus Automotive" "Friendly Holding Company" "Lou Bachrodt Chevrolet Co." "Superstore Automotive, Inc." ...
##  $ InsuredState       : Factor w/ 52 levels "1","AK","AL",..: 16 24 16 25 26 24 37 37 24 37 ...
##  $ BusinessClass      : Factor w/ 232 levels "Accounting, Auditing, and Bookkeeping",..: 15 15 15 15 15 15 15 232 15 15 ...
##  $ InsuredRegion      : Factor w/ 4 levels "Mid West","North East",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ ClaimStatus        : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 2 ...
##  $ BusinessRevenue    : num  1.40e+08 2.00e+08 8.00e+07 7.82e+07 1.64e+08 ...
##  $ GrossPremium       : num  3045 25342 8887 7775 39931 ...
##  $ DiscretionaryCredit: num  0 0 0 0 0 0 0 -41.2 0 0 ...
##  $ RevenueBand        : Factor w/ 6 levels "1. $0M - 10M",..: 3 3 3 3 3 3 3 2 4 3 ...

Before: 18,385 observations with 9 variables

After: 12,281 observations** with 10 variables

Delta: 18,385 - 12,281 = 6,104

Split clean data set into Training set & Test set.

# Set seed of 123
set.seed(123)

# Store row numbers for training set: index_train
index_train <- sample(1:nrow(insured_data_ROT2), 2/3*nrow(insured_data_ROT2))

# Create training set: training_set
training_set <- insured_data_ROT2[index_train, ]

# Create test set: test_set
test_set <- insured_data_ROT2[-index_train, ]

Decision Tree

Decision Tree is a form of supervised machine learning used to solve regression and classification problems.

Imagine if you’re playing a game of Twenty Questions.
The point of the game is to guess the subject that your opponent secretly chose.
But since we’re limited to 20 questions, we want to ask an effective broad question first, and then ask a specific questions after to narrow down our options.

For example, for the first question, it’s better to ask “Is it alive?” instead of “Is it tall?”.

Key point: Some questions have a higher predictive power that others.

This is what the decision tree does for us. We’re going to have R take our clean data set and choose the question that provides the best split. And it’ll keep doing this until all points are considered, and we can easily identify our answer.

The Decision tree structure

All decision trees start with a Root Node which represents the number of observations. The Root Node is split into 2 sub-nodes called Decision Node until it stops splitting.
The last node is called the Terminal Node or a Leaf.

Decision Tree Definitions:

Root Node represents the entire population or sample. It gets divided into two or more homogeneous sets.
Splitting is a process of dividing a node into two or more sub-nodes.
A sub-node splits into either two nodes:
1. Decision Node: sub-node continues splitting.
2. Terminal Node or Leaf: sub-node stops splitting.
When you remove sub-nodes, this process is called Pruning. (Opposite of pruning is Splitting.)
A sub-section of an entire tree is called Branch.
A node that divided into sub-nodes is called a parent node, and the sub-nodes are called the child node.

How are Splitting Decisions made?

Now that we know about what a Decision tree looks like, lets get into how the decision tree model decides on a decision node and where to split it.

How are decision nodes decided?

- For example, how do we know if “Insured Region = Midwest” is a better decision node for our model compared to “Insured Region = Northeast”?

Answer: We need to calculate the gini impurity & purity gain.

Gini Impurity & Purity Gain

The gini impurity (also known as gini measure) measures the likelihood of an incorrect classification of a specific split in a decision node.

It measures the impurity (inaccuracy) when we split the data set.

Gini Impurity = 0 (perfect split)
The gini impurity value is lowest at zero when there is no random variation in the split.
- In other words, when the split divides the observations evenly amount the groups.
- Example: 10 Observations, evenly split into 2 groups.

Using the gini impurity value, we can measure the Purity Gain.

The Purity Gain is a measurement that represents the overall likelihood of a correct classification for the entire decision node.
Purity gain = 1 (perfect split)
The purity gain value is at its highest at 1 when there is no random variation in the split decision node split (3 split in total).

The idea is to calculate the purity gain for every possible case and choose the decision node with the highest (maximum) amount of Purity Gain.

The decision node with the highest purity gain will have the lowest gini impurity value.

Before getting into calculating the Gini Impurity & Purity Gain, let’s first go over how to understand a decision node:

Example Decision Node: Insured Region = Midwest

Total # of observation = 500
- Yes, Insured Region = Midwest: 120 observations
- No, Insured Region is NOT Midwest: 380 observations
  (120 + 380 = 500 observations)
Out of 120 observations :
- 100 observations have NO Claim.
- 20 observations have a past Claim.
Out of 380 observations :
- 300 observations have NO Claim.
- 80 observations have a past Claim.

Step-by-step: Calculate Impurity (gini measure) and the Purity gain:

Now that we understand the decision node, let’s calculate the Gini Impurity & Purity Gain.

Remember, we’ll be using the Gini Impurity value from each split in the decision node to calculate the Purity Gain.

Step 1) Calculate the Gini Impurity for all three splits within the decision node.

Gini Impurity Equation
Gini Impurity = \(2 * proportion( ClaimCount ) * proportion( NoClaimCount )\)

Gini Root Split_1 = 2 * (120 / 500) * (380 / 500)
- = 0.3648
Gini Node Split_2 = 2 * (100 / 400) * (300 / 400)
- = 0.375
Gini Node Split_3 = 2 * (20 / 100) * (80 / 100)
- = 0.32

Step 2) Calculate the Purity Gain using the three Gini Impurity values.

Purity Gain Equation:
Purity Gain = \(GiniRootS1 - (prop(ClaimCountAllS2) * GiniNodeS2) - (prop(NoClaimCountAllS3) * GiniNodeS3)\)

Building Decision Tree using rpart

Imagine if we had to calculate all possible splits, that would take us an unimaginable amount of time.

This is where rpart library comes in.

The rpart library uses a predictive technique called CART (classification and regression trees) and a special algorithm called RPART (Recursive Partitioning And Regression Trees) to calculate and generate a decision tree.
Fact: The inventor (Leo Breiman) trademarked the actual technique as “CART”, but the world widely calls it by the algorithm name, “RPART”.

Just as we learned before, the rpart algorithm works by splitting the dataset recursively, meaning the split continues until a predetermined termination node is reached.

Splitting rules are constructed based on the notion of impurity - a measure of the degree of heterogeneity of the leaf nodes. In other words, if a leaf node had no variation and contained only a single class, impurity = 0.

rpart uses two qualification methods to validate models using impurity: Entropy and Gini Index.

Gini impurity and Entropy are both standard metrics to compute “impurity” or “information level”.
They both find the equality or randomness when dataset is split into a node.

The equations to the right show that Gini Impurity and Entropy are computed in a similar fashion. However, Gini Impurity is calculated with much less computation, because there are no logarithmic function involved. That’s the main difference!

rpart’s warnings!!!

We just learned how useful rpart can be, but there are some pitfalls we’ll run into very fast.

Problem 1) Unbalanced data
One of the main reason is that our Insurance data is extremely unbalanced due to low percentage of past claim on policies.

Let’s check out our data, specifically our dependent variable: ClaimStatus.

summary(training_set$ClaimStatus)

##    0    1 
## 7727  460

460/7727

## [1] 0.05953151

With less than 6% of past claim in our data set, rpart will generate an error instead of a decision tree.

rpart generates this error because it recognizes that it’s achieving the highest accuracy by simply predicting all cases to be “No Claim”.
Similarly to what we observed when setting a cut-off in the logistic regression model, the more unbalanced our data set is, the less predictable power the model will have.

library(rpart)

fit_claim <- rpart(ClaimStatus ~ ., method = "class", data = training_set)

plot(fit_default)

Problem #2) Large & complex decision tree

Along with our problem with unbalanced data, we also face a challenge of simplifying our decision tree.
When decision trees are too complex, they start looking like a spider web, and we call this “over-fitting”.
Over-fitted decisions trees are not only impossible to read, but they produce inferior (insignificant) results when applying the model to the test set.

The best process to simplify an over-fitted decision tree is called pruning.

Lets take it step by step.
We’ll first go over how to overcome unbalance data then tackle pruning a decision in the next section.

Overcoming unbalanced data

There are 3 main techniques used to overcome an unbalanced data set (past claim values are very low in data set).

Undersampling or Oversampling
- We can oversample the under-represented group (policies with claims) OR undersample the over-represented group (policies without claim).
- Balancing the training-set will have a positive effect on accuracy.
- Note: over/under sampling should ONLY be applied the training_set (NOT the test_set).
Changing the Prior Probabilities
- In this technique, we choose a prior probability proportion of “Policies with Claim” v.s. “Policies with No Claim” and set it equal to the proportion in the training set.
Include a Loss Matrix
- When we include a loss matrix in the rpart function, we can have rpart stress more importance on the missclassification of “Policies with claim” (1) = “Policy with No Claim” (0) and penalize it more heavily.
- By increasing the penalty cost of the misclassification, more attention is given to the correct classification of “Policies with Claim” (1) = “Policies with Claim (1)”.

Lets build a decision tree using each of these three techniques.
Note: We’ll compare and validate each of the models at the end and choose the best.

1. Undersample & Oversample:

Question: Should we oversample or undersample?

Oversampling is a great technique when we lack the detailed data and it has yet to be collected.
- Increases “Policies with Past claim” count (“Policies with No claim” stays the same).
Undersampling is used when there’s an overabundance of collected data, and we see a lot of this when working with “big data”.
- Decreases “Policies with No claim” count (“Policies with No claim” stays the same).

For our claims data, we’ll create both an oversample & undersample decision tree and evaluate the two models in the next section.

What we’ll do next:
Step 1. Create an oversample & undersample training set.
Step 2. Construct a decision tree for both oversampled & undersampled training set.
Step 3. Plot both trees.
Note: In the next section, we will simplify and prune the decision tree.

Step 1) create oversample & undersample training set:

Oversample Training Set

# Oversample Training Set

# a. Load the **ROSE** library, and apply ovun.sample() function.  
#    + Define the dependent variable and include columns from training set.  
#    + method = "over" for oversampling
#    + Define the % amount of "Policies with past claim" we want in our oversampled data set.  
#    + Make sure to set a seed to make the randomly generated result reproducible.  

library(ROSE)

## Warning: package 'ROSE' was built under R version 3.6.3

## Loaded ROSE 0.0-3

oversampled_result <- ovun.sample(ClaimStatus ~ InsuredRegion + GrossPremium + RevenueBand + DiscretionaryCredit,
                                   data = training_set,
                                   method = "over",
                                   p = 0.25,
                                   seed = 1234)

oversampled_training_set <- oversampled_result$data

table(oversampled_training_set$ClaimStatus)

## 
##    0    1 
## 7727 2545

Undersample Training Set
1a. Identify how many observation counts we have the “ClaimStatus” field in the training set.

1b. Define & save the percentage amount of “Policies with past claim” we want in our undersampled data set.

1c. Create an undersample dataset using the defined percentage amount of “Policies with past claim”.

# Undersample Training Set

# a. Use the table() function to identify how many "Policies with past claim" & "Policies with no claim" are in the training_set.
# b. Save the "Policies with past claim" count in a variable: n_claim_status.  
table(training_set$ClaimStatus)

## 
##    0    1 
## 7727  460

#0 (No claim) = 7,727
#1 (Yes Claim) = 460

n_claim_status <- 460

# c. Save the % amount of "Policies with past claim" we want in our undersampled data set in variable: new_frac_claim_status, and then divide "n_claim_status" by "new_frac_claim_status", and save the divided value in variable: new_n_total.

new_frac_claim_status <- 0.25

new_n_total <- n_claim_status / new_frac_claim_status


# d. Load the **ROSE** library, and apply ovun.sample() function.  
#    + Define the dependent variable and include columns from training set.  
#    + N = new_n_total  
#    + Make sure to set a seed to make the randomly generated result reproducible.  

library(ROSE)

undersampled_result <- ovun.sample(ClaimStatus ~ InsuredRegion + GrossPremium + RevenueBand + DiscretionaryCredit,
                                   data = training_set,
                                   method = "under",
                                   N = new_n_total,
                                   seed = 1234)

# d. Save the undersampled dataset in variable: undersampled_training_set, and finally use table() function to check values.

undersampled_training_set <- undersampled_result$data

table(undersampled_training_set$ClaimStatus)

## 
##    0    1 
## 1380  460

Step 2) Construct a decision tree

Load the rpart library & construct decision tree for both Oversample training set and Undersample training set.

# Load package rpart in your workspace.
library(rpart)
  
# Oversample Decision Tree  
# Construct decision tree using the oversampled training set. Include "rpart.control" to relax the complexity parameter to 0.001.
tree_oversample <- rpart(ClaimStatus ~ . 
                          , method = "class"
                          , data =  oversampled_training_set
                          , control = rpart.control(cp = 0.001))

# Undersample Decision Tree  
# Construct decision tree using the undersampled training set. Include "rpart.control" to relax the complexity parameter to 0.001.
tree_undersample <- rpart(ClaimStatus ~ . 
                          , method = "class"
                          , data =  undersampled_training_set
                          , control = rpart.control(cp = 0.001))

Step 3) Plot decision tree.

Use the plot() function to plot and the text() function to add labels to the decision tree.

#Oversample Decision Tree  

# Plot the decision tree
plot(tree_oversample, uniform = TRUE)
  
# Add labels to the decision tree
text(tree_oversample, use.n = TRUE)

#Undersample Decision Tree

# Plot the decision tree
plot(tree_undersample, uniform = TRUE)
  
# Add labels to the decision tree
text(tree_undersample, use.n = TRUE)

2. Changing prior probabilities

Prior probability: the proportion of events and non-events in an imbalance data set.

Changing the prior probabilities in a data set indirectly adjusts the importance of incorrectly classifying the prediction for each class.

By making the prior probabilities for “Policies with past claims” bigger, we put more importance when the prediction is classified as “False Positive” or “False Negative” (aka misclassification).

How to change prior probabilities:

Applying prior probabilities to RPART is very easy.

We apply parms(), an additional argument inside rpart(), and this argument specifically deals with unbalanced class sizes.

Inside the parms() argument, define the percentage proportion we want to apply on (#_PoliciesWithPastClaim, #_PoliciesWithNoClaim).

For our example below, we’ll start with a percentage proportion of (70% = PoliciesWithPastClaim, 30% = PoliciesWithNoClaim).
Note: parms argument should always sum up to 1.

# Load package rpart in your workspace.
library(rpart)

#Step 1) Add the parms() argument.  
#- The parms argument has the following form:  
#    + parms = list(prior=c(#_PoliciesWithPastClaim, #_PoliciesWithNoClaim)).  
#     * parms = list(prior=c(0.7, 0.3)).
#     * Note: parms argument should always sum up to 1. 
#    + include **control = rpart.control(cp = 0.001).
#    + Save decision tree in variable: **tree_prior**.
tree_prior <- rpart(ClaimStatus ~ InsuredRegion + GrossPremium + RevenueBand + DiscretionaryCredit
                    , method = "class"
                    , data = training_set
                    , parms = list(prior = c(0.7, 0.3))
                    , control = rpart.control(cp = 0.001))

#Step 2) Plot the decision tree
plot(tree_prior, uniform = TRUE)

# Add labels to the decision tree
text(tree_prior)

3. Including a Loss Matrix

Our third technique of overcoming unbalanced data is including a Loss Matrix (also known as Cost Matrix) in the rpart() function.

Instead of changing the prior probabilities, we use the parms argument to define the penalization cost of each value in the loss matrix.

By increasing the misclassification costs of the False Negative (Actual data has Past Claim but Model predicted No Claim), more attention is given to the correct classification of True Positive and True Negative.
Defining a penalization cost with a loss matrix improves the quality of the decision tree.

#Loss Matrix
TP  FP
FN  TN

Loss Matrix	Model Prediction	Test_set (actual) Prediction
True Positive	1 (Claim)	1 (Claim)
False Positive	1 (Claim)	0 (No Claim)
False Negative	0 (No Claim)	1 (Claim)
True Negative	0 (No Claim)	0 (No Claim)

For our example below, lets adjust the misclassification of “Actual Claim as No Claim” penalization 10 times bigger.

The contingency table’s category of misclassifying “Actual claims” as “No claim” in called False Negative, and it is the second variable in the parms argument.

The parms argument has the following form:
- parms = list(loss = matrix(c(TP, FN, FP, TN)
- Example: parms = list(prior = c(0, 10, 1, 0))

#- Adjust the misclassification of "Actual Claim as No Claim" 10 times bigger.  
#    + parms = list(loss = matrix(c(TP, FN, FP, TN)) 
#    + parms = list(prior = c(0, 10, 1, 0)) 
tree_loss_matrix <- rpart(ClaimStatus ~ InsuredRegion + GrossPremium + RevenueBand + DiscretionaryCredit
                          , method = "class"
                          , data =  training_set
                          , parms = list(loss = matrix(c(0, 10, 1, 0), ncol=2))
                          , control = rpart.control(cp = 0.001))

# Plot the decision tree
plot(tree_loss_matrix, uniform = TRUE)


# Add labels to the decision tree
text(tree_loss_matrix)

Pruning the Decision Tree

Problem with large decision trees:
- Too complex, cannot read
- Overfitting when applying to test set

When a decision is too hard to read and looks more like a spiderweb than a tree, this is called “overfitting”.

Overfitting is a modeling error that occurs when a function is too closely fit to a limited set of data points. An overfitted model describes the random error in the data, rather than the relationship between the variables.

Overfitting occurs when the model is too complex.

Solution: use printcp() and plotcp() for pruning the decision tree.

The rpart package provides tools that make pruning of the decision fairly easy.
We’ll be using functions printcp() and plotcp() for pruning purpose.

Both printcp() and plotcp() are listing the possible values of the complexity parameter.

Cp (complexity parameter)

The complexity parameter (Cp) in rpart is the minimum improvement in the model needed at each node.

Just think of Cp value as the minimum benefit that a split must add to the tree. If the split doesn’t yield at least that much benefit (the value of cp), rpart() won’t add an additional node.

The Cp defines how large (complex) the decision tree will be.

The default value for Cp value is 0.01.

When Cp = 0.00, the decision tree has no restrictions on what a split must add, and it’ll produce the most complex tree possible.

So, what is the best Cp value?

We want to choose the Cp value that produces the lowest amount of cross validation error.

Cross-Validation is a technique used in model selection to better estimate how our decision tree will perform.

The idea behind cross-validation is to create a number of partitions of sample observations, known as the validation sets, from the training data set, then we measure the performance against each validation set, and then calculate the average error. It gives us a better assessment of how the model will perform when asked to predict for new observations.

The functions printcp() and plotcp() will help us validate and identify the best Cp value for our model.

printcp()

printcp() function will generate a list of Cp values, and we use this list to find the value that has the least amount of cross-validated error (xerror).

The cp value with the least amount of cross-validated error (xerror) will generate a decision tree with the most efficient amount of decision nodes for our data set.

For our example below, we’ll print the cp list of “tree_prior” (changing prior probability decision tree).

# Apply printcp function to tree_prior.
printcp(tree_prior)

## 
## Classification tree:
## rpart(formula = ClaimStatus ~ InsuredRegion + GrossPremium + 
##     RevenueBand + DiscretionaryCredit, data = training_set, method = "class", 
##     parms = list(prior = c(0.7, 0.3)), control = rpart.control(cp = 0.001))
## 
## Variables actually used in tree construction:
## [1] DiscretionaryCredit GrossPremium        InsuredRegion      
## [4] RevenueBand        
## 
## Root node error: 2456.1/8187 = 0.3
## 
## n= 8187 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.0218068      0   1.00000 1.00000 0.045296
## 2  0.0206209      2   0.95639 1.03215 0.041910
## 3  0.0137033      3   0.93577 1.00662 0.039494
## 4  0.0050692      5   0.90836 1.00571 0.040510
## 5  0.0049238      7   0.89822 0.99792 0.040050
## 6  0.0038970      9   0.88837 0.98228 0.039835
## 7  0.0029890     11   0.88058 0.97485 0.039931
## 8  0.0028967     14   0.87074 0.98203 0.040140
## 9  0.0025609     15   0.86784 0.98843 0.040448
## 10 0.0025208     26   0.83408 0.98384 0.040645
## 11 0.0024764     35   0.81077 0.98843 0.040947
## 12 0.0023757     38   0.80334 0.98982 0.040849
## 13 0.0022040     41   0.79621 0.98758 0.040997
## 14 0.0021944     43   0.79181 0.98752 0.040947
## 15 0.0021138     47   0.78281 0.98994 0.040949
## 16 0.0021138     48   0.78069 0.99260 0.040901
## 17 0.0020233     49   0.77858 0.99713 0.040904
## 18 0.0019321     53   0.77049 1.00220 0.040608
## 19 0.0019120     54   0.76855 1.01102 0.040413
## 20 0.0018715     57   0.76282 1.01525 0.040416
## 21 0.0018712     60   0.75720 1.01489 0.040366
## 22 0.0018118     63   0.75002 1.01489 0.040366
## 23 0.0018114     66   0.74428 1.01398 0.040365
## 24 0.0018013     70   0.73583 1.01519 0.040366
## 25 0.0017107     81   0.71083 1.02395 0.040371
## 26 0.0016902     84   0.70570 1.02256 0.040471
## 27 0.0016000     85   0.70401 1.02594 0.040524
## 28 0.0014190     90   0.69537 1.03125 0.040426
## 29 0.0014190     92   0.69254 1.03953 0.040280
## 30 0.0013288    107   0.66156 1.04116 0.040129
## 31 0.0012878    109   0.65890 1.04629 0.040385
## 32 0.0012684    115   0.65118 1.04871 0.040386
## 33 0.0012680    119   0.64610 1.04991 0.040387
## 34 0.0012079    121   0.64357 1.05082 0.040388
## 35 0.0012076    127   0.63632 1.05112 0.040388
## 36 0.0011673    133   0.62787 1.05535 0.040390
## 37 0.0010867    136   0.62436 1.06205 0.040444
## 38 0.0010667    141   0.61700 1.06085 0.040443
## 39 0.0010261    144   0.61380 1.06061 0.040242
## 40 0.0010092    145   0.61277 1.06061 0.040242
## 41 0.0010024    152   0.60571 1.06242 0.040243
## 42 0.0010000    164   0.59357 1.06242 0.040243

We’ll also add couple line of code that will automate the process of finding the Cp value with the lowest Xerror.

The printed value of “treeprior_cp” is the Cp value we want to apply in our decision tree.

# Create an index for of the row with the minimum xerror
index_prior <- which.min(tree_prior$cptabl[ , "xerror"])

treeprior_cp <- tree_prior$cptable[index_prior, "CP"]
treeprior_cp

## [1] 0.002988955

plotcp()

We can also plot the cross-validation error as a function of the complexity parameter and size of the tree using the function plotcp().

You can also plot a cross-validation error as a function of the complexity parameter CP and size of the tree using the function plotcp().

plotcp(tree_prior)

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.6.2

# tree_prior is loaded in your workspace

# Plot the cross-validated error rate as a function of the complexity parameter
plotcp(tree_prior)

# Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized.
printcp(tree_prior)

## 
## Classification tree:
## rpart(formula = ClaimStatus ~ InsuredRegion + GrossPremium + 
##     RevenueBand + DiscretionaryCredit, data = training_set, method = "class", 
##     parms = list(prior = c(0.7, 0.3)), control = rpart.control(cp = 0.001))
## 
## Variables actually used in tree construction:
## [1] DiscretionaryCredit GrossPremium        InsuredRegion      
## [4] RevenueBand        
## 
## Root node error: 2456.1/8187 = 0.3
## 
## n= 8187 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.0218068      0   1.00000 1.00000 0.045296
## 2  0.0206209      2   0.95639 1.03215 0.041910
## 3  0.0137033      3   0.93577 1.00662 0.039494
## 4  0.0050692      5   0.90836 1.00571 0.040510
## 5  0.0049238      7   0.89822 0.99792 0.040050
## 6  0.0038970      9   0.88837 0.98228 0.039835
## 7  0.0029890     11   0.88058 0.97485 0.039931
## 8  0.0028967     14   0.87074 0.98203 0.040140
## 9  0.0025609     15   0.86784 0.98843 0.040448
## 10 0.0025208     26   0.83408 0.98384 0.040645
## 11 0.0024764     35   0.81077 0.98843 0.040947
## 12 0.0023757     38   0.80334 0.98982 0.040849
## 13 0.0022040     41   0.79621 0.98758 0.040997
## 14 0.0021944     43   0.79181 0.98752 0.040947
## 15 0.0021138     47   0.78281 0.98994 0.040949
## 16 0.0021138     48   0.78069 0.99260 0.040901
## 17 0.0020233     49   0.77858 0.99713 0.040904
## 18 0.0019321     53   0.77049 1.00220 0.040608
## 19 0.0019120     54   0.76855 1.01102 0.040413
## 20 0.0018715     57   0.76282 1.01525 0.040416
## 21 0.0018712     60   0.75720 1.01489 0.040366
## 22 0.0018118     63   0.75002 1.01489 0.040366
## 23 0.0018114     66   0.74428 1.01398 0.040365
## 24 0.0018013     70   0.73583 1.01519 0.040366
## 25 0.0017107     81   0.71083 1.02395 0.040371
## 26 0.0016902     84   0.70570 1.02256 0.040471
## 27 0.0016000     85   0.70401 1.02594 0.040524
## 28 0.0014190     90   0.69537 1.03125 0.040426
## 29 0.0014190     92   0.69254 1.03953 0.040280
## 30 0.0013288    107   0.66156 1.04116 0.040129
## 31 0.0012878    109   0.65890 1.04629 0.040385
## 32 0.0012684    115   0.65118 1.04871 0.040386
## 33 0.0012680    119   0.64610 1.04991 0.040387
## 34 0.0012079    121   0.64357 1.05082 0.040388
## 35 0.0012076    127   0.63632 1.05112 0.040388
## 36 0.0011673    133   0.62787 1.05535 0.040390
## 37 0.0010867    136   0.62436 1.06205 0.040444
## 38 0.0010667    141   0.61700 1.06085 0.040443
## 39 0.0010261    144   0.61380 1.06061 0.040242
## 40 0.0010092    145   0.61277 1.06061 0.040242
## 41 0.0010024    152   0.60571 1.06242 0.040243
## 42 0.0010000    164   0.59357 1.06242 0.040243

# Create an index for of the row with the minimum xerror
index <- which.min(tree_prior$cptabl[ , "xerror"])

# Create tree_min
tree_min <- tree_prior$cptable[index, "CP"]

#  Prune the tree using tree_min
ptree_prior <- prune(tree_prior, cp = tree_min)

# Use prp() to plot the pruned tree
prp(ptree_prior, extra = 1)

# Plot the cross-validated error rate as a function of the complexity parameter
plotcp(tree_oversample)

plotcp(tree_undersample)

plotcp(tree_prior)

plotcp(tree_loss_matrix)

# Use printcp() to identify for which complexity parameter the cross-validated error rate is minimized.
printcp(tree_oversample)

## 
## Classification tree:
## rpart(formula = ClaimStatus ~ ., data = oversampled_training_set, 
##     method = "class", control = rpart.control(cp = 0.001))
## 
## Variables actually used in tree construction:
## [1] DiscretionaryCredit GrossPremium        InsuredRegion      
## [4] RevenueBand        
## 
## Root node error: 2545/10272 = 0.24776
## 
## n= 10272 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.0083824      0   1.00000 1.00000 0.017192
## 2  0.0038310      5   0.94774 0.95560 0.016929
## 3  0.0037328     13   0.91513 0.93006 0.016770
## 4  0.0036346     15   0.90766 0.92849 0.016760
## 5  0.0035363     22   0.86758 0.92849 0.016760
## 6  0.0033399     24   0.86051 0.92141 0.016715
## 7  0.0030452     28   0.84597 0.91631 0.016682
## 8  0.0029470     32   0.83379 0.90845 0.016632
## 9  0.0027505     35   0.82436 0.90098 0.016583
## 10 0.0025540     41   0.80786 0.89862 0.016567
## 11 0.0024885     45   0.79764 0.88841 0.016500
## 12 0.0021611     48   0.79018 0.87348 0.016399
## 13 0.0019646     52   0.78153 0.85815 0.016294
## 14 0.0017682     62   0.75756 0.83576 0.016137
## 15 0.0015717    127   0.61807 0.79136 0.015811
## 16 0.0013752    146   0.58153 0.72888 0.015319
## 17 0.0013098    160   0.55599 0.71827 0.015232
## 18 0.0012443    164   0.54971 0.68723 0.014968
## 19 0.0011788    170   0.54224 0.68527 0.014951
## 20 0.0011002    188   0.52063 0.66758 0.014796
## 21 0.0010478    193   0.51513 0.63969 0.014544
## 22 0.0010000    227   0.46326 0.63733 0.014522

printcp(tree_undersample)

## 
## Classification tree:
## rpart(formula = ClaimStatus ~ ., data = undersampled_training_set, 
##     method = "class", control = rpart.control(cp = 0.001))
## 
## Variables actually used in tree construction:
## [1] DiscretionaryCredit GrossPremium        InsuredRegion      
## [4] RevenueBand        
## 
## Root node error: 460/1840 = 0.25
## 
## n= 1840 
## 
##           CP nsplit rel error xerror     xstd
## 1  0.0095652      0   1.00000 1.0000 0.040379
## 2  0.0086957      6   0.93261 1.0239 0.040695
## 3  0.0072464      8   0.91522 1.0261 0.040724
## 4  0.0065217     13   0.87826 1.0283 0.040752
## 5  0.0054348     14   0.87174 1.0283 0.040752
## 6  0.0043478     16   0.86087 1.0239 0.040695
## 7  0.0032609     18   0.85217 1.0391 0.040892
## 8  0.0030435     24   0.83261 1.0630 0.041192
## 9  0.0026087     31   0.81087 1.0848 0.041457
## 10 0.0021739     36   0.79783 1.1022 0.041663
## 11 0.0010870     51   0.76304 1.1043 0.041689
## 12 0.0010000     57   0.75652 1.1196 0.041864

printcp(tree_prior)

## 
## Classification tree:
## rpart(formula = ClaimStatus ~ InsuredRegion + GrossPremium + 
##     RevenueBand + DiscretionaryCredit, data = training_set, method = "class", 
##     parms = list(prior = c(0.7, 0.3)), control = rpart.control(cp = 0.001))
## 
## Variables actually used in tree construction:
## [1] DiscretionaryCredit GrossPremium        InsuredRegion      
## [4] RevenueBand        
## 
## Root node error: 2456.1/8187 = 0.3
## 
## n= 8187 
## 
##           CP nsplit rel error  xerror     xstd
## 1  0.0218068      0   1.00000 1.00000 0.045296
## 2  0.0206209      2   0.95639 1.03215 0.041910
## 3  0.0137033      3   0.93577 1.00662 0.039494
## 4  0.0050692      5   0.90836 1.00571 0.040510
## 5  0.0049238      7   0.89822 0.99792 0.040050
## 6  0.0038970      9   0.88837 0.98228 0.039835
## 7  0.0029890     11   0.88058 0.97485 0.039931
## 8  0.0028967     14   0.87074 0.98203 0.040140
## 9  0.0025609     15   0.86784 0.98843 0.040448
## 10 0.0025208     26   0.83408 0.98384 0.040645
## 11 0.0024764     35   0.81077 0.98843 0.040947
## 12 0.0023757     38   0.80334 0.98982 0.040849
## 13 0.0022040     41   0.79621 0.98758 0.040997
## 14 0.0021944     43   0.79181 0.98752 0.040947
## 15 0.0021138     47   0.78281 0.98994 0.040949
## 16 0.0021138     48   0.78069 0.99260 0.040901
## 17 0.0020233     49   0.77858 0.99713 0.040904
## 18 0.0019321     53   0.77049 1.00220 0.040608
## 19 0.0019120     54   0.76855 1.01102 0.040413
## 20 0.0018715     57   0.76282 1.01525 0.040416
## 21 0.0018712     60   0.75720 1.01489 0.040366
## 22 0.0018118     63   0.75002 1.01489 0.040366
## 23 0.0018114     66   0.74428 1.01398 0.040365
## 24 0.0018013     70   0.73583 1.01519 0.040366
## 25 0.0017107     81   0.71083 1.02395 0.040371
## 26 0.0016902     84   0.70570 1.02256 0.040471
## 27 0.0016000     85   0.70401 1.02594 0.040524
## 28 0.0014190     90   0.69537 1.03125 0.040426
## 29 0.0014190     92   0.69254 1.03953 0.040280
## 30 0.0013288    107   0.66156 1.04116 0.040129
## 31 0.0012878    109   0.65890 1.04629 0.040385
## 32 0.0012684    115   0.65118 1.04871 0.040386
## 33 0.0012680    119   0.64610 1.04991 0.040387
## 34 0.0012079    121   0.64357 1.05082 0.040388
## 35 0.0012076    127   0.63632 1.05112 0.040388
## 36 0.0011673    133   0.62787 1.05535 0.040390
## 37 0.0010867    136   0.62436 1.06205 0.040444
## 38 0.0010667    141   0.61700 1.06085 0.040443
## 39 0.0010261    144   0.61380 1.06061 0.040242
## 40 0.0010092    145   0.61277 1.06061 0.040242
## 41 0.0010024    152   0.60571 1.06242 0.040243
## 42 0.0010000    164   0.59357 1.06242 0.040243

printcp(tree_loss_matrix)

## 
## Classification tree:
## rpart(formula = ClaimStatus ~ InsuredRegion + GrossPremium + 
##     RevenueBand + DiscretionaryCredit, data = training_set, method = "class", 
##     parms = list(loss = matrix(c(0, 10, 1, 0), ncol = 2)), control = rpart.control(cp = 0.001))
## 
## Variables actually used in tree construction:
## [1] DiscretionaryCredit GrossPremium        InsuredRegion      
## [4] RevenueBand        
## 
## Root node error: 4600/8187 = 0.56187
## 
## n= 8187 
## 
##           CP nsplit rel error xerror      xstd
## 1  0.1393478      0   1.00000 0.1000 0.0045296
## 2  0.0069565      1   0.86065 4.0254 0.0817016
## 3  0.0054348      6   0.82587 3.8167 0.0801069
## 4  0.0028261     11   0.79739 3.3617 0.0762870
## 5  0.0026087     12   0.79457 3.1628 0.0743987
## 6  0.0025000     17   0.78152 3.0900 0.0736909
## 7  0.0023188     26   0.75196 3.0337 0.0731121
## 8  0.0021304     42   0.70478 3.0057 0.0728332
## 9  0.0020652     48   0.69130 2.9909 0.0726809
## 10 0.0019565     53   0.68043 2.9543 0.0723104
## 11 0.0017391     55   0.67652 2.9870 0.0726362
## 12 0.0016304     60   0.66696 3.1313 0.0740574
## 13 0.0015942     62   0.66370 3.1313 0.0740574
## 14 0.0015217     66   0.65435 3.1011 0.0737657
## 15 0.0014783     72   0.64500 3.1035 0.0737858
## 16 0.0014493     91   0.61174 3.1035 0.0737858
## 17 0.0014130     99   0.59652 3.0887 0.0736379
## 18 0.0013768    107   0.58522 3.0502 0.0732562
## 19 0.0013043    114   0.57522 3.0204 0.0729557
## 20 0.0012319    126   0.55826 2.9954 0.0726941
## 21 0.0011957    129   0.55457 2.9954 0.0726941
## 22 0.0010870    133   0.54978 3.0096 0.0728193
## 23 0.0010326    140   0.54217 3.0300 0.0730089
## 24 0.0010145    152   0.52674 3.0320 0.0730310
## 25 0.0010000    160   0.51609 3.0320 0.0730310

# Create an index for of the row with the minimum xerror
index_oversample <- 0.002#which.min(tree_oversample$cptabl[ , "xerror"])
index_undersample <- which.min(tree_undersample$cptabl[ , "xerror"])
index_prior <- which.min(tree_prior$cptabl[ , "xerror"])
index_loss_matrix <- #2.9543 #which.min(tree_loss_matrix$cptabl[ , "xerror"])

# Create tree_min
#tree_min_oversample #<- round(tree_oversample$cptable[index_oversample , "CP"], digits = 7)
tree_min_undersample <- round(tree_undersample$cptable[index_undersample , "CP"], digits = 7)
tree_min_prior <- round(tree_prior$cptable[index_prior, "CP"], digits = 7)
#tree_min_loss_matrix <- 2.9543 #round(tree_loss_matrix$cptable[index_loss_matrix, "CP"], digits = 7)

#  Prune the tree using tree_min
ptree_oversample <- prune(tree_oversample, cp = .0037)#tree_min_oversample)
ptree_undersample <- prune(tree_undersample, cp = tree_min_undersample)
ptree_prior <- prune(tree_prior, cp = tree_min_prior)
ptree_loss_matrix <- prune(tree_loss_matrix, cp = .003)#0.0021304)#tree_min_loss_matrix)

# Use prp() to plot the pruned tree
prp(ptree_oversample, extra = 1)

prp(ptree_undersample, extra = 1)

prp(ptree_prior, extra = 1)

prp(ptree_loss_matrix, extra = 1)

#Validating the Predictions using the Decision Tree

We need to evaluate the decision tree.

# Make predictions for each of the pruned trees using the test set.
pred_oversample <- predict(ptree_oversample, newdata = test_set,  type = "class")
pred_undersample <- predict(ptree_undersample, newdata = test_set,  type = "class")
pred_prior <- predict(ptree_prior, newdata = test_set,  type = "class")
pred_loss_matrix <- predict(ptree_loss_matrix, newdata = test_set,  type = "class")
#pred_weights <- predict(ptree_weights, newdata = test_set,  type = "class")


# construct confusion matrices using the predictions.
confmat_oversample <- table(test_set$ClaimStatus, pred_oversample)
confmat_undersample <- table(test_set$ClaimStatus, pred_undersample)
confmat_prior <- table(test_set$ClaimStatus, pred_prior)
confmat_loss_matrix <- table(test_set$ClaimStatus, pred_loss_matrix)
#confmat_weights <- table(test_set$loan_status, pred_weights)


# Compute the accuracies
acc_oversample <- sum(diag(confmat_oversample)) / nrow(test_set)
acc_undersample <- sum(diag(confmat_undersample)) / nrow(test_set)
acc_prior <- sum(diag(confmat_prior)) / nrow(test_set)
acc_loss_matrix <- sum(diag(confmat_loss_matrix)) / nrow(test_set)
#acc_weights <-sum(diag(confmat_weights)) / nrow(test_set)


print(acc_oversample)

## [1] 0.91915

print(acc_undersample)

## [1] 0.9015633

print(acc_prior)

## [1] 0.8727406

print(acc_loss_matrix)

## [1] 0.8004397

#print(acc_weights)

Calculate % of claims by choosing % of acceptance

We learned how to compute the bad rate (or, the percentage of claims) in the Insured portfolio when given these two information:

a specific model
the acceptance rate

Compute the bad rate that a Insurance company can expect when using the pruned tree “ptree_prior” and an acceptance rate of 80%.

Instructions:

If you use the predict() function for a tree, the probability of default can be found in the second column. Therefore [,2] was pasted to the predict() function.
Obtain the cut-off that leads to an acceptance rate of 80%, using prob_default_prior. You can use the quantile() function to do this, setting the second argument to 0.8. Assign the name cutoff_prior.
The code to obtain the actual binary default predictions (0 or 1) is provided. ifelse() here. Name the object bin_pred_prior_80.
The code to select the default indicators of test_set for the accepted loans cording to a 80% acceptance rate is provided.
Compute the percentage of defaults (or the “bad rate”) for the accepted loans. This is the number of occurrences of 1 in accepted_status_prior_80, divided by the total number of instances in this vector. Print the solution to your R-console.

#Goal: Calculate the percentage of default (bad rate)
#step1) Create the prediction using pruned decision tree.
#step2) Create the cut-off for the predicited values.
#step3) Change the predicited values to binary values (1 = yes, 0 = no) using the cut-off.
#step4) Isolate the observations that are "default"
#step5) Calculate sum(default)/total # of instance

# Make predictions for the probability of default using the pruned tree and the test set.
prob_claim_prior <- predict(ptree_prior, newdata = test_set)[ ,2]

# Obtain the cutoff for acceptance rate 80%
cutoff_80 <- quantile(prob_claim_prior, 0.8)
  
# Obtain the binary predictions.
bin_pred_prior <- ifelse(prob_default_prior > cutoff_80, 1, 0)

# Obtain the actual default status for the accepted loans
accepted_status_prior <- test_set$ClaimStatus[bin_pred_prior == 0]

# Obtain the bad rate for the accepted loans
### 
sum(accepted_status_prior) / length(accepted_status_prior)

ROC Curve

# Fit the logit, probit and cloglog-link logistic regression models
log_model_logit <- glm(ClaimStatus ~ InsuredRegion + GrossPremium + RevenueBand + DiscretionaryCredit,
                       family = binomial(link = logit), data = training_set)

log_model_probit <- glm(ClaimStatus ~ InsuredRegion + GrossPremium + RevenueBand + DiscretionaryCredit,
                       family = binomial(link = probit), data = training_set)

log_model_cloglog <-  glm(ClaimStatus ~ InsuredRegion + GrossPremium + RevenueBand + DiscretionaryCredit,
                       family = binomial(link = cloglog), data = training_set)

log_model_all_full <-  glm(ClaimStatus ~ InsuredRegion + GrossPremium + RevenueBand + DiscretionaryCredit + InsuredState,
                       family = binomial, data = training_set)

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# Make predictions for all models using the test set
predictions_logit <- predict(log_model_logit, newdata = test_set, type = "response")
predictions_probit <- predict(log_model_probit, newdata = test_set, type = "response")
predictions_cloglog <- predict(log_model_cloglog, newdata = test_set, type = "response")
predictions_all_full <- predict(log_model_all_full, newdata = test_set, type = "response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type
## == : prediction from a rank-deficient fit may be misleading

# Use a cut-off of 14% to make binary predictions-vectors
cutoff <- 0.14
class_pred_logit <- ifelse(predictions_logit > cutoff, 1, 0)
class_pred_probit <- ifelse(predictions_probit > cutoff, 1, 0)
class_pred_cloglog <- ifelse(predictions_cloglog > cutoff, 1, 0)
class_pred_all_full<- ifelse(predictions_all_full > cutoff, 1, 0)

# Load the pROC-package
library(pROC)

## Warning: package 'pROC' was built under R version 3.6.3

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

# Construct the objects containing ROC-information
ROC_logit <- roc(test_set$ClaimStatus, predictions_logit)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

ROC_probit <- roc(test_set$ClaimStatus, predictions_probit)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

ROC_cloglog <- roc(test_set$ClaimStatus, predictions_cloglog)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

ROC_all_full <- roc(test_set$ClaimStatus, predictions_all_full)

## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

# Draw all ROCs on one plot
plot(ROC_logit)
lines(ROC_probit, col="blue")
lines(ROC_cloglog, col="red")
lines(ROC_all_full, col="green")

# Compute the AUCs
auc(ROC_logit)

## Area under the curve: 0.6819

auc(ROC_probit)

## Area under the curve: 0.6841

auc(ROC_cloglog)

## Area under the curve: 0.6813

auc(ROC_all_full)

## Area under the curve: 0.65

pred_undersample <- predict(ptree_undersample, newdata = test_set,  type = "class")

cutoff <- quantile(pred_undersample, 0.8)

pred_undersample <- predict(ptree_undersample, newdata = test_set,  type = "class")


strategy_bank <- function(prob_of_def){ 
  cutoff=rep(NA, 21) 
  bad_rate=rep(NA, 21) 
  accept_rate=seq(1,0,by=-0.05) 
  for (i in 1:21){ 
    cutoff[i]=quantile(prob_of_def,accept_rate[i]) 
    pred_i=ifelse(prob_of_def> cutoff[i], 1, 0) 
    pred_as_good=test_set$ClaimStatus[pred_i==0] 
    bad_rate[i]=sum(pred_as_good)/length(pred_as_good)} 
  table=cbind(accept_rate,cutoff=round(cutoff,4),bad_rate=round(bad_rate,4)) 
  return(list(table=table,bad_rate=bad_rate, accept_rate=accept_rate, cutoff=cutoff)) }


strategy_loss_matrix <- strategy_bank(ptree_loss_matrix)

NetGuard Decision Tree

Hikaru

3/19/2020