#1. Introduction
In this project, I am an analyst studying historical data to model key outcomes. As a risk analyst for a credit card company, I examine customer characteristics to assess the likelihood of default, helping calculate credit risk. For the government, I analyze wage growth patterns and related economic factors to support policy decisions on the labor force and economic agenda.
#1. Data Loading:
library(rpart)
library(rpart.plot)
file_path <- "/Users/arthurrichardson/Documents/Documents - Arthur’s MacBook Pro/College/MAT 303/CSV/creditcard.csv"
credit_default <- read.csv(file_path)
file_path <- "/Users/arthurrichardson/Documents/Documents - Arthur’s MacBook Pro/College/MAT 303/CSV/economic.csv"
economic <- read.csv(file_path)
## Rows: 600
## Colmuns: 8
## age sex education marriage assets missed_payment credit_utilize
## 1 28 male college married house no 0.191
## 2 31 female postgraduate married car_house no 0.196
## 3 48 female college married house yes 1.000
## 4 35 male high_school married car yes 1.000
## 5 30 female high_school unmarried car no 0.374
## 6 23 female high_school unmarried house yes 0.970
## default
## 1 no
## 2 no
## 3 yes
## 4 yes
## 5 yes
## 6 yes
#2. Data Preparation
set.seed(6751342)
samp.size = floor(0.70*nrow(credit_default))
## [1] "Number of rows for the training set"
## Credit Default Training Data Rows: 420
## Credit Default Training Data Colmuns: 8
##. Credit Default Testing set
## [1] "Number of rows for the test set"
## Credit Default Test Set Rows: 180
## Credit Default Test Set Colmuns: 8
#3. Create a Model
set.seed(6751342)
model <- rpart(default ~ missed_payment + credit_utilize + assets, method="class", data=training.data, control = rpart.control(minsplit=10))
printcp(model)
##
## Classification tree:
## rpart(formula = default ~ missed_payment + credit_utilize + assets,
## data = training.data, method = "class", control = rpart.control(minsplit = 10))
##
## Variables actually used in tree construction:
## [1] assets credit_utilize
##
## Root node error: 194/420 = 0.4619
##
## n= 420
##
## CP nsplit rel error xerror xstd
## 1 0.788660 0 1.00000 1.00000 0.052666
## 2 0.051546 1 0.21134 0.21649 0.031692
## 3 0.010000 3 0.10825 0.10825 0.023023
##
## Classification tree:
## rpart(formula = default ~ missed_payment + credit_utilize + assets,
## data = training.data, method = "class", control = rpart.control(cp = 0.021))
##
## Variables actually used in tree construction:
## [1] assets credit_utilize
##
## Root node error: 194/420 = 0.4619
##
## n= 420
##
## CP nsplit rel error xerror xstd
## 1 0.788660 0 1.00000 1.00000 0.052666
## 2 0.051546 1 0.21134 0.21649 0.031692
## 3 0.021000 3 0.10825 0.10825 0.023023
## Make predictions on the test data
pred <- predict(pruned_model, newdata=testing.data, type='class')
## [1] "Confusion Matrix"
## pred
## Prediction default : no Prediction default : yes
## Actual default : no "77" " 5"
## Actual default : yes " 3" "95"
## [1] "Prediction for defaulting (yes or no): missed_payment='none', credit_utilize = .30, asset='car_house' "
newdata1 <- data.frame(missed_payment='no', credit_utilize = .30, assets='car_house')
## 1
## no
## Levels: no yes
## [1] "Prediction for defaulting (yes or no): missed_payment='yes', credit_utilize = .30, asset='none' "
newdata2 <- data.frame(missed_payment='no', credit_utilize = .30, assets='none')
## 1
## yes
## Levels: no yes
#4. Regression Decision Tree
head(economic, 6)
## wage_growth inflation unemployment economy education gdp
## 1 7.30 4.49 3.56 no_recession college 6.27
## 2 9.05 9.59 2.42 no_recession college 9.44
## 3 10.08 11.36 1.23 no_recession post_graduate 18.29
## 4 10.98 9.55 1.18 no_recession post_graduate 19.96
## 5 8.54 8.63 2.54 no_recession high_school 8.43
## 6 9.75 8.26 2.22 no_recession college 17.85
set.seed(507690)
## [1] "Number of rows for the training set"
## [1] 79
## [1] "Number of rows for the testing set"
## [1] 20
set.seed(507690)
model2 <- rpart(wage_growth ~ economy + inflation + gdp, method="anova", data=train.data2, control = rpart.control(minsplit=10))
printcp(model2)
##
## Regression tree:
## rpart(formula = wage_growth ~ economy + inflation + gdp, data = train.data2,
## method = "anova", control = rpart.control(minsplit = 10))
##
## Variables actually used in tree construction:
## [1] gdp inflation
##
## Root node error: 549.06/79 = 6.9502
##
## n= 79
##
## CP nsplit rel error xerror xstd
## 1 0.656007 0 1.000000 1.02825 0.147202
## 2 0.145357 1 0.343993 0.42856 0.055392
## 3 0.057143 2 0.198636 0.27566 0.042452
## 4 0.028626 3 0.141493 0.21520 0.027677
## 5 0.017301 4 0.112867 0.18980 0.023291
## 6 0.010715 5 0.095566 0.18342 0.024280
## 7 0.010491 6 0.084851 0.17213 0.023401
## 8 0.010000 7 0.074360 0.16836 0.023428
rpart(formula = wage_growth ~ economy + inflation + gdp, data = train.data2,
method = "anova", control = rpart.control(minsplit = 10))
## n= 79
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 79 549.06280000 7.0849370
## 2) gdp< 4.48 20 52.75978000 3.4175000
## 4) inflation< 2.42 10 11.01965000 2.1650000
## 8) gdp< 1.95 3 1.48006700 0.9933333 *
## 9) gdp>=1.95 7 3.65614300 2.6671430 *
## 5) inflation>=2.42 10 10.36500000 4.6700000 *
## 3) gdp>=4.48 59 136.11410000 8.3281360
## 6) gdp< 8.705 23 18.67129000 6.8730430
## 12) inflation< 4.24 7 0.98728570 5.9014290 *
## 13) inflation>=4.24 16 8.18464400 7.2981250 *
## 7) gdp>=8.705 36 37.63262000 9.2577780
## 14) inflation< 7.73 12 10.14187000 8.3233330
## 28) gdp< 13.4 9 4.34140000 7.9233330 *
## 29) gdp>=13.4 3 0.04046667 9.5233330 *
## 15) inflation>=7.73 24 11.77340000 9.7250000 *
set.seed(507690)
pruned_model2 <- rpart(wage_growth ~ economy + inflation + gdp, method="anova", data=train.data2, control = rpart.control(cp = 0.019))
printcp(pruned_model2)
##
## Regression tree:
## rpart(formula = wage_growth ~ economy + inflation + gdp, data = train.data2,
## method = "anova", control = rpart.control(cp = 0.019))
##
## Variables actually used in tree construction:
## [1] gdp inflation
##
## Root node error: 549.06/79 = 6.9502
##
## n= 79
##
## CP nsplit rel error xerror xstd
## 1 0.656007 0 1.00000 1.02825 0.147202
## 2 0.145357 1 0.34399 0.42856 0.055392
## 3 0.057143 2 0.19864 0.27566 0.042452
## 4 0.028626 3 0.14149 0.22761 0.029377
## 5 0.019000 4 0.11287 0.19861 0.027348
rpart(formula = wage_growth ~ economy + inflation + gdp, data = train.data2,
method = "anova", control = rpart.control(cp = 0.019))
## n= 79
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 79 549.06280 7.084937
## 2) gdp< 4.48 20 52.75978 3.417500
## 4) inflation< 2.42 10 11.01965 2.165000 *
## 5) inflation>=2.42 10 10.36500 4.670000 *
## 3) gdp>=4.48 59 136.11410 8.328136
## 6) gdp< 8.705 23 18.67129 6.873043 *
## 7) gdp>=8.705 36 37.63262 9.257778
## 14) inflation< 7.73 12 10.14187 8.323333 *
## 15) inflation>=7.73 24 11.77340 9.725000 *
rpart.plot(pruned_model2)
RMSE = function(pred, obs) {
return(sqrt( sum( (pred - obs)^2 )/length(pred) ) )
}
## [1] "Root Mean Squared Error"
## [1] 1.2334
## [1] "Predicted wage growth: economy='no_recession', inflation=2.10, gdp=2.5"
newdata3 <- data.frame(economy='no_recession', inflation=2.10, gdp=2.5)
predicted_wage_growth = predict(pruned_model2, newdata3, type='vector')
round(predicted_wage_growth,4)
## 1
## 2.165
## [1] "Predicted wage growth: economy='no_recession', inflation=3.50, gdp=6.8"
newdata4 <- data.frame(economy='no_recession', inflation=3.50, gdp=6.8)
predicted_wage_growth = predict(pruned_model2, newdata4, type='vector')
round(predicted_wage_growth,4)
## 1
## 6.873