library(C50)
## Warning: package 'C50' was built under R version 3.5.1
library(gmodels)
## Warning: package 'gmodels' was built under R version 3.5.1
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.5.1
## Loading required package: rpart
library(ca)
## Warning: package 'ca' was built under R version 3.5.1
library(class)
## Warning: package 'class' was built under R version 3.5.1
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
library(rpart)
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.1
library(bnlearn)
## Warning: package 'bnlearn' was built under R version 3.5.1
##
## Attaching package: 'bnlearn'
## The following object is masked from 'package:stats':
##
## sigma
library(tm)
## Warning: package 'tm' was built under R version 3.5.1
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(e1071)
## Warning: package 'e1071' was built under R version 3.5.1
##
## Attaching package: 'e1071'
## The following object is masked from 'package:bnlearn':
##
## impute
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
library(SnowballC)
library(klaR)
## Warning: package 'klaR' was built under R version 3.5.1
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.5.1
## corrplot 0.84 loaded
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
library(leaps)
## Warning: package 'leaps' was built under R version 3.5.1
library(colorspace)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(naivebayes)
library(psych)
## Warning: package 'psych' was built under R version 3.5.1
##
## Attaching package: 'psych'
## The following object is masked from 'package:kernlab':
##
## alpha
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
DATA COLLECTION
credit <- read.csv("C:/Users/charl/Downloads/ANLY 530--Lab 2--creditData.csv")
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ Creditability : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Account.Balance : int 1 1 2 1 1 1 1 1 4 2 ...
## $ Duration.of.Credit..month. : int 18 9 12 12 12 10 8 6 18 24 ...
## $ Payment.Status.of.Previous.Credit: int 4 4 2 4 4 4 4 4 4 2 ...
## $ Purpose : int 2 0 9 0 0 0 0 0 3 3 ...
## $ Credit.Amount : int 1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
## $ Value.Savings.Stocks : int 1 1 2 1 1 1 1 1 1 3 ...
## $ Length.of.current.employment : int 2 3 4 3 3 2 4 2 1 1 ...
## $ Instalment.per.cent : int 4 2 2 3 4 1 1 2 4 1 ...
## $ Sex...Marital.Status : int 2 3 2 3 3 3 3 3 2 2 ...
## $ Guarantors : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Duration.in.Current.address : int 4 2 4 2 4 3 4 4 4 4 ...
## $ Most.valuable.available.asset : int 2 1 1 1 2 1 1 1 3 4 ...
## $ Age..years. : int 21 36 23 39 38 48 39 40 65 23 ...
## $ Concurrent.Credits : int 3 3 3 3 1 3 3 3 3 3 ...
## $ Type.of.apartment : int 1 1 1 1 2 1 2 2 2 1 ...
## $ No.of.Credits.at.this.Bank : int 1 2 1 2 2 2 2 1 2 1 ...
## $ Occupation : int 3 3 2 2 2 2 2 2 1 1 ...
## $ No.of.dependents : int 1 2 1 2 1 2 1 2 1 1 ...
## $ Telephone : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Foreign.Worker : int 1 1 1 2 2 2 2 2 1 1 ...
Note that “Account Balance” is ranked 1, “Duration of Credit…” is #2, and “Credit Amount” is ranked #3
Exploring the Data
summary(credit$Value.Savings.Stocks)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 1.000 2.105 3.000 5.000
Exploring the DATA and Randomizing the DATA
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
Divide DATA to Train and Test Sets
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
Train and Test Sets- A good randomization should show Close percentages between Splits
prop.table(table(credit_train$Value.Savings.Stocks))
##
## 1 2 3 4 5
## 0.59333333 0.10933333 0.05733333 0.04533333 0.19466667
prop.table(table(credit_test$Value.Savings.Stocks))
##
## 1 2 3 4 5
## 0.632 0.084 0.080 0.056 0.148
Training the Model
m.rpart <- rpart(Value.Savings.Stocks ~., data = credit_train)
summary(m.rpart)
## Call:
## rpart(formula = Value.Savings.Stocks ~ ., data = credit_train)
## n= 750
##
## CP nsplit rel error xerror xstd
## 1 0.05254483 0 1.0000000 1.0027940 0.04052979
## 2 0.01732279 1 0.9474552 0.9524548 0.04102228
## 3 0.01578068 4 0.8954868 0.9731137 0.04338439
## 4 0.01058861 7 0.8481447 0.9509179 0.04564234
## 5 0.01000000 9 0.8269675 0.9494655 0.04591892
##
## Variable importance
## Account.Balance Credit.Amount
## 26 20
## Duration.of.Credit..month. Duration.in.Current.address
## 15 9
## Length.of.current.employment Purpose
## 6 4
## Instalment.per.cent Creditability
## 4 4
## Most.valuable.available.asset Age..years.
## 2 2
## Type.of.apartment Payment.Status.of.Previous.Credit
## 2 2
## Sex...Marital.Status Telephone
## 1 1
##
## Node number 1: 750 observations, complexity param=0.05254483
## mean=2.138667, MSE=2.564772
## left son=2 (470 obs) right son=3 (280 obs)
## Primary splits:
## Account.Balance < 3.5 to the left, improve=0.05254483, (0 missing)
## Creditability < 0.5 to the left, improve=0.03261021, (0 missing)
## Length.of.current.employment < 3.5 to the left, improve=0.01380736, (0 missing)
## Guarantors < 2.5 to the right, improve=0.01366281, (0 missing)
## Credit.Amount < 6296.5 to the left, improve=0.01260530, (0 missing)
## Surrogate splits:
## Duration.of.Credit..month. < 5.5 to the right, agree=0.632, adj=0.014, (0 split)
##
## Node number 2: 470 observations, complexity param=0.01578068
## mean=1.855319, MSE=2.119493
## left son=4 (216 obs) right son=5 (254 obs)
## Primary splits:
## Account.Balance < 1.5 to the left, improve=0.024843930, (0 missing)
## Creditability < 0.5 to the left, improve=0.023522030, (0 missing)
## Guarantors < 2.5 to the right, improve=0.017754970, (0 missing)
## Credit.Amount < 6443.5 to the left, improve=0.012432060, (0 missing)
## Duration.in.Current.address < 1.5 to the left, improve=0.008485768, (0 missing)
## Surrogate splits:
## Purpose < 2.5 to the left, agree=0.574, adj=0.074, (0 split)
## Duration.in.Current.address < 2.5 to the right, agree=0.574, adj=0.074, (0 split)
## Age..years. < 42.5 to the right, agree=0.570, adj=0.065, (0 split)
## Instalment.per.cent < 3.5 to the right, agree=0.560, adj=0.042, (0 split)
## Type.of.apartment < 1.5 to the left, agree=0.555, adj=0.032, (0 split)
##
## Node number 3: 280 observations, complexity param=0.01732279
## mean=2.614286, MSE=2.951224
## left son=6 (197 obs) right son=7 (83 obs)
## Primary splits:
## Length.of.current.employment < 4.5 to the left, improve=0.03485971, (0 missing)
## Duration.in.Current.address < 3.5 to the left, improve=0.03356328, (0 missing)
## Age..years. < 35.5 to the left, improve=0.02963182, (0 missing)
## Credit.Amount < 4643 to the left, improve=0.02581992, (0 missing)
## Purpose < 1.5 to the right, improve=0.01820727, (0 missing)
## Surrogate splits:
## Age..years. < 44.5 to the left, agree=0.750, adj=0.157, (0 split)
## Type.of.apartment < 2.5 to the left, agree=0.718, adj=0.048, (0 split)
## No.of.Credits.at.this.Bank < 2.5 to the left, agree=0.714, adj=0.036, (0 split)
## Credit.Amount < 12135 to the left, agree=0.711, adj=0.024, (0 split)
## Most.valuable.available.asset < 3.5 to the left, agree=0.707, adj=0.012, (0 split)
##
## Node number 4: 216 observations
## mean=1.606481, MSE=1.784958
##
## Node number 5: 254 observations, complexity param=0.01578068
## mean=2.066929, MSE=2.306544
## left son=10 (100 obs) right son=11 (154 obs)
## Primary splits:
## Creditability < 0.5 to the left, improve=0.03388434, (0 missing)
## Duration.of.Credit..month. < 46.5 to the left, improve=0.02273516, (0 missing)
## Guarantors < 2.5 to the right, improve=0.01779559, (0 missing)
## Credit.Amount < 6435.5 to the left, improve=0.01658153, (0 missing)
## Age..years. < 34.5 to the left, improve=0.01290754, (0 missing)
## Surrogate splits:
## Credit.Amount < 9340.5 to the right, agree=0.657, adj=0.13, (0 split)
## Payment.Status.of.Previous.Credit < 1.5 to the left, agree=0.630, adj=0.06, (0 split)
## Duration.of.Credit..month. < 43.5 to the right, agree=0.626, adj=0.05, (0 split)
## Most.valuable.available.asset < 3.5 to the right, agree=0.626, adj=0.05, (0 split)
## Purpose < 9.5 to the right, agree=0.614, adj=0.02, (0 split)
##
## Node number 6: 197 observations, complexity param=0.01732279
## mean=2.406091, MSE=2.789405
## left son=12 (97 obs) right son=13 (100 obs)
## Primary splits:
## Credit.Amount < 2178.5 to the left, improve=0.05734661, (0 missing)
## Purpose < 1.5 to the right, improve=0.05117379, (0 missing)
## Duration.in.Current.address < 3.5 to the left, improve=0.05102136, (0 missing)
## Duration.of.Credit..month. < 19 to the left, improve=0.02200405, (0 missing)
## No.of.dependents < 1.5 to the left, improve=0.01654588, (0 missing)
## Surrogate splits:
## Duration.of.Credit..month. < 19 to the left, agree=0.736, adj=0.464, (0 split)
## Purpose < 1.5 to the right, agree=0.614, adj=0.216, (0 split)
## Instalment.per.cent < 3.5 to the right, agree=0.614, adj=0.216, (0 split)
## Telephone < 1.5 to the left, agree=0.594, adj=0.175, (0 split)
## Sex...Marital.Status < 3.5 to the right, agree=0.589, adj=0.165, (0 split)
##
## Node number 7: 83 observations
## mean=3.108434, MSE=2.988242
##
## Node number 10: 100 observations
## mean=1.72, MSE=1.4216
##
## Node number 11: 154 observations, complexity param=0.01578068
## mean=2.292208, MSE=2.752277
## left son=22 (143 obs) right son=23 (11 obs)
## Primary splits:
## Duration.of.Credit..month. < 45 to the left, improve=0.10962830, (0 missing)
## Credit.Amount < 6435.5 to the left, improve=0.05072811, (0 missing)
## Guarantors < 2.5 to the right, improve=0.03605021, (0 missing)
## Instalment.per.cent < 3.5 to the left, improve=0.02954390, (0 missing)
## Sex...Marital.Status < 3.5 to the right, improve=0.02244850, (0 missing)
## Surrogate splits:
## Credit.Amount < 7527 to the left, agree=0.955, adj=0.364, (0 split)
## Purpose < 9.5 to the left, agree=0.942, adj=0.182, (0 split)
##
## Node number 12: 97 observations
## mean=2, MSE=2.041237
##
## Node number 13: 100 observations, complexity param=0.01732279
## mean=2.8, MSE=3.2
## left son=26 (63 obs) right son=27 (37 obs)
## Primary splits:
## Duration.in.Current.address < 3.5 to the left, improve=0.12389530, (0 missing)
## Purpose < 1.5 to the right, improve=0.05654187, (0 missing)
## Payment.Status.of.Previous.Credit < 2.5 to the right, improve=0.05442834, (0 missing)
## No.of.dependents < 1.5 to the left, improve=0.03915989, (0 missing)
## Age..years. < 28.5 to the left, improve=0.03654746, (0 missing)
## Surrogate splits:
## Type.of.apartment < 1.5 to the right, agree=0.70, adj=0.189, (0 split)
## Age..years. < 24.5 to the right, agree=0.68, adj=0.135, (0 split)
## Duration.of.Credit..month. < 57 to the left, agree=0.65, adj=0.054, (0 split)
## Credit.Amount < 2203.5 to the right, agree=0.65, adj=0.054, (0 split)
## Sex...Marital.Status < 2.5 to the right, agree=0.64, adj=0.027, (0 split)
##
## Node number 22: 143 observations
## mean=2.13986, MSE=2.525894
##
## Node number 23: 11 observations
## mean=4.272727, MSE=1.471074
##
## Node number 26: 63 observations
## mean=2.31746, MSE=2.788108
##
## Node number 27: 37 observations, complexity param=0.01058861
## mean=3.621622, MSE=2.829803
## left son=54 (25 obs) right son=55 (12 obs)
## Primary splits:
## Credit.Amount < 3130 to the right, improve=0.08591981, (0 missing)
## Type.of.apartment < 1.5 to the right, improve=0.08591981, (0 missing)
## Purpose < 1.5 to the right, improve=0.05952200, (0 missing)
## Length.of.current.employment < 3.5 to the right, improve=0.04210967, (0 missing)
## Age..years. < 26.5 to the left, improve=0.03510928, (0 missing)
## Surrogate splits:
## Duration.of.Credit..month. < 13.5 to the right, agree=0.703, adj=0.083, (0 split)
## Most.valuable.available.asset < 1.5 to the right, agree=0.703, adj=0.083, (0 split)
##
## Node number 54: 25 observations, complexity param=0.01058861
## mean=3.28, MSE=3.1616
## left son=108 (10 obs) right son=109 (15 obs)
## Primary splits:
## Credit.Amount < 4643 to the left, improve=0.40156880, (0 missing)
## Type.of.apartment < 1.5 to the right, improve=0.10627900, (0 missing)
## Purpose < 1.5 to the right, improve=0.10295300, (0 missing)
## Telephone < 1.5 to the left, improve=0.09337396, (0 missing)
## Length.of.current.employment < 3.5 to the right, improve=0.09055728, (0 missing)
## Surrogate splits:
## Instalment.per.cent < 2.5 to the right, agree=0.76, adj=0.4, (0 split)
## Most.valuable.available.asset < 1.5 to the left, agree=0.72, adj=0.3, (0 split)
## Duration.of.Credit..month. < 9 to the left, agree=0.68, adj=0.2, (0 split)
## Payment.Status.of.Previous.Credit < 3.5 to the right, agree=0.68, adj=0.2, (0 split)
## Purpose < 1.5 to the right, agree=0.64, adj=0.1, (0 split)
##
## Node number 55: 12 observations
## mean=4.333333, MSE=1.388889
##
## Node number 108: 10 observations
## mean=1.9, MSE=1.89
##
## Node number 109: 15 observations
## mean=4.2, MSE=1.893333
Visualization
rpart.plot(m.rpart, digits = 3, type = 1)

Let’s look at the relation between Account.Balance and Value.Savings.Stocks
xtabs(~ Account.Balance + Value.Savings.Stocks, data = credit)
## Value.Savings.Stocks
## Account.Balance 1 2 3 4 5
## 1 219 12 8 6 29
## 2 152 47 11 14 45
## 3 41 5 4 3 10
## 4 191 39 40 25 99
credit$Value <- as.factor(credit$Value.Savings.Stocks)
credit$Account <- as.factor(credit$Account.Balance)
Let’s now create a diversity plot using Age..years and Account.Balance:
credit %>% ggplot(aes(x=Age..years., fill = Account.Balance)) +
geom_density(alpha=0.8, color = "blue") + ggtitle("Density Plot")

Notice that the account balance was greatest around age 25 years.
Data Partition
Let us use the naive_bayes() function on our earlier training data to build the Naive Bayes Classifier:
sum(is.na(credit))
## [1] 0
convert_counts <- function(credit_train) {credit_train <- ifelse (credit_train > 0, "Yes","No")}
Use the Naive Bayes function on our earlier Training Data
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
credit_train <- credit_rand[1:750, ]
credit_test <- credit_rand[751:1000, ]
Naive Bayes Model
model <- naive_bayes(credit$Account ~ credit$Value, data = credit_train)
model
## ===================== Naive Bayes =====================
## Call:
## naive_bayes.formula(formula = credit$Account ~ credit$Value,
## data = credit_train)
##
## A priori probabilities:
##
## 1 2 3 4
## 0.274 0.269 0.063 0.394
##
## Tables:
##
## credit$Value 1 2 3 4
## 1 0.79927007 0.56505576 0.65079365 0.48477157
## 2 0.04379562 0.17472119 0.07936508 0.09898477
## 3 0.02919708 0.04089219 0.06349206 0.10152284
## 4 0.02189781 0.05204461 0.04761905 0.06345178
## 5 0.10583942 0.16728625 0.15873016 0.25126904
Plot model:
plot(model)

Show Confusion Matrix of Credit Predictions: