# Entropy
ColorG<--((1/3*log2(1/3)+2/3*log2(2/3)))
ColorG
## [1] 0.9182958
ColorY<--((8/13*log2(8/13)+5/13*log2(5/13)))
ColorY
## [1] 0.9612366
## Credit Data Frame
# Import data
library(readr)
credit <- read.csv("C:/Users/angel/Documents/RPractice/creditS.csv", stringsAsFactors=TRUE)
View(credit)
# Attribute entropy
# exploring data set
str(credit)
## 'data.frame': 1001 obs. of 21 variables:
## $ checking_balance : Factor w/ 5 levels "< 0 DM","> 200 DM",..: 5 3 1 1 1 3 5 1 3 5 ...
## $ months_loan_duration: int 6 9 6 12 6 6 6 12 15 12 ...
## $ credit_history : Factor w/ 6 levels "","critical",..: 2 6 2 5 6 2 6 2 6 5 ...
## $ purpose : Factor w/ 11 levels "","business",..: 3 3 9 11 5 3 9 9 6 9 ...
## $ amount : int 250 276 338 339 343 362 368 385 392 409 ...
## $ savings_balance : Factor w/ 6 levels "","< 100 DM",..: 3 2 5 2 2 4 6 2 2 3 ...
## $ employment_length : Factor w/ 6 levels "","> 7 yrs","0 - 1 yrs",..: 4 4 2 2 3 4 2 5 3 4 ...
## $ installment_rate : int 2 4 4 4 4 4 4 4 4 3 ...
## $ personal_status : Factor w/ 5 levels "","divorced male",..: 3 4 5 4 3 3 5 3 3 3 ...
## $ other_debtors : Factor w/ 4 levels "","co-applicant",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ residence_history : int 2 4 4 1 1 4 4 3 4 3 ...
## $ property : Factor w/ 5 levels "","building society savings",..: 4 4 3 3 4 3 2 4 2 4 ...
## $ age : int 41 22 52 45 27 52 38 58 23 42 ...
## $ installment_plan : Factor w/ 4 levels "","bank","none",..: 2 3 3 2 3 3 3 3 3 3 ...
## $ housing : Factor w/ 4 levels "","for free",..: 3 4 3 3 3 3 3 3 4 4 ...
## $ existing_credits : int 2 1 2 1 1 2 1 4 1 2 ...
## $ default : int 1 1 1 1 1 1 1 1 1 1 ...
## $ dependents : int 1 1 1 1 1 1 1 1 1 1 ...
## $ telephone : Factor w/ 3 levels "","none","yes": 2 2 2 2 2 2 2 3 3 2 ...
## $ foreign_worker : Factor w/ 3 levels "","no","yes": 3 3 3 3 3 3 3 3 3 3 ...
## $ job : Factor w/ 5 levels "","mangement self-employed",..: 5 5 3 5 3 5 3 5 3 3 ...
credit <- credit[-1001,]
credit[155, 1] = as.factor("1 - 200 DM")
credit[155,1]
## [1] 1 - 200 DM
## Levels: < 0 DM > 200 DM 1 - 200 DM checking_balance unknown
table(credit$checking_balance)
##
## < 0 DM > 200 DM 1 - 200 DM checking_balance
## 274 63 269 0
## unknown
## 394
prop.table(table(credit$checking_balance))
##
## < 0 DM > 200 DM 1 - 200 DM checking_balance
## 0.274 0.063 0.269 0.000
## unknown
## 0.394
str(credit$checking_balance)
## Factor w/ 5 levels "< 0 DM","> 200 DM",..: 5 3 1 1 1 3 5 1 3 5 ...
table(credit$savings_balance)
##
## < 100 DM > 1000 DM 101 - 500 DM 501 - 1000 DM
## 0 603 48 103 63
## unknown
## 183
prop.table(table(credit$savings_balance))
##
## < 100 DM > 1000 DM 101 - 500 DM 501 - 1000 DM
## 0.000 0.603 0.048 0.103 0.063
## unknown
## 0.183
summary(credit$months_loan_duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 12.0 18.0 20.9 24.0 72.0
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
str(credit$default)
## int [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
credit$default <- factor(credit$default)
str(credit$default)
## Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
table(credit$default)
##
## 1 2
## 700 300
prop.table(table(credit$default))
##
## 1 2
## 0.7 0.3
# Create a random training and test dataset
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
head(credit_rand)
## checking_balance months_loan_duration credit_history purpose
## 14 > 200 DM 6 fully repaid this bank education
## 448 unknown 6 critical car (new)
## 697 < 0 DM 12 critical radio/tv
## 32 1 - 200 DM 12 repaid car (new)
## 196 unknown 24 repaid business
## 83 unknown 18 repaid radio/tv
## amount savings_balance employment_length installment_rate personal_status
## 14 433 > 1000 DM 0 - 1 yrs 4 female
## 448 2080 501 - 1000 DM 1 - 4 yrs 1 married male
## 697 3573 < 100 DM 1 - 4 yrs 1 female
## 32 640 < 100 DM 1 - 4 yrs 4 divorced male
## 196 1258 < 100 DM 4 - 7 yrs 4 single male
## 83 866 < 100 DM 1 - 4 yrs 4 married male
## other_debtors residence_history property age
## 14 none 2 building society savings 24
## 448 none 2 other 24
## 697 none 1 real estate 23
## 32 none 2 real estate 49
## 196 none 1 real estate 25
## 83 guarantor 2 real estate 25
## installment_plan housing existing_credits default dependents telephone
## 14 bank rent 1 2 2 none
## 448 none own 1 1 1 none
## 697 none own 1 1 1 none
## 32 none own 1 1 1 none
## 196 none own 1 1 1 yes
## 83 none own 1 1 1 none
## foreign_worker job
## 14 yes skilled employee
## 448 yes skilled employee
## 697 yes unskilled resident
## 32 yes unskilled resident
## 196 yes skilled employee
## 83 yes unskilled resident
# training and test
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]
# Bring c50 package
# m <- C5.0(train, class, trails = 1, costs = NULL)
library(C50)
m <- C5.0(credit_train, credit_train$default, trials = 1, costs = NULL)
## c50 code called exit with value 1
m
##
## Call:
## C5.0.default(x = credit_train, y = credit_train$default, trials = 1, costs
## = NULL)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 21
##
## Tree size: 0
##
## Non-standard options: attempt to group attributes