# Entropy
ColorG<--((1/3*log2(1/3)+2/3*log2(2/3)))
ColorG
## [1] 0.9182958
ColorY<--((8/13*log2(8/13)+5/13*log2(5/13)))
ColorY
## [1] 0.9612366
## Credit Data Frame
# Import data
library(readr)
credit <- read.csv("C:/Users/angel/Documents/RPractice/creditS.csv", stringsAsFactors=TRUE)
View(credit)

# Attribute entropy
# exploring data set
str(credit)
## 'data.frame':    1001 obs. of  21 variables:
##  $ checking_balance    : Factor w/ 5 levels "< 0 DM","> 200 DM",..: 5 3 1 1 1 3 5 1 3 5 ...
##  $ months_loan_duration: int  6 9 6 12 6 6 6 12 15 12 ...
##  $ credit_history      : Factor w/ 6 levels "","critical",..: 2 6 2 5 6 2 6 2 6 5 ...
##  $ purpose             : Factor w/ 11 levels "","business",..: 3 3 9 11 5 3 9 9 6 9 ...
##  $ amount              : int  250 276 338 339 343 362 368 385 392 409 ...
##  $ savings_balance     : Factor w/ 6 levels "","< 100 DM",..: 3 2 5 2 2 4 6 2 2 3 ...
##  $ employment_length   : Factor w/ 6 levels "","> 7 yrs","0 - 1 yrs",..: 4 4 2 2 3 4 2 5 3 4 ...
##  $ installment_rate    : int  2 4 4 4 4 4 4 4 4 3 ...
##  $ personal_status     : Factor w/ 5 levels "","divorced male",..: 3 4 5 4 3 3 5 3 3 3 ...
##  $ other_debtors       : Factor w/ 4 levels "","co-applicant",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ residence_history   : int  2 4 4 1 1 4 4 3 4 3 ...
##  $ property            : Factor w/ 5 levels "","building society savings",..: 4 4 3 3 4 3 2 4 2 4 ...
##  $ age                 : int  41 22 52 45 27 52 38 58 23 42 ...
##  $ installment_plan    : Factor w/ 4 levels "","bank","none",..: 2 3 3 2 3 3 3 3 3 3 ...
##  $ housing             : Factor w/ 4 levels "","for free",..: 3 4 3 3 3 3 3 3 4 4 ...
##  $ existing_credits    : int  2 1 2 1 1 2 1 4 1 2 ...
##  $ default             : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ dependents          : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ telephone           : Factor w/ 3 levels "","none","yes": 2 2 2 2 2 2 2 3 3 2 ...
##  $ foreign_worker      : Factor w/ 3 levels "","no","yes": 3 3 3 3 3 3 3 3 3 3 ...
##  $ job                 : Factor w/ 5 levels "","mangement self-employed",..: 5 5 3 5 3 5 3 5 3 3 ...
credit <- credit[-1001,]
credit[155, 1] = as.factor("1 - 200 DM")
credit[155,1]
## [1] 1 - 200 DM
## Levels: < 0 DM > 200 DM 1 - 200 DM checking_balance unknown
table(credit$checking_balance)
## 
##           < 0 DM         > 200 DM       1 - 200 DM checking_balance 
##              274               63              269                0 
##          unknown 
##              394
prop.table(table(credit$checking_balance))
## 
##           < 0 DM         > 200 DM       1 - 200 DM checking_balance 
##            0.274            0.063            0.269            0.000 
##          unknown 
##            0.394
str(credit$checking_balance)
##  Factor w/ 5 levels "< 0 DM","> 200 DM",..: 5 3 1 1 1 3 5 1 3 5 ...
table(credit$savings_balance)
## 
##                    < 100 DM     > 1000 DM  101 - 500 DM 501 - 1000 DM 
##             0           603            48           103            63 
##       unknown 
##           183
prop.table(table(credit$savings_balance))
## 
##                    < 100 DM     > 1000 DM  101 - 500 DM 501 - 1000 DM 
##         0.000         0.603         0.048         0.103         0.063 
##       unknown 
##         0.183
summary(credit$months_loan_duration)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0    12.0    18.0    20.9    24.0    72.0
summary(credit$amount)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424
str(credit$default)
##  int [1:1000] 1 1 1 1 1 1 1 1 1 1 ...
credit$default <- factor(credit$default)
str(credit$default)
##  Factor w/ 2 levels "1","2": 1 1 1 1 1 1 1 1 1 1 ...
table(credit$default)
## 
##   1   2 
## 700 300
prop.table(table(credit$default))
## 
##   1   2 
## 0.7 0.3
# Create a random training and test dataset
set.seed(12345)
credit_rand <- credit[order(runif(1000)), ]
head(credit_rand)
##     checking_balance months_loan_duration         credit_history   purpose
## 14          > 200 DM                    6 fully repaid this bank education
## 448          unknown                    6               critical car (new)
## 697           < 0 DM                   12               critical  radio/tv
## 32        1 - 200 DM                   12                 repaid car (new)
## 196          unknown                   24                 repaid  business
## 83           unknown                   18                 repaid  radio/tv
##     amount savings_balance employment_length installment_rate personal_status
## 14     433       > 1000 DM         0 - 1 yrs                4          female
## 448   2080   501 - 1000 DM         1 - 4 yrs                1    married male
## 697   3573        < 100 DM         1 - 4 yrs                1          female
## 32     640        < 100 DM         1 - 4 yrs                4   divorced male
## 196   1258        < 100 DM         4 - 7 yrs                4     single male
## 83     866        < 100 DM         1 - 4 yrs                4    married male
##     other_debtors residence_history                 property age
## 14           none                 2 building society savings  24
## 448          none                 2                    other  24
## 697          none                 1              real estate  23
## 32           none                 2              real estate  49
## 196          none                 1              real estate  25
## 83      guarantor                 2              real estate  25
##     installment_plan housing existing_credits default dependents telephone
## 14              bank    rent                1       2          2      none
## 448             none     own                1       1          1      none
## 697             none     own                1       1          1      none
## 32              none     own                1       1          1      none
## 196             none     own                1       1          1       yes
## 83              none     own                1       1          1      none
##     foreign_worker                job
## 14             yes   skilled employee
## 448            yes   skilled employee
## 697            yes unskilled resident
## 32             yes unskilled resident
## 196            yes   skilled employee
## 83             yes unskilled resident
# training and test
credit_train <- credit_rand[1:900, ]
credit_test <- credit_rand[901:1000, ]

# Bring c50 package 
# m <- C5.0(train, class, trails = 1, costs = NULL)
library(C50)
m <- C5.0(credit_train, credit_train$default, trials = 1, costs = NULL)
## c50 code called exit with value 1
m
## 
## Call:
## C5.0.default(x = credit_train, y = credit_train$default, trials = 1, costs
##  = NULL)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 21 
## 
## Tree size: 0 
## 
## Non-standard options: attempt to group attributes