Decision Tree

data(iris)
View(iris)
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.2
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris )
fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin=0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species)
abline(v= 2.45, col='orange')
abline(h= 1.75, col='blue')

predicted <- predict(fit, iris[,-5], type= 'class')
table(iris[,5], predicted)
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45

Logistic Regression

iris.data <- iris[iris$Species != 'setosa', ]
str(iris.data$Species)
##  Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
iris.data$Species <- factor(iris.data$Species)
fit <- glm(Species ~ ., family = 'binomial', data = iris.data)
fit
## 
## Call:  glm(formula = Species ~ ., family = "binomial", data = iris.data)
## 
## Coefficients:
##  (Intercept)  Sepal.Length   Sepal.Width  Petal.Length   Petal.Width  
##      -42.638        -2.465        -6.681         9.429        18.286  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  95 Residual
## Null Deviance:       138.6 
## Residual Deviance: 11.9  AIC: 21.9

SVM

library(e1071)
## Warning: package 'e1071' was built under R version 3.4.3
data(iris)
fit <- svm(Species ~ . , data = iris,family=binomial)
?svm
## starting httpd help server ... done
summary(fit)
## 
## Call:
## svm(formula = Species ~ ., data = iris, family = binomial)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.25 
## 
## Number of Support Vectors:  51
## 
##  ( 8 22 21 )
## 
## 
## Number of Classes:  3 
## 
## Levels: 
##  setosa versicolor virginica
predicted <- predict(fit, iris[,-5])
table(iris[,5], predicted)
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48

Creditability Prediction

trainset <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/Training50.csv')
head(trainset)
##     X Creditability Account.Balance Duration.of.Credit..month.
## 1 497             1               3                          6
## 2 756             0               1                         15
## 3 580             0               1                         42
## 4 833             0               3                         36
## 5 602             1               3                         24
## 6 734             1               1                         15
##   Payment.Status.of.Previous.Credit Purpose Credit.Amount
## 1                                 2       3          2108
## 2                                 1       4           950
## 3                                 2       3          7174
## 4                                 3       4          7980
## 5                                 3       2          2028
## 6                                 2       4          2511
##   Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## 1                    1                            3                   2
## 2                    1                            4                   4
## 3                    4                            3                   4
## 4                    4                            1                   4
## 5                    1                            3                   2
## 6                    1                            1                   1
##   Sex...Marital.Status Guarantors Duration.in.Current.address
## 1                    3          1                           2
## 2                    2          1                           3
## 3                    1          1                           3
## 4                    2          1                           4
## 5                    2          1                           2
## 6                    1          1                           4
##   Most.valuable.available.asset Age..years. Concurrent.Credits
## 1                             1          29                  2
## 2                             3          33                  2
## 3                             3          30                  2
## 4                             3          27                  2
## 5                             2          30                  2
## 6                             3          23                  2
##   Type.of.apartment No.of.Credits.at.this.Bank Occupation No.of.dependents
## 1                 1                          1          1                1
## 2                 1                          2          1                2
## 3                 2                          1          1                1
## 4                 1                          2          1                1
## 5                 2                          2          1                1
## 6                 1                          1          1                1
##   Telephone Foreign.Worker
## 1         1              1
## 2         1              1
## 3         2              1
## 4         2              1
## 5         1              1
## 6         1              1
library(rpart)
fmla <- as.formula(paste("Creditability ~ ", paste(names(trainset), collapse= "+")))

model <- rpart(Creditability ~ Account.Balance+Duration.of.Credit..month.+Payment.Status.of.Previous.Credit+Purpose+Credit.Amount+Value.Savings.Stocks+Length.of.current.employment+Instalment.per.cent+Sex...Marital.Status+Guarantors+Duration.in.Current.address+Most.valuable.available.asset+Age..years.+Concurrent.Credits+Type.of.apartment+No.of.Credits.at.this.Bank+Occupation+No.of.dependents+Telephone, data=trainset, method = 'class') 


plot(model, margin = 0.1)
text(model)

testset <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/Test50.csv')

predicted <- predict(model, testset, type='class')

## Accuracy
sum(predicted == testset$Creditability) / length(testset$Creditability)
## [1] 0.71
## Confusion Matrix
table(testset$Creditability, predicted)
##    predicted
##       0   1
##   0  64  93
##   1  52 291
predicted <- predict(model, testset, type='prob' )
head(predicted)
##           0         1
## 1 0.2134831 0.7865169
## 2 0.2134831 0.7865169
## 3 0.8000000 0.2000000
## 4 0.2134831 0.7865169
## 5 0.2134831 0.7865169
## 6 0.2134831 0.7865169
## ROC Curve
fpr_ary <- c(0)
tpr_ary <- c(0)
for (thresh in seq(0,1,0.1)){
  res <- as.factor(ifelse(predicted[,1]>= thresh, 0, 1 ))
  tb  <- table(testset$Creditability, res)
  if (ncol(tb) == 2){
    tp  <- tb[1,1]
    fp  <- tb[1,2]
    fn  <- tb[2,1]
    tn  <- tb[2,2]
    FPR <- fp / (fp + tn)
    TPR <- tp / (tp + fn)
    fpr_ary <- c(fpr_ary, FPR)
    tpr_ary <- c(tpr_ary, TPR)
  }
}
fpr_ary <- c(fpr_ary, 1)
tpr_ary <- c(tpr_ary, 1)
plot(fpr_ary, tpr_ary, type= 'b')

使用ROCR

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <- predict(model, testset, type="prob")
pred.to.roc <- predictions[, 2] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset$Creditability)) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

使用RandomForest

library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
head(trainset)
##     X Creditability Account.Balance Duration.of.Credit..month.
## 1 497             1               3                          6
## 2 756             0               1                         15
## 3 580             0               1                         42
## 4 833             0               3                         36
## 5 602             1               3                         24
## 6 734             1               1                         15
##   Payment.Status.of.Previous.Credit Purpose Credit.Amount
## 1                                 2       3          2108
## 2                                 1       4           950
## 3                                 2       3          7174
## 4                                 3       4          7980
## 5                                 3       2          2028
## 6                                 2       4          2511
##   Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## 1                    1                            3                   2
## 2                    1                            4                   4
## 3                    4                            3                   4
## 4                    4                            1                   4
## 5                    1                            3                   2
## 6                    1                            1                   1
##   Sex...Marital.Status Guarantors Duration.in.Current.address
## 1                    3          1                           2
## 2                    2          1                           3
## 3                    1          1                           3
## 4                    2          1                           4
## 5                    2          1                           2
## 6                    1          1                           4
##   Most.valuable.available.asset Age..years. Concurrent.Credits
## 1                             1          29                  2
## 2                             3          33                  2
## 3                             3          30                  2
## 4                             3          27                  2
## 5                             2          30                  2
## 6                             3          23                  2
##   Type.of.apartment No.of.Credits.at.this.Bank Occupation No.of.dependents
## 1                 1                          1          1                1
## 2                 1                          2          1                2
## 3                 2                          1          1                1
## 4                 1                          2          1                1
## 5                 2                          2          1                1
## 6                 1                          1          1                1
##   Telephone Foreign.Worker
## 1         1              1
## 2         1              1
## 3         2              1
## 4         2              1
## 5         1              1
## 6         1              1
trainset$X <- NULL
trainset$Creditability <- as.factor(trainset$Creditability)

forest <- randomForest(Creditability ~. , data = trainset, ntree = 200, proximity = TRUE, importance = TRUE)

forest.predicted <- predict(forest, testset, type = 'class')


## Accuracy
sum(testset$Creditability == forest.predicted) / length(testset$Creditability)
## [1] 0.734
## Confusion Matirx
table(testset$Creditability, forest.predicted)
##    forest.predicted
##       0   1
##   0  53 104
##   1  29 314

 比較ROC Curve

## Tree Model
model <- rpart(Creditability ~ ., data=trainset, method = 'class') 
predictions1 <- predict(model, testset, type="prob")
pred.to.roc1 <- predictions1[, 2] 
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$Creditability)) 
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

# Forest Model
forest <- randomForest(Creditability ~. , data = trainset, ntree = 200, proximity = TRUE, importance = TRUE)
predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 2] 
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$Creditability)) 
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")

plot(perf.tpr.rocr1,main='ROC Curve', col='blue')
plot(perf.tpr.rocr2, col='red', add=TRUE)
legend(0.7, 0.2, c('rpart', 'randomforest'), c('blue', 'red'))

forest$importance
##                                               0             1
## Account.Balance                    0.0432052823  0.0231470071
## Duration.of.Credit..month.         0.0256940366  0.0144040693
## Payment.Status.of.Previous.Credit  0.0161351180  0.0125862252
## Purpose                            0.0046621694  0.0087867339
## Credit.Amount                      0.0091725847  0.0146548502
## Value.Savings.Stocks               0.0069316613  0.0075492841
## Length.of.current.employment       0.0027370048  0.0020513463
## Instalment.per.cent                0.0011021482  0.0044041472
## Sex...Marital.Status               0.0029031263  0.0035456861
## Guarantors                         0.0001390153  0.0050687831
## Duration.in.Current.address        0.0027588414  0.0039364569
## Most.valuable.available.asset     -0.0030726926  0.0077956527
## Age..years.                        0.0007688531  0.0036482221
## Concurrent.Credits                 0.0043023228  0.0027989500
## Type.of.apartment                 -0.0017054390  0.0033841708
## No.of.Credits.at.this.Bank        -0.0029604113  0.0033911064
## Occupation                         0.0000000000  0.0000000000
## No.of.dependents                   0.0020504720  0.0009278489
## Telephone                          0.0010695400 -0.0012892742
## Foreign.Worker                    -0.0002461548 -0.0007122646
##                                   MeanDecreaseAccuracy MeanDecreaseGini
## Account.Balance                           0.0288508902        18.191664
## Duration.of.Credit..month.                0.0174423016        22.105354
## Payment.Status.of.Previous.Credit         0.0137004723        13.018677
## Purpose                                   0.0076342221        10.954497
## Credit.Amount                             0.0129798424        28.897949
## Value.Savings.Stocks                      0.0075703192         9.462563
## Length.of.current.employment              0.0020922541        10.289176
## Instalment.per.cent                       0.0033943828         9.517532
## Sex...Marital.Status                      0.0033880621         7.190769
## Guarantors                                0.0036134740         3.499224
## Duration.in.Current.address               0.0035755590        10.762701
## Most.valuable.available.asset             0.0046105562        10.505516
## Age..years.                               0.0025807659        22.431389
## Concurrent.Credits                        0.0031393798         4.898788
## Type.of.apartment                         0.0018763603         5.366175
## No.of.Credits.at.this.Bank                0.0015616739         4.062593
## Occupation                                0.0000000000         0.000000
## No.of.dependents                          0.0012139988         3.843355
## Telephone                                -0.0007166337         4.304294
## Foreign.Worker                           -0.0005826014         0.818816