Decision Tree
data(iris)
View(iris)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.2
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris )
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin=0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species)
abline(v= 2.45, col='orange')
abline(h= 1.75, col='blue')

predicted <- predict(fit, iris[,-5], type= 'class')
table(iris[,5], predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
Logistic Regression
iris.data <- iris[iris$Species != 'setosa', ]
str(iris.data$Species)
## Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
iris.data$Species <- factor(iris.data$Species)
fit <- glm(Species ~ ., family = 'binomial', data = iris.data)
fit
##
## Call: glm(formula = Species ~ ., family = "binomial", data = iris.data)
##
## Coefficients:
## (Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
## -42.638 -2.465 -6.681 9.429 18.286
##
## Degrees of Freedom: 99 Total (i.e. Null); 95 Residual
## Null Deviance: 138.6
## Residual Deviance: 11.9 AIC: 21.9
SVM
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.3
data(iris)
fit <- svm(Species ~ . , data = iris,family=binomial)
?svm
## starting httpd help server ... done
summary(fit)
##
## Call:
## svm(formula = Species ~ ., data = iris, family = binomial)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.25
##
## Number of Support Vectors: 51
##
## ( 8 22 21 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
predicted <- predict(fit, iris[,-5])
table(iris[,5], predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
Creditability Prediction
trainset <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/Training50.csv')
head(trainset)
## X Creditability Account.Balance Duration.of.Credit..month.
## 1 497 1 3 6
## 2 756 0 1 15
## 3 580 0 1 42
## 4 833 0 3 36
## 5 602 1 3 24
## 6 734 1 1 15
## Payment.Status.of.Previous.Credit Purpose Credit.Amount
## 1 2 3 2108
## 2 1 4 950
## 3 2 3 7174
## 4 3 4 7980
## 5 3 2 2028
## 6 2 4 2511
## Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## 1 1 3 2
## 2 1 4 4
## 3 4 3 4
## 4 4 1 4
## 5 1 3 2
## 6 1 1 1
## Sex...Marital.Status Guarantors Duration.in.Current.address
## 1 3 1 2
## 2 2 1 3
## 3 1 1 3
## 4 2 1 4
## 5 2 1 2
## 6 1 1 4
## Most.valuable.available.asset Age..years. Concurrent.Credits
## 1 1 29 2
## 2 3 33 2
## 3 3 30 2
## 4 3 27 2
## 5 2 30 2
## 6 3 23 2
## Type.of.apartment No.of.Credits.at.this.Bank Occupation No.of.dependents
## 1 1 1 1 1
## 2 1 2 1 2
## 3 2 1 1 1
## 4 1 2 1 1
## 5 2 2 1 1
## 6 1 1 1 1
## Telephone Foreign.Worker
## 1 1 1
## 2 1 1
## 3 2 1
## 4 2 1
## 5 1 1
## 6 1 1
library(rpart)
fmla <- as.formula(paste("Creditability ~ ", paste(names(trainset), collapse= "+")))
model <- rpart(Creditability ~ Account.Balance+Duration.of.Credit..month.+Payment.Status.of.Previous.Credit+Purpose+Credit.Amount+Value.Savings.Stocks+Length.of.current.employment+Instalment.per.cent+Sex...Marital.Status+Guarantors+Duration.in.Current.address+Most.valuable.available.asset+Age..years.+Concurrent.Credits+Type.of.apartment+No.of.Credits.at.this.Bank+Occupation+No.of.dependents+Telephone, data=trainset, method = 'class')
plot(model, margin = 0.1)
text(model)

testset <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/Test50.csv')
predicted <- predict(model, testset, type='class')
## Accuracy
sum(predicted == testset$Creditability) / length(testset$Creditability)
## [1] 0.71
## Confusion Matrix
table(testset$Creditability, predicted)
## predicted
## 0 1
## 0 64 93
## 1 52 291
predicted <- predict(model, testset, type='prob' )
head(predicted)
## 0 1
## 1 0.2134831 0.7865169
## 2 0.2134831 0.7865169
## 3 0.8000000 0.2000000
## 4 0.2134831 0.7865169
## 5 0.2134831 0.7865169
## 6 0.2134831 0.7865169
## ROC Curve
fpr_ary <- c(0)
tpr_ary <- c(0)
for (thresh in seq(0,1,0.1)){
res <- as.factor(ifelse(predicted[,1]>= thresh, 0, 1 ))
tb <- table(testset$Creditability, res)
if (ncol(tb) == 2){
tp <- tb[1,1]
fp <- tb[1,2]
fn <- tb[2,1]
tn <- tb[2,2]
FPR <- fp / (fp + tn)
TPR <- tp / (tp + fn)
fpr_ary <- c(fpr_ary, FPR)
tpr_ary <- c(tpr_ary, TPR)
}
}
fpr_ary <- c(fpr_ary, 1)
tpr_ary <- c(tpr_ary, 1)
plot(fpr_ary, tpr_ary, type= 'b')

使用ROCR
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(model, testset, type="prob")
pred.to.roc <- predictions[, 2]
pred.rocr <- prediction(pred.to.roc, as.factor(testset$Creditability))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

使用RandomForest
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
head(trainset)
## X Creditability Account.Balance Duration.of.Credit..month.
## 1 497 1 3 6
## 2 756 0 1 15
## 3 580 0 1 42
## 4 833 0 3 36
## 5 602 1 3 24
## 6 734 1 1 15
## Payment.Status.of.Previous.Credit Purpose Credit.Amount
## 1 2 3 2108
## 2 1 4 950
## 3 2 3 7174
## 4 3 4 7980
## 5 3 2 2028
## 6 2 4 2511
## Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## 1 1 3 2
## 2 1 4 4
## 3 4 3 4
## 4 4 1 4
## 5 1 3 2
## 6 1 1 1
## Sex...Marital.Status Guarantors Duration.in.Current.address
## 1 3 1 2
## 2 2 1 3
## 3 1 1 3
## 4 2 1 4
## 5 2 1 2
## 6 1 1 4
## Most.valuable.available.asset Age..years. Concurrent.Credits
## 1 1 29 2
## 2 3 33 2
## 3 3 30 2
## 4 3 27 2
## 5 2 30 2
## 6 3 23 2
## Type.of.apartment No.of.Credits.at.this.Bank Occupation No.of.dependents
## 1 1 1 1 1
## 2 1 2 1 2
## 3 2 1 1 1
## 4 1 2 1 1
## 5 2 2 1 1
## 6 1 1 1 1
## Telephone Foreign.Worker
## 1 1 1
## 2 1 1
## 3 2 1
## 4 2 1
## 5 1 1
## 6 1 1
trainset$X <- NULL
trainset$Creditability <- as.factor(trainset$Creditability)
forest <- randomForest(Creditability ~. , data = trainset, ntree = 200, proximity = TRUE, importance = TRUE)
forest.predicted <- predict(forest, testset, type = 'class')
## Accuracy
sum(testset$Creditability == forest.predicted) / length(testset$Creditability)
## [1] 0.734
## Confusion Matirx
table(testset$Creditability, forest.predicted)
## forest.predicted
## 0 1
## 0 53 104
## 1 29 314
比較ROC Curve
## Tree Model
model <- rpart(Creditability ~ ., data=trainset, method = 'class')
predictions1 <- predict(model, testset, type="prob")
pred.to.roc1 <- predictions1[, 2]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$Creditability))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
# Forest Model
forest <- randomForest(Creditability ~. , data = trainset, ntree = 200, proximity = TRUE, importance = TRUE)
predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 2]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$Creditability))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col='blue')
plot(perf.tpr.rocr2, col='red', add=TRUE)
legend(0.7, 0.2, c('rpart', 'randomforest'), c('blue', 'red'))

forest$importance
## 0 1
## Account.Balance 0.0432052823 0.0231470071
## Duration.of.Credit..month. 0.0256940366 0.0144040693
## Payment.Status.of.Previous.Credit 0.0161351180 0.0125862252
## Purpose 0.0046621694 0.0087867339
## Credit.Amount 0.0091725847 0.0146548502
## Value.Savings.Stocks 0.0069316613 0.0075492841
## Length.of.current.employment 0.0027370048 0.0020513463
## Instalment.per.cent 0.0011021482 0.0044041472
## Sex...Marital.Status 0.0029031263 0.0035456861
## Guarantors 0.0001390153 0.0050687831
## Duration.in.Current.address 0.0027588414 0.0039364569
## Most.valuable.available.asset -0.0030726926 0.0077956527
## Age..years. 0.0007688531 0.0036482221
## Concurrent.Credits 0.0043023228 0.0027989500
## Type.of.apartment -0.0017054390 0.0033841708
## No.of.Credits.at.this.Bank -0.0029604113 0.0033911064
## Occupation 0.0000000000 0.0000000000
## No.of.dependents 0.0020504720 0.0009278489
## Telephone 0.0010695400 -0.0012892742
## Foreign.Worker -0.0002461548 -0.0007122646
## MeanDecreaseAccuracy MeanDecreaseGini
## Account.Balance 0.0288508902 18.191664
## Duration.of.Credit..month. 0.0174423016 22.105354
## Payment.Status.of.Previous.Credit 0.0137004723 13.018677
## Purpose 0.0076342221 10.954497
## Credit.Amount 0.0129798424 28.897949
## Value.Savings.Stocks 0.0075703192 9.462563
## Length.of.current.employment 0.0020922541 10.289176
## Instalment.per.cent 0.0033943828 9.517532
## Sex...Marital.Status 0.0033880621 7.190769
## Guarantors 0.0036134740 3.499224
## Duration.in.Current.address 0.0035755590 10.762701
## Most.valuable.available.asset 0.0046105562 10.505516
## Age..years. 0.0025807659 22.431389
## Concurrent.Credits 0.0031393798 4.898788
## Type.of.apartment 0.0018763603 5.366175
## No.of.Credits.at.this.Bank 0.0015616739 4.062593
## Occupation 0.0000000000 0.000000
## No.of.dependents 0.0012139988 3.843355
## Telephone -0.0007166337 4.304294
## Foreign.Worker -0.0005826014 0.818816