data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.2
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)
plot(Petal.Width ~ Petal.Length, data= iris, col=Species)
abline(v = 2.45, col = 'orange')
abline(h = 1.75, col = 'blue')
predicted <- predict(fit, iris, type='class')
sum(iris$Species == predicted) / length(iris$Species)
## [1] 0.96
tb <- table(iris$Species, predicted)
tb
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
library(caret)
## Warning: package 'caret' was built under R version 3.4.2
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.36
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9074 0.9783
## Specificity 1.0000 0.9896 0.9519
## Pos Pred Value 1.0000 0.9800 0.9000
## Neg Pred Value 1.0000 0.9500 0.9900
## Prevalence 0.3333 0.3600 0.3067
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9485 0.9651
set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36 2
set.seed(123)
idx <- sample.int(2, nrow(iris), replace = TRUE, prob = c(0.7, 0.3))
trainset <- iris[idx == 1, ]
testset <- iris[idx == 2, ]
dim(trainset)
## [1] 106 5
dim(testset)
## [1] 44 5
fit2 <- rpart(Species ~ . , data = trainset)
predicted2 <- predict(fit2, testset, type='class')
table(testset$Species, predicted2)
## predicted2
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 10 4
## virginica 0 1 14
sum(testset$Species == predicted2) / length(testset$Species)
## [1] 0.8863636
# install.packages('party')
library(party)
## Warning: package 'party' was built under R version 3.4.2
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.4.2
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.4.2
fit <- ctree(Species ~ ., data = iris)
plot(fit)
predicted <- predict(fit, iris)
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
plot(Petal.Width ~ Petal.Length, data= iris, col=Species)
abline(h = 1.7, col = 'orange')
abline(v = 1.9, col = 'blue')
abline(v = 4.8, col = 'red')
## Logistic Regression
dataset <- iris[iris$Species != 'setosa', ]
dataset$Species <- factor(dataset$Species)
fit <- glm(Species ~ ., data = dataset, family='binomial')
summary(fit)
##
## Call:
## glm(formula = Species ~ ., family = "binomial", data = dataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.01105 -0.00541 -0.00001 0.00677 1.78065
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.638 25.707 -1.659 0.0972 .
## Sepal.Length -2.465 2.394 -1.030 0.3032
## Sepal.Width -6.681 4.480 -1.491 0.1359
## Petal.Length 9.429 4.737 1.991 0.0465 *
## Petal.Width 18.286 9.743 1.877 0.0605 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.629 on 99 degrees of freedom
## Residual deviance: 11.899 on 95 degrees of freedom
## AIC: 21.899
##
## Number of Fisher Scoring iterations: 10
predict(fit, dataset)
## 51 52 53 54 55 56
## -11.3544818 -9.9326130 -6.7253803 -10.0730364 -6.5638417 -9.1918314
## 57 58 59 60 61 62
## -6.6396889 -21.3484037 -11.1356798 -11.1201500 -17.0366939 -10.1926410
## 63 64 65 66 67 68
## -16.1233989 -7.1315175 -18.0998447 -12.7756425 -6.6242594 -18.0278599
## 69 70 71 72 73 74
## -2.7586819 -16.2559018 -0.3853463 -14.8926121 -1.2377160 -10.1206562
## 75 76 77 78 79 80
## -13.4714513 -11.8610318 -7.2461444 -0.9640817 -6.9422588 -22.7708802
## 81 82 83 84 85 86
## -16.2842296 -19.0557818 -16.2565095 1.8801634 -6.1312154 -8.4540886
## 87 88 89 90 91 92
## -8.1182133 -8.2734585 -14.0532409 -11.4092139 -10.1341622 -8.7425447
## 93 94 95 96 97 98
## -14.6454823 -20.9268370 -11.1060362 -15.1854381 -12.6887357 -12.9784072
## 99 100 101 102 103 104
## -23.5097383 -12.9635855 22.0760350 7.8590485 13.8507316 8.1763399
## 105 106 107 108 109 110
## 16.2155389 19.2186911 2.0990656 12.3116893 11.7484836 18.7960092
## 111 112 113 114 115 116
## 4.6215645 8.2657934 10.8185436 10.3274231 16.3340282 12.2398047
## 117 118 119 120 121 122
## 6.0722686 16.3990117 28.1305669 2.4490547 14.7789486 7.6267405
## 123 124 125 126 127 128
## 19.4226714 2.9119477 10.9466766 7.7251297 1.5474425 1.4007256
## 129 130 131 132 133 134
## 14.0837476 3.5182027 12.6759926 9.4199248 15.9123613 -1.3561051
## 135 136 137 138 139 140
## 3.3591953 17.9147039 15.8075785 5.6507019 0.7043091 8.9609944
## 141 142 143 144 145 146
## 16.8257565 9.7894062 7.8590485 16.9113477 18.2611313 11.8934775
## 147 148 149 150
## 7.0196773 6.9006805 12.3396098 3.7796467
#install.packages('e1071')
library(e1071)
fit <- svm(Species ~ ., data = iris)
predicted <- predict(fit, iris)
sum(predicted == iris$Species) / length(iris$Species)
## [1] 0.9733333
table(iris$Species, predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
#install.packages('C50')
library(C50)
## Warning: package 'C50' was built under R version 3.4.2
data(churn)
head(churnTrain)
## state account_length area_code international_plan voice_mail_plan
## 1 KS 128 area_code_415 no yes
## 2 OH 107 area_code_415 no yes
## 3 NJ 137 area_code_415 no no
## 4 OH 84 area_code_408 yes no
## 5 OK 75 area_code_415 yes no
## 6 AL 118 area_code_510 yes no
## number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1 25 265.1 110 45.07
## 2 26 161.6 123 27.47
## 3 0 243.4 114 41.38
## 4 0 299.4 71 50.90
## 5 0 166.7 113 28.34
## 6 0 223.4 98 37.98
## total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1 197.4 99 16.78 244.7
## 2 195.5 103 16.62 254.4
## 3 121.2 110 10.30 162.6
## 4 61.9 88 5.26 196.9
## 5 148.3 122 12.61 186.9
## 6 220.6 101 18.75 203.9
## total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1 91 11.01 10.0 3
## 2 103 11.45 13.7 3
## 3 104 7.32 12.2 5
## 4 89 8.86 6.6 7
## 5 121 8.41 10.1 3
## 6 118 9.18 6.3 6
## total_intl_charge number_customer_service_calls churn
## 1 2.70 1 no
## 2 3.70 1 no
## 3 3.29 0 no
## 4 1.78 2 no
## 5 2.73 3 no
## 6 1.70 0 no
churnTrain <- churnTrain[,! names(churnTrain) %in% c('state', 'account_length', 'area_code')]
head(churnTrain)
## international_plan voice_mail_plan number_vmail_messages
## 1 no yes 25
## 2 no yes 26
## 3 no no 0
## 4 yes no 0
## 5 yes no 0
## 6 yes no 0
## total_day_minutes total_day_calls total_day_charge total_eve_minutes
## 1 265.1 110 45.07 197.4
## 2 161.6 123 27.47 195.5
## 3 243.4 114 41.38 121.2
## 4 299.4 71 50.90 61.9
## 5 166.7 113 28.34 148.3
## 6 223.4 98 37.98 220.6
## total_eve_calls total_eve_charge total_night_minutes total_night_calls
## 1 99 16.78 244.7 91
## 2 103 16.62 254.4 103
## 3 110 10.30 162.6 104
## 4 88 5.26 196.9 89
## 5 122 12.61 186.9 121
## 6 101 18.75 203.9 118
## total_night_charge total_intl_minutes total_intl_calls total_intl_charge
## 1 11.01 10.0 3 2.70
## 2 11.45 13.7 3 3.70
## 3 7.32 12.2 5 3.29
## 4 8.86 6.6 7 1.78
## 5 8.41 10.1 3 2.73
## 6 9.18 6.3 6 1.70
## number_customer_service_calls churn
## 1 1 no
## 2 1 no
## 3 0 no
## 4 2 no
## 5 3 no
## 6 0 no
set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob=c(0.7,0.3))
trainset <- churnTrain[idx == 1,]
testset <- churnTrain[idx == 2,]
dim(trainset)
## [1] 2315 17
dim(testset)
## [1] 1018 17
library(rpart)
fit <- rpart(churn ~ ., data = trainset)
plot(fit, margin = 0.1)
text(fit)
predicted <- predict(fit, testset, type= 'class')
table(testset$churn, predicted)
## predicted
## yes no
## yes 100 41
## no 18 859
sum(predicted == testset$churn) / length(testset$churn)
## [1] 0.9420432
table(testset$churn)
##
## yes no
## 141 877
877 / (141 +877)
## [1] 0.8614931
library(e1071)
fit2 <- svm(churn ~., data = trainset, kernel='polynomial')
#?svm
predicted2 <- predict(fit2, testset)
table(testset$churn, predicted2)
## predicted2
## yes no
## yes 67 74
## no 8 869
sum(testset$churn == predicted2) / length(testset$churn)
## [1] 0.9194499
library(rpart)
fit <- rpart(churn ~ ., data = trainset)
plot(fit, margin = 0.1)
text(fit)
predicted <- predict(fit, testset, type= 'class')
tb <- table(testset$churn, predicted)
sum(predicted == testset$churn) / length(testset$churn)
## [1] 0.9420432
library(caret)
confusionMatrix(tb)
## Confusion Matrix and Statistics
##
## predicted
## yes no
## yes 100 41
## no 18 859
##
## Accuracy : 0.942
## 95% CI : (0.9259, 0.9556)
## No Information Rate : 0.8841
## P-Value [Acc > NIR] : 2.052e-10
##
## Kappa : 0.7393
## Mcnemar's Test P-Value : 0.004181
##
## Sensitivity : 0.84746
## Specificity : 0.95444
## Pos Pred Value : 0.70922
## Neg Pred Value : 0.97948
## Prevalence : 0.11591
## Detection Rate : 0.09823
## Detection Prevalence : 0.13851
## Balanced Accuracy : 0.90095
##
## 'Positive' Class : yes
##
library(caret)
control <- trainControl(method="repeatedcv", numbe=10, repeats=3)
model <- train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
#predicted <- predict(model, testset, type='prob')
predicted <- predict(model, testset, type='prob')
library(rpart)
fit <- rpart(churn ~ ., data = trainset)
plot(fit, margin = 0.1)
text(fit)
predicted <- predict(fit, testset)
#head(predicted)
predicted[,1]
## 2 5 6 8 13 16
## 0.02877238 0.05084746 0.05084746 0.05084746 0.02877238 0.95522388
## 17 23 29 33 34 37
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238
## 41 45 46 47 50 54
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 56 57 58 60 61 62
## 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238 0.11764706
## 65 76 78 86 87 89
## 0.02877238 0.02877238 0.85915493 0.02877238 0.85915493 0.02877238
## 91 103 104 107 112 115
## 0.02877238 0.02877238 0.02877238 0.11764706 0.02877238 0.02877238
## 116 117 119 123 124 126
## 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 132 134 138 146 153 155
## 0.02877238 0.15584416 0.02877238 0.11162791 0.02877238 0.65000000
## 157 161 162 164 168 171
## 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791
## 174 176 181 183 184 186
## 0.02877238 0.11162791 0.15584416 0.02877238 0.02877238 0.02877238
## 188 193 201 204 206 211
## 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791
## 213 215 222 226 227 229
## 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 0.11764706
## 230 233 235 241 244 245
## 0.02877238 0.02877238 0.05084746 0.02877238 0.11162791 0.02877238
## 246 251 255 261 265 269
## 0.02877238 0.85915493 0.05084746 0.02877238 0.11162791 0.02877238
## 271 272 273 275 277 282
## 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238
## 286 289 293 295 301 304
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 305 309 313 319 332 333
## 0.02877238 0.02877238 0.02877238 0.11162791 0.95522388 0.85915493
## 339 340 341 342 343 348
## 0.72222222 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238
## 354 356 357 369 371 378
## 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 0.02877238
## 389 390 391 392 398 400
## 0.02877238 0.02877238 0.02877238 0.02877238 0.72222222 0.02877238
## 405 406 410 412 417 420
## 0.15584416 0.05084746 0.02877238 0.02877238 1.00000000 0.02877238
## 421 425 432 433 436 438
## 0.02877238 0.02877238 0.02877238 0.11162791 0.11162791 0.16666667
## 439 443 447 452 455 456
## 0.02877238 0.02877238 0.02877238 0.11162791 0.72222222 0.11162791
## 458 459 461 466 468 476
## 0.02877238 0.02877238 0.11162791 1.00000000 0.02877238 0.02877238
## 480 482 483 484 485 487
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746
## 489 490 494 498 506 509
## 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 517 520 521 524 530 532
## 0.02877238 0.02877238 0.11162791 0.02877238 0.05084746 0.02877238
## 533 536 537 540 542 544
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791
## 548 550 551 555 560 561
## 0.85915493 0.15584416 0.15584416 0.02877238 0.02877238 0.02877238
## 563 565 569 571 573 574
## 0.02877238 0.11162791 0.68421053 0.02877238 0.05084746 0.11162791
## 576 577 583 586 589 590
## 0.11764706 0.02877238 0.02877238 0.02877238 0.85915493 0.02877238
## 596 605 609 611 615 617
## 0.02877238 0.02877238 0.02877238 0.11162791 0.11162791 0.05084746
## 620 628 633 642 643 645
## 0.95522388 0.15584416 0.02877238 0.02877238 0.02877238 0.02877238
## 648 651 653 655 657 661
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.72222222
## 664 667 669 673 676 677
## 0.02877238 0.11162791 0.11162791 0.02877238 0.11162791 0.11162791
## 678 689 695 697 701 702
## 0.02877238 0.11764706 0.85915493 0.02877238 0.02877238 0.02877238
## 703 707 709 711 715 717
## 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 0.05084746
## 721 722 723 726 728 734
## 0.02877238 0.85915493 0.02877238 0.02877238 0.11162791 0.11162791
## 735 743 747 754 755 759
## 0.05084746 0.15584416 0.02877238 0.02877238 0.02877238 0.11764706
## 760 764 769 777 778 779
## 0.02877238 0.02877238 0.11162791 0.11162791 0.05084746 0.15584416
## 783 785 787 792 795 798
## 0.11162791 0.11764706 0.02877238 0.02877238 0.02877238 1.00000000
## 799 800 806 809 814 817
## 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 0.11162791
## 820 830 833 837 838 841
## 0.02877238 0.02877238 0.95522388 0.02877238 0.05084746 0.02877238
## 845 846 847 849 851 853
## 0.02877238 0.11764706 0.02877238 0.11162791 0.16666667 0.02877238
## 855 858 864 866 867 868
## 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 0.02877238
## 874 890 892 895 906 908
## 0.02877238 0.02877238 0.02877238 0.85915493 0.65000000 0.02877238
## 909 915 917 922 929 932
## 0.15584416 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238
## 934 937 938 941 943 945
## 1.00000000 0.02877238 0.11162791 0.02877238 0.11162791 0.15584416
## 946 954 956 962 966 968
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791
## 969 974 978 981 985 988
## 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 0.11764706
## 992 998 1003 1008 1011 1015
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238
## 1019 1022 1026 1027 1030 1033
## 0.11162791 0.02877238 0.72222222 0.02877238 0.02877238 0.11162791
## 1034 1035 1036 1039 1043 1044
## 0.02877238 0.02877238 0.02877238 0.85915493 0.05084746 0.02877238
## 1048 1050 1051 1053 1054 1057
## 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238
## 1065 1067 1068 1072 1073 1076
## 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1077 1079 1086 1087 1091 1094
## 0.02877238 0.95522388 0.11162791 0.02877238 0.02877238 0.02877238
## 1099 1103 1105 1107 1108 1111
## 0.02877238 0.65000000 0.15584416 0.02877238 0.02877238 0.02877238
## 1112 1115 1117 1120 1121 1125
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1131 1132 1133 1135 1136 1137
## 0.02877238 0.11162791 0.02877238 0.02877238 0.11162791 0.11764706
## 1141 1142 1143 1144 1146 1148
## 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238 0.11162791
## 1149 1150 1153 1157 1159 1160
## 0.02877238 0.05084746 0.02877238 0.02877238 0.05084746 0.02877238
## 1163 1164 1167 1172 1174 1179
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1186 1188 1190 1193 1209 1211
## 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 0.16666667
## 1212 1214 1215 1216 1217 1222
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238
## 1224 1226 1227 1228 1232 1233
## 0.02877238 0.02877238 0.02877238 0.02877238 0.95522388 0.11162791
## 1236 1242 1243 1246 1249 1252
## 0.15584416 0.15584416 0.05084746 0.02877238 0.05084746 0.02877238
## 1256 1257 1260 1262 1263 1266
## 0.02877238 0.02877238 0.02877238 0.02877238 0.65000000 0.11162791
## 1267 1270 1273 1276 1279 1291
## 0.02877238 0.85915493 0.15584416 0.02877238 0.72222222 0.11162791
## 1294 1296 1300 1301 1303 1306
## 0.02877238 0.02877238 0.95522388 0.02877238 0.95522388 0.02877238
## 1309 1310 1316 1322 1323 1325
## 0.02877238 0.05084746 0.11162791 0.02877238 1.00000000 0.02877238
## 1326 1329 1332 1333 1339 1347
## 0.85915493 0.02877238 0.02877238 0.11162791 0.02877238 1.00000000
## 1368 1384 1385 1386 1390 1396
## 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1397 1399 1401 1406 1407 1411
## 0.15584416 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238
## 1412 1414 1422 1423 1424 1425
## 0.02877238 0.02877238 0.11162791 0.02877238 0.05084746 0.02877238
## 1426 1427 1431 1433 1437 1439
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238
## 1444 1445 1446 1450 1451 1454
## 0.05084746 0.02877238 0.02877238 0.15584416 0.02877238 0.11162791
## 1455 1460 1461 1462 1463 1465
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1469 1471 1476 1477 1483 1484
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.05084746
## 1485 1487 1495 1497 1500 1508
## 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1510 1528 1529 1531 1532 1538
## 0.11162791 0.02877238 0.02877238 0.11162791 0.85915493 0.85915493
## 1540 1541 1546 1550 1554 1560
## 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791
## 1564 1569 1573 1574 1575 1580
## 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238
## 1581 1596 1597 1599 1600 1604
## 0.72222222 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1606 1613 1615 1617 1618 1628
## 0.02877238 0.02877238 0.72222222 0.02877238 0.05084746 0.02877238
## 1636 1638 1642 1643 1645 1650
## 0.85915493 0.11162791 0.05084746 0.02877238 0.02877238 0.02877238
## 1658 1659 1660 1663 1669 1673
## 0.02877238 0.02877238 0.02877238 0.85915493 0.02877238 0.02877238
## 1678 1681 1682 1683 1684 1685
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1686 1694 1702 1704 1705 1708
## 0.02877238 0.02877238 0.95522388 0.02877238 0.95522388 0.85915493
## 1709 1713 1714 1718 1719 1721
## 0.15584416 0.15584416 0.85915493 0.02877238 0.16666667 0.02877238
## 1725 1727 1728 1730 1731 1736
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1738 1743 1754 1759 1761 1763
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1765 1766 1771 1775 1777 1788
## 0.11162791 0.85915493 0.02877238 0.11162791 0.02877238 0.02877238
## 1795 1800 1803 1813 1821 1824
## 0.95522388 0.02877238 0.15584416 0.95522388 0.02877238 0.11162791
## 1825 1827 1828 1829 1831 1835
## 0.02877238 0.02877238 0.02877238 0.02877238 0.68421053 0.02877238
## 1836 1837 1838 1841 1843 1848
## 0.15584416 0.02877238 1.00000000 0.11162791 0.95522388 0.02877238
## 1850 1852 1856 1858 1859 1866
## 0.68421053 0.15584416 0.02877238 0.02877238 0.72222222 0.85915493
## 1869 1873 1875 1876 1882 1889
## 0.02877238 0.05084746 0.02877238 0.02877238 0.95522388 0.02877238
## 1892 1894 1902 1907 1911 1912
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.15584416
## 1915 1916 1926 1930 1932 1937
## 0.02877238 0.02877238 0.02877238 0.02877238 0.05084746 0.02877238
## 1938 1939 1940 1941 1946 1961
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238
## 1962 1966 1968 1972 1982 1983
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238
## 1985 1987 1988 1990 1991 1992
## 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 1996 2000 2003 2008 2011 2014
## 0.11162791 0.02877238 0.02877238 0.11162791 0.02877238 0.02877238
## 2019 2023 2025 2027 2029 2032
## 0.02877238 0.02877238 0.02877238 0.02877238 0.68421053 0.85915493
## 2034 2037 2038 2042 2043 2046
## 0.02877238 0.11162791 0.02877238 0.02877238 0.72222222 0.15584416
## 2051 2055 2057 2059 2060 2063
## 0.02877238 0.02877238 0.02877238 0.02877238 0.95522388 0.02877238
## 2065 2069 2070 2075 2077 2078
## 0.02877238 0.02877238 0.11162791 0.02877238 0.15584416 0.11162791
## 2079 2080 2096 2097 2102 2110
## 0.02877238 0.11162791 0.11162791 0.11162791 0.02877238 0.02877238
## 2112 2116 2122 2125 2128 2131
## 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 0.11162791
## 2133 2139 2141 2145 2147 2153
## 0.02877238 0.02877238 0.65000000 0.02877238 0.05084746 0.05084746
## 2154 2157 2161 2163 2165 2167
## 0.02877238 0.11162791 1.00000000 0.02877238 1.00000000 0.02877238
## 2168 2178 2180 2182 2189 2194
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791
## 2196 2199 2200 2208 2211 2212
## 0.02877238 0.11162791 0.11162791 0.02877238 0.95522388 0.15584416
## 2213 2219 2224 2233 2238 2243
## 0.11162791 0.85915493 0.65000000 0.02877238 0.15584416 0.02877238
## 2245 2246 2250 2257 2258 2259
## 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238 0.95522388
## 2261 2262 2269 2270 2274 2282
## 0.11764706 0.15584416 0.02877238 0.02877238 0.02877238 0.02877238
## 2283 2285 2288 2289 2290 2292
## 0.02877238 0.02877238 0.16666667 0.02877238 0.05084746 0.02877238
## 2293 2295 2297 2305 2313 2316
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 2319 2320 2321 2323 2325 2328
## 0.02877238 0.11764706 0.02877238 0.15584416 0.85915493 0.65000000
## 2331 2332 2340 2343 2344 2351
## 0.02877238 0.02877238 0.02877238 0.02877238 1.00000000 0.02877238
## 2354 2355 2356 2360 2362 2369
## 0.05084746 0.95522388 0.11162791 0.02877238 0.02877238 0.11162791
## 2370 2372 2374 2377 2379 2381
## 0.65000000 0.02877238 0.11162791 0.95522388 0.02877238 0.65000000
## 2382 2386 2393 2396 2397 2399
## 0.02877238 0.02877238 0.02877238 0.15584416 0.05084746 0.02877238
## 2407 2413 2416 2427 2428 2436
## 0.02877238 0.68421053 0.85915493 0.02877238 0.02877238 0.11764706
## 2440 2441 2444 2445 2448 2449
## 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238 0.11162791
## 2452 2454 2456 2468 2470 2472
## 0.95522388 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238
## 2474 2475 2476 2477 2480 2482
## 0.02877238 0.02877238 0.02877238 0.02877238 1.00000000 0.02877238
## 2485 2487 2488 2492 2495 2496
## 0.02877238 0.02877238 0.02877238 0.02877238 0.11162791 0.02877238
## 2497 2507 2508 2509 2512 2523
## 0.02877238 0.11162791 0.02877238 0.02877238 0.02877238 0.02877238
## 2524 2527 2529 2533 2539 2544
## 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238
## 2549 2551 2552 2555 2556 2561
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 2565 2569 2571 2574 2579 2580
## 0.02877238 0.05084746 0.02877238 0.68421053 0.02877238 0.02877238
## 2582 2583 2584 2585 2590 2592
## 0.85915493 0.02877238 0.02877238 0.02877238 0.02877238 0.72222222
## 2593 2598 2600 2602 2604 2605
## 0.15584416 0.02877238 0.72222222 0.02877238 0.11764706 0.02877238
## 2606 2611 2614 2616 2617 2619
## 0.72222222 0.02877238 0.11162791 0.02877238 0.02877238 0.05084746
## 2620 2623 2632 2634 2635 2636
## 1.00000000 0.85915493 0.02877238 0.02877238 0.02877238 0.02877238
## 2637 2640 2643 2645 2647 2651
## 0.02877238 0.02877238 0.05084746 0.02877238 0.15584416 0.02877238
## 2655 2656 2658 2660 2661 2662
## 0.02877238 0.02877238 0.11162791 0.02877238 0.95522388 0.11162791
## 2665 2666 2668 2671 2678 2688
## 1.00000000 0.05084746 0.02877238 0.02877238 0.68421053 1.00000000
## 2689 2697 2700 2705 2710 2712
## 0.02877238 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238
## 2714 2718 2720 2725 2728 2730
## 0.02877238 0.02877238 0.02877238 0.16666667 0.05084746 0.02877238
## 2740 2743 2752 2753 2754 2756
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 2761 2768 2770 2772 2780 2782
## 1.00000000 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 2783 2785 2786 2787 2791 2793
## 0.02877238 0.11162791 0.15584416 0.85915493 0.02877238 0.02877238
## 2795 2798 2801 2803 2810 2812
## 0.05084746 0.02877238 1.00000000 0.02877238 0.02877238 0.02877238
## 2813 2822 2823 2827 2835 2837
## 0.02877238 0.11162791 0.05084746 0.02877238 0.02877238 0.02877238
## 2838 2839 2840 2850 2851 2855
## 0.02877238 0.02877238 0.95522388 0.11162791 0.02877238 0.02877238
## 2862 2865 2866 2868 2869 2872
## 0.15584416 0.11162791 0.02877238 0.11162791 1.00000000 0.02877238
## 2874 2877 2878 2881 2882 2891
## 0.02877238 0.11162791 0.02877238 0.11162791 0.02877238 0.11162791
## 2892 2900 2905 2906 2908 2910
## 0.02877238 0.02877238 0.05084746 0.02877238 0.02877238 0.02877238
## 2920 2923 2938 2940 2944 2948
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.72222222
## 2951 2954 2955 2960 2962 2963
## 0.02877238 0.15584416 0.02877238 0.11162791 0.15584416 0.11162791
## 2966 2973 2975 2981 2982 2984
## 0.02877238 0.02877238 0.02877238 0.72222222 0.02877238 0.02877238
## 2985 2992 2996 2997 2998 3002
## 0.02877238 0.05084746 0.02877238 0.02877238 0.11162791 0.02877238
## 3005 3008 3013 3016 3018 3026
## 0.02877238 0.02877238 0.02877238 0.02877238 0.15584416 0.02877238
## 3027 3032 3034 3037 3038 3043
## 0.15584416 0.05084746 0.02877238 0.02877238 0.02877238 0.02877238
## 3048 3049 3050 3051 3057 3061
## 0.02877238 0.02877238 0.02877238 0.85915493 0.11162791 0.02877238
## 3063 3064 3069 3071 3072 3073
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.68421053
## 3084 3091 3092 3097 3098 3099
## 0.11162791 0.02877238 0.02877238 0.11764706 0.02877238 0.02877238
## 3107 3110 3112 3115 3116 3123
## 0.02877238 0.05084746 0.02877238 0.02877238 0.15584416 0.02877238
## 3125 3127 3129 3131 3132 3135
## 0.11162791 0.11162791 0.02877238 0.02877238 0.15584416 0.02877238
## 3140 3143 3144 3150 3152 3154
## 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238 0.02877238
## 3155 3159 3165 3168 3169 3176
## 0.02877238 0.02877238 0.02877238 0.11162791 0.72222222 0.11764706
## 3180 3184 3190 3193 3196 3198
## 0.02877238 0.02877238 1.00000000 0.02877238 0.02877238 0.11162791
## 3200 3207 3217 3218 3223 3226
## 0.02877238 0.02877238 0.02877238 0.11162791 0.05084746 0.11162791
## 3227 3229 3232 3235 3245 3254
## 0.11162791 0.15584416 0.02877238 0.02877238 0.11162791 0.02877238
## 3258 3259 3260 3261 3267 3270
## 0.02877238 0.11162791 0.02877238 0.11162791 0.02877238 0.05084746
## 3277 3279 3285 3290 3292 3296
## 0.11162791 0.11162791 0.02877238 0.02877238 1.00000000 0.11162791
## 3301 3302 3307 3311 3315 3316
## 0.02877238 0.95522388 0.02877238 0.02877238 0.02877238 0.02877238
## 3323 3325 3329 3333
## 0.95522388 0.02877238 0.02877238 0.72222222
res <- ifelse(predicted[,1] > 0.2, 'yes', 'no' )
res <- factor(res, levels = c('yes','no'), labels = c('yes', 'no'))
#res
tb <- table(testset$churn,res)
TP <- tb[1,1]
FN <- tb[2,1]
FP <- tb[1,2]
TN <- tb[2,2]
FPR <- FP / (TN + FP)
TPR <- TP / (TP + FN)
FPR
## [1] 0.04555556
TPR
## [1] 0.8474576
x <- c(0)
y <- c(0)
t <- c(0)
for (threshold in seq(0,1,0.01)){
res <- ifelse(predicted[,1] >= threshold, 'yes', 'no' )
res <- factor(res, levels = c('yes','no'), labels = c('yes', 'no'))
tb <- table(testset$churn,res)
#print(tb)
TP <- tb[1,1]
FN <- tb[2,1]
FP <- tb[1,2]
TN <- tb[2,2]
FPR <- FP / (TN + FP)
TPR <- TP / (TP + FN)
if (! is.na(FPR)){
x <- c(x, FPR)
y <- c(y, TPR)
t <- c(t, threshold)
}
}
x <- c(x, 1)
y <- c(y, 1)
t <- c(t, 1)
t[order(y / x, decreasing = TRUE)]
## [1] 0.16 0.12 0.13 0.14 0.15 0.17 0.18 0.19 0.20 0.21 0.22 0.23 0.24 0.25
## [15] 0.26 0.27 0.28 0.29 0.30 0.31 0.32 0.33 0.34 0.35 0.36 0.37 0.38 0.39
## [29] 0.40 0.41 0.42 0.43 0.44 0.45 0.46 0.47 0.48 0.49 0.50 0.51 0.52 0.53
## [43] 0.54 0.55 0.56 0.57 0.58 0.59 0.60 0.61 0.62 0.63 0.64 0.65 0.06 0.07
## [57] 0.08 0.09 0.10 0.11 0.66 0.67 0.68 0.69 0.70 0.71 0.72 0.03 0.04 0.05
## [71] 0.73 0.74 0.75 0.76 0.77 0.78 0.79 0.80 0.81 0.82 0.83 0.84 0.85 0.86
## [85] 0.87 0.88 0.89 0.90 0.91 0.92 0.93 0.94 0.95 0.96 0.97 0.98 0.99 1.00
## [99] 1.00 0.00
plot(x, y, type = 'b', xlab = 'FPR', ylab ='TPR', main = 'ROC Curve', col='blue', xlim = c(0,1), ylim=c(0,1))
lines(c(0,1), c(0,1), col='red')
set.seed(23)
#.Random.seed
sample.int(42,6)
## [1] 25 10 14 28 32 16
sample.int(42,6)
## [1] 41 42 34 39 33 26
sample.int(42,6)
## [1] 17 13 34 6 20 22
# Tree Model
library(rpart)
fit1 <- rpart(churn ~ ., data = trainset)
# Random Forest
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
fit2 <- randomForest(churn ~., data = trainset)
# ConfusionMatirx
## tree
predicted.tree <- predict(fit1, testset, type='class')
table(testset$churn,predicted.tree)
## predicted.tree
## yes no
## yes 100 41
## no 18 859
## forest
predicted.forest <- predict(fit2, testset)
table(testset$churn,predicted.forest)
## predicted.forest
## yes no
## yes 109 32
## no 6 871
# ROC Curve
### ROC Generation Function
ROCCurve <- function(model){
predicted <- predict(model, testset, type='prob')
x1 <- c(0)
y1 <- c(0)
for (threshold in seq(0,1,0.01)){
res <- ifelse(predicted[,1] >= threshold, 'yes', 'no' )
res <- factor(res, levels = c('yes','no'), labels = c('yes', 'no'))
tb <- table(testset$churn,res)
#print(tb)
TP <- tb[1,1]
FN <- tb[2,1]
FP <- tb[1,2]
TN <- tb[2,2]
FPR <- FP / (TN + FP)
TPR <- TP / (TP + FN)
if (! is.na(FPR)){
x1 <- c(x1, FPR)
y1 <- c(y1, TPR)
}
}
x1 <- c(x1, 1)
y1 <- c(y1, 1)
return(list(x=x1, y = y1))
}
### Compare Model
tree <- ROCCurve(fit1)
forest <- ROCCurve(fit2)
plot(c(0,1),c(0,1), type= 'n',xlab = 'FPR', ylab ='TPR', main = 'ROC Curve', col='blue', xlim = c(0,1), ylim=c(0,1))
lines(tree$x, tree$y, col='red')
lines(forest$x, forest$y, col='orange')
legend(0.7,0.2, legend = c('tree', 'forest'), col = c('red', 'orange'),lwd = 3)
## Using ROCR Package
#install.packages('ROCR')
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.4.2
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predicted.tree <- predict(fit1, testset, type='prob')
tree.to.roc <- predicted.tree[,1]
tree.pred.rocr <- prediction(tree.to.roc, testset$churn)
tree.perf.rocr <- performance(tree.pred.rocr, measure = "auc", x.measure = "cutoff")
tree.perf.tpr.rocr <- performance(tree.pred.rocr, "tpr","fpr")
predicted.forest <- predict(fit2, testset, type='prob')
forest.to.roc <- predicted.forest[,1]
forest.pred.rocr <- prediction(forest.to.roc, testset$churn)
forest.perf.rocr <- performance(forest.pred.rocr, measure = "auc", x.measure = "cutoff")
#forest.perf.rocr
forest.perf.tpr.rocr <- performance(forest.pred.rocr, "tpr","fpr")
plot(tree.perf.tpr.rocr, col="red",main="ROC Comparison")
plot(forest.perf.tpr.rocr, col="blue", add=TRUE)
legend(0.7,0.2, legend = c(paste0('tree:',as.character(round(tree.perf.rocr@y.values[[1]],2))), paste0('forest:',as.character(round(forest.perf.rocr@y.values[[1]],2) ))), col = c('red', 'blue'),lwd = 3)
## Find Most Important Variable
#install.packages('rminer')
library(rminer)
## Warning: package 'rminer' was built under R version 3.4.2
##
## Attaching package: 'rminer'
## The following object is masked from 'package:party':
##
## fit
## The following object is masked from 'package:modeltools':
##
## fit
model <- fit(churn~., data = trainset,model="svm")
VariableImportance <- Importance(model,trainset,method="sensv")
L <- list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)
## 文章分類 - https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData
#load('applenews.RData')
#head(applenews)
apple.subset <- applenews[applenews$category %in% c('社會', '財經') , ]
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(apple.subset$content, function(e) segment(e, jiebar = mixseg))
library(tm)
corpus <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(corpus)
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
dtm.count <- apply(dtm, MARGIN = 2, convert_counts)
#dtm.count[1:100,1:100]
m <- as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]
traintag <- apple.subset[idx==1,"category"]
testtag <-apple.subset[idx==2,"category"]
library(e1071)
model <- naiveBayes(trainset,as.factor(traintag) )
pred <- predict(model, testset)
tb <- table(pred, as.factor(testtag) )
tb