使用rpart 做分類

data(iris)
View(iris)
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data= iris)
fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)

結果驗證

plot(iris$Petal.Length, iris$Petal.Width)

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)
fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
abline(v= 2.45, col="blue")
abline(h= 1.75, col="orange")

產生預測結果

predict(fit, data.frame(Petal.Length = 1, Petal.Width = 1, Sepal.Length = 1, Sepal.Width = 1))
##   setosa versicolor virginica
## 1      1          0         0
# 類別
#predict(fit, iris[,-5], type='class')

# 機率
#predict(fit, iris[,-5], type='prob')


cm <- table(iris[,5], predict(fit, iris[,-5], type='class'))
cm
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45
# Build Confusion Matrix
t <- c(1,2,1,2,1,0,0,1,2,1)
table(t)
## t
## 0 1 2 
## 2 5 3
p <- c(1,2,2,1,2,0,0,1,2,0)
table(t,p)
##    p
## t   0 1 2
##   0 2 0 0
##   1 1 2 2
##   2 0 1 2

計算預測準確率

ac = (50 + 49 +45 ) / (50+50+50)
#install.pacakges('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.36           
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9074           0.9783
## Specificity                 1.0000            0.9896           0.9519
## Pos Pred Value              1.0000            0.9800           0.9000
## Neg Pred Value              1.0000            0.9500           0.9900
## Prevalence                  0.3333            0.3600           0.3067
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9485           0.9651
#https://en.wikipedia.org/wiki/Sensitivity_and_specificity

分為訓練與測試資料集

#.Random.seed
set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36  2
sample.int(42,6)
## [1] 23 37 42 18 41 17
sample.int(42,6)
## [1] 29 24  5 36 10  2
set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36  2
sample.int(42,6)
## [1] 23 37 42 18 41 17
sample.int(42,6)
## [1] 29 24  5 36 10  2
set.seed(123)
ind = sample(c(1,2), nrow(iris), replace=TRUE, prob = c(0.7,0.3))
#ind == 1
trainset = iris[ind == 1 , ]
testset  = iris[ind == 2 , ]
dim(trainset)
## [1] 106   5
dim(testset)
## [1] 44  5

使用訓練資料集建立模型

fit2 <- rpart(Species ~ . , data = trainset)
plot(fit2, margin =0.1)
text(fit2)

plot(trainset$Petal.Length, trainset$Petal.Width)

plot(trainset$Petal.Length, trainset$Petal.Width, col=trainset$Species)
fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
abline(v= 2.45, col="blue")
abline(v= 4.75, col="orange")

使用測試資料集驗證模型

trainpred <- predict(fit2, trainset[,-5], type= "class")
traincm <- table(trainpred, trainset[,5])
confusionMatrix(traincm)
## Confusion Matrix and Statistics
## 
##             
## trainpred    setosa versicolor virginica
##   setosa         35          0         0
##   versicolor      0         34         0
##   virginica       0          2        35
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9811          
##                  95% CI : (0.9335, 0.9977)
##     No Information Rate : 0.3396          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9717          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9444           1.0000
## Specificity                 1.0000            1.0000           0.9718
## Pos Pred Value              1.0000            1.0000           0.9459
## Neg Pred Value              1.0000            0.9722           1.0000
## Prevalence                  0.3302            0.3396           0.3302
## Detection Rate              0.3302            0.3208           0.3302
## Detection Prevalence        0.3302            0.3208           0.3491
## Balanced Accuracy           1.0000            0.9722           0.9859
testpred <- predict(fit2, testset[,-5], type= "class")
testcm <- table(testpred, testset[,5])
confusionMatrix(testcm)
## Confusion Matrix and Statistics
## 
##             
## testpred     setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         1
##   virginica       0          4        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.7544, 0.9621)
##     No Information Rate : 0.3409          
##     P-Value [Acc > NIR] : 8.552e-14       
##                                           
##                   Kappa : 0.8291          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7143           0.9333
## Specificity                 1.0000            0.9667           0.8621
## Pos Pred Value              1.0000            0.9091           0.7778
## Neg Pred Value              1.0000            0.8788           0.9615
## Prevalence                  0.3409            0.3182           0.3409
## Detection Rate              0.3409            0.2273           0.3182
## Detection Prevalence        0.3409            0.2500           0.4091
## Balanced Accuracy           1.0000            0.8405           0.8977

使用caret 套件

library(caret)
fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           repeats = 10)
rpartFit <- train(Species ~ ., data = iris,
                 method = "rpart",
                 trControl = fitControl)
rpartFit
## CART 
## 
## 150 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa
##   0.00  0.9346667  0.902
##   0.44  0.7906667  0.686
##   0.50  0.3333333  0.000
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.
cm = table(predict(rpartFit, iris), iris[,5])
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9800           0.9000
## Specificity                 1.0000            0.9500           0.9900
## Pos Pred Value              1.0000            0.9074           0.9783
## Neg Pred Value              1.0000            0.9896           0.9519
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3600           0.3067
## Balanced Accuracy           1.0000            0.9650           0.9450

安裝caret, e1071


install.packages('caret', repo='http://nbcgib.uesc.br/mirrors/cran/')

install.packages('e1071', repo='http://nbcgib.uesc.br/mirrors/cran/')

客戶流失分析

讀取資料

#install.packages("C50")
library(C50)
## Warning: package 'C50' was built under R version 3.2.5
data(churn)
str(churnTrain)
## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

將資料區分成訓練與測試資料集

churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset = churnTrain[ind == 1,]
testset = churnTrain[ind == 2,]

建立分類樹

library(rpart)
churn.rp <- rpart(churn ~ ., data=trainset)

plot(churn.rp, margin= 0.1)
text(churn.rp, all=TRUE, use.n = TRUE)

產生Confusion Matrix

pred <- predict(churn.rp, testset, type="class")
cm = table(testset$churn, pred)

library(caret)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##      pred
##       yes  no
##   yes 100  41
##   no   18 859
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8841          
##     P-Value [Acc > NIR] : 2.052e-10       
##                                           
##                   Kappa : 0.7393          
##  Mcnemar's Test P-Value : 0.004181        
##                                           
##             Sensitivity : 0.84746         
##             Specificity : 0.95444         
##          Pos Pred Value : 0.70922         
##          Neg Pred Value : 0.97948         
##              Prevalence : 0.11591         
##          Detection Rate : 0.09823         
##    Detection Prevalence : 0.13851         
##       Balanced Accuracy : 0.90095         
##                                           
##        'Positive' Class : yes             
## 
table(testset$churn)
## 
## yes  no 
## 141 877
877 / (141 + 877)
## [1] 0.8614931

進行剪枝

min(churn.rp$cptable[,"xerror"])
## [1] 0.4707602
which.min(churn.rp$cptable[,"xerror"])
## 7 
## 7
churn.cp = churn.rp$cptable[7,"CP"]
prune.tree = prune(churn.rp, cp= churn.cp)
plot(prune.tree, margin= 0.1)
text(prune.tree, all=TRUE , use.n=TRUE)

手工製做ROC Curve

head(predict(churn.rp, testset, type="prob"))
##           yes         no
## 2  0.02877238 0.97122762
## 5  0.05084746 0.94915254
## 6  0.05084746 0.94915254
## 8  0.05084746 0.94915254
## 13 0.02877238 0.97122762
## 16 0.95522388 0.04477612
x = c()
y = c()
for (threshold in seq(0.1,0.9,0.1)){
yes = predict(churn.rp, testset, type="prob")[,1]
pred = ifelse(yes > threshold, 0, 1)
pred = factor(pred, labels =c('yes', 'no'))
ct = table(pred, testset$churn)
cm = confusionMatrix(ct)
y = c(y, cm$byClass[1])
x = c(x, 1 - cm$byClass[2])
}

 plot(x,y)
 lines(x,y, col="red")

使用套件製作ROC Curve

 library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <- predict(churn.rp, testset, type="prob")
yes <- predictions[, 1] 
pred.rocr <- prediction(yes, as.factor(testset[,(dim(testset)[[2]])])) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

各種分類

#rpart
fit = rpart(Species ~., data = iris)
table(predict(fit,iris[,-5], type="class"),  iris$Species)
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
#svm
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5
fit = svm(Species ~., data = iris)
table(predict(fit,iris[,-5], type="class"),  iris$Species)
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48
#naive bayes
library(e1071)
fit = naiveBayes(Species ~., data = iris)
table(predict(fit,iris[,-5], type="class"),  iris$Species)
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          3        47

新聞分類

讀取資料集

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/appledaily.RData', 'appledaily.RData')

load('appledaily.RData')

apple.subset = appledaily[appledaily$category %in% c('財經', '娛樂', '社會'),]

table(apple.subset$category)
## 
## 社會 娛樂 財經 
##  194  113  121

使用 Jieba斷詞

library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
mixseg = worker()

apple.seg =lapply(apple.subset$content, function(e)segment(code=e, jiebar=mixseg))

建立詞頻矩陣

source('https://github.com/ywchiu/rtibame/raw/master/Lib/CNCorpus.R')
## Warning: package 'tm' was built under R version 3.2.5
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
doc=CNCorpus(apple.seg)
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
dim(dtm)
## [1]   428 18107

篩選詞頻矩陣

ft <- findFreqTerms(dtm, 5)

control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer,dictionary =ft)

new.dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)

dim(new.dtm)
## [1]  428 3168

矩陣轉換

#as.character(apple.subset[1,])
#inspect(new.dtm[1,])
convert_counts <- function(x) {
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
  return(x)
}
dtm.count <- apply(new.dtm, MARGIN = 2, convert_counts)
#dtm.count[1,]

將資料列為訓練跟測試資料集

m <- as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]

traintitle <- apple.subset[idx==1,"title"]
testtitle <- apple.subset[idx==2,"title"]

traintag <- apple.subset[idx==1,"category"]
testtag <-apple.subset[idx==2,"category"]

traintag = as.factor(traintag)
testtag = as.factor(testtag)

dim(trainset)
## [1]  284 3168
dim(testset)
## [1]  144 3168

使用Naive Bayes 建立模型

library(e1071)
model <- naiveBayes(trainset, traintag)
pred <- predict(model, testset)
cm <- table(pred, testtag)
cm
##       testtag
## pred   社會 娛樂 財經
##   社會   66    1    1
##   娛樂    0   29    3
##   財經    0    0   44
testtitle[pred != testtag]
## [1] "【唱新聞】詐騙嗎?R.O.C.有CHINA但不是CHINA"   
## [2] "璩美鳳老公說她是石油公司副總 公司卻發聲明..."
## [3] "【壹週刊】從貴婦到攤販 看見台灣媳婦的堅毅"   
## [4] "【開了等於沒開?】浩鼎秘密會議 立委怒了"     
## [5] "大倒角握感如何? 網友試機HTC 10"
pred[pred != testtag]
## [1] 社會 娛樂 娛樂 社會 娛樂
## Levels: 社會 娛樂 財經
testtag[pred != testtag]
## [1] 娛樂 財經 財經 財經 財經
## Levels: 社會 娛樂 財經