使用rpart 做分類

data(iris)
View(iris)
library(rpart)

## Warning: package 'rpart' was built under R version 3.2.5

fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data= iris)
fit

## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *

plot(fit, margin = 0.1)
text(fit)

結果驗證

plot(iris$Petal.Length, iris$Petal.Width)

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)
fit

## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *

abline(v= 2.45, col="blue")
abline(h= 1.75, col="orange")

產生預測結果

predict(fit, data.frame(Petal.Length = 1, Petal.Width = 1, Sepal.Length = 1, Sepal.Width = 1))

##   setosa versicolor virginica
## 1      1          0         0

# 類別
#predict(fit, iris[,-5], type='class')

# 機率
#predict(fit, iris[,-5], type='prob')


cm <- table(iris[,5], predict(fit, iris[,-5], type='class'))
cm

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45

# Build Confusion Matrix
t <- c(1,2,1,2,1,0,0,1,2,1)
table(t)

## t
## 0 1 2 
## 2 5 3

p <- c(1,2,2,1,2,0,0,1,2,0)
table(t,p)

##    p
## t   0 1 2
##   0 2 0 0
##   1 1 2 2
##   2 0 1 2

計算預測準確率

ac = (50 + 49 +45 ) / (50+50+50)
#install.pacakges('caret')
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.36           
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9074           0.9783
## Specificity                 1.0000            0.9896           0.9519
## Pos Pred Value              1.0000            0.9800           0.9000
## Neg Pred Value              1.0000            0.9500           0.9900
## Prevalence                  0.3333            0.3600           0.3067
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3333           0.3333
## Balanced Accuracy           1.0000            0.9485           0.9651

#https://en.wikipedia.org/wiki/Sensitivity_and_specificity

分為訓練與測試資料集

#.Random.seed
set.seed(123)
sample.int(42,6)

## [1] 13 33 17 35 36  2

sample.int(42,6)

## [1] 23 37 42 18 41 17

sample.int(42,6)

## [1] 29 24  5 36 10  2

set.seed(123)
sample.int(42,6)

## [1] 13 33 17 35 36  2

sample.int(42,6)

## [1] 23 37 42 18 41 17

sample.int(42,6)

## [1] 29 24  5 36 10  2

set.seed(123)
ind = sample(c(1,2), nrow(iris), replace=TRUE, prob = c(0.7,0.3))
#ind == 1
trainset = iris[ind == 1 , ]
testset  = iris[ind == 2 , ]
dim(trainset)

## [1] 106   5

dim(testset)

## [1] 44  5

使用訓練資料集建立模型

fit2 <- rpart(Species ~ . , data = trainset)
plot(fit2, margin =0.1)
text(fit2)

plot(trainset$Petal.Length, trainset$Petal.Width)

plot(trainset$Petal.Length, trainset$Petal.Width, col=trainset$Species)
fit

## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *

abline(v= 2.45, col="blue")
abline(v= 4.75, col="orange")

使用測試資料集驗證模型

trainpred <- predict(fit2, trainset[,-5], type= "class")
traincm <- table(trainpred, trainset[,5])
confusionMatrix(traincm)

## Confusion Matrix and Statistics
## 
##             
## trainpred    setosa versicolor virginica
##   setosa         35          0         0
##   versicolor      0         34         0
##   virginica       0          2        35
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9811          
##                  95% CI : (0.9335, 0.9977)
##     No Information Rate : 0.3396          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9717          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9444           1.0000
## Specificity                 1.0000            1.0000           0.9718
## Pos Pred Value              1.0000            1.0000           0.9459
## Neg Pred Value              1.0000            0.9722           1.0000
## Prevalence                  0.3302            0.3396           0.3302
## Detection Rate              0.3302            0.3208           0.3302
## Detection Prevalence        0.3302            0.3208           0.3491
## Balanced Accuracy           1.0000            0.9722           0.9859

testpred <- predict(fit2, testset[,-5], type= "class")
testcm <- table(testpred, testset[,5])
confusionMatrix(testcm)

## Confusion Matrix and Statistics
## 
##             
## testpred     setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         1
##   virginica       0          4        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.7544, 0.9621)
##     No Information Rate : 0.3409          
##     P-Value [Acc > NIR] : 8.552e-14       
##                                           
##                   Kappa : 0.8291          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7143           0.9333
## Specificity                 1.0000            0.9667           0.8621
## Pos Pred Value              1.0000            0.9091           0.7778
## Neg Pred Value              1.0000            0.8788           0.9615
## Prevalence                  0.3409            0.3182           0.3409
## Detection Rate              0.3409            0.2273           0.3182
## Detection Prevalence        0.3409            0.2500           0.4091
## Balanced Accuracy           1.0000            0.8405           0.8977

使用caret 套件

library(caret)
fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           repeats = 10)
rpartFit <- train(Species ~ ., data = iris,
                 method = "rpart",
                 trControl = fitControl)
rpartFit

## CART 
## 
## 150 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times) 
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ... 
## Resampling results across tuning parameters:
## 
##   cp    Accuracy   Kappa
##   0.00  0.9346667  0.902
##   0.44  0.7906667  0.686
##   0.50  0.3333333  0.000
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.

cm = table(predict(rpartFit, iris), iris[,5])
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9800           0.9000
## Specificity                 1.0000            0.9500           0.9900
## Pos Pred Value              1.0000            0.9074           0.9783
## Neg Pred Value              1.0000            0.9896           0.9519
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3600           0.3067
## Balanced Accuracy           1.0000            0.9650           0.9450

安裝caret, e1071


install.packages('caret', repo='http://nbcgib.uesc.br/mirrors/cran/')

install.packages('e1071', repo='http://nbcgib.uesc.br/mirrors/cran/')

客戶流失分析

讀取資料

#install.packages("C50")
library(C50)

## Warning: package 'C50' was built under R version 3.2.5

data(churn)
str(churnTrain)

## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

將資料區分成訓練與測試資料集

churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset = churnTrain[ind == 1,]
testset = churnTrain[ind == 2,]

建立分類樹

library(rpart)
churn.rp <- rpart(churn ~ ., data=trainset)

plot(churn.rp, margin= 0.1)
text(churn.rp, all=TRUE, use.n = TRUE)

產生Confusion Matrix

pred <- predict(churn.rp, testset, type="class")
cm = table(testset$churn, pred)

library(caret)
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##      pred
##       yes  no
##   yes 100  41
##   no   18 859
##                                           
##                Accuracy : 0.942           
##                  95% CI : (0.9259, 0.9556)
##     No Information Rate : 0.8841          
##     P-Value [Acc > NIR] : 2.052e-10       
##                                           
##                   Kappa : 0.7393          
##  Mcnemar's Test P-Value : 0.004181        
##                                           
##             Sensitivity : 0.84746         
##             Specificity : 0.95444         
##          Pos Pred Value : 0.70922         
##          Neg Pred Value : 0.97948         
##              Prevalence : 0.11591         
##          Detection Rate : 0.09823         
##    Detection Prevalence : 0.13851         
##       Balanced Accuracy : 0.90095         
##                                           
##        'Positive' Class : yes             
##

table(testset$churn)

## 
## yes  no 
## 141 877

877 / (141 + 877)

## [1] 0.8614931

進行剪枝

min(churn.rp$cptable[,"xerror"])

## [1] 0.4707602

which.min(churn.rp$cptable[,"xerror"])

## 7 
## 7

churn.cp = churn.rp$cptable[7,"CP"]
prune.tree = prune(churn.rp, cp= churn.cp)
plot(prune.tree, margin= 0.1)
text(prune.tree, all=TRUE , use.n=TRUE)

手工製做ROC Curve

head(predict(churn.rp, testset, type="prob"))

##           yes         no
## 2  0.02877238 0.97122762
## 5  0.05084746 0.94915254
## 6  0.05084746 0.94915254
## 8  0.05084746 0.94915254
## 13 0.02877238 0.97122762
## 16 0.95522388 0.04477612

x = c()
y = c()
for (threshold in seq(0.1,0.9,0.1)){
yes = predict(churn.rp, testset, type="prob")[,1]
pred = ifelse(yes > threshold, 0, 1)
pred = factor(pred, labels =c('yes', 'no'))
ct = table(pred, testset$churn)
cm = confusionMatrix(ct)
y = c(y, cm$byClass[1])
x = c(x, 1 - cm$byClass[2])
}

 plot(x,y)
 lines(x,y, col="red")

使用套件製作ROC Curve

 library(ROCR)

## Warning: package 'ROCR' was built under R version 3.2.5

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.2.5

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

predictions <- predict(churn.rp, testset, type="prob")
yes <- predictions[, 1] 
pred.rocr <- prediction(yes, as.factor(testset[,(dim(testset)[[2]])])) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

各種分類

#rpart
fit = rpart(Species ~., data = iris)
table(predict(fit,iris[,-5], type="class"),  iris$Species)

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45

#svm
library(e1071)

## Warning: package 'e1071' was built under R version 3.2.5

fit = svm(Species ~., data = iris)
table(predict(fit,iris[,-5], type="class"),  iris$Species)

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48

#naive bayes
library(e1071)
fit = naiveBayes(Species ~., data = iris)
table(predict(fit,iris[,-5], type="class"),  iris$Species)

##             
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         47         3
##   virginica       0          3        47

新聞分類

讀取資料集

download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/appledaily.RData', 'appledaily.RData')

load('appledaily.RData')

apple.subset = appledaily[appledaily$category %in% c('財經', '娛樂', '社會'),]

table(apple.subset$category)

## 
## 社會 娛樂 財經 
##  194  113  121

使用 Jieba斷詞

library(jiebaR)

## Warning: package 'jiebaR' was built under R version 3.2.5

## Loading required package: jiebaRD

## Warning: package 'jiebaRD' was built under R version 3.2.5

mixseg = worker()

apple.seg =lapply(apple.subset$content, function(e)segment(code=e, jiebar=mixseg))

建立詞頻矩陣

source('https://github.com/ywchiu/rtibame/raw/master/Lib/CNCorpus.R')

## Warning: package 'tm' was built under R version 3.2.5

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

doc=CNCorpus(apple.seg)
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
dim(dtm)

## [1]   428 18107

篩選詞頻矩陣

ft <- findFreqTerms(dtm, 5)

control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer,dictionary =ft)

new.dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)

dim(new.dtm)

## [1]  428 3168

矩陣轉換

#as.character(apple.subset[1,])
#inspect(new.dtm[1,])
convert_counts <- function(x) {
  x <- ifelse(x > 0, 1, 0)
  x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
  return(x)
}
dtm.count <- apply(new.dtm, MARGIN = 2, convert_counts)
#dtm.count[1,]

將資料列為訓練跟測試資料集

m <- as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]

traintitle <- apple.subset[idx==1,"title"]
testtitle <- apple.subset[idx==2,"title"]

traintag <- apple.subset[idx==1,"category"]
testtag <-apple.subset[idx==2,"category"]

traintag = as.factor(traintag)
testtag = as.factor(testtag)

dim(trainset)

## [1]  284 3168

dim(testset)

## [1]  144 3168

使用Naive Bayes 建立模型

library(e1071)
model <- naiveBayes(trainset, traintag)
pred <- predict(model, testset)
cm <- table(pred, testtag)
cm

##       testtag
## pred   社會 娛樂 財經
##   社會   66    1    1
##   娛樂    0   29    3
##   財經    0    0   44

testtitle[pred != testtag]

## [1] "【唱新聞】詐騙嗎？R.O.C.有CHINA但不是CHINA"   
## [2] "璩美鳳老公說她是石油公司副總　公司卻發聲明..."
## [3] "【壹週刊】從貴婦到攤販　看見台灣媳婦的堅毅"   
## [4] "【開了等於沒開？】浩鼎秘密會議　立委怒了"     
## [5] "大倒角握感如何？　網友試機HTC 10"

pred[pred != testtag]

## [1] 社會 娛樂 娛樂 社會 娛樂
## Levels: 社會 娛樂 財經

testtag[pred != testtag]

## [1] 娛樂 財經 財經 財經 財經
## Levels: 社會 娛樂 財經

Demo20160814_2

David Chiu

2016年8月14日

使用rpart 做分類

結果驗證

產生預測結果

計算預測準確率

分為訓練與測試資料集

使用訓練資料集建立模型

使用測試資料集驗證模型

使用caret 套件

安裝caret, e1071

客戶流失分析

讀取資料

將資料區分成訓練與測試資料集

建立分類樹

產生Confusion Matrix

進行剪枝

手工製做ROC Curve

使用套件製作ROC Curve

各種分類

新聞分類

讀取資料集

使用 Jieba斷詞

建立詞頻矩陣

篩選詞頻矩陣

矩陣轉換

將資料列為訓練跟測試資料集

使用Naive Bayes 建立模型