使用rpart 做分類
data(iris)
View(iris)
library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data= iris)
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)

結果驗證
plot(iris$Petal.Length, iris$Petal.Width)

plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species)
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
abline(v= 2.45, col="blue")
abline(h= 1.75, col="orange")

產生預測結果
predict(fit, data.frame(Petal.Length = 1, Petal.Width = 1, Sepal.Length = 1, Sepal.Width = 1))
## setosa versicolor virginica
## 1 1 0 0
# 類別
#predict(fit, iris[,-5], type='class')
# 機率
#predict(fit, iris[,-5], type='prob')
cm <- table(iris[,5], predict(fit, iris[,-5], type='class'))
cm
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
# Build Confusion Matrix
t <- c(1,2,1,2,1,0,0,1,2,1)
table(t)
## t
## 0 1 2
## 2 5 3
p <- c(1,2,2,1,2,0,0,1,2,0)
table(t,p)
## p
## t 0 1 2
## 0 2 0 0
## 1 1 2 2
## 2 0 1 2
計算預測準確率
ac = (50 + 49 +45 ) / (50+50+50)
#install.pacakges('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.36
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9074 0.9783
## Specificity 1.0000 0.9896 0.9519
## Pos Pred Value 1.0000 0.9800 0.9000
## Neg Pred Value 1.0000 0.9500 0.9900
## Prevalence 0.3333 0.3600 0.3067
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3333 0.3333
## Balanced Accuracy 1.0000 0.9485 0.9651
#https://en.wikipedia.org/wiki/Sensitivity_and_specificity
分為訓練與測試資料集
#.Random.seed
set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36 2
sample.int(42,6)
## [1] 23 37 42 18 41 17
sample.int(42,6)
## [1] 29 24 5 36 10 2
set.seed(123)
sample.int(42,6)
## [1] 13 33 17 35 36 2
sample.int(42,6)
## [1] 23 37 42 18 41 17
sample.int(42,6)
## [1] 29 24 5 36 10 2
set.seed(123)
ind = sample(c(1,2), nrow(iris), replace=TRUE, prob = c(0.7,0.3))
#ind == 1
trainset = iris[ind == 1 , ]
testset = iris[ind == 2 , ]
dim(trainset)
## [1] 106 5
dim(testset)
## [1] 44 5
使用訓練資料集建立模型
fit2 <- rpart(Species ~ . , data = trainset)
plot(fit2, margin =0.1)
text(fit2)

plot(trainset$Petal.Length, trainset$Petal.Width)

plot(trainset$Petal.Length, trainset$Petal.Width, col=trainset$Species)
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
abline(v= 2.45, col="blue")
abline(v= 4.75, col="orange")

使用測試資料集驗證模型
trainpred <- predict(fit2, trainset[,-5], type= "class")
traincm <- table(trainpred, trainset[,5])
confusionMatrix(traincm)
## Confusion Matrix and Statistics
##
##
## trainpred setosa versicolor virginica
## setosa 35 0 0
## versicolor 0 34 0
## virginica 0 2 35
##
## Overall Statistics
##
## Accuracy : 0.9811
## 95% CI : (0.9335, 0.9977)
## No Information Rate : 0.3396
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9717
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9444 1.0000
## Specificity 1.0000 1.0000 0.9718
## Pos Pred Value 1.0000 1.0000 0.9459
## Neg Pred Value 1.0000 0.9722 1.0000
## Prevalence 0.3302 0.3396 0.3302
## Detection Rate 0.3302 0.3208 0.3302
## Detection Prevalence 0.3302 0.3208 0.3491
## Balanced Accuracy 1.0000 0.9722 0.9859
testpred <- predict(fit2, testset[,-5], type= "class")
testcm <- table(testpred, testset[,5])
confusionMatrix(testcm)
## Confusion Matrix and Statistics
##
##
## testpred setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 10 1
## virginica 0 4 14
##
## Overall Statistics
##
## Accuracy : 0.8864
## 95% CI : (0.7544, 0.9621)
## No Information Rate : 0.3409
## P-Value [Acc > NIR] : 8.552e-14
##
## Kappa : 0.8291
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.7143 0.9333
## Specificity 1.0000 0.9667 0.8621
## Pos Pred Value 1.0000 0.9091 0.7778
## Neg Pred Value 1.0000 0.8788 0.9615
## Prevalence 0.3409 0.3182 0.3409
## Detection Rate 0.3409 0.2273 0.3182
## Detection Prevalence 0.3409 0.2500 0.4091
## Balanced Accuracy 1.0000 0.8405 0.8977
使用caret 套件
library(caret)
fitControl <- trainControl(## 10-fold CV
method = "repeatedcv",
number = 10,
repeats = 10)
rpartFit <- train(Species ~ ., data = iris,
method = "rpart",
trControl = fitControl)
rpartFit
## CART
##
## 150 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 135, 135, 135, 135, 135, 135, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.00 0.9346667 0.902
## 0.44 0.7906667 0.686
## 0.50 0.3333333 0.000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.
cm = table(predict(rpartFit, iris), iris[,5])
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9800 0.9000
## Specificity 1.0000 0.9500 0.9900
## Pos Pred Value 1.0000 0.9074 0.9783
## Neg Pred Value 1.0000 0.9896 0.9519
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3600 0.3067
## Balanced Accuracy 1.0000 0.9650 0.9450
安裝caret, e1071
install.packages('caret', repo='http://nbcgib.uesc.br/mirrors/cran/')
install.packages('e1071', repo='http://nbcgib.uesc.br/mirrors/cran/')
客戶流失分析
讀取資料
#install.packages("C50")
library(C50)
## Warning: package 'C50' was built under R version 3.2.5
data(churn)
str(churnTrain)
## 'data.frame': 3333 obs. of 20 variables:
## $ state : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
## $ account_length : int 128 107 137 84 75 118 121 147 117 141 ...
## $ area_code : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
將資料區分成訓練與測試資料集
churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]
set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset = churnTrain[ind == 1,]
testset = churnTrain[ind == 2,]
建立分類樹
library(rpart)
churn.rp <- rpart(churn ~ ., data=trainset)
plot(churn.rp, margin= 0.1)
text(churn.rp, all=TRUE, use.n = TRUE)

產生Confusion Matrix
pred <- predict(churn.rp, testset, type="class")
cm = table(testset$churn, pred)
library(caret)
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
## pred
## yes no
## yes 100 41
## no 18 859
##
## Accuracy : 0.942
## 95% CI : (0.9259, 0.9556)
## No Information Rate : 0.8841
## P-Value [Acc > NIR] : 2.052e-10
##
## Kappa : 0.7393
## Mcnemar's Test P-Value : 0.004181
##
## Sensitivity : 0.84746
## Specificity : 0.95444
## Pos Pred Value : 0.70922
## Neg Pred Value : 0.97948
## Prevalence : 0.11591
## Detection Rate : 0.09823
## Detection Prevalence : 0.13851
## Balanced Accuracy : 0.90095
##
## 'Positive' Class : yes
##
table(testset$churn)
##
## yes no
## 141 877
877 / (141 + 877)
## [1] 0.8614931
進行剪枝
min(churn.rp$cptable[,"xerror"])
## [1] 0.4707602
which.min(churn.rp$cptable[,"xerror"])
## 7
## 7
churn.cp = churn.rp$cptable[7,"CP"]
prune.tree = prune(churn.rp, cp= churn.cp)
plot(prune.tree, margin= 0.1)
text(prune.tree, all=TRUE , use.n=TRUE)

手工製做ROC Curve
head(predict(churn.rp, testset, type="prob"))
## yes no
## 2 0.02877238 0.97122762
## 5 0.05084746 0.94915254
## 6 0.05084746 0.94915254
## 8 0.05084746 0.94915254
## 13 0.02877238 0.97122762
## 16 0.95522388 0.04477612
x = c()
y = c()
for (threshold in seq(0.1,0.9,0.1)){
yes = predict(churn.rp, testset, type="prob")[,1]
pred = ifelse(yes > threshold, 0, 1)
pred = factor(pred, labels =c('yes', 'no'))
ct = table(pred, testset$churn)
cm = confusionMatrix(ct)
y = c(y, cm$byClass[1])
x = c(x, 1 - cm$byClass[2])
}
plot(x,y)
lines(x,y, col="red")

使用套件製作ROC Curve
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(churn.rp, testset, type="prob")
yes <- predictions[, 1]
pred.rocr <- prediction(yes, as.factor(testset[,(dim(testset)[[2]])]))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

各種分類
#rpart
fit = rpart(Species ~., data = iris)
table(predict(fit,iris[,-5], type="class"), iris$Species)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
#svm
library(e1071)
## Warning: package 'e1071' was built under R version 3.2.5
fit = svm(Species ~., data = iris)
table(predict(fit,iris[,-5], type="class"), iris$Species)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48
#naive bayes
library(e1071)
fit = naiveBayes(Species ~., data = iris)
table(predict(fit,iris[,-5], type="class"), iris$Species)
##
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 47 3
## virginica 0 3 47
新聞分類
讀取資料集
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/appledaily.RData', 'appledaily.RData')
load('appledaily.RData')
apple.subset = appledaily[appledaily$category %in% c('財經', '娛樂', '社會'),]
table(apple.subset$category)
##
## 社會 娛樂 財經
## 194 113 121
使用 Jieba斷詞
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.2.5
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.2.5
mixseg = worker()
apple.seg =lapply(apple.subset$content, function(e)segment(code=e, jiebar=mixseg))
建立詞頻矩陣
source('https://github.com/ywchiu/rtibame/raw/master/Lib/CNCorpus.R')
## Warning: package 'tm' was built under R version 3.2.5
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
doc=CNCorpus(apple.seg)
doc=unlist(tm_map(doc,jieba_tokenizer),recursive=F)
doc=lapply(doc,function(d)paste(d,collapse=' '))
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
dim(dtm)
## [1] 428 18107
篩選詞頻矩陣
ft <- findFreqTerms(dtm, 5)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer,dictionary =ft)
new.dtm=DocumentTermMatrix(Corpus(VectorSource(doc)),control=control.list)
dim(new.dtm)
## [1] 428 3168
矩陣轉換
#as.character(apple.subset[1,])
#inspect(new.dtm[1,])
convert_counts <- function(x) {
x <- ifelse(x > 0, 1, 0)
x <- factor(x, levels = c(0, 1), labels = c("No", "Yes"))
return(x)
}
dtm.count <- apply(new.dtm, MARGIN = 2, convert_counts)
#dtm.count[1,]
將資料列為訓練跟測試資料集
m <- as.data.frame(dtm.count)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx==1,]
testset <- m[idx==2,]
traintitle <- apple.subset[idx==1,"title"]
testtitle <- apple.subset[idx==2,"title"]
traintag <- apple.subset[idx==1,"category"]
testtag <-apple.subset[idx==2,"category"]
traintag = as.factor(traintag)
testtag = as.factor(testtag)
dim(trainset)
## [1] 284 3168
dim(testset)
## [1] 144 3168
使用Naive Bayes 建立模型
library(e1071)
model <- naiveBayes(trainset, traintag)
pred <- predict(model, testset)
cm <- table(pred, testtag)
cm
## testtag
## pred 社會 娛樂 財經
## 社會 66 1 1
## 娛樂 0 29 3
## 財經 0 0 44
testtitle[pred != testtag]
## [1] "【唱新聞】詐騙嗎?R.O.C.有CHINA但不是CHINA"
## [2] "璩美鳳老公說她是石油公司副總 公司卻發聲明..."
## [3] "【壹週刊】從貴婦到攤販 看見台灣媳婦的堅毅"
## [4] "【開了等於沒開?】浩鼎秘密會議 立委怒了"
## [5] "大倒角握感如何? 網友試機HTC 10"
pred[pred != testtag]
## [1] 社會 娛樂 娛樂 社會 娛樂
## Levels: 社會 娛樂 財經
testtag[pred != testtag]
## [1] 娛樂 財經 財經 財經 財經
## Levels: 社會 娛樂 財經