Decision Tree

data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
library(rpart)
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)

fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col= "orange")
abline(h = 1.75, col= "blue")


predicted <- predict(fit, iris[,1:4], type= 'class')

table(predicted, iris[,5])
##             
## predicted    setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
testing_data <- data.frame(Sepal.Length=5, Sepal.Width = 3, Petal.Length=2, Petal.Width = 0.3)

predict(fit, testing_data, type= "class")
##      1 
## setosa 
## Levels: setosa versicolor virginica
save(x=fit, file= 'classfication_tree.RData')
load('classfication_tree.RData')

testing_data <- data.frame(Sepal.Length=5, Sepal.Width = 3, Petal.Length=2, Petal.Width = 0.3)

predict(fit, testing_data, type= "class")
##      1 
## setosa 
## Levels: setosa versicolor virginica
# Using Caret to get precision rate
predicted <- predict(fit, iris[,1:4], type= 'class')
cm <- table(predicted, iris[,5])

library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3

confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##             
## predicted    setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9800           0.9000
## Specificity                 1.0000            0.9500           0.9900
## Pos Pred Value              1.0000            0.9074           0.9783
## Neg Pred Value              1.0000            0.9896           0.9519
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3600           0.3067
## Balanced Accuracy           1.0000            0.9650           0.9450

測試模型

set.seed(123)
sample.int(46, 6, replace=FALSE)
## [1] 14 36 18 38 40  2
set.seed(123)

idx <- sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7,0.3))
table(idx)
## idx
##   1   2 
## 106  44
trainset <- iris[idx == 1,]
testset  <- iris[idx == 2,]

library(rpart)
fit <- rpart(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'class'), testset[,5])
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
## 
##             
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         1
##   virginica       0          4        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.7544, 0.9621)
##     No Information Rate : 0.3409          
##     P-Value [Acc > NIR] : 8.552e-14       
##                                           
##                   Kappa : 0.8291          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7143           0.9333
## Specificity                 1.0000            0.9667           0.8621
## Pos Pred Value              1.0000            0.9091           0.7778
## Neg Pred Value              1.0000            0.8788           0.9615
## Prevalence                  0.3409            0.3182           0.3409
## Detection Rate              0.3409            0.2273           0.3182
## Detection Prevalence        0.3409            0.2500           0.4091
## Balanced Accuracy           1.0000            0.8405           0.8977
plot(fit, margin = 0.1)
text(fit)

plot(testset$Petal.Length, testset$Petal.Width, col = testset$Species)
abline(v = 2.45, col= "orange")
abline(v = 4.75, col= "blue")

## Change Classfication Model

library(rpart)
fit <- rpart(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'class'), testset[,5])
tb
##             
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         1
##   virginica       0          4        14
# install.packages('party')
library(party)
## Warning: package 'party' was built under R version 3.3.3
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.3.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.3.3
fit <- ctree(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'response'), testset[,5])
tb
##             
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         13         2
##   virginica       0          1        13
# install.packages('e1071')
library(e1071)
## Warning: package 'e1071' was built under R version 3.3.3
fit <- naiveBayes(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'class'), testset[,5])
tb
##             
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         11         2
##   virginica       0          3        13
library(e1071)
fit <- svm(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'class'), testset[,5])
tb
##             
##              setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         11         1
##   virginica       0          3        14

News Classfication

# https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData
load('applenews.RData')
dim(applenews)
## [1] 1500    5
apple.subset <- applenews[applenews$category %in% c('社會', '財經', '娛樂'), ]

apple.subset$category <-  factor(apple.subset$category)

library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.3.3
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.3.3
mixseg <- worker()
apple.seg <- lapply(apple.subset$content, function(e) segment(e, jiebar = mixseg))

library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
doc <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(doc, control = list(removePunctuation=TRUE, removeNumbers=TRUE ))
dtm.remove <- removeSparseTerms(dtm, 0.99)

dim(dtm.remove)
## [1]  428 1940
dtm.mat <- as.matrix(dtm.remove)


convert_count <- function(e){
  x <- ifelse(e > 0, 1, 0)
  x <- factor(x, levels = c(0,1), labels = c('no', 'yes'))
  return(x)
}

dtm.count  <- apply(dtm.mat, MARGIN=2, convert_count)
dim(dtm.count)
## [1]  428 1940
m <- as.data.frame(dtm.count)
#View(m)

idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx == 1, ]
testset  <- m[idx == 2, ]
traintag <- apple.subset[idx==1, 'category']
testtag  <- apple.subset[idx==2, 'category']

traintitle <- apple.subset[idx==1, 'title']
testtitle <- apple.subset[idx==2, 'title']


library(e1071)
model <- naiveBayes(trainset, as.factor(traintag))
pred  <- predict(model, testset)
tb    <- table(testtag, pred)
tb
##        pred
## testtag 社會 娛樂 財經
##    社會   60    3    0
##    娛樂    0   29    0
##    財經    1    1   40
library(caret)
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
## 
##        pred
## testtag 社會 娛樂 財經
##    社會   60    3    0
##    娛樂    0   29    0
##    財經    1    1   40
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9627          
##                  95% CI : (0.9151, 0.9878)
##     No Information Rate : 0.4552          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.9416          
##  Mcnemar's Test P-Value : 0.1718          
## 
## Statistics by Class:
## 
##                      Class: 社會 Class: 娛樂 Class: 財經
## Sensitivity               0.9836      0.8788      1.0000
## Specificity               0.9589      1.0000      0.9787
## Pos Pred Value            0.9524      1.0000      0.9524
## Neg Pred Value            0.9859      0.9619      1.0000
## Prevalence                0.4552      0.2463      0.2985
## Detection Rate            0.4478      0.2164      0.2985
## Detection Prevalence      0.4701      0.2164      0.3134
## Balanced Accuracy         0.9713      0.9394      0.9894