data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
library(rpart)
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)
fit
## n= 150
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
## 7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)
plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col= "orange")
abline(h = 1.75, col= "blue")
predicted <- predict(fit, iris[,1:4], type= 'class')
table(predicted, iris[,5])
##
## predicted setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
testing_data <- data.frame(Sepal.Length=5, Sepal.Width = 3, Petal.Length=2, Petal.Width = 0.3)
predict(fit, testing_data, type= "class")
## 1
## setosa
## Levels: setosa versicolor virginica
save(x=fit, file= 'classfication_tree.RData')
load('classfication_tree.RData')
testing_data <- data.frame(Sepal.Length=5, Sepal.Width = 3, Petal.Length=2, Petal.Width = 0.3)
predict(fit, testing_data, type= "class")
## 1
## setosa
## Levels: setosa versicolor virginica
# Using Caret to get precision rate
predicted <- predict(fit, iris[,1:4], type= 'class')
cm <- table(predicted, iris[,5])
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
confusionMatrix(cm)
## Confusion Matrix and Statistics
##
##
## predicted setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 5
## virginica 0 1 45
##
## Overall Statistics
##
## Accuracy : 0.96
## 95% CI : (0.915, 0.9852)
## No Information Rate : 0.3333
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.94
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.9800 0.9000
## Specificity 1.0000 0.9500 0.9900
## Pos Pred Value 1.0000 0.9074 0.9783
## Neg Pred Value 1.0000 0.9896 0.9519
## Prevalence 0.3333 0.3333 0.3333
## Detection Rate 0.3333 0.3267 0.3000
## Detection Prevalence 0.3333 0.3600 0.3067
## Balanced Accuracy 1.0000 0.9650 0.9450
set.seed(123)
sample.int(46, 6, replace=FALSE)
## [1] 14 36 18 38 40 2
set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7,0.3))
table(idx)
## idx
## 1 2
## 106 44
trainset <- iris[idx == 1,]
testset <- iris[idx == 2,]
library(rpart)
fit <- rpart(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'class'), testset[,5])
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
##
##
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 10 1
## virginica 0 4 14
##
## Overall Statistics
##
## Accuracy : 0.8864
## 95% CI : (0.7544, 0.9621)
## No Information Rate : 0.3409
## P-Value [Acc > NIR] : 8.552e-14
##
## Kappa : 0.8291
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.7143 0.9333
## Specificity 1.0000 0.9667 0.8621
## Pos Pred Value 1.0000 0.9091 0.7778
## Neg Pred Value 1.0000 0.8788 0.9615
## Prevalence 0.3409 0.3182 0.3409
## Detection Rate 0.3409 0.2273 0.3182
## Detection Prevalence 0.3409 0.2500 0.4091
## Balanced Accuracy 1.0000 0.8405 0.8977
plot(fit, margin = 0.1)
text(fit)
plot(testset$Petal.Length, testset$Petal.Width, col = testset$Species)
abline(v = 2.45, col= "orange")
abline(v = 4.75, col= "blue")
## Change Classfication Model
library(rpart)
fit <- rpart(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'class'), testset[,5])
tb
##
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 10 1
## virginica 0 4 14
# install.packages('party')
library(party)
## Warning: package 'party' was built under R version 3.3.3
## Loading required package: grid
## Loading required package: mvtnorm
## Loading required package: modeltools
## Loading required package: stats4
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 3.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 3.3.3
fit <- ctree(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'response'), testset[,5])
tb
##
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 13 2
## virginica 0 1 13
# install.packages('e1071')
library(e1071)
## Warning: package 'e1071' was built under R version 3.3.3
fit <- naiveBayes(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'class'), testset[,5])
tb
##
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 11 2
## virginica 0 3 13
library(e1071)
fit <- svm(Species ~ ., data = trainset)
tb <- table(predict(fit, testset, type= 'class'), testset[,5])
tb
##
## setosa versicolor virginica
## setosa 15 0 0
## versicolor 0 11 1
## virginica 0 3 14
# https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData
load('applenews.RData')
dim(applenews)
## [1] 1500 5
apple.subset <- applenews[applenews$category %in% c('社會', '財經', '娛樂'), ]
apple.subset$category <- factor(apple.subset$category)
library(jiebaR)
## Warning: package 'jiebaR' was built under R version 3.3.3
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 3.3.3
mixseg <- worker()
apple.seg <- lapply(apple.subset$content, function(e) segment(e, jiebar = mixseg))
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
doc <- Corpus(VectorSource(apple.seg))
dtm <- DocumentTermMatrix(doc, control = list(removePunctuation=TRUE, removeNumbers=TRUE ))
dtm.remove <- removeSparseTerms(dtm, 0.99)
dim(dtm.remove)
## [1] 428 1940
dtm.mat <- as.matrix(dtm.remove)
convert_count <- function(e){
x <- ifelse(e > 0, 1, 0)
x <- factor(x, levels = c(0,1), labels = c('no', 'yes'))
return(x)
}
dtm.count <- apply(dtm.mat, MARGIN=2, convert_count)
dim(dtm.count)
## [1] 428 1940
m <- as.data.frame(dtm.count)
#View(m)
idx <- sample.int(2, nrow(m), replace=TRUE, prob=c(0.7,0.3))
trainset <- m[idx == 1, ]
testset <- m[idx == 2, ]
traintag <- apple.subset[idx==1, 'category']
testtag <- apple.subset[idx==2, 'category']
traintitle <- apple.subset[idx==1, 'title']
testtitle <- apple.subset[idx==2, 'title']
library(e1071)
model <- naiveBayes(trainset, as.factor(traintag))
pred <- predict(model, testset)
tb <- table(testtag, pred)
tb
## pred
## testtag 社會 娛樂 財經
## 社會 60 3 0
## 娛樂 0 29 0
## 財經 1 1 40
library(caret)
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
##
## pred
## testtag 社會 娛樂 財經
## 社會 60 3 0
## 娛樂 0 29 0
## 財經 1 1 40
##
## Overall Statistics
##
## Accuracy : 0.9627
## 95% CI : (0.9151, 0.9878)
## No Information Rate : 0.4552
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9416
## Mcnemar's Test P-Value : 0.1718
##
## Statistics by Class:
##
## Class: 社會 Class: 娛樂 Class: 財經
## Sensitivity 0.9836 0.8788 1.0000
## Specificity 0.9589 1.0000 0.9787
## Pos Pred Value 0.9524 1.0000 0.9524
## Neg Pred Value 0.9859 0.9619 1.0000
## Prevalence 0.4552 0.2463 0.2985
## Detection Rate 0.4478 0.2164 0.2985
## Detection Prevalence 0.4701 0.2164 0.3134
## Balanced Accuracy 0.9713 0.9394 0.9894