作業五

download.file('https://github.com/ywchiu/rtibame/raw/master/Data/oneday2.csv', 'oneday.csv') 
oneday <- read.csv('oneday.csv', stringsAsFactors = FALSE)
head(oneday)
str(oneday)

library(jiebaR)
mixseg <- worker()
oneday.seg <- segment(oneday$content, mixseg)
tb <- table(oneday.seg)
tb <- tb[(nchar(names(tb)) >= 2) & (tb >= 10) & grepl('[\u4e00-\u9fa5]+', x = names(tb))]
tb

library(wordcloud2)
wordcloud2(tb, shape = "star")

iris classfication

data(iris)
#iris
library(rpart)
fit <- rpart( Species  ~ Sepal.Length +Sepal.Width + Petal.Length +Petal.Width,   data = iris )
fit
n= 150 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
  2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
  3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
    6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
    7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin=0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col="orange")
abline(h = 1.75, col="blue")

predict(fit, data.frame(Petal.Length = 2, Petal.Width= 3, Sepal.Length = 2, Sepal.Width = 2))
  setosa versicolor virginica
1      1          0         0
#predict(fit, iris)
predicted <- predict(fit, iris, type= 'class')
tb <- table(predicted,iris$Species)
# accuracy 
(50 + 49 + 45) /150
[1] 0.96
#install.packages('caret')
#install.packages('e1071')
library(caret)
cm <- confusionMatrix(tb)
cm
Confusion Matrix and Statistics

            
predicted    setosa versicolor virginica
  setosa         50          0         0
  versicolor      0         49         5
  virginica       0          1        45

Overall Statistics
                                         
               Accuracy : 0.96           
                 95% CI : (0.915, 0.9852)
    No Information Rate : 0.3333         
    P-Value [Acc > NIR] : < 2.2e-16      
                                         
                  Kappa : 0.94           
 Mcnemar's Test P-Value : NA             

Statistics by Class:

                     Class: setosa Class: versicolor
Sensitivity                 1.0000            0.9800
Specificity                 1.0000            0.9500
Pos Pred Value              1.0000            0.9074
Neg Pred Value              1.0000            0.9896
Prevalence                  0.3333            0.3333
Detection Rate              0.3333            0.3267
Detection Prevalence        0.3333            0.3600
Balanced Accuracy           1.0000            0.9650
                     Class: virginica
Sensitivity                    0.9000
Specificity                    0.9900
Pos Pred Value                 0.9783
Neg Pred Value                 0.9519
Prevalence                     0.3333
Detection Rate                 0.3000
Detection Prevalence           0.3067
Balanced Accuracy              0.9450
# data sampling
set.seed(123)
sample.int(42, 6)
[1] 13 33 17 35 36  2
sample.int(42, 6)
[1] 23 37 42 18 41 17
sample.int(42, 6)
[1] 29 24  5 36 10  2
a  <- c(1,2,3,4,5)
ix <- c(1,0,1,0,1)
a[ix == 1]
[1] 1 3 5
# split data into trainset and testset
nrow(iris)
[1] 150
set.seed(123)
idx <- sample.int(2, nrow(iris), replace = TRUE, prob=c(0.7,0.3))
trainset <- iris[idx == 1, ]
testset  <- iris[idx == 2, ]
# build model
dim(trainset)
[1] 106   5
dim(testset)
[1] 44  5
fit <- rpart(Species ~ ., data = trainset)
fit
n= 106 

node), split, n, loss, yval, (yprob)
      * denotes terminal node

1) root 106 70 versicolor (0.33018868 0.33962264 0.33018868)  
  2) Petal.Length< 2.45 35  0 setosa (1.00000000 0.00000000 0.00000000) *
  3) Petal.Length>=2.45 71 35 versicolor (0.00000000 0.50704225 0.49295775)  
    6) Petal.Length< 4.75 34  0 versicolor (0.00000000 1.00000000 0.00000000) *
    7) Petal.Length>=4.75 37  2 virginica (0.00000000 0.05405405 0.94594595) *
plot(fit, margin = 0.1)
text(fit)

# apply model on testset
predicted <- predict(fit, testset, type="class")
tb <- table(testset$Species, predicted)
cm <- confusionMatrix(tb)
cm
Confusion Matrix and Statistics

            predicted
             setosa versicolor virginica
  setosa         15          0         0
  versicolor      0         10         4
  virginica       0          1        14

Overall Statistics
                                          
               Accuracy : 0.8864          
                 95% CI : (0.7544, 0.9621)
    No Information Rate : 0.4091          
    P-Value [Acc > NIR] : 6.207e-11       
                                          
                  Kappa : 0.8291          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: setosa Class: versicolor
Sensitivity                 1.0000            0.9091
Specificity                 1.0000            0.8788
Pos Pred Value              1.0000            0.7143
Neg Pred Value              1.0000            0.9667
Prevalence                  0.3409            0.2500
Detection Rate              0.3409            0.2273
Detection Prevalence        0.3409            0.3182
Balanced Accuracy           1.0000            0.8939
                     Class: virginica
Sensitivity                    0.7778
Specificity                    0.9615
Pos Pred Value                 0.9333
Neg Pred Value                 0.8621
Prevalence                     0.4091
Detection Rate                 0.3182
Detection Prevalence           0.3409
Balanced Accuracy              0.8697
# apply model on trainset
predicted2 <- predict(fit, trainset, type="class")
tb2 <- table(trainset$Species, predicted2)
cm2 <- confusionMatrix(tb2)
cm2
Confusion Matrix and Statistics

            predicted2
             setosa versicolor virginica
  setosa         35          0         0
  versicolor      0         34         2
  virginica       0          0        35

Overall Statistics
                                          
               Accuracy : 0.9811          
                 95% CI : (0.9335, 0.9977)
    No Information Rate : 0.3491          
    P-Value [Acc > NIR] : < 2.2e-16       
                                          
                  Kappa : 0.9717          
 Mcnemar's Test P-Value : NA              

Statistics by Class:

                     Class: setosa Class: versicolor
Sensitivity                 1.0000            1.0000
Specificity                 1.0000            0.9722
Pos Pred Value              1.0000            0.9444
Neg Pred Value              1.0000            1.0000
Prevalence                  0.3302            0.3208
Detection Rate              0.3302            0.3208
Detection Prevalence        0.3302            0.3396
Balanced Accuracy           1.0000            0.9861
                     Class: virginica
Sensitivity                    0.9459
Specificity                    1.0000
Pos Pred Value                 1.0000
Neg Pred Value                 0.9718
Prevalence                     0.3491
Detection Rate                 0.3302
Detection Prevalence           0.3302
Balanced Accuracy              0.9730

Churn rate analysis

#install.packages('C50')
library(C50)
data(churn)
#churnTrain
churnTrain <- churnTrain[,!names(churnTrain)%in%c("state", "area_code", "account_length")]
set.seed(2)
idx <- sample.int(2,nrow(churnTrain), replace = TRUE, prob = c(0.7, 0.3))
trainset <- churnTrain[idx == 1, ]
testset  <- churnTrain[idx == 2, ]
churn.rp <- rpart(churn ~., data = trainset)
plot(churn.rp, margin=0.1)
text(churn.rp)

Verify Classfication Result

mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)
Error: could not find function "mgraph"

Distance

x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
# euclidean
sqrt(sum((x - y) ^ 2))
[1] 1.414214
dist(rbind(x,y), method = 'euclidean')
         x
y 1.414214
# manhattan
sum(abs(x - y))
[1] 2
dist(rbind(x,y), method = 'manhattan')
  x
y 2

Clustering

data(iris)
dist.iris <- dist(iris[,-5], method='euclidean')
hc <- hclust(dist.iris, method = "ward.D2")
plot(hc)

fit <- cutree(hc, k = 3)
fit
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [30] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
 [59] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2
 [88] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3 3 3 2 2 3
[117] 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3
[146] 3 2 3 3 2
plot(hc, hang =-0.01, cex=0.7)
rect.hclust(hc, k =3, border="red")
fit <- cutree(hc, k = 3)
fit
  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 [30] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
 [59] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2
 [88] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3 3 3 2 2 3
[117] 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3
[146] 3 2 3 3 2
par(mfrow=c(1,2))

plot(iris$Petal.Length, iris$Petal.Width, col =iris$Species, main = 'with species')
plot(iris$Petal.Length, iris$Petal.Width, col =fit, main= 'clustering result')

Article Clustering

a <- c(1, 2, 2, 1, 1, 1, 0)
b <- c(1, 2, 2, 1, 1, 2, 1)

sum(a * b ) /( sqrt(sum(a^ 2)) *  sqrt(sum(b^ 2)))
#install.packages('proxy')
1 - proxy::dist(rbind(a,b), method="cosine")

# Download and load data
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/History/Class1/news_big5.RData', 'new.RData')
load('new.RData')

# Data Preprocessing
names(news)  <- c('title', 'content', 'articleid')
news$title   <- as.character(news$title)
news$content <- as.character(news$content)

library(jiebaR)
mixseg <- worker()
news.seg <- lapply(news$content, function(e)segment(e, mixseg))
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')

corpus=CNCorpus(news.seg)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
doc  <- tm_map(corpus, removeNumbers)
dtm  <- DocumentTermMatrix(doc, control<-control.list)
dtm.remove <- removeSparseTerms(dtm, 0.99)
dtm.remove
#dtm$dimnames$Terms


dtm.dist <- proxy::dist(as.matrix(dtm.remove), method ="cosine")
dtm.mat  <- as.matrix(dtm.dist)
idx <- order(dtm.mat[127,])[1:10]
cbind(news$title[idx], dtm.mat[127,idx])



# article clustering
dtm.cluster <- hclust(dtm.dist, method = "ward.D2")
plot(dtm.cluster, hang=-1)
fit <- cutree(dtm.cluster, k = 6)

news$title[fit ==5]
