作æ¥äº”
download.file('https://github.com/ywchiu/rtibame/raw/master/Data/oneday2.csv', 'oneday.csv')
oneday <- read.csv('oneday.csv', stringsAsFactors = FALSE)
head(oneday)
str(oneday)
library(jiebaR)
mixseg <- worker()
oneday.seg <- segment(oneday$content, mixseg)
tb <- table(oneday.seg)
tb <- tb[(nchar(names(tb)) >= 2) & (tb >= 10) & grepl('[\u4e00-\u9fa5]+', x = names(tb))]
tb
library(wordcloud2)
wordcloud2(tb, shape = "star")
iris classfication
data(iris)
#iris
library(rpart)
fit <- rpart( Species ~ Sepal.Length +Sepal.Width + Petal.Length +Petal.Width, data = iris )
fit
n= 150
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)
2) Petal.Length< 2.45 50 0 setosa (1.00000000 0.00000000 0.00000000) *
3) Petal.Length>=2.45 100 50 versicolor (0.00000000 0.50000000 0.50000000)
6) Petal.Width< 1.75 54 5 versicolor (0.00000000 0.90740741 0.09259259) *
7) Petal.Width>=1.75 46 1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin=0.1)
text(fit)

plot(iris$Petal.Length, iris$Petal.Width, col = iris$Species)
abline(v = 2.45, col="orange")
abline(h = 1.75, col="blue")

predict(fit, data.frame(Petal.Length = 2, Petal.Width= 3, Sepal.Length = 2, Sepal.Width = 2))
setosa versicolor virginica
1 1 0 0
#predict(fit, iris)
predicted <- predict(fit, iris, type= 'class')
tb <- table(predicted,iris$Species)
# accuracy
(50 + 49 + 45) /150
[1] 0.96
#install.packages('caret')
#install.packages('e1071')
library(caret)
cm <- confusionMatrix(tb)
cm
Confusion Matrix and Statistics
predicted setosa versicolor virginica
setosa 50 0 0
versicolor 0 49 5
virginica 0 1 45
Overall Statistics
Accuracy : 0.96
95% CI : (0.915, 0.9852)
No Information Rate : 0.3333
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.94
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: setosa Class: versicolor
Sensitivity 1.0000 0.9800
Specificity 1.0000 0.9500
Pos Pred Value 1.0000 0.9074
Neg Pred Value 1.0000 0.9896
Prevalence 0.3333 0.3333
Detection Rate 0.3333 0.3267
Detection Prevalence 0.3333 0.3600
Balanced Accuracy 1.0000 0.9650
Class: virginica
Sensitivity 0.9000
Specificity 0.9900
Pos Pred Value 0.9783
Neg Pred Value 0.9519
Prevalence 0.3333
Detection Rate 0.3000
Detection Prevalence 0.3067
Balanced Accuracy 0.9450
# data sampling
set.seed(123)
sample.int(42, 6)
[1] 13 33 17 35 36 2
sample.int(42, 6)
[1] 23 37 42 18 41 17
sample.int(42, 6)
[1] 29 24 5 36 10 2
a <- c(1,2,3,4,5)
ix <- c(1,0,1,0,1)
a[ix == 1]
[1] 1 3 5
# split data into trainset and testset
nrow(iris)
[1] 150
set.seed(123)
idx <- sample.int(2, nrow(iris), replace = TRUE, prob=c(0.7,0.3))
trainset <- iris[idx == 1, ]
testset <- iris[idx == 2, ]
# build model
dim(trainset)
[1] 106 5
dim(testset)
[1] 44 5
fit <- rpart(Species ~ ., data = trainset)
fit
n= 106
node), split, n, loss, yval, (yprob)
* denotes terminal node
1) root 106 70 versicolor (0.33018868 0.33962264 0.33018868)
2) Petal.Length< 2.45 35 0 setosa (1.00000000 0.00000000 0.00000000) *
3) Petal.Length>=2.45 71 35 versicolor (0.00000000 0.50704225 0.49295775)
6) Petal.Length< 4.75 34 0 versicolor (0.00000000 1.00000000 0.00000000) *
7) Petal.Length>=4.75 37 2 virginica (0.00000000 0.05405405 0.94594595) *
plot(fit, margin = 0.1)
text(fit)

# apply model on testset
predicted <- predict(fit, testset, type="class")
tb <- table(testset$Species, predicted)
cm <- confusionMatrix(tb)
cm
Confusion Matrix and Statistics
predicted
setosa versicolor virginica
setosa 15 0 0
versicolor 0 10 4
virginica 0 1 14
Overall Statistics
Accuracy : 0.8864
95% CI : (0.7544, 0.9621)
No Information Rate : 0.4091
P-Value [Acc > NIR] : 6.207e-11
Kappa : 0.8291
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: setosa Class: versicolor
Sensitivity 1.0000 0.9091
Specificity 1.0000 0.8788
Pos Pred Value 1.0000 0.7143
Neg Pred Value 1.0000 0.9667
Prevalence 0.3409 0.2500
Detection Rate 0.3409 0.2273
Detection Prevalence 0.3409 0.3182
Balanced Accuracy 1.0000 0.8939
Class: virginica
Sensitivity 0.7778
Specificity 0.9615
Pos Pred Value 0.9333
Neg Pred Value 0.8621
Prevalence 0.4091
Detection Rate 0.3182
Detection Prevalence 0.3409
Balanced Accuracy 0.8697
# apply model on trainset
predicted2 <- predict(fit, trainset, type="class")
tb2 <- table(trainset$Species, predicted2)
cm2 <- confusionMatrix(tb2)
cm2
Confusion Matrix and Statistics
predicted2
setosa versicolor virginica
setosa 35 0 0
versicolor 0 34 2
virginica 0 0 35
Overall Statistics
Accuracy : 0.9811
95% CI : (0.9335, 0.9977)
No Information Rate : 0.3491
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9717
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: setosa Class: versicolor
Sensitivity 1.0000 1.0000
Specificity 1.0000 0.9722
Pos Pred Value 1.0000 0.9444
Neg Pred Value 1.0000 1.0000
Prevalence 0.3302 0.3208
Detection Rate 0.3302 0.3208
Detection Prevalence 0.3302 0.3396
Balanced Accuracy 1.0000 0.9861
Class: virginica
Sensitivity 0.9459
Specificity 1.0000
Pos Pred Value 1.0000
Neg Pred Value 0.9718
Prevalence 0.3491
Detection Rate 0.3302
Detection Prevalence 0.3302
Balanced Accuracy 0.9730
Churn rate analysis
#install.packages('C50')
library(C50)
data(churn)
#churnTrain
churnTrain <- churnTrain[,!names(churnTrain)%in%c("state", "area_code", "account_length")]
set.seed(2)
idx <- sample.int(2,nrow(churnTrain), replace = TRUE, prob = c(0.7, 0.3))
trainset <- churnTrain[idx == 1, ]
testset <- churnTrain[idx == 2, ]
churn.rp <- rpart(churn ~., data = trainset)
plot(churn.rp, margin=0.1)
text(churn.rp)

Verify Classfication Result
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)
Error: could not find function "mgraph"
Distance
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
# euclidean
sqrt(sum((x - y) ^ 2))
[1] 1.414214
dist(rbind(x,y), method = 'euclidean')
x
y 1.414214
# manhattan
sum(abs(x - y))
[1] 2
dist(rbind(x,y), method = 'manhattan')
x
y 2
Clustering
data(iris)
dist.iris <- dist(iris[,-5], method='euclidean')
hc <- hclust(dist.iris, method = "ward.D2")
plot(hc)

fit <- cutree(hc, k = 3)
fit
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[30] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
[59] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2
[88] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3 3 3 2 2 3
[117] 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3
[146] 3 2 3 3 2
plot(hc, hang =-0.01, cex=0.7)
rect.hclust(hc, k =3, border="red")
fit <- cutree(hc, k = 3)
fit
[1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[30] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2
[59] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2
[88] 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3 3 2 3 3 3 3 3 3 2 2 3
[117] 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3 3 3 2 3 3
[146] 3 2 3 3 2
par(mfrow=c(1,2))

plot(iris$Petal.Length, iris$Petal.Width, col =iris$Species, main = 'with species')
plot(iris$Petal.Length, iris$Petal.Width, col =fit, main= 'clustering result')

Article Clustering
a <- c(1, 2, 2, 1, 1, 1, 0)
b <- c(1, 2, 2, 1, 1, 2, 1)
sum(a * b ) /( sqrt(sum(a^ 2)) * sqrt(sum(b^ 2)))
#install.packages('proxy')
1 - proxy::dist(rbind(a,b), method="cosine")
# Download and load data
download.file('https://raw.githubusercontent.com/ywchiu/rtibame/master/History/Class1/news_big5.RData', 'new.RData')
load('new.RData')
# Data Preprocessing
names(news) <- c('title', 'content', 'articleid')
news$title <- as.character(news$title)
news$content <- as.character(news$content)
library(jiebaR)
mixseg <- worker()
news.seg <- lapply(news$content, function(e)segment(e, mixseg))
source('https://raw.githubusercontent.com/ywchiu/rtibame/master/Lib/CNCorpus.R')
corpus=CNCorpus(news.seg)
control.list=list(wordLengths=c(2,Inf),tokenize=space_tokenizer)
doc <- tm_map(corpus, removeNumbers)
dtm <- DocumentTermMatrix(doc, control<-control.list)
dtm.remove <- removeSparseTerms(dtm, 0.99)
dtm.remove
#dtm$dimnames$Terms
dtm.dist <- proxy::dist(as.matrix(dtm.remove), method ="cosine")
dtm.mat <- as.matrix(dtm.dist)
idx <- order(dtm.mat[127,])[1:10]
cbind(news$title[idx], dtm.mat[127,idx])
# article clustering
dtm.cluster <- hclust(dtm.dist, method = "ward.D2")
plot(dtm.cluster, hang=-1)
fit <- cutree(dtm.cluster, k = 6)
news$title[fit ==5]
