#install.packages('C50')
library(C50)
## Warning: package 'C50' was built under R version 3.3.3
data(churn)
dim(churnTrain)
## [1] 3333 20
View(churnTrain)
churnTrain <- churnTrain[ , ! names(churnTrain) %in% c('state', 'account_length', 'area_code')]
set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob= c(0.7, 0.3))
table(idx)
## idx
## 1 2
## 2315 1018
trainset <- churnTrain[idx == 1, ]
testset <- churnTrain[idx == 2, ]
library(rpart)
churn.rp <- rpart(churn ~ ., data = trainset)
plot(churn.rp, margin = 0.1)
text(churn.rp)
prediction <- predict(churn.rp, testset, type = 'class')
table(testset$churn, prediction)
## prediction
## yes no
## yes 100 41
## no 18 859
tb <- table(prediction, testset$churn)
library(caret)
## Warning: package 'caret' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
cm <- confusionMatrix(tb)
cm
## Confusion Matrix and Statistics
##
##
## prediction yes no
## yes 100 18
## no 41 859
##
## Accuracy : 0.942
## 95% CI : (0.9259, 0.9556)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7393
## Mcnemar's Test P-Value : 0.004181
##
## Sensitivity : 0.70922
## Specificity : 0.97948
## Pos Pred Value : 0.84746
## Neg Pred Value : 0.95444
## Prevalence : 0.13851
## Detection Rate : 0.09823
## Detection Prevalence : 0.11591
## Balanced Accuracy : 0.84435
##
## 'Positive' Class : yes
##
set.seed(2)
idx <- sample.int(10, nrow(churnTrain), replace=TRUE)
for (i in 1:10){
print(i)
trainset <- churnTrain[idx != i, ]
testset <- churnTrain[idx == i, ]
churn.rp <- rpart(churn ~ ., data = trainset)
prediction <- predict(churn.rp, testset, type = 'class')
tb <- table(prediction, testset$churn)
cm <- confusionMatrix(tb)
print(cm$overall[1])
}
## [1] 1
## Accuracy
## 0.9425982
## [1] 2
## Accuracy
## 0.9354839
## [1] 3
## Accuracy
## 0.95
## [1] 4
## Accuracy
## 0.9333333
## [1] 5
## Accuracy
## 0.9382716
## [1] 6
## Accuracy
## 0.9241983
## [1] 7
## Accuracy
## 0.9345794
## [1] 8
## Accuracy
## 0.9446254
## [1] 9
## Accuracy
## 0.9498607
## [1] 10
## Accuracy
## 0.9517045
library(caret)
control = trainControl(method="repeatedcv", number=10, repeats=3)
#?train
#names(getModelInfo())
model = train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model
## CART
##
## 2981 samples
## 16 predictor
## 2 classes: 'yes', 'no'
##
## Pre-processing: scaled (16)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2682, 2684, 2683, 2684, 2683, 2682, ...
## Resampling results across tuning parameters:
##
## cp Accuracy Kappa
## 0.05069124 0.9143537 0.5865270
## 0.07949309 0.8690623 0.2109233
## 0.08064516 0.8664873 0.1830921
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was cp = 0.05069124.
#predict(model, testset)
set.seed(2)
idx <- sample.int(2, nrow(churnTrain), replace=TRUE, prob= c(0.7, 0.3))
trainset <- churnTrain[idx == 1, ]
testset <- churnTrain[idx == 2, ]
fit <- glm(churn ~ ., data = trainset, family=binomial)
summary(fit)
##
## Call:
## glm(formula = churn ~ ., family = binomial, data = trainset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1519 0.1983 0.3460 0.5186 2.1284
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.3462866 0.8364914 9.978 < 2e-16 ***
## international_planyes -2.0534243 0.1726694 -11.892 < 2e-16 ***
## voice_mail_planyes 1.3445887 0.6618905 2.031 0.042211 *
## number_vmail_messages -0.0155101 0.0209220 -0.741 0.458496
## total_day_minutes 0.2398946 3.9168466 0.061 0.951163
## total_day_calls -0.0014003 0.0032769 -0.427 0.669141
## total_day_charge -1.4855284 23.0402950 -0.064 0.948592
## total_eve_minutes 0.3600678 1.9349825 0.186 0.852379
## total_eve_calls -0.0028484 0.0033061 -0.862 0.388928
## total_eve_charge -4.3204432 22.7644698 -0.190 0.849475
## total_night_minutes 0.4431210 1.0478105 0.423 0.672367
## total_night_calls 0.0003978 0.0033188 0.120 0.904588
## total_night_charge -9.9162795 23.2836376 -0.426 0.670188
## total_intl_minutes 0.4587114 6.3524560 0.072 0.942435
## total_intl_calls 0.1065264 0.0304318 3.500 0.000464 ***
## total_intl_charge -2.0803428 23.5262100 -0.088 0.929538
## number_customer_service_calls -0.5109077 0.0476289 -10.727 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1938.8 on 2314 degrees of freedom
## Residual deviance: 1515.3 on 2298 degrees of freedom
## AIC: 1549.3
##
## Number of Fisher Scoring iterations: 6
pred <- predict(fit,testset, type="response")
Class = pred >.5
summary(Class)
## Mode FALSE TRUE NA's
## logical 58 960 0
tb = table(testset$churn,Class)
tb
## Class
## FALSE TRUE
## yes 29 112
## no 29 848
churn.mod = ifelse(testset$churn == "yes", 1, 0)
pred_class = churn.mod
pred_class[pred<=.5] = 1- pred_class[pred<=.5]
ctb = table(churn.mod, pred_class)
ctb
## pred_class
## churn.mod 0 1
## 0 848 29
## 1 29 112
confusionMatrix(ctb)
## Confusion Matrix and Statistics
##
## pred_class
## churn.mod 0 1
## 0 848 29
## 1 29 112
##
## Accuracy : 0.943
## 95% CI : (0.927, 0.9565)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7613
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9669
## Specificity : 0.7943
## Pos Pred Value : 0.9669
## Neg Pred Value : 0.7943
## Prevalence : 0.8615
## Detection Rate : 0.8330
## Detection Prevalence : 0.8615
## Balanced Accuracy : 0.8806
##
## 'Positive' Class : 0
##
library(e1071)
## Warning: package 'e1071' was built under R version 3.3.3
model.tuned <- svm(churn~., data = trainset, gamma = 10^-2, cost = 100)
summary(model.tuned)
##
## Call:
## svm(formula = churn ~ ., data = trainset, gamma = 10^-2, cost = 100)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 100
## gamma: 0.01
##
## Number of Support Vectors: 547
##
## ( 304 243 )
##
##
## Number of Classes: 2
##
## Levels:
## yes no
svm.tuned.pred <- predict(model.tuned, testset[, !names(testset) %in% c("churn")])
svm.tuned.table=table(svm.tuned.pred, testset$churn)
svm.tuned.table
##
## svm.tuned.pred yes no
## yes 95 24
## no 46 853
classAgreement(svm.tuned.table)
## $diag
## [1] 0.9312377
##
## $kappa
## [1] 0.691678
##
## $rand
## [1] 0.871806
##
## $crand
## [1] 0.6303615
confusionMatrix(svm.tuned.table)
## Confusion Matrix and Statistics
##
##
## svm.tuned.pred yes no
## yes 95 24
## no 46 853
##
## Accuracy : 0.9312
## 95% CI : (0.9139, 0.946)
## No Information Rate : 0.8615
## P-Value [Acc > NIR] : 1.56e-12
##
## Kappa : 0.6917
## Mcnemar's Test P-Value : 0.01207
##
## Sensitivity : 0.67376
## Specificity : 0.97263
## Pos Pred Value : 0.79832
## Neg Pred Value : 0.94883
## Prevalence : 0.13851
## Detection Rate : 0.09332
## Detection Prevalence : 0.11690
## Balanced Accuracy : 0.82320
##
## 'Positive' Class : yes
##
library(rpart)
churn.rp <- rpart(churn ~ ., data = trainset)
prediction <- predict(churn.rp, testset)
roc_x <- c(0)
roc_y <- c(0)
for(i in seq(0.1,1,0.01)){
res <- as.factor(ifelse(prediction[,1] >= i, 'yes', 'no'))
res <- factor(res,levels(res)[c(2,1)])
tb <- table(testset$churn, res)
cm <- confusionMatrix(tb)
x <- 1 - cm$byClass[2]
y <- cm$byClass[1]
roc_x <- c(roc_x, x)
roc_y <- c(roc_y, y)
}
roc_x <- c(roc_x, 1)
roc_y <- c(roc_y, 1)
plot(roc_x, roc_y, type='b')
## Using ROCR
#install.packages('ROCR')
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.3.3
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.3.3
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1]
pred.rocr <- prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])]))
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))
## 找出最重要的變數
#install.packages("rminer")
library(rminer)
## Warning: package 'rminer' was built under R version 3.3.3
model <- fit(churn~.,trainset,model="svm")
VariableImportance <- Importance(model,trainset,method="sensv")
L=list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)
?dist
## starting httpd help server ...
## done
x <- c(0, 0, 1, 1, 1, 1)
y <- c(1, 0, 1, 1, 0, 1)
sqrt(sum((x - y) ^ 2))
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
## x
## y 1.414214
dist(rbind(x,y), method = 'minkowski', p = 2)
## x
## y 1.414214
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
## x
## y 2
dist(rbind(x,y), method = 'minkowski', p = 1)
## x
## y 2
data(iris)
iris.dist <- dist(iris[, -5], 'euclidean')
hc <- hclust(iris.dist,method = 'ward.D2')
plot(hc)
plot(hc, hang = -0.01, cex = 0.7)
plot(hc, hang = -0.01, cex = 0.7)
rect.hclust(hc, k = 3, border = 'red')
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species )
fit <- cutree(hc, k= 3)
fit
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2
plot(iris$Petal.Length, iris$Petal.Width, col=fit )
par(mfrow=c(1,2))
plot(iris$Petal.Length, iris$Petal.Width, col=iris$Species )
plot(iris$Petal.Length, iris$Petal.Width, col=fit )
## Applenews 分群
# https://github.com/ywchiu/rtibame/blob/master/data/applenews.RData
load('applenews.RData')
dim(applenews)
head(applenews)
library(jiebaR)
mixseg <- worker()
apple.seg <- lapply(applenews$content, function(e) segment(e, jiebar = mixseg ))
library(tm)
s.corpus <- Corpus(VectorSource(apple.seg))
doc <- tm_map(s.corpus, removePunctuation)
doc <- tm_map(doc, removeNumbers)
dtm <- DocumentTermMatrix(doc)
dtm.remove <- removeSparseTerms(dtm, 0.99)
library(proxy)
dist.mat <- proxy::dist(as.matrix(dtm.remove), method = 'cosine')
mat <- as.matrix(dist.mat)
# Find nearest article
applenews$title[order(mat[20,])][1:10]
# Articel Clustering
hc <- hclust(dist.mat, method = 'ward.D2')
plot(hc)
rect.hclust(hc, 10)
fit <- cutree(hc, 20)
table(fit)
applenews$title[fit == 6]
download.file('https://github.com/ywchiu/rtibame/raw/master/data/customer.csv', 'customer.csv')
customer <- read.csv('customer.csv')
customer <- scale(customer[,-1])
set.seed(22)
fit <- kmeans(customer, 4 )
fit
## K-means clustering with 4 clusters of sizes 8, 11, 16, 25
##
## Cluster means:
## Visit.Time Average.Expense Sex Age
## 1 1.3302016 1.0155226 -1.4566845 0.5591307
## 2 -0.7771737 -0.5178412 -1.4566845 -0.4774599
## 3 0.8571173 0.9887331 0.6750489 1.0505015
## 4 -0.6322632 -0.7299063 0.6750489 -0.6411604
##
## Clustering vector:
## [1] 2 2 1 2 1 2 1 1 2 2 2 1 1 2 2 2 1 2 1 3 4 3 4 3 3 4 4 3 4 4 4 3 3 3 4
## [36] 4 3 4 4 4 4 4 4 4 3 3 4 4 4 3 4 3 3 4 4 4 3 4 4 3
##
## Within cluster sum of squares by cluster:
## [1] 5.90040 11.97454 22.58236 20.89159
## (between_SS / total_SS = 74.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
#plot(customer$Visit.Time, customer$Average.Expense, col = fit$cluster)
library(cluster)
set.seed(22)
km <- kmeans(customer, 4)
kms <- silhouette(km$cluster,dist(customer))
summary(kms)
## Silhouette of 60 units in 4 clusters from silhouette.default(x = km$cluster, dist = dist(customer)) :
## Cluster sizes and average silhouette widths:
## 8 11 16 25
## 0.5464597 0.4080823 0.3794910 0.5164434
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1931 0.4030 0.4890 0.4641 0.5422 0.6333
plot(kms)
# install.packages("fpc")
library(fpc)
## Warning: package 'fpc' was built under R version 3.3.3
nk <- 2:10
set.seed(22)
sw <- sapply(nk , function(k){
cluster.stats(dist(customer[,-1]), kmeans(customer[,-1], centers=k)$cluster)$avg.silwidth
})
plot(nk, sw, type="l", xlab="number of clusers", ylab="average silhouette width")
nk = 2:10
set.seed(22)
WSS = sapply(nk, function(k) {
kmeans(customer, centers=k)$tot.withinss
})
WSS
## [1] 123.49224 88.07028 61.34890 48.76431 47.20813 45.48114 29.58014
## [8] 28.87519 23.21331
plot(nk, WSS, type="l", xlab= "number of k", ylab="within sum of squares")
single_c <- hclust(dist(customer), method="single")
hc_single <- cutree(single_c, k = 4)
complete_c <- hclust(dist(customer), method="complete")
hc_complete <- cutree(complete_c, k = 4)
set.seed(22)
km <- kmeans(customer, 4)
cs <- cluster.stats(dist(customer), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]
## $within.cluster.ss
## [1] 61.3489
##
## $avg.silwidth
## [1] 0.4640587
sapply(list(kmeans = km$cluster, hc_single = hc_single, hc_complete = hc_complete), function(c) cluster.stats(dist(customer), c)[c("within.cluster.ss","avg.silwidth")])
## kmeans hc_single hc_complete
## within.cluster.ss 61.3489 136.0092 65.94076
## avg.silwidth 0.4640587 0.2481926 0.4255961
data(iris)
ind <- sample(2, nrow(iris), replace = TRUE, prob=c(0.7, 0.3))
trainset <- iris[ind == 1,]
testset <- iris[ind == 2,]
#install.packages("neuralnet")
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 3.3.3
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
##
## prediction
#trainset
trainset$setosa <- trainset$Species == "setosa"
trainset$virginica <- trainset$Species == "virginica"
trainset$versicolor <- trainset$Species == "versicolor"
network <- neuralnet(versicolor + virginica + setosa ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, trainset, hidden=3)
#network
#network$result.matrix
#head(network$generalized.weights[[1]])
plot(network)
res <- compute(network, testset[-5])$net.result
res
## [,1] [,2] [,3]
## 1 -0.000112341503913 -0.0011361437077 1.0011210699510
## 2 0.000171924625309 -0.0008509294624 1.0005516770555
## 4 0.000198773395306 -0.0008239911434 1.0004978982288
## 7 0.000179335424060 -0.0008434939464 1.0005368330212
## 10 -0.000064210732419 -0.0010878524051 1.0010246626885
## 20 -0.000071233625953 -0.0010948987222 1.0010387297379
## 21 -0.000008221233853 -0.0010316761641 1.0009125141789
## 22 0.000230977162540 -0.0007916799666 1.0004333931950
## 24 0.003906276182324 0.0028958773195 0.9930716679109
## 29 -0.000088440803922 -0.0011121632914 1.0010731961899
## 31 0.000245145141199 -0.0007774647333 1.0004050143431
## 33 -0.000206623645893 -0.0012307403100 1.0013099196813
## 47 -0.000144232500131 -0.0011681410703 1.0011849484958
## 53 0.500037999419108 0.5006823542983 -0.0006938547066
## 57 0.500045009008528 0.5006893872669 -0.0007078951075
## 61 0.499017716292572 0.4996586682100 0.0013498005304
## 62 0.500009004169413 0.5006532623405 -0.0006357764220
## 68 0.497459823928507 0.4980955797899 0.0044703018950
## 71 0.500066805776966 0.5007112567344 -0.0007515546358
## 79 0.500041171387181 0.5006855368459 -0.0007002082462
## 81 0.499252465332441 0.4998942002110 0.0008795917303
## 82 0.497719480946298 0.4983561028447 0.0039502017282
## 83 0.499403135404458 0.5000453728146 0.0005777954213
## 85 0.500041231118495 0.5006855967765 -0.0007003278896
## 87 0.500027401100188 0.5006717206307 -0.0006726259816
## 92 0.499995391557168 0.5006396043259 -0.0006085099843
## 95 0.499922893434478 0.5005668643994 -0.0004632942454
## 99 0.496127036434409 0.4967583470346 0.0071399119976
## 105 0.500069119464522 0.5007135781388 -0.0007561890158
## 109 0.500069038524807 0.5007134969291 -0.0007560268913
## 110 0.500069125946325 0.5007135846422 -0.0007562019990
## 112 0.500068990358979 0.5007134486026 -0.0007559304138
## 113 0.500069087483520 0.5007135460511 -0.0007561249569
## 119 0.500069127713582 0.5007135864154 -0.0007562055388
## 122 0.500069016010607 0.5007134743398 -0.0007559817947
## 124 0.500068458765984 0.5007129152366 -0.0007548656184
## 127 0.500068138888507 0.5007125942923 -0.0007542248950
## 128 0.500067845585739 0.5007123000112 -0.0007536374014
## 130 0.500066996657084 0.5007114482511 -0.0007519369739
## 131 0.500069068924737 0.5007135274304 -0.0007560877832
## 136 0.500069124669376 0.5007135833610 -0.0007561994412
## 146 0.500069116068220 0.5007135747312 -0.0007561822129
## 150 0.500068316190427 0.5007127721855 -0.0007545800356
prediction <- c("versicolor", "virginica", "setosa")[apply(res, 1, which.max)]
predict.table <- table(testset$Species, prediction)
predict.table
## prediction
## setosa virginica
## setosa 13 0
## versicolor 0 15
## virginica 0 15
prediction
## [1] "setosa" "setosa" "setosa" "setosa" "setosa"
## [6] "setosa" "setosa" "setosa" "setosa" "setosa"
## [11] "setosa" "setosa" "setosa" "virginica" "virginica"
## [16] "virginica" "virginica" "virginica" "virginica" "virginica"
## [21] "virginica" "virginica" "virginica" "virginica" "virginica"
## [26] "virginica" "virginica" "virginica" "virginica" "virginica"
## [31] "virginica" "virginica" "virginica" "virginica" "virginica"
## [36] "virginica" "virginica" "virginica" "virginica" "virginica"
## [41] "virginica" "virginica" "virginica"