library(C50)
data(churn)
head(churnTrain)
## state account_length area_code international_plan voice_mail_plan
## 1 KS 128 area_code_415 no yes
## 2 OH 107 area_code_415 no yes
## 3 NJ 137 area_code_415 no no
## 4 OH 84 area_code_408 yes no
## 5 OK 75 area_code_415 yes no
## 6 AL 118 area_code_510 yes no
## number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1 25 265.1 110 45.07
## 2 26 161.6 123 27.47
## 3 0 243.4 114 41.38
## 4 0 299.4 71 50.90
## 5 0 166.7 113 28.34
## 6 0 223.4 98 37.98
## total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1 197.4 99 16.78 244.7
## 2 195.5 103 16.62 254.4
## 3 121.2 110 10.30 162.6
## 4 61.9 88 5.26 196.9
## 5 148.3 122 12.61 186.9
## 6 220.6 101 18.75 203.9
## total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1 91 11.01 10.0 3
## 2 103 11.45 13.7 3
## 3 104 7.32 12.2 5
## 4 89 8.86 6.6 7
## 5 121 8.41 10.1 3
## 6 118 9.18 6.3 6
## total_intl_charge number_customer_service_calls churn
## 1 2.70 1 no
## 2 3.70 1 no
## 3 3.29 0 no
## 4 1.78 2 no
## 5 2.73 3 no
## 6 1.70 0 no
colnames(churnTrain)
## [1] "state" "account_length"
## [3] "area_code" "international_plan"
## [5] "voice_mail_plan" "number_vmail_messages"
## [7] "total_day_minutes" "total_day_calls"
## [9] "total_day_charge" "total_eve_minutes"
## [11] "total_eve_calls" "total_eve_charge"
## [13] "total_night_minutes" "total_night_calls"
## [15] "total_night_charge" "total_intl_minutes"
## [17] "total_intl_calls" "total_intl_charge"
## [19] "number_customer_service_calls" "churn"
dataset <- churnTrain[,4:20]
str(dataset)
## 'data.frame': 3333 obs. of 17 variables:
## $ international_plan : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
## $ voice_mail_plan : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
## $ number_vmail_messages : int 25 26 0 0 0 0 24 0 0 37 ...
## $ total_day_minutes : num 265 162 243 299 167 ...
## $ total_day_calls : int 110 123 114 71 113 98 88 79 97 84 ...
## $ total_day_charge : num 45.1 27.5 41.4 50.9 28.3 ...
## $ total_eve_minutes : num 197.4 195.5 121.2 61.9 148.3 ...
## $ total_eve_calls : int 99 103 110 88 122 101 108 94 80 111 ...
## $ total_eve_charge : num 16.78 16.62 10.3 5.26 12.61 ...
## $ total_night_minutes : num 245 254 163 197 187 ...
## $ total_night_calls : int 91 103 104 89 121 118 118 96 90 97 ...
## $ total_night_charge : num 11.01 11.45 7.32 8.86 8.41 ...
## $ total_intl_minutes : num 10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
## $ total_intl_calls : int 3 3 5 7 3 6 7 6 4 5 ...
## $ total_intl_charge : num 2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
## $ number_customer_service_calls: int 1 1 0 2 3 0 3 0 1 0 ...
## $ churn : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...
library(rpart)
fit <- rpart(churn ~., data = dataset)
plot(fit, margin=0.1)
text(fit)

library(rpart.plot)
rpart.plot(fit)

predicted <- predict(fit, data = dataset, type = 'class')
sum(predicted == dataset$churn) / length(dataset$churn)
## [1] 0.9528953
table(dataset$churn)
##
## yes no
## 483 2850
2850 / (2850+483)
## [1] 0.8550855
table(dataset$churn, predicted)
## predicted
## yes no
## yes 361 122
## no 35 2815
TOTAL_LOSS <- 1500 * 483 *24
GIFT <- 396 * 2000
TOTAL_LOSS
## [1] 17388000
GIFT
## [1] 792000
A <- c('Y' ,'N', 'Y')
P <- c('Y', 'Y', 'Y')
sum(as.numeric(A == P))
## [1] 2
sum(A == P) / length(A)
## [1] 0.6666667
set.seed(2)
sample(1:42, 6)
## [1] 8 29 23 7 36 35
length(dataset$churn)
## [1] 3333
set.seed(2)
ind <- sample(2 , 3333,replace = TRUE, prob=c(0.7,0.3))
table(ind)
## ind
## 1 2
## 2315 1018
trainset <- dataset[ind == 1, ]
testset <- dataset[ind == 2, ]
nrow(trainset)
## [1] 2315
nrow(testset)
## [1] 1018
fit <- rpart(churn ~., data = trainset )
train_predicted <- predict(fit, trainset, type = 'class')
sum(train_predicted == trainset$churn) / length(trainset$churn)
## [1] 0.9434125
test_predicted <- predict(fit, testset, type = 'class')
sum(test_predicted == testset$churn) / length(testset$churn)
## [1] 0.9420432
10 Fold Cross Validation
set.seed(42)
ind <- sample(10 , 3333,replace = TRUE)
table(ind)
## ind
## 1 2 3 4 5 6 7 8 9 10
## 346 371 301 324 339 321 311 349 322 349
acc <- c()
for (i in 1:10){
trainset <- dataset[ind != i,]
testset <- dataset[ind == i,]
fit <- rpart(churn ~., data = trainset)
predicted <- predict(fit, testset, type = 'class')
val <- sum(predicted == testset$churn) / length(testset$churn)
acc <- c(acc, val)
}
acc
## [1] 0.9595376 0.9514825 0.9401993 0.9320988 0.9498525 0.9376947 0.9517685
## [8] 0.9111748 0.9316770 0.9570201
mean(acc) # 1 - bias
## [1] 0.9422506
sd(acc) # variance
## [1] 0.01474266
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
control <- trainControl(method="repeatedcv", number=10, repeats=3)
model = train(churn~., data=dataset, method="rpart"
,preProcess="scale", trControl=control)
predicted <- predict(model, dataset)
Confusion Matrix
predicted <- predict(model, dataset)
tb <- table(dataset$churn, predicted)
tb
## predicted
## yes no
## yes 121 362
## no 37 2813
TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[2,2]
TPR <- TP / (TP + FN)
TPR
## [1] 0.7658228
FPR <- FP / (FP + TN)
FPR
## [1] 0.1140157
predicted <- predict(model, dataset, type = 'prob')
predicted2 <- ifelse(predicted[,1] > 0.85, 'yes', 'no')
predicted2 <-factor(predicted2, levels = c('yes', 'no'))
table(dataset$churn,predicted2)
## predicted2
## yes no
## yes 0 483
## no 0 2850
TPR_ARY <- c(0)
FPR_ARY <- c(0)
for(i in seq(0,1,0.01)){
print(i)
predicted2 <- ifelse(predicted[,1] > i, 'yes', 'no')
predicted2 <-factor(predicted2, levels = c('yes', 'no'))
tb <- table(dataset$churn,predicted2)
TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[2,2]
TPR <- TP / (TP + FN)
FPR <- FP / (FP + TN)
if ((is.na(TPR) == FALSE) & (is.na(FPR) == FALSE)){
TPR_ARY <- c(TPR_ARY, TPR)
FPR_ARY <- c(FPR_ARY, FPR)
}
}
## [1] 0
## [1] 0.01
## [1] 0.02
## [1] 0.03
## [1] 0.04
## [1] 0.05
## [1] 0.06
## [1] 0.07
## [1] 0.08
## [1] 0.09
## [1] 0.1
## [1] 0.11
## [1] 0.12
## [1] 0.13
## [1] 0.14
## [1] 0.15
## [1] 0.16
## [1] 0.17
## [1] 0.18
## [1] 0.19
## [1] 0.2
## [1] 0.21
## [1] 0.22
## [1] 0.23
## [1] 0.24
## [1] 0.25
## [1] 0.26
## [1] 0.27
## [1] 0.28
## [1] 0.29
## [1] 0.3
## [1] 0.31
## [1] 0.32
## [1] 0.33
## [1] 0.34
## [1] 0.35
## [1] 0.36
## [1] 0.37
## [1] 0.38
## [1] 0.39
## [1] 0.4
## [1] 0.41
## [1] 0.42
## [1] 0.43
## [1] 0.44
## [1] 0.45
## [1] 0.46
## [1] 0.47
## [1] 0.48
## [1] 0.49
## [1] 0.5
## [1] 0.51
## [1] 0.52
## [1] 0.53
## [1] 0.54
## [1] 0.55
## [1] 0.56
## [1] 0.57
## [1] 0.58
## [1] 0.59
## [1] 0.6
## [1] 0.61
## [1] 0.62
## [1] 0.63
## [1] 0.64
## [1] 0.65
## [1] 0.66
## [1] 0.67
## [1] 0.68
## [1] 0.69
## [1] 0.7
## [1] 0.71
## [1] 0.72
## [1] 0.73
## [1] 0.74
## [1] 0.75
## [1] 0.76
## [1] 0.77
## [1] 0.78
## [1] 0.79
## [1] 0.8
## [1] 0.81
## [1] 0.82
## [1] 0.83
## [1] 0.84
## [1] 0.85
## [1] 0.86
## [1] 0.87
## [1] 0.88
## [1] 0.89
## [1] 0.9
## [1] 0.91
## [1] 0.92
## [1] 0.93
## [1] 0.94
## [1] 0.95
## [1] 0.96
## [1] 0.97
## [1] 0.98
## [1] 0.99
## [1] 1
TPR_ARY <- c(TPR_ARY, 1)
FPR_ARY <- c(FPR_ARY, 1)
plot(FPR_ARY, TPR_ARY, type = 'l')

library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
churn.rp <- rpart(churn ~., data = trainset)
predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1]
pred.rocr <- prediction(pred.to.roc, testset$churn)
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")
perf.tpr.rocr
## An object of class "performance"
## Slot "x.name":
## [1] "False positive rate"
##
## Slot "y.name":
## [1] "True positive rate"
##
## Slot "alpha.name":
## [1] "Cutoff"
##
## Slot "x.values":
## [[1]]
## [1] 0.000000000 0.000000000 0.003355705 0.003355705 0.003355705
## [6] 0.006711409 0.006711409 0.010067114 0.013422819 0.023489933
## [11] 0.053691275 0.197986577 0.231543624 0.278523490 1.000000000
##
##
## Slot "y.values":
## [[1]]
## [1] 0.0000000 0.1764706 0.3333333 0.3529412 0.3725490 0.4117647 0.7450980
## [8] 0.7647059 0.7647059 0.7843137 0.7843137 0.9019608 0.9215686 0.9215686
## [15] 1.0000000
##
##
## Slot "alpha.values":
## [[1]]
## [1] Inf 1.00000000 0.95652174 0.90322581 0.86666667 0.85714286
## [7] 0.84705882 0.76470588 0.14285714 0.14285714 0.13636364 0.11186441
## [13] 0.10588235 0.04402516 0.02797203
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

RandomForest
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
forest <- randomForest(churn ~., data = trainset, ntree=200, importance=T, proximity=T)
predicted <- predict(forest, testset, type = 'class')
table(testset$churn, predicted)
## predicted
## yes no
## yes 37 14
## no 1 297
# Decision Tree
predictions1 <- predict(churn.rp, testset, type="prob")
pred.to.roc1 <- predictions1[, 1]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$churn))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")
# Random Forest
predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 1]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$churn))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")
rpart_auc <- round(as.numeric(perf.rocr1@y.values),2)
forest_auc <- round(as.numeric(perf.rocr2@y.values),2)
forest_auc
## [1] 0.94
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2,
c(paste("rpart:",rpart_auc),
paste("forest:",forest_auc)), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

挑選變數
library(rminer)
model<-fit(churn~.,trainset,model="rpart")
VariableImportance<-Importance(model,trainset,method="sensv")
VariableImportance$imp
## [1] 0.0006418937 0.0000000000 0.0000000000 0.9907184657 0.0000000000
## [6] 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## [11] 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## [16] 0.0086396406 0.0000000000
colnames(trainset)
## [1] "international_plan" "voice_mail_plan"
## [3] "number_vmail_messages" "total_day_minutes"
## [5] "total_day_calls" "total_day_charge"
## [7] "total_eve_minutes" "total_eve_calls"
## [9] "total_eve_charge" "total_night_minutes"
## [11] "total_night_calls" "total_night_charge"
## [13] "total_intl_minutes" "total_intl_calls"
## [15] "total_intl_charge" "number_customer_service_calls"
## [17] "churn"
barplot(VariableImportance$imp, names.arg = colnames(trainset))

L<- list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

計算距離
x <- c(0,0,1,1,1,1) # meter
y <- c(1,0,1,1,0,1) # meter
sum(abs(x - y)) # meter
## [1] 2
sum((x - y ) ^ 2) # meter ^ 2
## [1] 2
sqrt(sum((x - y ) ^ 2)) # meter
## [1] 1.414214
dist(rbind(x,y), method = 'euclidean')
## x
## y 1.414214
dist(rbind(x,y), method ="minkowski", p=2)
## x
## y 1.414214
sum(abs(x - y))
## [1] 2
dist(rbind(x,y), method = 'manhattan')
## x
## y 2
dist(rbind(x,y), method ="minkowski", p=1)
## x
## y 2
分群
data(iris)
hc <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'ward.D2' )
plot(hc)
fit <- cutree(hc, k = 3)
plot(hc)
rect.hclust(hc, k = 3, border = 'red')

par(mfrow = c(1,2))
plot(Petal.Length ~ Petal.Width, data = iris, col = iris$Species, main = 'Original Classfication')
plot(Petal.Length ~ Petal.Width, data = iris, col = fit, main = 'Clustered Result')

KMeans分群
fit <- kmeans(iris[, -5], centers = 3)
fit$centers
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 6.850000 3.073684 5.742105 2.071053
## 3 5.901613 2.748387 4.393548 1.433871
#fit$cluster
plot(Petal.Length ~ Petal.Width, data = iris, col = fit$cluster)
points(fit$centers[,4], fit$centers[,3], col= 'orange', pch = 19, cex = 3)

barplot(t(fit$centers), beside = TRUE,xlab="cluster", ylab="value")

par(mfrow = c(1,2))
plot(Petal.Length ~ Petal.Width, data = iris, col = iris$Species, main = 'Original Classfication')
plot(Petal.Length ~ Petal.Width, data = iris, col = fit$cluster, main = 'KMeans Clustering')

Silhouette
library(cluster)
set.seed(22)
km <- kmeans(iris[, -5], centers = 3)
kms <- silhouette(km$cluster, dist(iris[,-5]))
plot(kms)

WSS
set.seed(22)
WSS <- c()
for(k in 2:10){
km <- kmeans(iris[, -5], centers = k)
WSS <- c(WSS, km$tot.withinss)
}
plot(2:10, WSS, type = 'l')
abline(v = 3, col = 'red')

Silhouette
library(cluster)
set.seed(22)
SW <- c()
for(k in 2:10){
km <- kmeans(iris[, -5], centers = k)
kms <- silhouette(km$cluster, dist(iris[,-5]))
sil <- mean(kms[,3])
SW <- c(SW, sil)
}
plot(2:10, SW, type = 'l')
abline(v = 2, col = 'red')

比較分群演算法
single_c <- hclust(dist(iris[,-5]), method="single")
hc_single <- cutree(single_c, k = 3)
plot(single_c)

complete_c <- hclust(dist(iris[,-5]), method="complete")
hc_complete <- cutree(complete_c, k = 3)
plot(complete_c)

ward_c <- hclust(dist(iris[,-5]), method="ward.D2")
hc_ward <- cutree(ward_c, k = 3)
plot(ward_c)

set.seed(42)
km <- kmeans(iris[,-5], 3)
library(fpc)
cs <- cluster.stats(dist(iris[,-5]), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]
## $within.cluster.ss
## [1] 78.85144
##
## $avg.silwidth
## [1] 0.552819
sapply(
list(kmeans = km$cluster,
hc_single = hc_single,
hc_complete = hc_complete,
hc_ward = hc_ward),
function(k){
cs <- cluster.stats(dist(iris[,-5]), k)
cs[c("within.cluster.ss","avg.silwidth")]
}
)
## kmeans hc_single hc_complete hc_ward
## within.cluster.ss 78.85144 142.4794 89.52501 79.29713
## avg.silwidth 0.552819 0.5121108 0.5135953 0.5543237
set.seed(51)
sample(42,6)
## [1] 33 9 12 39 41 31
sample(42,6)
## [1] 32 17 39 5 20 26
sample(42,6)
## [1] 8 11 1 21 6 10
客戶分群
customers <- read.csv('https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/customers.csv')
customers <- customers[,4:5]
set.seed(31)
kc <- kmeans(customers, centers = 5)
plot(customers[,1], customers[,2], xlab= 'Income', ylab= 'Spending Score', col = kc$cluster)
points(kc$centers, col= 'orange', pch = 19, cex = 3)

library(cluster)
kcs <- silhouette(kc$cluster,dist(customers))
plot(kcs)

library(cluster)
set.seed(123)
SW <- c()
for(k in 2:10){
km <- kmeans(customers, centers = k)
kms <- silhouette(km$cluster, dist(customers))
sil <- mean(kms[,3])
SW <- c(SW, sil)
}
plot(2:10, SW, type = 'l')
abline(v = 5, col = 'red')

迴歸分析
data(anscombe)
head(anscombe)
## x1 x2 x3 x4 y1 y2 y3 y4
## 1 10 10 10 8 8.04 9.14 7.46 6.58
## 2 8 8 8 8 6.95 8.14 6.77 5.76
## 3 13 13 13 8 7.58 8.74 12.74 7.71
## 4 9 9 9 8 8.81 8.77 7.11 8.84
## 5 11 11 11 8 8.33 9.26 7.81 8.47
## 6 14 14 14 8 9.96 8.10 8.84 7.04
plot(y1 ~ x1 , data = anscombe)
fit <- lm(y1 ~ x1, data = anscombe)
plot(y1 ~ x1 , data = anscombe)
abline(fit, col = 'red')

plot(y2 ~ x1 , data = anscombe)
fit <- lm(y2 ~ x1, data = anscombe)
abline(fit, col = 'red')

plot(y2 ~ x1 , data = anscombe)

lmfit <- lm(y2 ~ poly(x1, 2), data = anscombe)
lmfit
##
## Call:
## lm(formula = y2 ~ poly(x1, 2), data = anscombe)
##
## Coefficients:
## (Intercept) poly(x1, 2)1 poly(x1, 2)2
## 7.501 5.244 -3.712
# y = -3.712 x ^ 2 + 5.244 * x + 7.501
anscombe$x1
## [1] 10 8 13 9 11 14 6 4 12 7 5
lmfit$fitted.values
## 1 2 3 4 5 6 7 8
## 9.141329 8.141329 8.740629 8.768042 9.261189 8.100210 6.127622 3.100210
## 9 10 11
## 9.127622 7.261189 4.740629
plot(anscombe$x1, lmfit$fitted.values, type = 'p')

plot(anscombe$x1, lmfit$fitted.values, type = 'l')

plot(y2 ~ x1 , data = anscombe)
lines(sort(anscombe$x1), lmfit$fitted.values[order(anscombe$x1)], type = 'l', col = 'red')

fit <- lm(y3~x1, data = anscombe)
plot(y3~x1, data = anscombe)
abline(fit, col = 'red')

library(MASS)
fit <- rlm(y3~x1, data = anscombe)
plot(y3~x1, data = anscombe)
abline(fit, col = 'red')

預測房價
house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/house-prices.csv')
head(house_prices)
## Home Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1 1 114300 1790 2 2 2 No East
## 2 2 114200 2030 4 2 3 No East
## 3 3 114800 1740 3 2 1 No East
## 4 4 94700 1980 3 2 3 No East
## 5 5 119800 2130 3 3 3 No East
## 6 6 114600 1780 3 2 2 No North
fit <- lm(Price ~ SqFt, data = house_prices)
plot(Price ~ SqFt, data = house_prices)
abline(fit, col = 'red')

fit
##
## Call:
## lm(formula = Price ~ SqFt, data = house_prices)
##
## Coefficients:
## (Intercept) SqFt
## -10091.13 70.23
library(jsonlite)
1749 / 30
## [1] 58.3
dfall <- data.frame()
for (i in 0:1){
p <- i * 30
df <- fromJSON(paste0('https://sale.591.com.tw/home/search/list?type=2&&shType=list§ion=5®ionid=1&kind=9&firstRow=',p,'&totalRows=1749×tamp=1535791689399'))
dfall <- rbind(dfall, df$data$house_list[, c('area', 'price')])
}
write.csv(dfall, file= '591daan.csv')
dfall <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/591daan.csv')
fit <- lm(price ~ area, dfall)
plot(price ~ area, dfall)
abline(fit, col ='red')

predict(fit, data.frame(area = 20))
## 1
## 1521.443
summary(fit)
##
## Call:
## lm(formula = price ~ area, data = dfall)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11412 -610 -22 610 37384
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1061.919 72.773 -14.59 <2e-16 ***
## area 129.168 1.269 101.76 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1702 on 1750 degrees of freedom
## Multiple R-squared: 0.8554, Adjusted R-squared: 0.8553
## F-statistic: 1.035e+04 on 1 and 1750 DF, p-value: < 2.2e-16