Demo20180901

library(C50)
data(churn)
head(churnTrain)

##   state account_length     area_code international_plan voice_mail_plan
## 1    KS            128 area_code_415                 no             yes
## 2    OH            107 area_code_415                 no             yes
## 3    NJ            137 area_code_415                 no              no
## 4    OH             84 area_code_408                yes              no
## 5    OK             75 area_code_415                yes              no
## 6    AL            118 area_code_510                yes              no
##   number_vmail_messages total_day_minutes total_day_calls total_day_charge
## 1                    25             265.1             110            45.07
## 2                    26             161.6             123            27.47
## 3                     0             243.4             114            41.38
## 4                     0             299.4              71            50.90
## 5                     0             166.7             113            28.34
## 6                     0             223.4              98            37.98
##   total_eve_minutes total_eve_calls total_eve_charge total_night_minutes
## 1             197.4              99            16.78               244.7
## 2             195.5             103            16.62               254.4
## 3             121.2             110            10.30               162.6
## 4              61.9              88             5.26               196.9
## 5             148.3             122            12.61               186.9
## 6             220.6             101            18.75               203.9
##   total_night_calls total_night_charge total_intl_minutes total_intl_calls
## 1                91              11.01               10.0                3
## 2               103              11.45               13.7                3
## 3               104               7.32               12.2                5
## 4                89               8.86                6.6                7
## 5               121               8.41               10.1                3
## 6               118               9.18                6.3                6
##   total_intl_charge number_customer_service_calls churn
## 1              2.70                             1    no
## 2              3.70                             1    no
## 3              3.29                             0    no
## 4              1.78                             2    no
## 5              2.73                             3    no
## 6              1.70                             0    no

colnames(churnTrain)

##  [1] "state"                         "account_length"               
##  [3] "area_code"                     "international_plan"           
##  [5] "voice_mail_plan"               "number_vmail_messages"        
##  [7] "total_day_minutes"             "total_day_calls"              
##  [9] "total_day_charge"              "total_eve_minutes"            
## [11] "total_eve_calls"               "total_eve_charge"             
## [13] "total_night_minutes"           "total_night_calls"            
## [15] "total_night_charge"            "total_intl_minutes"           
## [17] "total_intl_calls"              "total_intl_charge"            
## [19] "number_customer_service_calls" "churn"

dataset <- churnTrain[,4:20]

str(dataset)

## 'data.frame':    3333 obs. of  17 variables:
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

library(rpart)
fit <- rpart(churn ~., data = dataset)
plot(fit, margin=0.1)
text(fit)

library(rpart.plot)
rpart.plot(fit)

predicted <- predict(fit, data = dataset, type = 'class')
sum(predicted == dataset$churn) / length(dataset$churn)

## [1] 0.9528953

table(dataset$churn)

## 
##  yes   no 
##  483 2850

2850 / (2850+483)

## [1] 0.8550855

table(dataset$churn, predicted)

##      predicted
##        yes   no
##   yes  361  122
##   no    35 2815

TOTAL_LOSS <- 1500 * 483 *24
GIFT       <- 396 * 2000
TOTAL_LOSS

## [1] 17388000

GIFT

## [1] 792000

A <- c('Y' ,'N', 'Y')
P <- c('Y', 'Y', 'Y')
sum(as.numeric(A == P))

## [1] 2

sum(A == P) / length(A)

## [1] 0.6666667

set.seed(2)
sample(1:42, 6)

## [1]  8 29 23  7 36 35

length(dataset$churn)

## [1] 3333

set.seed(2)
ind <- sample(2 , 3333,replace = TRUE, prob=c(0.7,0.3))
table(ind)

## ind
##    1    2 
## 2315 1018

trainset <- dataset[ind == 1, ]
testset <- dataset[ind == 2, ]
nrow(trainset)

## [1] 2315

nrow(testset)

## [1] 1018

fit <- rpart(churn ~., data = trainset )
train_predicted <- predict(fit, trainset, type = 'class')

sum(train_predicted == trainset$churn) / length(trainset$churn)

## [1] 0.9434125

test_predicted <- predict(fit, testset, type = 'class')
sum(test_predicted == testset$churn) / length(testset$churn)

## [1] 0.9420432

10 Fold Cross Validation

set.seed(42)
ind <- sample(10 , 3333,replace = TRUE)
table(ind)

## ind
##   1   2   3   4   5   6   7   8   9  10 
## 346 371 301 324 339 321 311 349 322 349

acc <- c()
for (i in 1:10){
  trainset <- dataset[ind != i,]
  testset  <- dataset[ind == i,]
  fit <- rpart(churn ~., data = trainset)
  predicted <- predict(fit, testset, type = 'class')
  val <- sum(predicted == testset$churn) / length(testset$churn)
  acc <- c(acc, val)
}
acc

##  [1] 0.9595376 0.9514825 0.9401993 0.9320988 0.9498525 0.9376947 0.9517685
##  [8] 0.9111748 0.9316770 0.9570201

mean(acc) # 1 - bias

## [1] 0.9422506

sd(acc)   # variance

## [1] 0.01474266

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

control <- trainControl(method="repeatedcv", number=10, repeats=3)
model = train(churn~., data=dataset, method="rpart"
,preProcess="scale", trControl=control)

predicted <- predict(model, dataset)

Confusion Matrix

predicted <- predict(model, dataset)
tb <- table(dataset$churn, predicted)
tb

##      predicted
##        yes   no
##   yes  121  362
##   no    37 2813

TP <- tb[1,1]
FP <- tb[1,2]
FN <- tb[2,1]
TN <- tb[2,2]

TPR <- TP / (TP + FN)
TPR

## [1] 0.7658228

FPR <- FP / (FP + TN)
FPR

## [1] 0.1140157

predicted <-  predict(model, dataset, type = 'prob')

predicted2 <- ifelse(predicted[,1] > 0.85, 'yes', 'no')
predicted2 <-factor(predicted2, levels = c('yes', 'no'))
table(dataset$churn,predicted2)

##      predicted2
##        yes   no
##   yes    0  483
##   no     0 2850

TPR_ARY <- c(0)
FPR_ARY <- c(0)
for(i in seq(0,1,0.01)){
  print(i)
  predicted2 <- ifelse(predicted[,1] > i, 'yes', 'no')
  predicted2 <-factor(predicted2, levels = c('yes', 'no'))
  tb <- table(dataset$churn,predicted2)
  
  TP <- tb[1,1]
  FP <- tb[1,2]
  FN <- tb[2,1]
  TN <- tb[2,2]
  TPR <- TP / (TP + FN)
  FPR <- FP / (FP + TN)
  if ((is.na(TPR) == FALSE) & (is.na(FPR) == FALSE)){
  TPR_ARY <- c(TPR_ARY, TPR)
  FPR_ARY <- c(FPR_ARY, FPR)
  }
}

## [1] 0
## [1] 0.01
## [1] 0.02
## [1] 0.03
## [1] 0.04
## [1] 0.05
## [1] 0.06
## [1] 0.07
## [1] 0.08
## [1] 0.09
## [1] 0.1
## [1] 0.11
## [1] 0.12
## [1] 0.13
## [1] 0.14
## [1] 0.15
## [1] 0.16
## [1] 0.17
## [1] 0.18
## [1] 0.19
## [1] 0.2
## [1] 0.21
## [1] 0.22
## [1] 0.23
## [1] 0.24
## [1] 0.25
## [1] 0.26
## [1] 0.27
## [1] 0.28
## [1] 0.29
## [1] 0.3
## [1] 0.31
## [1] 0.32
## [1] 0.33
## [1] 0.34
## [1] 0.35
## [1] 0.36
## [1] 0.37
## [1] 0.38
## [1] 0.39
## [1] 0.4
## [1] 0.41
## [1] 0.42
## [1] 0.43
## [1] 0.44
## [1] 0.45
## [1] 0.46
## [1] 0.47
## [1] 0.48
## [1] 0.49
## [1] 0.5
## [1] 0.51
## [1] 0.52
## [1] 0.53
## [1] 0.54
## [1] 0.55
## [1] 0.56
## [1] 0.57
## [1] 0.58
## [1] 0.59
## [1] 0.6
## [1] 0.61
## [1] 0.62
## [1] 0.63
## [1] 0.64
## [1] 0.65
## [1] 0.66
## [1] 0.67
## [1] 0.68
## [1] 0.69
## [1] 0.7
## [1] 0.71
## [1] 0.72
## [1] 0.73
## [1] 0.74
## [1] 0.75
## [1] 0.76
## [1] 0.77
## [1] 0.78
## [1] 0.79
## [1] 0.8
## [1] 0.81
## [1] 0.82
## [1] 0.83
## [1] 0.84
## [1] 0.85
## [1] 0.86
## [1] 0.87
## [1] 0.88
## [1] 0.89
## [1] 0.9
## [1] 0.91
## [1] 0.92
## [1] 0.93
## [1] 0.94
## [1] 0.95
## [1] 0.96
## [1] 0.97
## [1] 0.98
## [1] 0.99
## [1] 1

TPR_ARY <- c(TPR_ARY, 1)
FPR_ARY <- c(FPR_ARY, 1)
plot(FPR_ARY, TPR_ARY, type = 'l')

library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

churn.rp <- rpart(churn ~., data = trainset)

predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1]
pred.rocr <- prediction(pred.to.roc, testset$churn)
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr")

perf.tpr.rocr

## An object of class "performance"
## Slot "x.name":
## [1] "False positive rate"
## 
## Slot "y.name":
## [1] "True positive rate"
## 
## Slot "alpha.name":
## [1] "Cutoff"
## 
## Slot "x.values":
## [[1]]
##  [1] 0.000000000 0.000000000 0.003355705 0.003355705 0.003355705
##  [6] 0.006711409 0.006711409 0.010067114 0.013422819 0.023489933
## [11] 0.053691275 0.197986577 0.231543624 0.278523490 1.000000000
## 
## 
## Slot "y.values":
## [[1]]
##  [1] 0.0000000 0.1764706 0.3333333 0.3529412 0.3725490 0.4117647 0.7450980
##  [8] 0.7647059 0.7647059 0.7843137 0.7843137 0.9019608 0.9215686 0.9215686
## [15] 1.0000000
## 
## 
## Slot "alpha.values":
## [[1]]
##  [1]        Inf 1.00000000 0.95652174 0.90322581 0.86666667 0.85714286
##  [7] 0.84705882 0.76470588 0.14285714 0.14285714 0.13636364 0.11186441
## [13] 0.10588235 0.04402516 0.02797203

plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

RandomForest

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

forest <- randomForest(churn ~., data = trainset, ntree=200, importance=T, proximity=T)


predicted <- predict(forest, testset, type = 'class')
table(testset$churn, predicted)

##      predicted
##       yes  no
##   yes  37  14
##   no    1 297

# Decision Tree
predictions1 <- predict(churn.rp, testset, type="prob")
pred.to.roc1 <- predictions1[, 1]
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$churn))
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

# Random Forest
predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 1]
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$churn))
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff")
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")

rpart_auc  <- round(as.numeric(perf.rocr1@y.values),2)
forest_auc <- round(as.numeric(perf.rocr2@y.values),2)
forest_auc

## [1] 0.94

plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, 
       c(paste("rpart:",rpart_auc),
         paste("forest:",forest_auc)), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

挑選變數

library(rminer)
model<-fit(churn~.,trainset,model="rpart")
VariableImportance<-Importance(model,trainset,method="sensv")
VariableImportance$imp

##  [1] 0.0006418937 0.0000000000 0.0000000000 0.9907184657 0.0000000000
##  [6] 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## [11] 0.0000000000 0.0000000000 0.0000000000 0.0000000000 0.0000000000
## [16] 0.0086396406 0.0000000000

colnames(trainset)

##  [1] "international_plan"            "voice_mail_plan"              
##  [3] "number_vmail_messages"         "total_day_minutes"            
##  [5] "total_day_calls"               "total_day_charge"             
##  [7] "total_eve_minutes"             "total_eve_calls"              
##  [9] "total_eve_charge"              "total_night_minutes"          
## [11] "total_night_calls"             "total_night_charge"           
## [13] "total_intl_minutes"            "total_intl_calls"             
## [15] "total_intl_charge"             "number_customer_service_calls"
## [17] "churn"

barplot(VariableImportance$imp, names.arg = colnames(trainset))

L<- list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)
mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

計算距離

x <- c(0,0,1,1,1,1) # meter
y <- c(1,0,1,1,0,1) # meter

sum(abs(x - y)) # meter

## [1] 2

sum((x - y ) ^ 2) # meter ^ 2

## [1] 2

sqrt(sum((x - y ) ^ 2)) # meter

## [1] 1.414214

dist(rbind(x,y), method = 'euclidean')

##          x
## y 1.414214

dist(rbind(x,y), method ="minkowski", p=2)

##          x
## y 1.414214

sum(abs(x - y))

## [1] 2

dist(rbind(x,y), method = 'manhattan')

##   x
## y 2

dist(rbind(x,y), method ="minkowski", p=1)

##   x
## y 2

分群

data(iris)
hc <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'ward.D2' )

plot(hc)

fit <- cutree(hc, k = 3)

plot(hc)
rect.hclust(hc, k = 3, border = 'red')

par(mfrow = c(1,2))
plot(Petal.Length ~ Petal.Width, data = iris, col = iris$Species, main = 'Original Classfication')
plot(Petal.Length ~ Petal.Width, data = iris, col = fit, main = 'Clustered Result')

KMeans分群

fit <- kmeans(iris[, -5], centers = 3)

fit$centers

##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     5.006000    3.428000     1.462000    0.246000
## 2     6.850000    3.073684     5.742105    2.071053
## 3     5.901613    2.748387     4.393548    1.433871

#fit$cluster
plot(Petal.Length ~ Petal.Width, data = iris, col = fit$cluster)
points(fit$centers[,4], fit$centers[,3], col= 'orange', pch = 19, cex = 3)

barplot(t(fit$centers), beside = TRUE,xlab="cluster", ylab="value")

par(mfrow = c(1,2))
plot(Petal.Length ~ Petal.Width, data = iris, col = iris$Species, main = 'Original Classfication')
plot(Petal.Length ~ Petal.Width, data = iris, col = fit$cluster, main = 'KMeans Clustering')

Silhouette

library(cluster)
set.seed(22)
km <- kmeans(iris[, -5], centers = 3)

kms <- silhouette(km$cluster, dist(iris[,-5]))
plot(kms)

WSS

set.seed(22)
WSS <- c()
for(k in 2:10){
  km <- kmeans(iris[, -5], centers = k)
  WSS <- c(WSS, km$tot.withinss)
}

plot(2:10, WSS, type = 'l')
abline(v = 3, col = 'red')

Silhouette

library(cluster)

set.seed(22)
SW <- c()
for(k in 2:10){
  km  <- kmeans(iris[, -5], centers = k)
  kms <- silhouette(km$cluster, dist(iris[,-5]))
  sil <- mean(kms[,3])
  SW  <- c(SW, sil)
}

plot(2:10, SW, type = 'l')
abline(v = 2, col = 'red')

比較分群演算法

single_c  <- hclust(dist(iris[,-5]), method="single")
hc_single <- cutree(single_c, k = 3)
plot(single_c)

complete_c <- hclust(dist(iris[,-5]), method="complete")
hc_complete <- cutree(complete_c, k = 3)
plot(complete_c)

ward_c  <- hclust(dist(iris[,-5]), method="ward.D2")
hc_ward <- cutree(ward_c, k = 3)
plot(ward_c)

set.seed(42)
km <- kmeans(iris[,-5], 3)

library(fpc)

cs <- cluster.stats(dist(iris[,-5]), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]

## $within.cluster.ss
## [1] 78.85144
## 
## $avg.silwidth
## [1] 0.552819

sapply(
list(kmeans      = km$cluster, 
     hc_single   = hc_single, 
     hc_complete = hc_complete,
     hc_ward     = hc_ward), 
  function(k){
    cs <- cluster.stats(dist(iris[,-5]), k)
    cs[c("within.cluster.ss","avg.silwidth")]    
  }
)

##                   kmeans   hc_single hc_complete hc_ward  
## within.cluster.ss 78.85144 142.4794  89.52501    79.29713 
## avg.silwidth      0.552819 0.5121108 0.5135953   0.5543237

set.seed(51)
sample(42,6)

## [1] 33  9 12 39 41 31

sample(42,6)

## [1] 32 17 39  5 20 26

sample(42,6)

## [1]  8 11  1 21  6 10

客戶分群

customers <- read.csv('https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/customers.csv')

customers <- customers[,4:5]
set.seed(31)

kc <- kmeans(customers, centers = 5)

plot(customers[,1], customers[,2], xlab= 'Income', ylab= 'Spending Score', col = kc$cluster)
points(kc$centers, col= 'orange', pch = 19, cex = 3)

library(cluster)
kcs <- silhouette(kc$cluster,dist(customers))
plot(kcs)

library(cluster)

set.seed(123)
SW <- c()
for(k in 2:10){
  km  <- kmeans(customers, centers = k)
  kms <- silhouette(km$cluster, dist(customers))
  sil <- mean(kms[,3])
  SW  <- c(SW, sil)
}

plot(2:10, SW, type = 'l')
abline(v = 5, col = 'red')

Clustering Method

http://scikit-learn.org/stable/modules/clustering.html

迴歸分析

data(anscombe)
head(anscombe)

##   x1 x2 x3 x4   y1   y2    y3   y4
## 1 10 10 10  8 8.04 9.14  7.46 6.58
## 2  8  8  8  8 6.95 8.14  6.77 5.76
## 3 13 13 13  8 7.58 8.74 12.74 7.71
## 4  9  9  9  8 8.81 8.77  7.11 8.84
## 5 11 11 11  8 8.33 9.26  7.81 8.47
## 6 14 14 14  8 9.96 8.10  8.84 7.04

plot(y1 ~ x1 , data = anscombe)
fit <- lm(y1 ~ x1, data = anscombe)
plot(y1 ~ x1 , data = anscombe)
abline(fit, col = 'red')

plot(y2 ~ x1 , data = anscombe)
fit <- lm(y2 ~ x1, data = anscombe)
abline(fit, col = 'red')

plot(y2 ~ x1 , data = anscombe)

lmfit <- lm(y2 ~ poly(x1, 2), data = anscombe)
lmfit

## 
## Call:
## lm(formula = y2 ~ poly(x1, 2), data = anscombe)
## 
## Coefficients:
##  (Intercept)  poly(x1, 2)1  poly(x1, 2)2  
##        7.501         5.244        -3.712

# y = -3.712 x ^ 2 + 5.244 * x + 7.501
anscombe$x1

##  [1] 10  8 13  9 11 14  6  4 12  7  5

lmfit$fitted.values

##        1        2        3        4        5        6        7        8 
## 9.141329 8.141329 8.740629 8.768042 9.261189 8.100210 6.127622 3.100210 
##        9       10       11 
## 9.127622 7.261189 4.740629

plot(anscombe$x1, lmfit$fitted.values, type = 'p')

plot(anscombe$x1, lmfit$fitted.values, type = 'l')

plot(y2 ~ x1 , data = anscombe)
lines(sort(anscombe$x1), lmfit$fitted.values[order(anscombe$x1)], type = 'l', col = 'red')

fit <- lm(y3~x1, data = anscombe)
plot(y3~x1, data = anscombe)
abline(fit, col = 'red')

library(MASS)
fit <- rlm(y3~x1, data = anscombe)
plot(y3~x1, data = anscombe)
abline(fit, col = 'red')

預測房價

house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/tibamepy/master/data/house-prices.csv')

head(house_prices)

##   Home  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1    1 114300 1790        2         2      2    No         East
## 2    2 114200 2030        4         2      3    No         East
## 3    3 114800 1740        3         2      1    No         East
## 4    4  94700 1980        3         2      3    No         East
## 5    5 119800 2130        3         3      3    No         East
## 6    6 114600 1780        3         2      2    No        North

fit <- lm(Price ~ SqFt, data = house_prices)
plot(Price ~ SqFt, data = house_prices)
abline(fit, col = 'red')

fit

## 
## Call:
## lm(formula = Price ~ SqFt, data = house_prices)
## 
## Coefficients:
## (Intercept)         SqFt  
##   -10091.13        70.23

library(jsonlite)
1749 / 30

## [1] 58.3

dfall <- data.frame()
for (i in 0:1){ 
  p <- i * 30
df <- fromJSON(paste0('https://sale.591.com.tw/home/search/list?type=2&&shType=list&section=5&regionid=1&kind=9&firstRow=',p,'&totalRows=1749&timestamp=1535791689399'))
dfall <- rbind(dfall, df$data$house_list[, c('area', 'price')])
}


write.csv(dfall, file= '591daan.csv')

dfall <- read.csv('https://raw.githubusercontent.com/ywchiu/rtibame/master/Data/591daan.csv')

fit <- lm(price ~ area, dfall)
plot(price ~ area, dfall)
abline(fit, col ='red')

predict(fit, data.frame(area = 20))

##        1 
## 1521.443

summary(fit)

## 
## Call:
## lm(formula = price ~ area, data = dfall)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -11412   -610    -22    610  37384 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1061.919     72.773  -14.59   <2e-16 ***
## area          129.168      1.269  101.76   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1702 on 1750 degrees of freedom
## Multiple R-squared:  0.8554, Adjusted R-squared:  0.8553 
## F-statistic: 1.035e+04 on 1 and 1750 DF,  p-value: < 2.2e-16

Demo20180901

David Chiu

2018年9月1日

10 Fold Cross Validation

Confusion Matrix

RandomForest

挑選變數

計算距離

分群

KMeans分群

Silhouette

WSS

Silhouette

比較分群演算法

客戶分群

Clustering Method

迴歸分析

預測房價