套用迴歸分析在房價資料上

lvr_data <- read.csv('lvr_prices_windows.csv')

簡單線性迴歸

lvr_new <- lvr_data[
     lvr_data$total_price < 100000000 &     
     lvr_data$building_sqmeter< 1000 & 
     lvr_data$area == '大安區' &
     lvr_data$building_sqmeter > 0,]

plot(total_price ~ building_sqmeter, data = lvr_new)

fit <- lm(total_price ~ building_sqmeter, data = lvr_new)

abline(fit, col="red")

根據finish_ymd 以及trading_ymd 計算屋齡(年),並將屋齡以名稱(house_age) 加入到該Data Frame 的欄位中

class(lvr_data$trading_ymd)
## [1] "factor"
lvr_data$trading_ymd <- as.Date(lvr_data$trading_ymd)
lvr_data$finish_ymd  <- as.Date(lvr_data$finish_ymd)
lvr_data$house_age   <- as.numeric((lvr_data$trading_ymd - lvr_data$finish_ymd)/ 365)

Mutiple Regression (Sample)

str(lvr_data)
fit2 <- lm(total_price ~ management + compartment + parking_type + parking_sqmeter + house_age + room + living_room + bath + building_sqmeter + main_purpose + built_with  + trading_target + land_sqmeter + city_land_type + area, data = lvr_data)

step(fit2)

使用rpart 分類 iris 資料

library(rpart)
## Warning: package 'rpart' was built under R version 3.2.5
data(iris)

fit <- rpart(formula= Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris )

fit
## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *
plot(fit, margin = 0.1)
text(fit)

顯示分類結果

plot(Petal.Length ~ Petal.Width, data=iris, col=Species)

abline(h = 2.45, col="blue")
abline(v = 1.75, col="orange")

驗證分類結果

predicted_prob <- predict(fit, data = iris)
predicted <- predict(fit, data = iris, type ="class")
table(predicted, iris$Species)
##             
## predicted    setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45

使用caret 套件找出準確率

#install.packages('caret')
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
cm <- table(predicted, iris$Species)
confusionMatrix(cm)
## Confusion Matrix and Statistics
## 
##             
## predicted    setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9800           0.9000
## Specificity                 1.0000            0.9500           0.9900
## Pos Pred Value              1.0000            0.9074           0.9783
## Neg Pred Value              1.0000            0.9896           0.9519
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3600           0.3067
## Balanced Accuracy           1.0000            0.9650           0.9450

測試模型

set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7,0.3))
table(idx)
## idx
##   1   2 
## 106  44
trainset <- iris[idx == 1,]
testset  <- iris[idx == 2,]

dim(trainset)
## [1] 106   5
dim(testset)
## [1] 44  5
a <- c(1,2,3,4,5)
idx2 <- c(1,0,1,0,1)
idx2 == 1
## [1]  TRUE FALSE  TRUE FALSE  TRUE
a[idx2 == 1]
## [1] 1 3 5

使用訓練資料集建立模型

# . 代表所有變數
fit2       <- rpart(Species ~. , data = trainset)
predicted2 <- predict(fit2, testset, type="class")
cm2 <- table(predicted2, testset$Species)
confusionMatrix(cm2)
## Confusion Matrix and Statistics
## 
##             
## predicted2   setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         1
##   virginica       0          4        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.7544, 0.9621)
##     No Information Rate : 0.3409          
##     P-Value [Acc > NIR] : 8.552e-14       
##                                           
##                   Kappa : 0.8291          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7143           0.9333
## Specificity                 1.0000            0.9667           0.8621
## Pos Pred Value              1.0000            0.9091           0.7778
## Neg Pred Value              1.0000            0.8788           0.9615
## Prevalence                  0.3409            0.3182           0.3409
## Detection Rate              0.3409            0.2273           0.3182
## Detection Prevalence        0.3409            0.2500           0.4091
## Balanced Accuracy           1.0000            0.8405           0.8977
fit2
## n= 106 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 106 70 versicolor (0.33018868 0.33962264 0.33018868)  
##   2) Petal.Length< 2.45 35  0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 71 35 versicolor (0.00000000 0.50704225 0.49295775)  
##     6) Petal.Length< 4.75 34  0 versicolor (0.00000000 1.00000000 0.00000000) *
##     7) Petal.Length>=4.75 37  2 virginica (0.00000000 0.05405405 0.94594595) *
plot(fit2, margin = 0.1)
text(fit2)

plot(Petal.Length ~ Petal.Width, data=iris, col=Species)

abline(h = 2.45, col="blue")
abline(h = 4.75, col="orange")

顧客流失分析

讀取資料

#install.packages("C50")
library(C50)
## Warning: package 'C50' was built under R version 3.2.5
data(churn)
str(churnTrain)
## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

將資料處理後分為訓練與測試資料集

churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]

set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset <- churnTrain[ind == 1,]
testset  <- churnTrain[ind == 2,]

dim(trainset)
## [1] 2315   17
dim(testset)
## [1] 1018   17

set.seed example

set.seed(2)
sample(42,6)
## [1]  8 29 23  7 36 35
sample(42,6)
## [1]  6 35 19 22 39  9
sample(42,6)
## [1] 32  8 17 34 38  9

建立預測模型

churn.rp <- rpart(churn ~ ., data=trainset)
plot(churn.rp, margin= 0.1)
text(churn.rp, all=TRUE, use.n = TRUE)

Tree Prune

min_error_split <- which.min(churn.rp$cptable[,'xerror'])
min_error_split <- 3
churn.cp <- churn.rp$cptable[min_error_split ,"CP"]

prune.tree <- prune(churn.rp, cp= churn.cp)

plot(prune.tree, margin = 0.1)
text(prune.tree)

Make churn prediction

predictions <- predict(prune.tree, testset, type="class")
churn_table <- table(predictions, testset$churn)
confusionMatrix(churn_table)
## Confusion Matrix and Statistics
## 
##            
## predictions yes  no
##         yes  64  10
##         no   77 867
##                                          
##                Accuracy : 0.9145         
##                  95% CI : (0.8957, 0.931)
##     No Information Rate : 0.8615         
##     P-Value [Acc > NIR] : 1.266e-07      
##                                          
##                   Kappa : 0.5527         
##  Mcnemar's Test P-Value : 1.484e-12      
##                                          
##             Sensitivity : 0.45390        
##             Specificity : 0.98860        
##          Pos Pred Value : 0.86486        
##          Neg Pred Value : 0.91843        
##              Prevalence : 0.13851        
##          Detection Rate : 0.06287        
##    Detection Prevalence : 0.07269        
##       Balanced Accuracy : 0.72125        
##                                          
##        'Positive' Class : yes            
## 
table(testset$churn)
## 
## yes  no 
## 141 877
877 / (877 + 141)
## [1] 0.8614931

Cross validation

idx <- sample.int(10, nrow(churnTrain), replace=TRUE)

accu_vector = c()
for (i in seq(1,10)){
  trainset <- churnTrain[idx == i,]
  testset  <- churnTrain[idx != i,]
  fit <- rpart(churn ~., trainset)
  predictions <- predict(fit, testset, type = 'class')
  accu_vector <- c(accu_vector, confusionMatrix(predictions, testset$churn)$overall[1])
}

Use caret package

library(caret)
control = trainControl(method="repeatedcv", number=10, repeats=3)
model = train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model
## CART 
## 
## 348 samples
##  16 predictor
##   2 classes: 'yes', 'no' 
## 
## Pre-processing: scaled (16) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 314, 313, 313, 313, 313, 313, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.03333333  0.8687395  0.4797714
##   0.08333333  0.8802241  0.4647340
##   0.35000000  0.8553501  0.2702799
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.08333333.

ROC Curve

set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset <- churnTrain[ind == 1,]
testset  <- churnTrain[ind == 2,]

fit <- rpart(churn ~., trainset)

library(caret)
predictions <- predict(fit, testset)

x <- c()
y <- c()
for (t in seq(0,1,0.1)){
  predict_over_threshold <- ifelse(predictions[,1] > t, '0', '1' )
  predict_over_threshold <- as.factor(predict_over_threshold)
  levels(predict_over_threshold) <- c('yes', 'no')
  
  cm <- table(predict_over_threshold, testset$churn)
  y <- c(y, confusionMatrix(cm)$byClass[1])
  x <- c(x, 1- confusionMatrix(cm)$byClass[2])
}
plot(x,y, type='p')
lines(sort(x),y[order(x)], col="red")

## use rocr

library(ROCR)
## Warning: package 'ROCR' was built under R version 3.2.5
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.2.5
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])])) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))