套用迴歸分析在房價資料上

lvr_data <- read.csv('lvr_prices_windows.csv')

簡單線性迴歸

lvr_new <- lvr_data[
     lvr_data$total_price < 100000000 &     
     lvr_data$building_sqmeter< 1000 & 
     lvr_data$area == '大安區' &
     lvr_data$building_sqmeter > 0,]

plot(total_price ~ building_sqmeter, data = lvr_new)

fit <- lm(total_price ~ building_sqmeter, data = lvr_new)

abline(fit, col="red")

根據finish_ymd 以及trading_ymd 計算屋齡(年)，並將屋齡以名稱(house_age) 加入到該Data Frame 的欄位中

class(lvr_data$trading_ymd)

## [1] "factor"

lvr_data$trading_ymd <- as.Date(lvr_data$trading_ymd)
lvr_data$finish_ymd  <- as.Date(lvr_data$finish_ymd)
lvr_data$house_age   <- as.numeric((lvr_data$trading_ymd - lvr_data$finish_ymd)/ 365)

Mutiple Regression (Sample)

str(lvr_data)
fit2 <- lm(total_price ~ management + compartment + parking_type + parking_sqmeter + house_age + room + living_room + bath + building_sqmeter + main_purpose + built_with  + trading_target + land_sqmeter + city_land_type + area, data = lvr_data)

step(fit2)

使用rpart 分類 iris 資料

library(rpart)

## Warning: package 'rpart' was built under R version 3.2.5

data(iris)

fit <- rpart(formula= Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris )

fit

## n= 150 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 150 100 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 50   0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 100  50 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Width< 1.75 54   5 versicolor (0.00000000 0.90740741 0.09259259) *
##     7) Petal.Width>=1.75 46   1 virginica (0.00000000 0.02173913 0.97826087) *

plot(fit, margin = 0.1)
text(fit)

顯示分類結果

plot(Petal.Length ~ Petal.Width, data=iris, col=Species)

abline(h = 2.45, col="blue")
abline(v = 1.75, col="orange")

驗證分類結果

predicted_prob <- predict(fit, data = iris)
predicted <- predict(fit, data = iris, type ="class")
table(predicted, iris$Species)

##             
## predicted    setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45

使用caret 套件找出準確率

#install.packages('caret')
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

cm <- table(predicted, iris$Species)
confusionMatrix(cm)

## Confusion Matrix and Statistics
## 
##             
## predicted    setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         5
##   virginica       0          1        45
## 
## Overall Statistics
##                                          
##                Accuracy : 0.96           
##                  95% CI : (0.915, 0.9852)
##     No Information Rate : 0.3333         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.94           
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.9800           0.9000
## Specificity                 1.0000            0.9500           0.9900
## Pos Pred Value              1.0000            0.9074           0.9783
## Neg Pred Value              1.0000            0.9896           0.9519
## Prevalence                  0.3333            0.3333           0.3333
## Detection Rate              0.3333            0.3267           0.3000
## Detection Prevalence        0.3333            0.3600           0.3067
## Balanced Accuracy           1.0000            0.9650           0.9450

測試模型

set.seed(123)
idx <- sample.int(2, nrow(iris), replace=TRUE, prob=c(0.7,0.3))
table(idx)

## idx
##   1   2 
## 106  44

trainset <- iris[idx == 1,]
testset  <- iris[idx == 2,]

dim(trainset)

## [1] 106   5

dim(testset)

## [1] 44  5

a <- c(1,2,3,4,5)
idx2 <- c(1,0,1,0,1)
idx2 == 1

## [1]  TRUE FALSE  TRUE FALSE  TRUE

a[idx2 == 1]

## [1] 1 3 5

使用訓練資料集建立模型

# . 代表所有變數
fit2       <- rpart(Species ~. , data = trainset)
predicted2 <- predict(fit2, testset, type="class")
cm2 <- table(predicted2, testset$Species)
confusionMatrix(cm2)

## Confusion Matrix and Statistics
## 
##             
## predicted2   setosa versicolor virginica
##   setosa         15          0         0
##   versicolor      0         10         1
##   virginica       0          4        14
## 
## Overall Statistics
##                                           
##                Accuracy : 0.8864          
##                  95% CI : (0.7544, 0.9621)
##     No Information Rate : 0.3409          
##     P-Value [Acc > NIR] : 8.552e-14       
##                                           
##                   Kappa : 0.8291          
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.7143           0.9333
## Specificity                 1.0000            0.9667           0.8621
## Pos Pred Value              1.0000            0.9091           0.7778
## Neg Pred Value              1.0000            0.8788           0.9615
## Prevalence                  0.3409            0.3182           0.3409
## Detection Rate              0.3409            0.2273           0.3182
## Detection Prevalence        0.3409            0.2500           0.4091
## Balanced Accuracy           1.0000            0.8405           0.8977

fit2

## n= 106 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 106 70 versicolor (0.33018868 0.33962264 0.33018868)  
##   2) Petal.Length< 2.45 35  0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 71 35 versicolor (0.00000000 0.50704225 0.49295775)  
##     6) Petal.Length< 4.75 34  0 versicolor (0.00000000 1.00000000 0.00000000) *
##     7) Petal.Length>=4.75 37  2 virginica (0.00000000 0.05405405 0.94594595) *

plot(fit2, margin = 0.1)
text(fit2)

plot(Petal.Length ~ Petal.Width, data=iris, col=Species)

abline(h = 2.45, col="blue")
abline(h = 4.75, col="orange")

顧客流失分析

讀取資料

#install.packages("C50")
library(C50)

## Warning: package 'C50' was built under R version 3.2.5

data(churn)
str(churnTrain)

## 'data.frame':    3333 obs. of  20 variables:
##  $ state                        : Factor w/ 51 levels "AK","AL","AR",..: 17 36 32 36 37 2 20 25 19 50 ...
##  $ account_length               : int  128 107 137 84 75 118 121 147 117 141 ...
##  $ area_code                    : Factor w/ 3 levels "area_code_408",..: 2 2 2 1 2 3 3 2 1 2 ...
##  $ international_plan           : Factor w/ 2 levels "no","yes": 1 1 1 2 2 2 1 2 1 2 ...
##  $ voice_mail_plan              : Factor w/ 2 levels "no","yes": 2 2 1 1 1 1 2 1 1 2 ...
##  $ number_vmail_messages        : int  25 26 0 0 0 0 24 0 0 37 ...
##  $ total_day_minutes            : num  265 162 243 299 167 ...
##  $ total_day_calls              : int  110 123 114 71 113 98 88 79 97 84 ...
##  $ total_day_charge             : num  45.1 27.5 41.4 50.9 28.3 ...
##  $ total_eve_minutes            : num  197.4 195.5 121.2 61.9 148.3 ...
##  $ total_eve_calls              : int  99 103 110 88 122 101 108 94 80 111 ...
##  $ total_eve_charge             : num  16.78 16.62 10.3 5.26 12.61 ...
##  $ total_night_minutes          : num  245 254 163 197 187 ...
##  $ total_night_calls            : int  91 103 104 89 121 118 118 96 90 97 ...
##  $ total_night_charge           : num  11.01 11.45 7.32 8.86 8.41 ...
##  $ total_intl_minutes           : num  10 13.7 12.2 6.6 10.1 6.3 7.5 7.1 8.7 11.2 ...
##  $ total_intl_calls             : int  3 3 5 7 3 6 7 6 4 5 ...
##  $ total_intl_charge            : num  2.7 3.7 3.29 1.78 2.73 1.7 2.03 1.92 2.35 3.02 ...
##  $ number_customer_service_calls: int  1 1 0 2 3 0 3 0 1 0 ...
##  $ churn                        : Factor w/ 2 levels "yes","no": 2 2 2 2 2 2 2 2 2 2 ...

將資料處理後分為訓練與測試資料集

churnTrain = churnTrain[,! names(churnTrain) %in% c("state", "area_code", "account_length") ]

set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset <- churnTrain[ind == 1,]
testset  <- churnTrain[ind == 2,]

dim(trainset)

## [1] 2315   17

dim(testset)

## [1] 1018   17

set.seed example

set.seed(2)
sample(42,6)

## [1]  8 29 23  7 36 35

sample(42,6)

## [1]  6 35 19 22 39  9

sample(42,6)

## [1] 32  8 17 34 38  9

建立預測模型

churn.rp <- rpart(churn ~ ., data=trainset)
plot(churn.rp, margin= 0.1)
text(churn.rp, all=TRUE, use.n = TRUE)

Tree Prune

min_error_split <- which.min(churn.rp$cptable[,'xerror'])
min_error_split <- 3
churn.cp <- churn.rp$cptable[min_error_split ,"CP"]

prune.tree <- prune(churn.rp, cp= churn.cp)

plot(prune.tree, margin = 0.1)
text(prune.tree)

Make churn prediction

predictions <- predict(prune.tree, testset, type="class")
churn_table <- table(predictions, testset$churn)
confusionMatrix(churn_table)

## Confusion Matrix and Statistics
## 
##            
## predictions yes  no
##         yes  64  10
##         no   77 867
##                                          
##                Accuracy : 0.9145         
##                  95% CI : (0.8957, 0.931)
##     No Information Rate : 0.8615         
##     P-Value [Acc > NIR] : 1.266e-07      
##                                          
##                   Kappa : 0.5527         
##  Mcnemar's Test P-Value : 1.484e-12      
##                                          
##             Sensitivity : 0.45390        
##             Specificity : 0.98860        
##          Pos Pred Value : 0.86486        
##          Neg Pred Value : 0.91843        
##              Prevalence : 0.13851        
##          Detection Rate : 0.06287        
##    Detection Prevalence : 0.07269        
##       Balanced Accuracy : 0.72125        
##                                          
##        'Positive' Class : yes            
##

table(testset$churn)

## 
## yes  no 
## 141 877

877 / (877 + 141)

## [1] 0.8614931

Cross validation

idx <- sample.int(10, nrow(churnTrain), replace=TRUE)

accu_vector = c()
for (i in seq(1,10)){
  trainset <- churnTrain[idx == i,]
  testset  <- churnTrain[idx != i,]
  fit <- rpart(churn ~., trainset)
  predictions <- predict(fit, testset, type = 'class')
  accu_vector <- c(accu_vector, confusionMatrix(predictions, testset$churn)$overall[1])
}

Use caret package

library(caret)
control = trainControl(method="repeatedcv", number=10, repeats=3)
model = train(churn~., data=trainset, method="rpart", preProcess="scale", trControl=control)
model

## CART 
## 
## 348 samples
##  16 predictor
##   2 classes: 'yes', 'no' 
## 
## Pre-processing: scaled (16) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 314, 313, 313, 313, 313, 313, ... 
## Resampling results across tuning parameters:
## 
##   cp          Accuracy   Kappa    
##   0.03333333  0.8687395  0.4797714
##   0.08333333  0.8802241  0.4647340
##   0.35000000  0.8553501  0.2702799
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final value used for the model was cp = 0.08333333.

ROC Curve

set.seed(2)
ind <- sample(2, nrow(churnTrain), replace = TRUE, prob=c(0.7, 0.3))
trainset <- churnTrain[ind == 1,]
testset  <- churnTrain[ind == 2,]

fit <- rpart(churn ~., trainset)

library(caret)
predictions <- predict(fit, testset)

x <- c()
y <- c()
for (t in seq(0,1,0.1)){
  predict_over_threshold <- ifelse(predictions[,1] > t, '0', '1' )
  predict_over_threshold <- as.factor(predict_over_threshold)
  levels(predict_over_threshold) <- c('yes', 'no')
  
  cm <- table(predict_over_threshold, testset$churn)
  y <- c(y, confusionMatrix(cm)$byClass[1])
  x <- c(x, 1- confusionMatrix(cm)$byClass[2])
}
plot(x,y, type='p')
lines(sort(x),y[order(x)], col="red")

## use rocr

library(ROCR)

## Warning: package 'ROCR' was built under R version 3.2.5

## Loading required package: gplots

## Warning: package 'gplots' was built under R version 3.2.5

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

predictions <- predict(churn.rp, testset, type="prob")
pred.to.roc <- predictions[, 1] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset[,(dim(testset)[[2]])])) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

Demo20161022

David Chiu

2016年10月22日

套用迴歸分析在房價資料上

簡單線性迴歸

根據finish_ymd 以及trading_ymd 計算屋齡(年)，並將屋齡以名稱(house_age) 加入到該Data Frame 的欄位中

Mutiple Regression (Sample)

使用rpart 分類 iris 資料

顯示分類結果

驗證分類結果

使用caret 套件找出準確率

測試模型

使用訓練資料集建立模型

顧客流失分析

讀取資料

將資料處理後分為訓練與測試資料集

set.seed example

建立預測模型

Tree Prune

Make churn prediction

Cross validation

Use caret package

ROC Curve