packages = c(
  "dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
  "magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
  "doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest"
  )
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
Sys.setlocale("LC_ALL",'C') #清除國家偵測設定
[1] "C/C/C/C/C/en_US.UTF-8"
rm(list=ls(all=T))
options(digits=4, scipen=12) #控制小數點, scipen=12讓科學記號不那麼常出現
library(dplyr)

1. Suprime Court Decision

1.1 Prepare Data

D = read.csv("data/stevens.csv")
library(caTools)
set.seed(3000)
spl = sample.split(D$Reverse, SplitRatio = 0.7)
TR = subset(D, spl)
TS = subset(D, !spl)

1.2 CART (Classification & Regression Tree) - rpart::rpart()

library(rpart)
rpart1 = rpart(Reverse ~ ., TR[,3:9],          # simplify the formula
               method="class", minbucket=25)   # class是讓他結果輸出類別, prob則是機率, 都不選也是機率

1.3 Plot Decision Tree

library(rpart.plot)
prp(rpart1)
Bad 'data' field in model 'call' field.
         To make this warning go away:
             Call prp with roundint=FALSE,
             or rebuild the rpart model with model=TRUE.

rpart.plot(rpart1,cex=0.6)
Bad 'data' field in model 'call' field.
         To make this warning go away:
             Call rpart.plot with roundint=FALSE,
             or rebuild the rpart model with model=TRUE.

1.4 Make Prediction

pred = predict(rpart1, TS, type = "class")  # predict classes
x = table(actual = TS$Reverse, pred); x
      pred
actual  0  1
     0 41 36
     1 22 71
sum(diag(x))/sum(x)                         # .65882
[1] 0.6588

1.5 ROC & AUC

library(ROCR)
PredictROC = predict(rpart1, TS)              # predict prob.
head(PredictROC)
        0      1
1  0.3036 0.6964
3  0.3036 0.6964
4  0.4000 0.6000
6  0.4000 0.6000
8  0.4000 0.6000
21 0.3036 0.6964
perf = prediction(PredictROC[,2], TS$Reverse)
perf = performance(perf, "tpr", "fpr")
plot(perf)

pred = predict(rpart1, TS)[,2]     # prob. of Reverse = 1         
colAUC(pred, TS$Reverse, T)        # AUC = 0.6927
          [,1]
0 vs. 1 0.6927

1.6 DPP of Decision Tree

source("DPP.R")
auc = DPP(pred, TS$Reverse, 1)     # AUC = 0.6927

【QUIZ-1】比較以上prp()rpart.plot()DPP()這幾張圖形

  • 一棵決策樹的形狀, 和由它所產生的預測機率分布之間,有什麼關係呢 ? 以上DPP 圖中的分布點數、位置和高度,分別會對應到決策樹的什麼特徵呢?

DPP的分佈點數(0/1)是決策樹紅/綠之分別;DPP分佈位置代表決策樹的預測機率(第二個數字);DPP高度為決策樹中的各水桶佔總資料的比例(第三個數字)。


1.7 The effect of minbucket

t5 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=5)
prp(t5)
Bad 'data' field in model 'call' field.
         To make this warning go away:
             Call prp with roundint=FALSE,
             or rebuild the rpart model with model=TRUE.

t100 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=100)
prp(t100)
Bad 'data' field in model 'call' field.
         To make this warning go away:
             Call prp with roundint=FALSE,
             or rebuild the rpart model with model=TRUE.

【QUIZ-2】Based on the 2 plots above, what is the effect of minbucket?

  • minbucket的數字愈大,決策樹就會長得愈小(簡單);反之,數字愈小,樹就愈大(複雜)。


1.8 Random Forest Method - randomForest::randomForest()

library(randomForest)
TR$Reverse = as.factor(TR$Reverse)  # Convert outcome to factor
TS$Reverse = as.factor(TS$Reverse)
# Build model
rf1 = randomForest(Reverse ~ ., TR[,3:9], ntree=200, nodesize=25)
# Make predictions
pred = predict(rf1, TS)
x = table(TS$Reverse, pred)
sum(diag(x)) / sum(x)          # 0.6824  #diag是對角線
[1] 0.6824
pred = predict(rf1, TS, type='prob')[,2]  #Y=的機率
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.))/sum(.)}
[1] 0.6882

1.9 Cross Validation & Parameter Tuning - caret package

pred = predict(rf1, TS, type='prob')[,2]   # 可以預測兩類別的時候要看第二行,所以用[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.))/sum(.)}   # 簡潔寫法(同上面的Accuracy算法)
[1] 0.68824

【QUIZ-3】Which cp which we should we choose? Why?

  • 我們應選擇0.18的cp。因為cp愈高,模型的複雜度就愈低;若Acc都一樣(呈水平線),我們選擇其中cp最大的。


1.10 The Final Model

rpart2 = rpart(Reverse ~ ., TR[,3:9], method="class", cp=0.19)
pred = predict(rpart2, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.)) / sum(.)}  # 0.7235
[1] 0.7235
rpart.plot(rpart2)
Bad 'data' field in model 'call' field.
         To make this warning go away:
             Call rpart.plot with roundint=FALSE,
             or rebuild the rpart model with model=TRUE.

【討論:】

  • Classification & Regression Tree rpart

決策樹可以做到類別與連續變數的分析。當要用的是類別時,方法則要選擇“class”;而當我們需要連續變化的機率,就使用“prob”。

  • Tree Bagging - Random Forest randomForest

在做Tree Bagging時,我們使用ntree來調整隨機森林中樹的大小,由此來告訴R做到多少就不要再往下長了。

  • Overfitting : Curse of Complexity

當模型做得太複雜會導致過度適配Trianing的模型,但這樣會遠離Testing的模型,且我們真正想推估的是未來可能發生的狀況,若一心追求模型的準確度,將達不到我們真正的目的。

  • Method Parameter : Cost of Compexity

那要如何對抗因過度適配產生的複雜度成本,就要靠方法參數(alpha,lamda,…),而非直接選取變數(x)來選擇模型。

  • Parameter Tuning by CV (caret package)

所謂方法參數,就是用cp來控制,但我們不知道到底哪個cp值最好,這時就是不斷地用caret來做cross validation,值到做出我們滿意的模型。

  • The power of machine vs. The value of humanity

因為需要不斷重複嘗試不同的參數值,就體現了機器計算高效的重要性;但是最終在作決策時,還是要靠人們對議題的敏感性及背景知識才能做判斷,機器終究只是幫忙建模型,而模型建完才是真正的開始。


2 D2HawkEye

2.1 Prepare & Examine Data

rm(list=ls(all=TRUE))
D = read.csv("data/ClaimsData.csv")
# Percentage of patients in each cost bucket
table(D$bucket2009)/nrow(D)

       1        2        3        4        5 
0.671268 0.190170 0.089466 0.043325 0.005771 
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(D$bucket2009, SplitRatio = 0.6)
TR = subset(D, spl)
TS = subset(D, !spl)
# Check Percentages Among Subsets
library(dplyr)
table(TR$bucket2009) %>% prop.table

       1        2        3        4        5 
0.671266 0.190169 0.089468 0.043326 0.005771 
table(TS$bucket2009) %>% prop.table

      1       2       3       4       5 
0.67127 0.19017 0.08946 0.04332 0.00577 
table(D$bucket2009) %>% prop.table

       1        2        3        4        5 
0.671268 0.190170 0.089466 0.043325 0.005771 
# sapply(): Apply Function & Aggregate Output
sapply(list(D, TR, TS), function(x) 
  table(x$bucket2009) %>% prop.table) %>% t  #x隨著前面的變數而變(D,TR,TS) #sapply不只是做出來 還會彙整成表
          1      2       3       4        5
[1,] 0.6713 0.1902 0.08947 0.04332 0.005771
[2,] 0.6713 0.1902 0.08947 0.04333 0.005771
[3,] 0.6713 0.1902 0.08946 0.04332 0.005770
# Data Exploration 
mean(TR$age)                 # 72.64
[1] 72.64
mean(TR$diabetes)            # .3809 
[1] 0.3809

2.2 Baseline Model

# Baseline Accuracy
acc.base = table(TS$bucket2009, TS$bucket2008) %>% 
  {sum(diag(.)) / sum(.)}    # .6838
# Penalty Matrix
Penalty = matrix(c(
  0,1,2,3,4,
  2,0,1,2,3,
  4,2,0,1,2,
  6,4,2,0,1,
  8,6,4,2,0), byrow=TRUE, nrow=5)
Penalty
     [,1] [,2] [,3] [,4] [,5]
[1,]    0    1    2    3    4
[2,]    2    0    1    2    3
[3,]    4    2    0    1    2
[4,]    6    4    2    0    1
[5,]    8    6    4    2    0
# Penalty Error of Baseline Method
cm = as.matrix(table(TS$bucket2009, TS$bucket2008))
pen.base = sum(cm*Penalty) / nrow(TS)    # pen: .7386
#因為有5*5的matrix(五個類別),不能做AUC(僅兩個類別)
#把全部的cm*payoff加總並平均,得出每個平均的penalty
# if we use the most popular class as baseline
acc.dumb = (table(TS$bucket2009)/nrow(TS))[1]                 # acc: .6713 
pen.dumb = sum(table(TS$bucket2009) * c(0,2,4,6,8))/nrow(TS)  # pen: 1.044

CART Model

# Dumb baseline - Using the most frequent class
acc.dumb = (table(TS$bucket2009)/nrow(TS))[1]                 
pen.dumb = sum(table(TS$bucket2009) * c(0,2,4,6,8))/nrow(TS)  
c(acc.dumb, pen.dumb)   # 0.6713 1.0443 (accuracy差不多,但penalty差很多,表示這個方式較好)
      1         
0.67127 1.04430 
# Make predictions
pred = predict(rpart1, newdata = TS, type = "class")
acc.rpart1 = table(TS$bucket2009, pred) %>% 
  { sum(diag(.)) /nrow(TS)}            # acc: .71267 <- .68381
# Penalty Error
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart1 = sum(cm*Penalty)/nrow(TS)  # pen: .75789 <- .7386

CART Model with Customized Loss Matrix

rpart2 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)], 
               method="class", cp=0.00005, 
               parms=list(loss=Penalty))
pred = predict(rpart2, TS, type = "class")
acc.rpart2 = table(TS$bucket2009, pred) %>% 
  {sum(diag(.))/sum(.)}                  # acc.rpart2: .64727   
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart2 = sum(cm*Penalty)/nrow(TS)    # pen.rpart2: .64182 
# Summary
data.frame(
  accuracy = c(acc.dumb, acc.base, acc.rpart1, acc.rpart2),
  penalty = c(pen.dumb, pen.base, pen.rpart1, pen.rpart2),
  row.names = c("dumb","base","CART","CART w/LossMx"))

【討論】

  • 5 classes: 5X5 confussion/penalty(payoff) matrices
cm
   pred
        1     2     3     4     5
  1 94310 25295  3087   286     0
  2  7176 18942  8079   643     0
  3  3590  7706  4692   401     1
  4  1304  3193  2803   636     1
  5   135   356   408   156     2
Penalty
     [,1] [,2] [,3] [,4] [,5]
[1,]    0    1    2    3    4
[2,]    2    0    1    2    3
[3,]    4    2    0    1    2
[4,]    6    4    2    0    1
[5,]    8    6    4    2    0
cm*Penalty #最壞的決策是高估(3,1)=14360。
   pred
        1     2     3     4     5
  1     0 25295  6174   858     0
  2 14352     0  8079  1286     0
  3 14360 15412     0   401     2
  4  7824 12772  5606     0     1
  5  1080  2136  1632   312     0
  • dumb vs. smart baseline

dumb baseline是將Y都猜1,準確率也不會與一般的baseline差太多,方便快速,然而penalty會大幅提升(acc=0.6731,penalty=1.044);而smart baseline是一種進階版的baseline,也就是想辦法拉高底線,此例是用2008年的類別直接預測2009年的類別(acc=0.6838,penalty=0.7386),相較之下,acc差不多,但penalty明顯下降,故應選smart baseline。

  • model1: accuracy up, penalty up

全部都猜1的模型,完全沒有在區分類別,即使正確性很高,沒有辨識率,就不是一個好的模型。

  • model2: accuracy down, penalty down

第二種模型是以今年的資料去預測明年,至少還有在分辨不同類別,因此penalty下降了。

  • accuracy != profitability

綜合上述,我們比較在意降低penalty而非增加acc。

  • “customized” model optimize criteria in classification methods (not always available)

客製化模型在做模型的當下,就已告訴決策樹預設好的penalty(parms=list(loss=Penalty)),給模型做一個加權數,模型預測只要沒猜中,類別就算錯,沒有猜多猜少(高估低估)的差別,最終做出來的樹不是acc最高的樹,而是penalty最低的樹(acc=0.6472,penalty=0.6418)。但不是每種模型都可以透過matrix加入penalty做這種最佳化。

  • This CV/PT process is over simplified

這例題中並沒有做到cross validation,簡化了他如何選擇cp的過程。

  • we will cover a better process next time

3 Boston House Price

rm(list=ls(all=TRUE))
D = read.csv("data/boston.csv")

3.1 Examine Data

# Plot observations
plot(D$LON, D$LAT)
# Tracts alongside the Charles River
with(D, 
     points(LON[CHAS==1], LAT[CHAS==1], col="blue", pch=19))
# Plot MIT
with(D, 
     points(LON[TRACT==3531], LAT[TRACT==3531], col="red", pch=19) )
# Plot polution
summary(D$NOX)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.385   0.449   0.538   0.555   0.624   0.871 
points(D$LON[D$NOX>=0.55], D$LAT[D$NOX>=0.55], 
       col="green", pch=20)

# Plot prices
plot(D$LON, D$LAT)
summary(D$MEDV)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    5.0    17.0    21.2    22.5    25.0    50.0 
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], 
       col="red", pch=20)

Linear Regression using LAT and LON

latlonlm = lm(MEDV ~ LAT + LON, data=D)
summary(latlonlm)

Call:
lm(formula = MEDV ~ LAT + LON, data = D)

Residuals:
   Min     1Q Median     3Q    Max 
-16.46  -5.59  -1.30   3.69  28.13 

Coefficients:
            Estimate Std. Error t value          Pr(>|t|)    
(Intercept) -3178.47     484.94   -6.55 0.000000000138619 ***
LAT             8.05       6.33    1.27               0.2    
LON           -40.27       5.18   -7.77 0.000000000000045 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.69 on 503 degrees of freedom
Multiple R-squared:  0.107, Adjusted R-squared:  0.104 
F-statistic: 30.2 on 2 and 503 DF,  p-value: 0.000000000000416

Visualize regression output

plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], 
       col="red", pch=20)
# latlonlm$fitted.values
points(D$LON[latlonlm$fitted.values >= 21.2], 
       D$LAT[latlonlm$fitted.values >= 21.2], 
       col="blue", pch="x")

Load CART packages

library(rpart)
library(rpart.plot)
# CART model
latlontree = rpart(MEDV ~ LAT + LON, data=D)
prp(latlontree)

# Visualize output
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], 
       col="red", pch=20)
fittedvalues = predict(latlontree)
points(D$LON[fittedvalues>21.2], 
       D$LAT[fittedvalues>=21.2], col="blue", pch="x")

Simplify tree by increasing minbucket

latlontree = rpart(MEDV ~ LAT + LON, data=D, minbucket=50)
rpart.plot(latlontree)

# Visualize Output
plot(D$LON,D$LAT)
abline(v=-71.07)
abline(h=42.21)
abline(h=42.17)
points(D$LON[D$MEDV>=21.2], 
       D$LAT[D$MEDV>=21.2], col="red", pch=20)

Use all the variables

# Split the data
library(caTools)
set.seed(123)
split = sample.split(D$MEDV, SplitRatio = 0.7)
train = subset(D, split==TRUE)
test = subset(D, split==FALSE)
# Create linear regression
linreg = lm(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
              AGE + DIS + RAD + TAX + PTRATIO, data=train)
summary(linreg)

Call:
lm(formula = MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + 
    RM + AGE + DIS + RAD + TAX + PTRATIO, data = train)

Residuals:
   Min     1Q Median     3Q    Max 
-14.51  -2.71  -0.68   1.79  36.88 

Coefficients:
              Estimate Std. Error t value    Pr(>|t|)    
(Intercept) -252.29051  436.71008   -0.58       0.564    
LAT            1.54396    5.19196    0.30       0.766    
LON           -2.98711    4.78583   -0.62       0.533    
CRIM          -0.18080    0.04391   -4.12 0.000047725 ***
ZN             0.03250    0.01877    1.73       0.084 .  
INDUS         -0.04297    0.08473   -0.51       0.612    
CHAS           2.90418    1.22005    2.38       0.018 *  
NOX          -21.61279    5.41371   -3.99 0.000079778 ***
RM             6.28440    0.48270   13.02     < 2e-16 ***
AGE           -0.04430    0.01785   -2.48       0.014 *  
DIS           -1.57736    0.28418   -5.55 0.000000056 ***
RAD            0.24509    0.09728    2.52       0.012 *  
TAX           -0.01112    0.00545   -2.04       0.042 *  
PTRATIO       -0.98349    0.19389   -5.07 0.000000638 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.6 on 350 degrees of freedom
Multiple R-squared:  0.665, Adjusted R-squared:  0.653 
F-statistic: 53.4 on 13 and 350 DF,  p-value: <2e-16
# Make predictions
linreg.pred = predict(linreg, newdata=test)
linreg.sse = sum((linreg.pred - test$MEDV)^2)
linreg.sse
[1] 3037

Create a CART model

tree = rpart(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
               AGE + DIS + RAD + TAX + PTRATIO, data=train)
prp(tree)

# Make predictions
tree.pred = predict(tree, newdata=test)
tree.sse = sum((tree.pred - test$MEDV)^2)
tree.sse
[1] 4329

Load libraries for cross-validation

library(caret)
# Number of folds
tr.control = trainControl(method = "cv", number = 10)
# cp values
cp.grid = expand.grid( .cp = (0:10)*0.001)
# Cross-validation
tr = train(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
             AGE + DIS + RAD + TAX + PTRATIO, 
           data = train, method = "rpart", 
           trControl = tr.control, tuneGrid = cp.grid)
tr
CART 

364 samples
 13 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 327, 326, 328, 329, 328, 327, ... 
Resampling results across tuning parameters:

  cp     RMSE   Rsquared  MAE  
  0.000  4.729  0.7598    3.068
  0.001  4.745  0.7584    3.105
  0.002  4.756  0.7569    3.091
  0.003  4.807  0.7491    3.151
  0.004  4.876  0.7418    3.230
  0.005  4.917  0.7368    3.281
  0.006  4.954  0.7315    3.322
  0.007  4.899  0.7360    3.290
  0.008  4.933  0.7297    3.323
  0.009  4.906  0.7320    3.308
  0.010  4.906  0.7320    3.308

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.
# Extract tree
best.tree = tr$finalModel
prp(best.tree)
Bad 'data' field in model 'call' field.
         To make this warning go away:
             Call prp with roundint=FALSE,
             or rebuild the rpart model with model=TRUE.

# Make predictions
best.tree.pred = predict(best.tree, newdata=test)
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse
[1] 3660

Ensembling

en.pred = (best.tree.pred + linreg.pred)/2
sum((en.pred - test$MEDV)^2)
[1] 2599
#       METHOD   SSE
#           lm  3037
#         rpart 4329
#      rpart.cv 3660
#   lm+rpart.cv 2599  

【討論】

  • drawing map with spatial data

  • tree in spatial data

分析在空間資料的決策樹,經緯度會是一個很重要的變數。

  • tree vs. linear regression

最後結果得出,決策樹的預測比線性回歸模型的預測還要差,因此我們知道,並非所有議題都適合用決策樹來分,有時候回歸模型的表現會更好(SSE較小)。

  • the concept of ensembling

Model ensembling是將兩種模型合併起來做平均,其的預測能力將能打敗其他單一種預測方式的模型。







---
title: "AS4-0"
author: "GROUP5——施采彣、陳怡安、楊凱倫、唐思琪、凌偉誠"
output: html_notebook
---

```{r}
packages = c(
  "dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
  "magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
  "doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest"
  )
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
```

```{r echo=T, message=F, cache=F, warning=F}
Sys.setlocale("LC_ALL",'C') #清除國家偵測設定
rm(list=ls(all=T))
options(digits=4, scipen=12) #控制小數點, scipen=12讓科學記號不那麼常出現
library(dplyr)
```

- - -

### 1. Suprime Court Decision

#### 1.1 Prepare Data
```{r}
D = read.csv("data/stevens.csv")

library(caTools)
set.seed(3000)
spl = sample.split(D$Reverse, SplitRatio = 0.7)
TR = subset(D, spl)
TS = subset(D, !spl)
```

#### 1.2 CART (Classification & Regression Tree) - `rpart::rpart()`
```{r}
library(rpart)
rpart1 = rpart(Reverse ~ ., TR[,3:9],          # simplify the formula
               method="class", minbucket=25)   # class是讓他結果輸出類別, prob則是機率, 都不選也是機率
```

#### 1.3 Plot Decision Tree
```{r}
library(rpart.plot)
prp(rpart1)
rpart.plot(rpart1,cex=0.6)
```

#### 1.4 Make Prediction
```{r}
pred = predict(rpart1, TS, type = "class")  # predict classes
x = table(actual = TS$Reverse, pred); x
sum(diag(x))/sum(x)                         # .65882
```

#### 1.5 ROC & AUC
```{r}
library(ROCR)
PredictROC = predict(rpart1, TS)              # predict prob.
head(PredictROC)
perf = prediction(PredictROC[,2], TS$Reverse)
perf = performance(perf, "tpr", "fpr")
plot(perf)
```

```{r}
pred = predict(rpart1, TS)[,2]     # prob. of Reverse = 1         
colAUC(pred, TS$Reverse, T)        # AUC = 0.6927
```


#### 1.6 DPP of Decision Tree
```{r}
source("DPP.R")
auc = DPP(pred, TS$Reverse, 1)     # AUC = 0.6927
```

#### 【QUIZ-1】比較以上`prp()`、`rpart.plot()`和`DPP()`這幾張圖形

+ 一棵決策樹的形狀， 和由它所產生的預測機率分布之間，有什麼關係呢 ?  以上DPP 圖中的分布點數、位置和高度，分別會對應到決策樹的什麼特徵呢?

DPP的分佈點數(0/1)是決策樹紅/綠之分別;DPP分佈位置代表決策樹的預測機率(第二個數字);DPP高度為決策樹中的各水桶佔總資料的比例(第三個數字)。


<br>

#### 1.7 The effect of `minbucket`
```{r}
t5 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=5)
prp(t5)
```

```{r fig.height=2}
t100 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=100)
prp(t100)
```


####【QUIZ-2】Based on the 2 plots above, what is the effect of `minbucket`? 

+ minbucket的數字愈大,決策樹就會長得愈小(簡單);反之,數字愈小,樹就愈大(複雜)。

<br>

#### 1.8 Random Forest Method - `randomForest::randomForest()`
```{r}
library(randomForest)
TR$Reverse = as.factor(TR$Reverse)  # Convert outcome to factor
TS$Reverse = as.factor(TS$Reverse)

# Build model
rf1 = randomForest(Reverse ~ ., TR[,3:9], ntree=200, nodesize=25)

# Make predictions
pred = predict(rf1, TS)
x = table(TS$Reverse, pred)
sum(diag(x)) / sum(x)          # 0.6824  #diag是對角線
```

```{r}
pred = predict(rf1, TS, type='prob')[,2]  #Y=的機率
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.))/sum(.)}
```


#### 1.9 Cross Validation & Parameter Tuning - `caret` package
```{r}
library(caret)
numFolds = trainControl(method="cv", number=10) # 10 fold CV
cpGrid = expand.grid(cp = seq(0.01,0.5,0.01))   # parameter combination

cv1 = train(Reverse ~ ., TR[,3:9], method = "rpart", 
            trControl=numFolds, tuneGrid=cpGrid)
cv1; plot(cv1)
```

####【QUIZ-3】Which `cp` which we should we choose? Why? 

+ 我們應選擇0.18的cp。因為cp愈高,模型的複雜度就愈低;若Acc都一樣(呈水平線),我們選擇其中cp最大的。

<br>

#### 1.10 The Final Model
```{r}
rpart2 = rpart(Reverse ~ ., TR[,3:9], method="class", cp=0.19)
pred = predict(rpart2, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.)) / sum(.)}  # 0.7235
```

```{r fig.width=3}
rpart.plot(rpart2)
```

####【討論：】

+ Classification & Regression Tree  `rpart`

  決策樹可以做到類別與連續變數的分析。當要用的是類別時,方法則要選擇"class";而當我們需要連續變化的機率,就使用"prob"。

+ Tree Bagging - Random Forest `randomForest`  

  在做Tree Bagging時,我們使用ntree來調整隨機森林中樹的大小,由此來告訴R做到多少就不要再往下長了。

+ Overfitting : Curse of Complexity

  當模型做得太複雜會導致過度適配Trianing的模型,但這樣會遠離Testing的模型,且我們真正想推估的是未來可能發生的狀況,若一心追求模型的準確度,將達不到我們真正的目的。

+ Method Parameter : Cost of Compexity 

  那要如何對抗因過度適配產生的複雜度成本,就要靠方法參數(alpha,lamda,...),而非直接選取變數(x)來選擇模型。

+ Parameter Tuning by CV (`caret` package)

  所謂方法參數,就是用cp來控制,但我們不知道到底哪個cp值最好,這時就是不斷地用caret來做cross validation,值到做出我們滿意的模型。

+ The power of machine vs. The value of humanity

  因為需要不斷重複嘗試不同的參數值,就體現了機器計算高效的重要性;但是最終在作決策時,還是要靠人們對議題的敏感性及背景知識才能做判斷,機器終究只是幫忙建模型,而模型建完才是真正的開始。

- - -

### 2 D2HawkEye

#### 2.1 Prepare & Examine Data 
```{r}
rm(list=ls(all=TRUE))
D = read.csv("data/ClaimsData.csv")

# Percentage of patients in each cost bucket
table(D$bucket2009)/nrow(D)
```

```{r}
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(D$bucket2009, SplitRatio = 0.6)
TR = subset(D, spl)
TS = subset(D, !spl)
```

```{r}
# Check Percentages Among Subsets
library(dplyr)
table(TR$bucket2009) %>% prop.table
table(TS$bucket2009) %>% prop.table
table(D$bucket2009) %>% prop.table

# sapply(): Apply Function & Aggregate Output
sapply(list(D, TR, TS), function(x) 
  table(x$bucket2009) %>% prop.table) %>% t  #x隨著前面的變數而變(D,TR,TS) #sapply不只是做出來 還會彙整成表
```

```{r}
# Data Exploration 
mean(TR$age)                 # 72.64
mean(TR$diabetes)            # .3809 
```

#### 2.2 Baseline Model 
```{r}
# Baseline Accuracy
acc.base = table(TS$bucket2009, TS$bucket2008) %>% 
  {sum(diag(.)) / sum(.)}    # .6838
```

```{r}
# Penalty Matrix
Penalty = matrix(c(
  0,1,2,3,4,
  2,0,1,2,3,
  4,2,0,1,2,
  6,4,2,0,1,
  8,6,4,2,0), byrow=TRUE, nrow=5)
Penalty
```

```{r}
# Penalty Error of Baseline Method
cm = as.matrix(table(TS$bucket2009, TS$bucket2008))
pen.base = sum(cm*Penalty) / nrow(TS)    # pen: .7386
#因為有5*5的matrix(五個類別),不能做AUC(僅兩個類別)
#把全部的cm*payoff加總並平均,得出每個平均的penalty
```

```{r}
# if we use the most popular class as baseline
acc.dumb = (table(TS$bucket2009)/nrow(TS))[1]                 # acc: .6713 
pen.dumb = sum(table(TS$bucket2009) * c(0,2,4,6,8))/nrow(TS)  # pen: 1.044
```

#### CART Model
```{r}
library(rpart)
library(rpart.plot)
rpart1 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)], 
               method="class", cp=0.00005)
prp(rpart1)
```

```{r}
# Make predictions
pred = predict(rpart1, newdata = TS, type = "class")
acc.rpart1 = table(TS$bucket2009, pred) %>% 
  { sum(diag(.)) /nrow(TS)}            # acc: .71267 <- .68381

# Penalty Error
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart1 = sum(cm*Penalty)/nrow(TS)  # pen: .75789 <- .7386
```

#### CART Model with Customized Loss Matrix
```{r}
rpart2 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)], 
               method="class", cp=0.00005, 
               parms=list(loss=Penalty))

pred = predict(rpart2, TS, type = "class")
acc.rpart2 = table(TS$bucket2009, pred) %>% 
  {sum(diag(.))/sum(.)}                  # acc.rpart2: .64727   

cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart2 = sum(cm*Penalty)/nrow(TS)    # pen.rpart2: .64182 
```

```{r}
# Summary
data.frame(
  accuracy = c(acc.dumb, acc.base, acc.rpart1, acc.rpart2),
  penalty = c(pen.dumb, pen.base, pen.rpart1, pen.rpart2),
  row.names = c("dumb","base","CART","CART w/LossMx"))
```

#### 【討論】

+ 5 classes: 5X5 confussion/penalty(payoff) matrices 
```{r}
cm
Penalty
cm*Penalty #最壞的決策是高估(3,1)=14360。
```

+ dumb vs. smart baseline

  dumb baseline是將Y都猜1,準確率也不會與一般的baseline差太多,方便快速,然而penalty會大幅提升(acc=0.6731,penalty=1.044);而smart baseline是一種進階版的baseline,也就是想辦法拉高底線,此例是用2008年的類別直接預測2009年的類別(acc=0.6838,penalty=0.7386),相較之下,acc差不多,但penalty明顯下降,故應選smart baseline。

+ model1: accuracy up, penalty up

  全部都猜1的模型,完全沒有在區分類別,即使正確性很高,沒有辨識率,就不是一個好的模型。

+ model2: accuracy down, penalty down

  第二種模型是以今年的資料去預測明年,至少還有在分辨不同類別,因此penalty下降了。

+ accuracy != profitability

  綜合上述,我們比較在意降低penalty而非增加acc。

+ "customized" model optimize criteria in classification methods (not always available)

  客製化模型在做模型的當下,就已告訴決策樹預設好的penalty(parms=list(loss=Penalty)),給模型做一個加權數,模型預測只要沒猜中,類別就算錯,沒有猜多猜少(高估低估)的差別,最終做出來的樹不是acc最高的樹,而是penalty最低的樹(acc=0.6472,penalty=0.6418)。但不是每種模型都可以透過matrix加入penalty做這種最佳化。

+ This CV/PT process is over simplified

  這例題中並沒有做到cross validation,簡化了他如何選擇cp的過程。

+ we will cover a better process next time
  

- - -

### 3 Boston House Price

```{r}
rm(list=ls(all=TRUE))
D = read.csv("data/boston.csv")
```

#### 3.1 Examine Data
```{r}
# Plot observations
plot(D$LON, D$LAT)

# Tracts alongside the Charles River
with(D, 
     points(LON[CHAS==1], LAT[CHAS==1], col="blue", pch=19))

# Plot MIT
with(D, 
     points(LON[TRACT==3531], LAT[TRACT==3531], col="red", pch=19) )

# Plot polution
summary(D$NOX)
points(D$LON[D$NOX>=0.55], D$LAT[D$NOX>=0.55], 
       col="green", pch=20)

# Plot prices
plot(D$LON, D$LAT)
summary(D$MEDV)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], 
       col="red", pch=20)
```

#### Linear Regression using LAT and LON
```{r}
latlonlm = lm(MEDV ~ LAT + LON, data=D)
summary(latlonlm)
```

#### Visualize regression output
```{r}
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], 
       col="red", pch=20)

# latlonlm$fitted.values
points(D$LON[latlonlm$fitted.values >= 21.2], 
       D$LAT[latlonlm$fitted.values >= 21.2], 
       col="blue", pch="x")
```

# Load CART packages
```{r}
library(rpart)
library(rpart.plot)
# CART model
latlontree = rpart(MEDV ~ LAT + LON, data=D)
prp(latlontree)

# Visualize output
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], 
       col="red", pch=20)

fittedvalues = predict(latlontree)
points(D$LON[fittedvalues>21.2], 
       D$LAT[fittedvalues>=21.2], col="blue", pch="x")
```

#### Simplify tree by increasing minbucket
```{r}
latlontree = rpart(MEDV ~ LAT + LON, data=D, minbucket=50)
rpart.plot(latlontree)
```

```{r}
# Visualize Output
plot(D$LON,D$LAT)
abline(v=-71.07)
abline(h=42.21)
abline(h=42.17)
points(D$LON[D$MEDV>=21.2], 
       D$LAT[D$MEDV>=21.2], col="red", pch=20)
```

#### Use all the variables
```{r}
# Split the data
library(caTools)
set.seed(123)
split = sample.split(D$MEDV, SplitRatio = 0.7)
train = subset(D, split==TRUE)
test = subset(D, split==FALSE)

# Create linear regression
linreg = lm(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
              AGE + DIS + RAD + TAX + PTRATIO, data=train)
summary(linreg)

# Make predictions
linreg.pred = predict(linreg, newdata=test)
linreg.sse = sum((linreg.pred - test$MEDV)^2)
linreg.sse
```

#### Create a CART model
```{r}
tree = rpart(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
               AGE + DIS + RAD + TAX + PTRATIO, data=train)
prp(tree)

# Make predictions
tree.pred = predict(tree, newdata=test)
tree.sse = sum((tree.pred - test$MEDV)^2)
tree.sse
```

#### Load libraries for cross-validation
```{r}
library(caret)

# Number of folds
tr.control = trainControl(method = "cv", number = 10)

# cp values
cp.grid = expand.grid( .cp = (0:10)*0.001)

# Cross-validation
tr = train(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
             AGE + DIS + RAD + TAX + PTRATIO, 
           data = train, method = "rpart", 
           trControl = tr.control, tuneGrid = cp.grid)
tr

# Extract tree
best.tree = tr$finalModel
prp(best.tree)

# Make predictions
best.tree.pred = predict(best.tree, newdata=test)
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse
```

#### Ensembling
```{r}
en.pred = (best.tree.pred + linreg.pred)/2
sum((en.pred - test$MEDV)^2)
```


```{r}
#       METHOD	 SSE
#           lm	3037
#         rpart	4329
#      rpart.cv	3660
#   lm+rpart.cv	2599  
```


#### 【討論】

+ drawing map with spatial data 

+ tree in spatial data

  分析在空間資料的決策樹,經緯度會是一個很重要的變數。

+ tree vs. linear regression

  最後結果得出,決策樹的預測比線性回歸模型的預測還要差,因此我們知道,並非所有議題都適合用決策樹來分,有時候回歸模型的表現會更好(SSE較小)。

+ the concept of ensembling

  Model ensembling是將兩種模型合併起來做平均,其的預測能力將能打敗其他單一種預測方式的模型。

- - -

<br><br><br><br><br>

<style>
.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #0066ff;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h4,h5{
  background: #ccffff;
}

</style>






