packages = c(
  "dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
  "magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
  "doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest"
  )
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=T))
options(digits=5, scipen=12)
library(dplyr)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)

1. Suprime Court Decision

1.1 Prepare Data

D = read.csv("data/stevens.csv")
library(caTools)
set.seed(3000)
spl = sample.split(D$Reverse, SplitRatio = 0.7)
TR = subset(D, spl)
TS = subset(D, !spl)

1.2 CART (Classification & Regression Tree) - rpart::rpart()

library(rpart)
rpart1 = rpart(Reverse ~ ., TR[,3:9],          # simplify the formula
               method="class", minbucket=25)

1.3 Plot Decision Tree

library(rpart.plot)
prp(rpart1)

rpart.plot(rpart1,cex=0.6)

1.4 Make Prediction

pred = predict(rpart1, TS, type = "class")  # predict classes
x = table(actual = TS$Reverse, pred); x
      pred
actual  0  1
     0 41 36
     1 22 71
sum(diag(x))/sum(x)                         # .65882
[1] 0.65882

1.5 ROC & AUC

library(ROCR)
Loading required package: gplots

Attaching package: 'gplots'

The following object is masked from 'package:stats':

    lowess
PredictROC = predict(rpart1, TS)              # predict prob.
head(PredictROC)
         0       1
1  0.30357 0.69643
3  0.30357 0.69643
4  0.40000 0.60000
6  0.40000 0.60000
8  0.40000 0.60000
21 0.30357 0.69643
perf = prediction(PredictROC[,2], TS$Reverse)
perf = performance(perf, "tpr", "fpr")
plot(perf)

pred = predict(rpart1, TS)[,2]     # prob. of Reverse = 1         
colAUC(pred, TS$Reverse, T)        # AUC = 0.6927
           [,1]
0 vs. 1 0.69271

1.6 DPP of Decision Tree

rpart.plot(rpart1,cex=0.75,varlen=3,faclen=2,box.palette="GnRd")

source("DPP.R")
auc = DPP(predict(rpart1)[,2], TR$Reverse, 1)     # AUC = 0.7884

【QUIZ-1】比較以上prp()rpart.plot()DPP()這幾張圖形

  • 一棵決策樹的形狀, 和由它所產生的預測機率分布之間,有什麼關係呢 ? 以上DPP 圖中的分布點數、位置和高度,分別會對應到決策樹的什麼特徵呢?


1.7 The effect of minbucket

t5 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=5)
prp(t5)

t100 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=100)
prp(t100)

【QUIZ-2】Based on the 2 plots above, what is the effect of minbucket?


1.8 Random Forest Method - randomForest::randomForest()

library(randomForest)
TR$Reverse = as.factor(TR$Reverse)  # Convert outcome to factor
TS$Reverse = as.factor(TS$Reverse)
# Build model
rf1 = randomForest(Reverse ~ ., TR[,3:9], ntree=200, nodesize=25)
# Make predictions
pred = predict(rf1, TS)
x = table(TS$Reverse, pred)
sum(diag(x)) / sum(x)          # 0.6824
[1] 0.68235
pred = predict(rf1, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.))/sum(.)}
[1] 0.68824

1.9 Cross Validation & Parameter Tuning - caret package

library(caret)
Loading required package: lattice
Loading required package: ggplot2

Attaching package: 'ggplot2'

The following object is masked from 'package:randomForest':

    margin
numFolds = trainControl(method="cv", number=10) # 10 fold CV
cpGrid = 
cv1 = train(
  Reverse ~ ., TR[,3:9], method = "rpart", 
  trControl = trainControl(method="cv", number=10), # 10 fold CV
  tuneGrid=expand.grid(cp = seq(0.01,0.5,0.01))     # parameter combination
  )
cv1; plot(cv1)
CART 

396 samples
  6 predictor
  2 classes: '0', '1' 

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 357, 356, 356, 356, 357, 356, ... 
Resampling results across tuning parameters:

  cp    Accuracy  Kappa   
  0.01  0.61359   0.207035
  0.02  0.61365   0.206251
  0.03  0.62385   0.231274
  0.04  0.63141   0.256349
  0.05  0.64410   0.284019
  0.06  0.64410   0.284019
  0.07  0.64410   0.284019
  0.08  0.64410   0.284019
  0.09  0.64410   0.284019
  0.10  0.64410   0.284019
  0.11  0.64410   0.284019
  0.12  0.64410   0.284019
  0.13  0.64410   0.284019
  0.14  0.64410   0.284019
  0.15  0.64410   0.284019
  0.16  0.64410   0.284019
  0.17  0.64410   0.284019
  0.18  0.64410   0.284019
  0.19  0.62103   0.230632
  0.20  0.58301   0.137651
  0.21  0.58301   0.137651
  0.22  0.55301   0.048387
  0.23  0.54788   0.032258
  0.24  0.54538   0.017685
  0.25  0.54538   0.000000
  0.26  0.54538   0.000000
  0.27  0.54538   0.000000
  0.28  0.54538   0.000000
  0.29  0.54538   0.000000
  0.30  0.54538   0.000000
  0.31  0.54538   0.000000
  0.32  0.54538   0.000000
  0.33  0.54538   0.000000
  0.34  0.54538   0.000000
  0.35  0.54538   0.000000
  0.36  0.54538   0.000000
  0.37  0.54538   0.000000
  0.38  0.54538   0.000000
  0.39  0.54538   0.000000
  0.40  0.54538   0.000000
  0.41  0.54538   0.000000
  0.42  0.54538   0.000000
  0.43  0.54538   0.000000
  0.44  0.54538   0.000000
  0.45  0.54538   0.000000
  0.46  0.54538   0.000000
  0.47  0.54538   0.000000
  0.48  0.54538   0.000000
  0.49  0.54538   0.000000
  0.50  0.54538   0.000000

Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.18.

【QUIZ-3】Which cp which we should we choose? Why?


1.10 The Final Model

rpart2 = rpart(Reverse ~ ., TR[,3:9], method="class", cp=0.19)
pred = predict(rpart2, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.)) / sum(.)}  # 0.7235
[1] 0.72353
rpart.plot(rpart2)


【討論】

  • Classification & Regression Tree rpart
  • A Bag of Tree - Random Forest randomForest
  • Overfitting : Curse of Complexity
  • Model Parameter : Cost of Compexity
  • Parameter Tuning by CV (caret package)



2 D2HawkEye

2.1 Prepare & Examine Data

rm(list=ls(all=TRUE))
D = read.csv("data/ClaimsData.csv")
# Percentage of patients in each cost bucket
table(D$bucket2009)/nrow(D)

        1         2         3         4         5 
0.6712678 0.1901704 0.0894663 0.0433249 0.0057707 
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(D$bucket2009, SplitRatio = 0.6)
TR = subset(D, spl)
TS = subset(D, !spl)
# sapply(): Apply Function & Aggregate Output
sapply(list(D, TR, TS), function(x) 
  table(x$bucket2009) %>% prop.table) %>% t
           1       2        3        4         5
[1,] 0.67127 0.19017 0.089466 0.043325 0.0057707
[2,] 0.67127 0.19017 0.089468 0.043326 0.0057714
[3,] 0.67127 0.19017 0.089464 0.043324 0.0057696
# Data Exploration 
mean(TR$age)                 # 72.64
[1] 72.638
mean(TR$diabetes)            # .3809 
[1] 0.3809

2.2 Smart Baseline

# Baseline Accuracy
acc.base = table(TS$bucket2009, TS$bucket2008) %>% {sum(diag(.)) / sum(.)}
acc.base   # acc.base = .6838
[1] 0.68381
# Penalty Matrix
Penalty = matrix(c(
  0,1,2,3,4,
  2,0,1,2,3,
  4,2,0,1,2,
  6,4,2,0,1,
  8,6,4,2,0), byrow=TRUE, nrow=5)
Penalty
     [,1] [,2] [,3] [,4] [,5]
[1,]    0    1    2    3    4
[2,]    2    0    1    2    3
[3,]    4    2    0    1    2
[4,]    6    4    2    0    1
[5,]    8    6    4    2    0
# Penalty Error of Baseline Method
cm = as.matrix(table(TS$bucket2009, TS$bucket2008))
pen.base = sum(cm*Penalty) / nrow(TS)
pen.base  # pen.base: .7386
[1] 0.73861
# Dumb baseline - Using the most frequent class
acc.dumb = (table(TS$bucket2009)/nrow(TS))[1]                 
pen.dumb = sum(table(TS$bucket2009) * c(0,2,4,6,8))/nrow(TS)  
c(acc.dumb, pen.dumb)   # 0.6713 1.0443
      1         
0.67127 1.04430 

2.3 CART Model

library(rpart)
library(rpart.plot)
rpart1 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)], 
               method="class", cp=0.00005)
prp(rpart1)

# Make predictions
pred = predict(rpart1, newdata = TS, type = "class")
acc.rpart1 = table(TS$bucket2009, pred) %>% { sum(diag(.)) /nrow(TS)}
acc.rpart1    # acc.rpart1 = .71267 <- .68381
[1] 0.71267
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart1 = sum(cm*Penalty)/nrow(TS)
pen.rpart1    # pen.rpart1: .75789 <- .7386
[1] 0.75789

2.4 CART Model with Customized Loss Matrix

rpart2 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)], 
               method="class", cp=0.00005, 
               parms=list(loss=Penalty)     # customized loss function 
               )
pred = predict(rpart2, TS, type = "class")
acc.rpart2 = table(TS$bucket2009, pred) %>% {sum(diag(.))/sum(.)}
acc.rpart2    # acc.rpart2 = .64727   
[1] 0.64727
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart2 = sum(cm*Penalty)/nrow(TS)    
pen.rpart2    # pen.rpart2: .64182 
[1] 0.64182
# Summary
data.frame(
  accuracy = c(acc.dumb, acc.base, acc.rpart1, acc.rpart2),
  penalty = c(pen.dumb, pen.base, pen.rpart1, pen.rpart2),
  row.names = c("dumb","base","CART","CART w/LossMx"))
              accuracy penalty
dumb           0.67127 1.04430
base           0.68381 0.73861
CART           0.71267 0.75789
CART w/LossMx  0.64727 0.64182


【討論】

  • 5 classes: 5X5 confussion/penalty(payoff) matrices
  • dumb vs. smart baseline
  • model1: accuracy up, penalty up
  • model2: accuracy down, penalty down
  • accuracy != profitability
  • “customized” model optimize criteria in classification methods (not always available)
  • This CV/PT process is over simplified
  • we will cover a better process next time



3 Boston House Price

rm(list=ls(all=TRUE))
D = read.csv("data/boston.csv")

3.1 Examine Data

# Plot observations
plot(D$LON, D$LAT)
# Tracts alongside the Charles River
with(D, points(LON[CHAS==1], LAT[CHAS==1], col="blue", pch=19))
points(D$LON[D$CHAS==1], D$LAT[D$CHAS==1], col="blue", pch=19)
# Plot MIT
with(D, points(LON[TRACT==3531], LAT[TRACT==3531], col="red", pch=19) )
# Plot polution
summary(D$NOX)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.385   0.449   0.538   0.555   0.624   0.871 
points(D$LON[D$NOX>=0.55], D$LAT[D$NOX>=0.55], col="green", pch=20)

# Plot prices
plot(D$LON, D$LAT)
summary(D$MEDV)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    5.0    17.0    21.2    22.5    25.0    50.0 
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], col="red", pch=20)

3.2 Linear Regression using LAT and LON

latlonlm = lm(MEDV ~ LAT + LON, data=D)
summary(latlonlm)

Call:
lm(formula = MEDV ~ LAT + LON, data = D)

Residuals:
   Min     1Q Median     3Q    Max 
-16.46  -5.59  -1.30   3.69  28.13 

Coefficients:
            Estimate Std. Error t value          Pr(>|t|)    
(Intercept) -3178.47     484.94   -6.55 0.000000000138619 ***
LAT             8.05       6.33    1.27               0.2    
LON           -40.27       5.18   -7.77 0.000000000000045 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.69 on 503 degrees of freedom
Multiple R-squared:  0.107, Adjusted R-squared:  0.104 
F-statistic: 30.2 on 2 and 503 DF,  p-value: 0.000000000000416

3.3 Visualize regression output

plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], col="red", pch=20)
# latlonlm$fitted.values
points(D$LON[latlonlm$fitted.values >= 21.2], 
       D$LAT[latlonlm$fitted.values >= 21.2], 
       col="blue", pch="x")

library(rpart)
library(rpart.plot)
# CART model
latlontree = rpart(MEDV ~ LAT + LON, data=D)
prp(latlontree)

# Visualize output
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], 
       col="red", pch=20)
fittedvalues = predict(latlontree)
points(D$LON[fittedvalues>21.2], 
       D$LAT[fittedvalues>=21.2], col="blue", pch="x")

3.4 Simplify tree by increasing minbucket

latlontree = rpart(MEDV ~ LAT + LON, data=D, minbucket=50)
rpart.plot(latlontree)

# Visualize Output
plot(D$LON,D$LAT)
abline(v=-71.07)
abline(h=42.21)
abline(h=42.17)
points(D$LON[D$MEDV>=21.2], 
       D$LAT[D$MEDV>=21.2], col="red", pch=20)

3.5 Use all the variables

# Split the data
library(caTools)
set.seed(123)
split = sample.split(D$MEDV, SplitRatio = 0.7)
train = subset(D, split==TRUE)
test = subset(D, split==FALSE)
# Create linear regression
linreg = lm(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
              AGE + DIS + RAD + TAX + PTRATIO, data=train)
summary(linreg)

Call:
lm(formula = MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + 
    RM + AGE + DIS + RAD + TAX + PTRATIO, data = train)

Residuals:
   Min     1Q Median     3Q    Max 
-14.51  -2.71  -0.68   1.79  36.88 

Coefficients:
              Estimate Std. Error t value    Pr(>|t|)    
(Intercept) -252.29051  436.71008   -0.58       0.564    
LAT            1.54396    5.19196    0.30       0.766    
LON           -2.98711    4.78583   -0.62       0.533    
CRIM          -0.18080    0.04391   -4.12 0.000047725 ***
ZN             0.03250    0.01877    1.73       0.084 .  
INDUS         -0.04297    0.08473   -0.51       0.612    
CHAS           2.90418    1.22005    2.38       0.018 *  
NOX          -21.61279    5.41371   -3.99 0.000079778 ***
RM             6.28440    0.48270   13.02     < 2e-16 ***
AGE           -0.04430    0.01785   -2.48       0.014 *  
DIS           -1.57736    0.28418   -5.55 0.000000056 ***
RAD            0.24509    0.09728    2.52       0.012 *  
TAX           -0.01112    0.00545   -2.04       0.042 *  
PTRATIO       -0.98349    0.19389   -5.07 0.000000638 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.6 on 350 degrees of freedom
Multiple R-squared:  0.665, Adjusted R-squared:  0.653 
F-statistic: 53.4 on 13 and 350 DF,  p-value: <2e-16
# Make predictions
linreg.pred = predict(linreg, newdata=test)
linreg.sse = sum((linreg.pred - test$MEDV)^2)
linreg.sse
[1] 3037.1

3.6 Create a CART model

tree = rpart(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
               AGE + DIS + RAD + TAX + PTRATIO, data=train)
prp(tree)

# Make predictions
tree.pred = predict(tree, newdata=test)
tree.sse = sum((tree.pred - test$MEDV)^2)
tree.sse
[1] 4329

3.7 Load libraries for cross-validation

library(caret)
# Number of folds
tr.control = trainControl(method = "cv", number = 10)
# cp values
cp.grid = expand.grid( .cp = (0:10)*0.001)
# Cross-validation
tr = train(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
             AGE + DIS + RAD + TAX + PTRATIO, 
           data = train, method = "rpart", 
           trControl = tr.control, tuneGrid = cp.grid)
tr
CART 

364 samples
 13 predictor

No pre-processing
Resampling: Cross-Validated (10 fold) 
Summary of sample sizes: 327, 326, 328, 329, 328, 327, ... 
Resampling results across tuning parameters:

  cp     RMSE    Rsquared  MAE   
  0.000  4.7290  0.75979   3.0684
  0.001  4.7446  0.75837   3.1046
  0.002  4.7563  0.75688   3.0909
  0.003  4.8067  0.74908   3.1510
  0.004  4.8756  0.74179   3.2299
  0.005  4.9174  0.73682   3.2811
  0.006  4.9539  0.73155   3.3217
  0.007  4.8985  0.73604   3.2903
  0.008  4.9334  0.72974   3.3231
  0.009  4.9056  0.73195   3.3081
  0.010  4.9056  0.73195   3.3081

RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.
# Extract tree
best.tree = tr$finalModel
prp(best.tree)

# Make predictions
best.tree.pred = predict(best.tree, newdata=test)
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse
[1] 3660.1

3.8 Ensembling

en.pred = (best.tree.pred + linreg.pred)/2
sum((en.pred - test$MEDV)^2)
[1] 2598.6
#       METHOD   SSE
#           lm  3037
#         rpart 4329
#      rpart.cv 3660
#   lm+rpart.cv 2599  


【討論】

  • drawing map with spatial data
  • tree in spatial data
  • tree vs. linear regression
  • the concept of ensembling








---
title: "AS4-0 課堂筆記"
author: "卓雍然 D994010001"
output: html_notebook
---

```{r}
packages = c(
  "dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
  "magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
  "doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest"
  )
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
```

```{r echo=T, message=F, cache=F, warning=F}
rm(list=ls(all=T))
options(digits=5, scipen=12)
library(dplyr)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
```

- - -

### 1. Suprime Court Decision

#### 1.1 Prepare Data
```{r}
D = read.csv("data/stevens.csv")

library(caTools)
set.seed(3000)
spl = sample.split(D$Reverse, SplitRatio = 0.7)
TR = subset(D, spl)
TS = subset(D, !spl)
```

#### 1.2 CART (Classification & Regression Tree) - `rpart::rpart()`
```{r}
library(rpart)
rpart1 = rpart(Reverse ~ ., TR[,3:9],          # simplify the formula
               method="class", minbucket=25)
```

#### 1.3 Plot Decision Tree
```{r}
library(rpart.plot)
prp(rpart1)
rpart.plot(rpart1,cex=0.6)
```

#### 1.4 Make Prediction
```{r}
pred = predict(rpart1, TS, type = "class")  # predict classes
x = table(actual = TS$Reverse, pred); x
sum(diag(x))/sum(x)                         # .65882
```

#### 1.5 ROC & AUC
```{r}
library(ROCR)
PredictROC = predict(rpart1, TS)              # predict prob.
head(PredictROC)
perf = prediction(PredictROC[,2], TS$Reverse)
perf = performance(perf, "tpr", "fpr")
plot(perf)
```

```{r}
pred = predict(rpart1, TS)[,2]     # prob. of Reverse = 1         
colAUC(pred, TS$Reverse, T)        # AUC = 0.6927
```


#### 1.6 DPP of Decision Tree

```{r}
rpart.plot(rpart1,cex=0.75,varlen=3,faclen=2,box.palette="GnRd")
```


```{r}
source("DPP.R")
auc = DPP(predict(rpart1)[,2], TR$Reverse, 1)     # AUC = 0.7884
```

#### 【QUIZ-1】比較以上`prp()`、`rpart.plot()`和`DPP()`這幾張圖形

+ 一棵決策樹的形狀， 和由它所產生的預測機率分布之間，有什麼關係呢 ?  以上DPP 圖中的分布點數、位置和高度，分別會對應到決策樹的什麼特徵呢?

+ 

<br>

#### 1.7 The effect of `minbucket`
```{r}
t5 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=5)
prp(t5)
```

```{r fig.height=2}
t100 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=100)
prp(t100)
```


####【QUIZ-2】Based on the 2 plots above, what is the effect of `minbucket`? 

+ 

<br>

#### 1.8 Random Forest Method - `randomForest::randomForest()`
```{r}
library(randomForest)
TR$Reverse = as.factor(TR$Reverse)  # Convert outcome to factor
TS$Reverse = as.factor(TS$Reverse)

# Build model
rf1 = randomForest(Reverse ~ ., TR[,3:9], ntree=200, nodesize=25)

# Make predictions
pred = predict(rf1, TS)
x = table(TS$Reverse, pred)
sum(diag(x)) / sum(x)          # 0.6824
```

```{r}
pred = predict(rf1, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.))/sum(.)}
```


#### 1.9 Cross Validation & Parameter Tuning - `caret` package
```{r}
library(caret)
numFolds = trainControl(method="cv", number=10) # 10 fold CV
cpGrid = 

cv1 = train(
  Reverse ~ ., TR[,3:9], method = "rpart", 
  trControl = trainControl(method="cv", number=10), # 10 fold CV
  tuneGrid=expand.grid(cp = seq(0.01,0.5,0.01))     # parameter combination
  )
cv1; plot(cv1)
```

####【QUIZ-3】Which `cp` which we should we choose? Why? 

+

<br>

#### 1.10 The Final Model
```{r}
rpart2 = rpart(Reverse ~ ., TR[,3:9], method="class", cp=0.19)
pred = predict(rpart2, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.)) / sum(.)}  # 0.7235
```

```{r fig.width=3}
rpart.plot(rpart2)
```
<br>

####【討論】

+ Classification & Regression Tree  `rpart`
    +
    +
+ A Bag of Tree - Random Forest `randomForest`  
    +
    +
+ Overfitting : Curse of Complexity
    +
    +
+ Model Parameter : Cost of Compexity 
    +
    +
+ Parameter Tuning by CV (`caret` package)
    +
    +

<br>

- - -

### 2 D2HawkEye

#### 2.1 Prepare & Examine Data 
```{r warning=F, message=F, error=F}
rm(list=ls(all=TRUE))
D = read.csv("data/ClaimsData.csv")

# Percentage of patients in each cost bucket
table(D$bucket2009)/nrow(D)
```

```{r}
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(D$bucket2009, SplitRatio = 0.6)
TR = subset(D, spl)
TS = subset(D, !spl)
```

```{r}
# sapply(): Apply Function & Aggregate Output
sapply(list(D, TR, TS), function(x) 
  table(x$bucket2009) %>% prop.table) %>% t
```

```{r}
# Data Exploration 
mean(TR$age)                 # 72.64
mean(TR$diabetes)            # .3809 
```

#### 2.2 Smart Baseline
```{r}
# Baseline Accuracy
acc.base = table(TS$bucket2009, TS$bucket2008) %>% {sum(diag(.)) / sum(.)}
acc.base   # acc.base = .6838
```

```{r}
# Penalty Matrix
Penalty = matrix(c(
  0,1,2,3,4,
  2,0,1,2,3,
  4,2,0,1,2,
  6,4,2,0,1,
  8,6,4,2,0), byrow=TRUE, nrow=5)
Penalty
```

```{r}
# Penalty Error of Baseline Method
cm = as.matrix(table(TS$bucket2009, TS$bucket2008))
pen.base = sum(cm*Penalty) / nrow(TS)
pen.base  # pen.base: .7386
```

```{r}
# Dumb baseline - Using the most frequent class
acc.dumb = (table(TS$bucket2009)/nrow(TS))[1]                 
pen.dumb = sum(table(TS$bucket2009) * c(0,2,4,6,8))/nrow(TS)  
c(acc.dumb, pen.dumb)   # 0.6713 1.0443
```

#### 2.3 CART Model
```{r}
library(rpart)
library(rpart.plot)
rpart1 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)], 
               method="class", cp=0.00005)
prp(rpart1)
```

```{r}
# Make predictions
pred = predict(rpart1, newdata = TS, type = "class")
acc.rpart1 = table(TS$bucket2009, pred) %>% { sum(diag(.)) /nrow(TS)}
acc.rpart1    # acc.rpart1 = .71267 <- .68381
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart1 = sum(cm*Penalty)/nrow(TS)
pen.rpart1    # pen.rpart1: .75789 <- .7386
```

#### 2.4 CART Model with Customized Loss Matrix
```{r}
rpart2 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)], 
               method="class", cp=0.00005, 
               parms=list(loss=Penalty)     # customized loss function 
               )

pred = predict(rpart2, TS, type = "class")
acc.rpart2 = table(TS$bucket2009, pred) %>% {sum(diag(.))/sum(.)}
acc.rpart2    # acc.rpart2 = .64727   
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart2 = sum(cm*Penalty)/nrow(TS)    
pen.rpart2    # pen.rpart2: .64182 
```

```{r}
# Summary
data.frame(
  accuracy = c(acc.dumb, acc.base, acc.rpart1, acc.rpart2),
  penalty = c(pen.dumb, pen.base, pen.rpart1, pen.rpart2),
  row.names = c("dumb","base","CART","CART w/LossMx"))
```
<br>

#### 【討論】

+ 5 classes: 5X5 confussion/penalty(payoff) matrices 
    + 
    +
+ dumb vs. smart baseline
    + 
    +
+ model1: accuracy up, penalty up
    + 
    +
+ model2: accuracy down, penalty down
    + 
    +
+ accuracy != profitability
    + 
    +
+ "customized" model optimize criteria in classification methods (not always available)
    + 
    +
+ This CV/PT process is over simplified
    + 
    +
+ we will cover a better process next time
    + 
    +


<br>

- - -

### 3 Boston House Price

```{r}
rm(list=ls(all=TRUE))
D = read.csv("data/boston.csv")
```

#### 3.1 Examine Data
```{r}
# Plot observations
plot(D$LON, D$LAT)

# Tracts alongside the Charles River
with(D, points(LON[CHAS==1], LAT[CHAS==1], col="blue", pch=19))
points(D$LON[D$CHAS==1], D$LAT[D$CHAS==1], col="blue", pch=19)

# Plot MIT
with(D, points(LON[TRACT==3531], LAT[TRACT==3531], col="red", pch=19) )

# Plot polution
summary(D$NOX)
points(D$LON[D$NOX>=0.55], D$LAT[D$NOX>=0.55], col="green", pch=20)

# Plot prices
plot(D$LON, D$LAT)
summary(D$MEDV)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], col="red", pch=20)
```

#### 3.2 Linear Regression using LAT and LON
```{r}
latlonlm = lm(MEDV ~ LAT + LON, data=D)
summary(latlonlm)
```

#### 3.3 Visualize regression output
```{r}
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], col="red", pch=20)

# latlonlm$fitted.values
points(D$LON[latlonlm$fitted.values >= 21.2], 
       D$LAT[latlonlm$fitted.values >= 21.2], 
       col="blue", pch="x")
```

```{r}
library(rpart)
library(rpart.plot)
# CART model
latlontree = rpart(MEDV ~ LAT + LON, data=D)
prp(latlontree)

# Visualize output
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], 
       col="red", pch=20)

fittedvalues = predict(latlontree)
points(D$LON[fittedvalues>21.2], 
       D$LAT[fittedvalues>=21.2], col="blue", pch="x")
```

#### 3.4 Simplify tree by increasing minbucket
```{r}
latlontree = rpart(MEDV ~ LAT + LON, data=D, minbucket=50)
rpart.plot(latlontree)
```

```{r}
# Visualize Output
plot(D$LON,D$LAT)
abline(v=-71.07)
abline(h=42.21)
abline(h=42.17)
points(D$LON[D$MEDV>=21.2], 
       D$LAT[D$MEDV>=21.2], col="red", pch=20)
```

#### 3.5 Use all the variables
```{r}
# Split the data
library(caTools)
set.seed(123)
split = sample.split(D$MEDV, SplitRatio = 0.7)
train = subset(D, split==TRUE)
test = subset(D, split==FALSE)

# Create linear regression
linreg = lm(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
              AGE + DIS + RAD + TAX + PTRATIO, data=train)
summary(linreg)

# Make predictions
linreg.pred = predict(linreg, newdata=test)
linreg.sse = sum((linreg.pred - test$MEDV)^2)
linreg.sse
```

#### 3.6 Create a CART model
```{r}
tree = rpart(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
               AGE + DIS + RAD + TAX + PTRATIO, data=train)
prp(tree)

# Make predictions
tree.pred = predict(tree, newdata=test)
tree.sse = sum((tree.pred - test$MEDV)^2)
tree.sse
```

#### 3.7 Load libraries for cross-validation
```{r}
library(caret)

# Number of folds
tr.control = trainControl(method = "cv", number = 10)

# cp values
cp.grid = expand.grid( .cp = (0:10)*0.001)

# Cross-validation
tr = train(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM + 
             AGE + DIS + RAD + TAX + PTRATIO, 
           data = train, method = "rpart", 
           trControl = tr.control, tuneGrid = cp.grid)
tr

# Extract tree
best.tree = tr$finalModel
prp(best.tree)

# Make predictions
best.tree.pred = predict(best.tree, newdata=test)
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse
```

#### 3.8 Ensembling
```{r}
en.pred = (best.tree.pred + linreg.pred)/2
sum((en.pred - test$MEDV)^2)
```


```{r}
#       METHOD	 SSE
#           lm	3037
#         rpart	4329
#      rpart.cv	3660
#   lm+rpart.cv	2599  
```
<br>

#### 【討論】

+ drawing map with spatial data 
    +
    +
+ tree in spatial data
    +
    +
+ tree vs. linear regression
    +
    +
+ the concept of ensembling
    +
    +

<br>

- - -

<br><br><br><br><br>

<style>
.caption {
  color: #777;
  margin-top: 10px;
}
p code {
  white-space: inherit;
}
pre {
  word-break: normal;
  word-wrap: normal;
  line-height: 1;
}
pre code {
  white-space: inherit;
}
p,li {
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

.r{
  line-height: 1.2;
}

title{
  color: #cc0000;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

body{
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h1,h2,h3,h4,h5{
  color: #0066ff;
  font-family: "Trebuchet MS", "微軟正黑體", "Microsoft JhengHei";
}

h4,h5{
  background: #ccffff;
}

</style>






