packages = c(
"dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
"magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
"doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest"
)
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
Sys.setlocale("LC_ALL",'C') #清除國家偵測設定
[1] "C"
rm(list=ls(all=T))
options(digits=4, scipen=12) #控制小數點, scipen=12讓科學記號不那麼常出現
library(dplyr)
D = read.csv("data/stevens.csv")
library(caTools)
set.seed(3000)
spl = sample.split(D$Reverse, SplitRatio = 0.7)
TR = subset(D, spl)
TS = subset(D, !spl)
rpart::rpart()library(rpart)
rpart1 = rpart(Reverse ~ ., TR[,3:9], # simplify the formula
method="class", minbucket=25) # class是讓他結果輸出類別, prob則是機率, 都不選也是機率
library(rpart.plot)
prp(rpart1)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart.plot(rpart1,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
pred = predict(rpart1, TS, type = "class") # predict classes
x = table(actual = TS$Reverse, pred); x
pred
actual 0 1
0 41 36
1 22 71
sum(diag(x))/sum(x) # .65882
[1] 0.6588
library(ROCR)
PredictROC = predict(rpart1, TS) # predict prob.
head(PredictROC)
0 1
1 0.3036 0.6964
3 0.3036 0.6964
4 0.4000 0.6000
6 0.4000 0.6000
8 0.4000 0.6000
21 0.3036 0.6964
perf = prediction(PredictROC[,2], TS$Reverse)
perf = performance(perf, "tpr", "fpr")
plot(perf)
pred = predict(rpart1, TS)[,2] # prob. of Reverse = 1
colAUC(pred, TS$Reverse, T) # AUC = 0.6927
[,1]
0 vs. 1 0.6927
source("DPP.R")
auc = DPP(pred, TS$Reverse, 1) # AUC = 0.6927
prp()、rpart.plot()和DPP()這幾張圖形DPP的分佈點數(0/1)是決策樹紅/綠之分別;DPP分佈位置代表決策樹的預測機率(第二個數字);DPP高度為決策樹中的各水桶佔總資料的比例(第三個數字)。
minbuckett5 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=5)
prp(t5)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
t100 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=100)
prp(t100)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
minbucket?randomForest::randomForest()library(randomForest)
TR$Reverse = as.factor(TR$Reverse) # Convert outcome to factor
TS$Reverse = as.factor(TS$Reverse)
# Build model
rf1 = randomForest(Reverse ~ ., TR[,3:9], ntree=200, nodesize=25)
# Make predictions
pred = predict(rf1, TS)
x = table(TS$Reverse, pred)
sum(diag(x)) / sum(x) # 0.6824 #diag是對角線
[1] 0.6824
pred = predict(rf1, TS, type='prob')[,2] #Y=的機率
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.))/sum(.)}
[1] 0.6882
caret packagelibrary(caret)
numFolds = trainControl(method="cv", number=10) # 10 fold CV
cpGrid = expand.grid(cp = seq(0.01,0.5,0.01)) # parameter combination
cv1 = train(Reverse ~ ., TR[,3:9], method = "rpart",
trControl=numFolds, tuneGrid=cpGrid)
cv1; plot(cv1)
CART
396 samples
6 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 357, 356, 356, 356, 357, 356, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.01 0.6136 0.20703
0.02 0.6137 0.20625
0.03 0.6238 0.23127
0.04 0.6314 0.25635
0.05 0.6441 0.28402
0.06 0.6441 0.28402
0.07 0.6441 0.28402
0.08 0.6441 0.28402
0.09 0.6441 0.28402
0.10 0.6441 0.28402
0.11 0.6441 0.28402
0.12 0.6441 0.28402
0.13 0.6441 0.28402
0.14 0.6441 0.28402
0.15 0.6441 0.28402
0.16 0.6441 0.28402
0.17 0.6441 0.28402
0.18 0.6441 0.28402
0.19 0.6210 0.23063
0.20 0.5830 0.13765
0.21 0.5830 0.13765
0.22 0.5530 0.04839
0.23 0.5479 0.03226
0.24 0.5454 0.01768
0.25 0.5454 0.00000
0.26 0.5454 0.00000
0.27 0.5454 0.00000
0.28 0.5454 0.00000
0.29 0.5454 0.00000
0.30 0.5454 0.00000
0.31 0.5454 0.00000
0.32 0.5454 0.00000
0.33 0.5454 0.00000
0.34 0.5454 0.00000
0.35 0.5454 0.00000
0.36 0.5454 0.00000
0.37 0.5454 0.00000
0.38 0.5454 0.00000
0.39 0.5454 0.00000
0.40 0.5454 0.00000
0.41 0.5454 0.00000
0.42 0.5454 0.00000
0.43 0.5454 0.00000
0.44 0.5454 0.00000
0.45 0.5454 0.00000
0.46 0.5454 0.00000
0.47 0.5454 0.00000
0.48 0.5454 0.00000
0.49 0.5454 0.00000
0.50 0.5454 0.00000
Accuracy was used to select the optimal model using
the largest value.
The final value used for the model was cp = 0.18.
cp which we should we choose? Why?rpart2 = rpart(Reverse ~ ., TR[,3:9], method="class", cp=0.19)
pred = predict(rpart2, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.)) / sum(.)} # 0.7235
[1] 0.7235
rpart.plot(rpart2)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart決策樹可以做到類別與連續變數的分析。當要用的是類別時,方法則要選擇“class”;而當我們需要連續變化的機率,就使用“prob”。
randomForest在做Tree Bagging時,我們使用ntree來調整隨機森林中樹的大小,由此來告訴R做到多少就不要再往下長了。
當模型做得太複雜會導致過度適配Trianing的模型,但這樣會遠離Testing的模型,且我們真正想推估的是未來可能發生的狀況,若一心追求模型的準確度,將達不到我們真正的目的。
那要如何對抗因過度適配產生的複雜度成本,就要靠方法參數(alpha,lamda,…),而非直接選取變數(x)來選擇模型。
caret package)所謂方法參數,就是用cp來控制,但我們不知道到底哪個cp值最好,這時就是不斷地用caret來做cross validation,值到做出我們滿意的模型。
因為需要不斷重複嘗試不同的參數值,就體現了機器計算高效的重要性;但是最終在作決策時,還是要靠人們對議題的敏感性及背景知識才能做判斷,機器終究只是幫忙建模型,而模型建完才是真正的開始。
rm(list=ls(all=TRUE))
D = read.csv("data/ClaimsData.csv")
# Percentage of patients in each cost bucket
table(D$bucket2009)/nrow(D)
1 2 3 4 5
0.671268 0.190170 0.089466 0.043325 0.005771
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(D$bucket2009, SplitRatio = 0.6)
TR = subset(D, spl)
TS = subset(D, !spl)
# Check Percentages Among Subsets
library(dplyr)
table(TR$bucket2009) %>% prop.table
1 2 3 4 5
0.671266 0.190169 0.089468 0.043326 0.005771
table(TS$bucket2009) %>% prop.table
1 2 3 4 5
0.67127 0.19017 0.08946 0.04332 0.00577
table(D$bucket2009) %>% prop.table
1 2 3 4 5
0.671268 0.190170 0.089466 0.043325 0.005771
# sapply(): Apply Function & Aggregate Output
sapply(list(D, TR, TS), function(x)
table(x$bucket2009) %>% prop.table) %>% t #x隨著前面的變數而變(D,TR,TS) #sapply不只是做出來 還會彙整成表
1 2 3 4 5
[1,] 0.6713 0.1902 0.08947 0.04332 0.005771
[2,] 0.6713 0.1902 0.08947 0.04333 0.005771
[3,] 0.6713 0.1902 0.08946 0.04332 0.005770
# Data Exploration
mean(TR$age) # 72.64
[1] 72.64
mean(TR$diabetes) # .3809
[1] 0.3809
# Baseline Accuracy
acc.base = table(TS$bucket2009, TS$bucket2008) %>%
{sum(diag(.)) / sum(.)} # .6838
# Penalty Matrix
Penalty = matrix(c(
0,1,2,3,4,
2,0,1,2,3,
4,2,0,1,2,
6,4,2,0,1,
8,6,4,2,0), byrow=TRUE, nrow=5)
Penalty
[,1] [,2] [,3] [,4] [,5]
[1,] 0 1 2 3 4
[2,] 2 0 1 2 3
[3,] 4 2 0 1 2
[4,] 6 4 2 0 1
[5,] 8 6 4 2 0
# Penalty Error of Baseline Method
cm = as.matrix(table(TS$bucket2009, TS$bucket2008))
pen.base = sum(cm*Penalty) / nrow(TS) # pen: .7386
# if we use the most popular class as baseline
acc.dumb = (table(TS$bucket2009)/nrow(TS))[1] # acc: .6713
pen.dumb = sum(table(TS$bucket2009) * c(0,2,4,6,8))/nrow(TS) # pen: 1.044
library(rpart)
library(rpart.plot)
rpart1 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)],
method="class", cp=0.00005)
prp(rpart1)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
# Make predictions
pred = predict(rpart1, newdata = TS, type = "class")
acc.rpart1 = table(TS$bucket2009, pred) %>%
{ sum(diag(.)) /nrow(TS)} # acc: .71267 <- .68381
# Penalty Error
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart1 = sum(cm*Penalty)/nrow(TS) # pen: .75789 <- .7386
rpart2 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)],
method="class", cp=0.00005,
parms=list(loss=Penalty))
pred = predict(rpart2, TS, type = "class")
acc.rpart2 = table(TS$bucket2009, pred) %>%
{sum(diag(.))/sum(.)} # acc.rpart2: .64727
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart2 = sum(cm*Penalty)/nrow(TS) # pen.rpart2: .64182
# Summary
data.frame(
accuracy = c(acc.dumb, acc.base, acc.rpart1, acc.rpart2),
penalty = c(pen.dumb, pen.base, pen.rpart1, pen.rpart2),
row.names = c("dumb","base","CART","CART w/LossMx"))
accuracy penalty
dumb 0.6713 1.0443
base 0.6838 0.7386
CART 0.7127 0.7579
CART w/LossMx 0.6473 0.6418
cm
pred
1 2 3 4 5
1 94310 25295 3087 286 0
2 7176 18942 8079 643 0
3 3590 7706 4692 401 1
4 1304 3193 2803 636 1
5 135 356 408 156 2
Penalty
[,1] [,2] [,3] [,4] [,5]
[1,] 0 1 2 3 4
[2,] 2 0 1 2 3
[3,] 4 2 0 1 2
[4,] 6 4 2 0 1
[5,] 8 6 4 2 0
cm*Penalty
pred
1 2 3 4 5
1 0 25295 6174 858 0
2 14352 0 8079 1286 0
3 14360 15412 0 401 2
4 7824 12772 5606 0 1
5 1080 2136 1632 312 0
dumb baseline是將Y都猜1,準確率也不會與一般的baseline差太多,方便快速,然而penalty會大幅提升(acc=0.6731,penalty=1.044);而smart baseline是一種進階版的baseline,也就是想辦法拉高底線,此例是用2008年的類別直接預測2009年的類別(acc=0.6838,penalty=0.7386),相較之下,acc差不多,但penalty明顯下降,故應選smart baseline。
全部都猜1的模型,完全沒有在區分類別,即使正確性很高,沒有辨識率,就不是一個好的模型。
第二種模型是以今年的資料去預測明年,至少還有在分辨不同類別,因此penalty下降了。
綜合上述,我們比較在意降低penalty而非增加acc。
客製化模型在做模型的當下,就已告訴決策樹預設好的penalty(parms=list(loss=Penalty)),給模型做一個加權數,模型預測只要沒猜中,類別就算錯,沒有猜多猜少(高估低估)的差別,最終做出來的樹不是acc最高的樹,而是penalty最低的樹(acc=0.6472,penalty=0.6418)。但不是每種模型都可以透過matrix加入penalty做這種最佳化。
這例題中並沒有做到cross validation,簡化了他如何選擇cp的過程。
rm(list=ls(all=TRUE))
D = read.csv("data/boston.csv")
# Plot observations
plot(D$LON, D$LAT)
# Tracts alongside the Charles River
with(D,
points(LON[CHAS==1], LAT[CHAS==1], col="blue", pch=19))
# Plot MIT
with(D,
points(LON[TRACT==3531], LAT[TRACT==3531], col="red", pch=19) )
# Plot polution
summary(D$NOX)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.385 0.449 0.538 0.555 0.624 0.871
points(D$LON[D$NOX>=0.55], D$LAT[D$NOX>=0.55],
col="green", pch=20)
# Plot prices
plot(D$LON, D$LAT)
summary(D$MEDV)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.0 17.0 21.2 22.5 25.0 50.0
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2],
col="red", pch=20)
latlonlm = lm(MEDV ~ LAT + LON, data=D)
summary(latlonlm)
Call:
lm(formula = MEDV ~ LAT + LON, data = D)
Residuals:
Min 1Q Median 3Q Max
-16.46 -5.59 -1.30 3.69 28.13
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3178.47 484.94 -6.55 0.000000000138619
LAT 8.05 6.33 1.27 0.2
LON -40.27 5.18 -7.77 0.000000000000045
(Intercept) ***
LAT
LON ***
---
Signif. codes:
0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.69 on 503 degrees of freedom
Multiple R-squared: 0.107, Adjusted R-squared: 0.104
F-statistic: 30.2 on 2 and 503 DF, p-value: 0.000000000000416
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2],
col="red", pch=20)
# latlonlm$fitted.values
points(D$LON[latlonlm$fitted.values >= 21.2],
D$LAT[latlonlm$fitted.values >= 21.2],
col="blue", pch="x")
library(rpart)
library(rpart.plot)
# CART model
latlontree = rpart(MEDV ~ LAT + LON, data=D)
prp(latlontree)
# Visualize output
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2],
col="red", pch=20)
fittedvalues = predict(latlontree)
points(D$LON[fittedvalues>21.2],
D$LAT[fittedvalues>=21.2], col="blue", pch="x")
latlontree = rpart(MEDV ~ LAT + LON, data=D, minbucket=50)
rpart.plot(latlontree)
# Visualize Output
plot(D$LON,D$LAT)
abline(v=-71.07)
abline(h=42.21)
abline(h=42.17)
points(D$LON[D$MEDV>=21.2],
D$LAT[D$MEDV>=21.2], col="red", pch=20)
# Split the data
library(caTools)
set.seed(123)
split = sample.split(D$MEDV, SplitRatio = 0.7)
train = subset(D, split==TRUE)
test = subset(D, split==FALSE)
# Create linear regression
linreg = lm(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO, data=train)
summary(linreg)
Call:
lm(formula = MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX +
RM + AGE + DIS + RAD + TAX + PTRATIO, data = train)
Residuals:
Min 1Q Median 3Q Max
-14.51 -2.71 -0.68 1.79 36.88
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -252.29051 436.71008 -0.58 0.564
LAT 1.54396 5.19196 0.30 0.766
LON -2.98711 4.78583 -0.62 0.533
CRIM -0.18080 0.04391 -4.12 0.000047725 ***
ZN 0.03250 0.01877 1.73 0.084 .
INDUS -0.04297 0.08473 -0.51 0.612
CHAS 2.90418 1.22005 2.38 0.018 *
NOX -21.61279 5.41371 -3.99 0.000079778 ***
RM 6.28440 0.48270 13.02 < 2e-16 ***
AGE -0.04430 0.01785 -2.48 0.014 *
DIS -1.57736 0.28418 -5.55 0.000000056 ***
RAD 0.24509 0.09728 2.52 0.012 *
TAX -0.01112 0.00545 -2.04 0.042 *
PTRATIO -0.98349 0.19389 -5.07 0.000000638 ***
---
Signif. codes:
0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.6 on 350 degrees of freedom
Multiple R-squared: 0.665, Adjusted R-squared: 0.653
F-statistic: 53.4 on 13 and 350 DF, p-value: <2e-16
# Make predictions
linreg.pred = predict(linreg, newdata=test)
linreg.sse = sum((linreg.pred - test$MEDV)^2)
linreg.sse
[1] 3037
tree = rpart(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO, data=train)
prp(tree)
# Make predictions
tree.pred = predict(tree, newdata=test)
tree.sse = sum((tree.pred - test$MEDV)^2)
tree.sse
[1] 4329
library(caret)
# Number of folds
tr.control = trainControl(method = "cv", number = 10)
# cp values
cp.grid = expand.grid( .cp = (0:10)*0.001)
# Cross-validation
tr = train(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO,
data = train, method = "rpart",
trControl = tr.control, tuneGrid = cp.grid)
tr
CART
364 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 327, 326, 328, 329, 328, 327, ...
Resampling results across tuning parameters:
cp RMSE Rsquared MAE
0.000 4.729 0.7598 3.068
0.001 4.745 0.7584 3.105
0.002 4.756 0.7569 3.091
0.003 4.807 0.7491 3.151
0.004 4.876 0.7418 3.230
0.005 4.917 0.7368 3.281
0.006 4.954 0.7315 3.322
0.007 4.899 0.7360 3.290
0.008 4.933 0.7297 3.323
0.009 4.906 0.7320 3.308
0.010 4.906 0.7320 3.308
RMSE was used to select the optimal model using the
smallest value.
The final value used for the model was cp = 0.
# Extract tree
best.tree = tr$finalModel
prp(best.tree)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
# Make predictions
best.tree.pred = predict(best.tree, newdata=test)
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse
[1] 3660
en.pred = (best.tree.pred + linreg.pred)/2
sum((en.pred - test$MEDV)^2)
[1] 2599
# METHOD SSE
# lm 3037
# rpart 4329
# rpart.cv 3660
# lm+rpart.cv 2599
drawing map with spatial data
tree in spatial data
分析在空間資料的決策樹,經緯度會是一個很重要的變數。
最後結果得出,決策樹的預測比線性回歸模型的預測還要差,因此我們知道,並非所有議題都適合用決策樹來分,有時候回歸模型的表現會更好(SSE較小)。
Model ensembling是將兩種模型合併起來做平均,其的預測能力將能打敗其他單一種預測方式的模型。