packages = c(
"dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
"magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
"doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest"
)
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
rm(list=ls(all=T))
options(digits=5, scipen=12)
library(dplyr)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
D = read.csv("data/stevens.csv")
library(caTools)
set.seed(3000)
spl = sample.split(D$Reverse, SplitRatio = 0.7)
TR = subset(D, spl)
TS = subset(D, !spl)
rpart::rpart()library(rpart)
rpart1 = rpart(Reverse ~ ., TR[,3:9], # simplify the formula
method="class", minbucket=25)
library(rpart.plot)
prp(rpart1)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart.plot(rpart1,cex=0.6)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
pred = predict(rpart1, TS, type = "class") # predict classes
x = table(actual = TS$Reverse, pred); x
pred
actual 0 1
0 41 36
1 22 71
sum(diag(x))/sum(x) # .65882
[1] 0.65882
library(ROCR)
package 愼㸱愼㸵ROCR愼㸱愼㸶 was built under R version 3.4.4Loading required package: gplots
package 愼㸱愼㸵gplots愼㸱愼㸶 was built under R version 3.4.4
Attaching package: 愼㸱愼㸵gplots愼㸱愼㸶
The following object is masked from 愼㸱愼㸵package:stats愼㸱愼㸶:
lowess
PredictROC = predict(rpart1, TS) # predict prob.
head(PredictROC)
0 1
1 0.30357 0.69643
3 0.30357 0.69643
4 0.40000 0.60000
6 0.40000 0.60000
8 0.40000 0.60000
21 0.30357 0.69643
perf = prediction(PredictROC[,2], TS$Reverse)
perf = performance(perf, "tpr", "fpr")
plot(perf)
pred = predict(rpart1, TS)[,2] # prob. of Reverse = 1
colAUC(pred, TS$Reverse, T) # AUC = 0.6927
[,1]
0 vs. 1 0.69271
rpart.plot(rpart1,cex=0.75,varlen=3,faclen=2,box.palette="GnRd")
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
source("DPP.R")
auc = DPP(predict(rpart1)[,2], TR$Reverse, 1) # AUC = 0.7884
prp()、rpart.plot()和DPP()這幾張圖形一棵決策樹的形狀, 和由它所產生的預測機率分布之間,有什麼關係呢 ? 以上DPP 圖中的分布點數、位置和高度,分別會對應到決策樹的什麼特徵呢?
DPP上的每個機率分布點對應到決策樹終端節點的機率,而在預測機率分布圖中,X軸代表終端節點機率,Y軸代表終端節點觀測值數量。 以最後一個終端節點為例,長度代表該節點中觀測值的總量,而紅色部分代表預測Y=1且正確的數量,而咖啡色部分代表預測Y=1錯誤的數量。
minbuckett5 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=5)
prp(t5)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
t100 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=100)
prp(t100)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
minbucket?控制終端節點的觀察值數量,如果minbucket越大則決策樹會越簡單,反之則越複雜。 +
randomForest::randomForest()library(randomForest)
TR$Reverse = as.factor(TR$Reverse) # Convert outcome to factor
TS$Reverse = as.factor(TS$Reverse)
# Build model
rf1 = randomForest(Reverse ~ ., TR[,3:9], ntree=200, nodesize=25)
# Make predictions
pred = predict(rf1, TS)
x = table(TS$Reverse, pred)
sum(diag(x)) / sum(x) # 0.6824
[1] 0.68235
pred = predict(rf1, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.))/sum(.)}
[1] 0.68824
caret packagelibrary(caret)
package 愼㸱愼㸵caret愼㸱愼㸶 was built under R version 3.4.4Loading required package: lattice
Loading required package: ggplot2
package 愼㸱愼㸵ggplot2愼㸱愼㸶 was built under R version 3.4.3
Attaching package: 愼㸱愼㸵ggplot2愼㸱愼㸶
The following object is masked from 愼㸱愼㸵package:randomForest愼㸱愼㸶:
margin
cv1 = train(
Reverse ~ ., TR[,3:9], method = "rpart",
trControl = trainControl(method="cv", number=10), # 10 fold CV
tuneGrid=expand.grid(cp = seq(0.01,0.5,0.01)) # parameter combination
)
cv1; plot(cv1)
CART
396 samples
6 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 356, 356, 356, 357, 356, 356, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.01 0.61647 0.2107007
0.02 0.62647 0.2342834
0.03 0.62141 0.2312568
0.04 0.62897 0.2496793
0.05 0.63397 0.2605018
0.06 0.64423 0.2829554
0.07 0.64423 0.2829554
0.08 0.64423 0.2829554
0.09 0.64423 0.2829554
0.10 0.64423 0.2829554
0.11 0.64423 0.2829554
0.12 0.64423 0.2829554
0.13 0.64423 0.2829554
0.14 0.64423 0.2829554
0.15 0.64423 0.2829554
0.16 0.64423 0.2829554
0.17 0.64423 0.2829554
0.18 0.64423 0.2829554
0.19 0.64423 0.2829554
0.20 0.62372 0.2349554
0.21 0.55551 0.0537805
0.22 0.54551 0.0244875
0.23 0.54038 0.0051077
0.24 0.54038 0.0051077
0.25 0.54538 0.0000000
0.26 0.54538 0.0000000
0.27 0.54538 0.0000000
0.28 0.54538 0.0000000
0.29 0.54538 0.0000000
0.30 0.54538 0.0000000
0.31 0.54538 0.0000000
0.32 0.54538 0.0000000
0.33 0.54538 0.0000000
0.34 0.54538 0.0000000
0.35 0.54538 0.0000000
0.36 0.54538 0.0000000
0.37 0.54538 0.0000000
0.38 0.54538 0.0000000
0.39 0.54538 0.0000000
0.40 0.54538 0.0000000
0.41 0.54538 0.0000000
0.42 0.54538 0.0000000
0.43 0.54538 0.0000000
0.44 0.54538 0.0000000
0.45 0.54538 0.0000000
0.46 0.54538 0.0000000
0.47 0.54538 0.0000000
0.48 0.54538 0.0000000
0.49 0.54538 0.0000000
0.50 0.54538 0.0000000
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.19.
cp which we should we choose? Why?+0.19 在ACCURACY一致的前提下,CP越高則模型的複雜度會越低,故選擇最大的CP,即為0.19。。
rpart2 = rpart(Reverse ~ ., TR[,3:9], method="class", cp=0.19)
pred = predict(rpart2, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.)) / sum(.)} # 0.7235
[1] 0.72353
rpart.plot(rpart2)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart
randomForest
caret package)
rm(list=ls(all=TRUE))
D = read.csv("data/ClaimsData.csv")
# Percentage of patients in each cost bucket
table(D$bucket2009)/nrow(D)
1 2 3 4 5
0.6712678 0.1901704 0.0894663 0.0433249 0.0057707
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(D$bucket2009, SplitRatio = 0.6)
TR = subset(D, spl)
TS = subset(D, !spl)
# sapply(): Apply Function & Aggregate Output
sapply(list(D, TR, TS), function(x)
table(x$bucket2009) %>% prop.table) %>% t
1 2 3 4 5
[1,] 0.67127 0.19017 0.089466 0.043325 0.0057707
[2,] 0.67127 0.19017 0.089468 0.043326 0.0057714
[3,] 0.67127 0.19017 0.089464 0.043324 0.0057696
# Data Exploration
mean(TR$age) # 72.64
[1] 72.638
mean(TR$diabetes) # .3809
[1] 0.3809
# Baseline Accuracy
acc.base = table(TS$bucket2009, TS$bucket2008) %>% {sum(diag(.)) / sum(.)}
acc.base # acc.base = .6838
[1] 0.68381
# Penalty Matrix
Penalty = matrix(c(
0,1,2,3,4,
2,0,1,2,3,
4,2,0,1,2,
6,4,2,0,1,
8,6,4,2,0), byrow=TRUE, nrow=5)
Penalty
[,1] [,2] [,3] [,4] [,5]
[1,] 0 1 2 3 4
[2,] 2 0 1 2 3
[3,] 4 2 0 1 2
[4,] 6 4 2 0 1
[5,] 8 6 4 2 0
# Penalty Error of Baseline Method
cm = as.matrix(table(TS$bucket2009, TS$bucket2008))
pen.base = sum(cm*Penalty) / nrow(TS)
pen.base # pen.base: .7386
[1] 0.73861
# Dumb baseline - Using the most frequent class
acc.dumb = (table(TS$bucket2009)/nrow(TS))[1]
pen.dumb = sum(table(TS$bucket2009) * c(0,2,4,6,8))/nrow(TS)
c(acc.dumb, pen.dumb) # 0.6713 1.0443
1
0.67127 1.04430
library(rpart)
library(rpart.plot)
rpart1 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)],
method="class", cp=0.00005)
prp(rpart1)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
# Make predictions
pred = predict(rpart1, newdata = TS, type = "class")
acc.rpart1 = table(TS$bucket2009, pred) %>% { sum(diag(.)) /nrow(TS)}
acc.rpart1 # acc.rpart1 = .71267 <- .68381
[1] 0.71267
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart1 = sum(cm*Penalty)/nrow(TS)
pen.rpart1 # pen.rpart1: .75789 <- .7386
[1] 0.75789
rpart2 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)],
method="class", cp=0.00005,
parms=list(loss=Penalty) # customized loss function
)
pred = predict(rpart2, TS, type = "class")
acc.rpart2 = table(TS$bucket2009, pred) %>% {sum(diag(.))/sum(.)}
acc.rpart2 # acc.rpart2 = .64727
[1] 0.64727
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart2 = sum(cm*Penalty)/nrow(TS)
pen.rpart2 # pen.rpart2: .64182
[1] 0.64182
# Summary
data.frame(
accuracy = c(acc.dumb, acc.base, acc.rpart1, acc.rpart2),
penalty = c(pen.dumb, pen.base, pen.rpart1, pen.rpart2),
row.names = c("dumb","base","CART","CART w/LossMx"))
rm(list=ls(all=TRUE))
D = read.csv("data/boston.csv")
# Plot observations
plot(D$LON, D$LAT)
# Tracts alongside the Charles River
with(D, points(LON[CHAS==1], LAT[CHAS==1], col="blue", pch=19))
points(D$LON[D$CHAS==1], D$LAT[D$CHAS==1], col="blue", pch=19)
# Plot MIT
with(D, points(LON[TRACT==3531], LAT[TRACT==3531], col="red", pch=19) )
# Plot polution
summary(D$NOX)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.385 0.449 0.538 0.555 0.624 0.871
points(D$LON[D$NOX>=0.55], D$LAT[D$NOX>=0.55], col="green", pch=20)
# Plot prices
plot(D$LON, D$LAT)
summary(D$MEDV)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.0 17.0 21.2 22.5 25.0 50.0
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], col="red", pch=20)
latlonlm = lm(MEDV ~ LAT + LON, data=D)
summary(latlonlm)
Call:
lm(formula = MEDV ~ LAT + LON, data = D)
Residuals:
Min 1Q Median 3Q Max
-16.46 -5.59 -1.30 3.69 28.13
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3178.47 484.94 -6.55 0.000000000138619 ***
LAT 8.05 6.33 1.27 0.2
LON -40.27 5.18 -7.77 0.000000000000045 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8.69 on 503 degrees of freedom
Multiple R-squared: 0.107, Adjusted R-squared: 0.104
F-statistic: 30.2 on 2 and 503 DF, p-value: 0.000000000000416
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], col="red", pch=20)
# latlonlm$fitted.values
points(D$LON[latlonlm$fitted.values >= 21.2],
D$LAT[latlonlm$fitted.values >= 21.2],
col="blue", pch="x")
library(rpart)
library(rpart.plot)
# CART model
latlontree = rpart(MEDV ~ LAT + LON, data=D)
prp(latlontree)
# Visualize output
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2],
col="red", pch=20)
fittedvalues = predict(latlontree)
points(D$LON[fittedvalues>21.2],
D$LAT[fittedvalues>=21.2], col="blue", pch="x")
latlontree = rpart(MEDV ~ LAT + LON, data=D, minbucket=50)
rpart.plot(latlontree)
# Visualize Output
plot(D$LON,D$LAT)
abline(v=-71.07)
abline(h=42.21)
abline(h=42.17)
points(D$LON[D$MEDV>=21.2],
D$LAT[D$MEDV>=21.2], col="red", pch=20)
# Split the data
library(caTools)
set.seed(123)
split = sample.split(D$MEDV, SplitRatio = 0.7)
train = subset(D, split==TRUE)
test = subset(D, split==FALSE)
# Create linear regression
linreg = lm(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO, data=train)
summary(linreg)
Call:
lm(formula = MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX +
RM + AGE + DIS + RAD + TAX + PTRATIO, data = train)
Residuals:
Min 1Q Median 3Q Max
-14.51 -2.71 -0.68 1.79 36.88
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -252.29051 436.71008 -0.58 0.564
LAT 1.54396 5.19196 0.30 0.766
LON -2.98711 4.78583 -0.62 0.533
CRIM -0.18080 0.04391 -4.12 0.000047725 ***
ZN 0.03250 0.01877 1.73 0.084 .
INDUS -0.04297 0.08473 -0.51 0.612
CHAS 2.90418 1.22005 2.38 0.018 *
NOX -21.61279 5.41371 -3.99 0.000079778 ***
RM 6.28440 0.48270 13.02 < 2e-16 ***
AGE -0.04430 0.01785 -2.48 0.014 *
DIS -1.57736 0.28418 -5.55 0.000000056 ***
RAD 0.24509 0.09728 2.52 0.012 *
TAX -0.01112 0.00545 -2.04 0.042 *
PTRATIO -0.98349 0.19389 -5.07 0.000000638 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5.6 on 350 degrees of freedom
Multiple R-squared: 0.665, Adjusted R-squared: 0.653
F-statistic: 53.4 on 13 and 350 DF, p-value: <2e-16
# Make predictions
linreg.pred = predict(linreg, newdata=test)
linreg.sse = sum((linreg.pred - test$MEDV)^2)
linreg.sse
[1] 3037.1
tree = rpart(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO, data=train)
prp(tree)
# Make predictions
tree.pred = predict(tree, newdata=test)
tree.sse = sum((tree.pred - test$MEDV)^2)
tree.sse
[1] 4329
library(caret)
# Number of folds
tr.control = trainControl(method = "cv", number = 10)
# cp values
cp.grid = expand.grid( .cp = (0:10)*0.001)
# Cross-validation
tr = train(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO,
data = train, method = "rpart",
trControl = tr.control, tuneGrid = cp.grid)
tr
CART
364 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 327, 326, 328, 329, 328, 327, ...
Resampling results across tuning parameters:
cp RMSE Rsquared MAE
0.000 4.7290 0.75979 3.0684
0.001 4.7446 0.75837 3.1046
0.002 4.7563 0.75688 3.0909
0.003 4.8067 0.74908 3.1510
0.004 4.8756 0.74179 3.2299
0.005 4.9174 0.73682 3.2811
0.006 4.9539 0.73155 3.3217
0.007 4.8985 0.73604 3.2903
0.008 4.9334 0.72974 3.3231
0.009 4.9056 0.73195 3.3081
0.010 4.9056 0.73195 3.3081
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.
# Extract tree
best.tree = tr$finalModel
prp(best.tree)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
# Make predictions
best.tree.pred = predict(best.tree, newdata=test)
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse
[1] 3660.1
en.pred = (best.tree.pred + linreg.pred)/2
sum((en.pred - test$MEDV)^2)
[1] 2598.6
# METHOD SSE
# lm 3037
# rpart 4329
# rpart.cv 3660
# lm+rpart.cv 2599