# packages = c(
# "dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
# "magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
# "doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest"
# )
# existing = as.character(installed.packages()[,1])
# for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
Sys.setlocale("LC_ALL", "English")
[1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
rm(list=ls(all=T))
options(digits=4, scipen=12)
library(dplyr)
package 㤼㸱dplyr㤼㸲 was built under R version 3.4.3
Attaching package: 㤼㸱dplyr㤼㸲
The following objects are masked from 㤼㸱package:stats㤼㸲:
filter, lag
The following objects are masked from 㤼㸱package:base㤼㸲:
intersect, setdiff, setequal, union
大法官會議
要預測其中一位大法官steven的決策
美國的司法體系有三個層級的法院: * district court、circuit court、supreme court,判決不服就往上上訴。
DV: 1=reverse翻盤、0=affirm維持原判 IDV: * circuit court of origin 最早從哪個法院上來 * issue area * type of peririoner: 上訴者的身分 * ideological: 判決的意識形態(conservative,liberal) * unconsitutional: 是否違憲
決策數的優點: * 不怕缺項 * 不怕共線性 * 易解釋 * Y的分布是比較細碎的、一塊一塊,用決策樹會比用線性模型效果來的好
D = read.csv("data/stevens.csv")
library(caTools)
set.seed(3000)
spl = sample.split(D$Reverse, SplitRatio = 0.7)
#依據相同的D$Reverse的比率 分成7:3 #兩份裡D$Reverse的比率是一樣的
#返回boolean
TR = subset(D, spl)
TS = subset(D, !spl)
rpart::rpart()library(rpart)
rpart1 = rpart(Reverse ~ ., TR[,3:9],
# simplify the formula
method="class", minbucket=25)
#設定minbucket,terminal node裡最小資料數,到達minbucket就不在往下分支了
library(rpart.plot)
prp(rpart1)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
#terminal node裡的資料不一定會完全都是0或1,而是靠比例去判斷
rpart.plot(rpart1,cex=0.6) #cex控制字大小 #rpart.plot是比較好的畫樹方法
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
pred = predict(rpart1, TS, type = "class")
# type = class 直接返回類別
# type = prob 和logistic regression一樣返回機率,要轉成類別就在confusion matrix-> pred = pred > 0.5
x = table(actual = TS$Reverse, pred); x
pred
actual 0 1
0 41 36
1 22 71
sum(diag(x))/sum(x) # ACC = .65882
[1] 0.6588
library(ROCR)
PredictROC = predict(rpart1, TS)# predict prob,class預設為prob
head(PredictROC)
0 1
1 0.3036 0.6964
3 0.3036 0.6964
4 0.4000 0.6000
6 0.4000 0.6000
8 0.4000 0.6000
21 0.3036 0.6964
#返回0,1兩類別的比例
#決策樹可能一次分好多類,不像邏輯式回歸只有兩類
#? 1.3.4.6...
#測量model在testing data的表現
#prediction是ROCR裡的用法
perf = prediction(PredictROC[,2], TS$Reverse)
perf = performance(perf, "tpr", "fpr")
plot(perf) #TPR與FPR的曲線(也就是ROC)
#用一般colAUC的作法
pred = predict(rpart1, TS)[,2] # prob. of Reverse = 1
colAUC(pred, TS$Reverse, T) # AUC = 0.6927
[,1]
0 vs. 1 0.6927
source("DPP.R")
auc = DPP(pred, TS$Reverse, 1) # AUC = 0.6927
prp()、rpart.plot()和DPP()這幾張圖形rpart.plot()比起prp(),多了在每個節點裡
一棵決策樹的形狀, 和由它所產生的預測機率分布之間,有什麼關係呢 ? 以上DPP 圖中的分布點數、位置和高度,分別會對應到決策樹的什麼特徵呢?
minbuckett5 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=5)
rpart.plot(t5,cex=0.5)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
t100 = rpart(Reverse ~ ., TR[,3:9], method="class", minbucket=100)
rpart.plot(t100,cex=0.5)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
minbucket?決策數的方法參數
randomForest::randomForest()同時間找出許多小樹,不依靠單一個樹去做決定,以小樹的平均決策作為最後決策的基礎。
random => 代表隨機給每棵樹看到不同的x變項,且每棵樹也會看到不同的資料點(可以重複)
each tree is built from a bagged/bootstrapped sample of data
library(randomForest)
package 㤼㸱randomForest㤼㸲 was built under R version 3.4.4randomForest 4.6-14
Type rfNews() to see new features/changes/bug fixes.
Attaching package: 㤼㸱randomForest㤼㸲
The following object is masked from 㤼㸱package:dplyr㤼㸲:
combine
TR$Reverse = as.factor(TR$Reverse) # Convert outcome to factor
TS$Reverse = as.factor(TS$Reverse)
# Build model
rf1 = randomForest(Reverse ~ ., TR[,3:9], ntree=200, nodesize=25)
#ntree 樹的數量
#nodesize是subset裡資料的最小數(= CART的minbucket)
# Make predictions
#方法一 直接返回class
pred = predict(rf1, TS)
x = table(TS$Reverse, pred) ; x #confusion matrix
pred
0 1
0 43 34
1 20 73
sum(diag(x)) / sum(x) # ACC = 0.6824
[1] 0.6824
library(magrittr)
package 㤼㸱magrittr㤼㸲 was built under R version 3.4.3
#方法二 返回類別 2的機率
pred = predict(rf1, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.))/sum(.)}
[1] 0.6882
caret packagelibrary(caret)
package 㤼㸱caret㤼㸲 was built under R version 3.4.4Loading required package: lattice
Loading required package: ggplot2
package 㤼㸱ggplot2㤼㸲 was built under R version 3.4.3
Attaching package: 㤼㸱ggplot2㤼㸲
The following object is masked from 㤼㸱package:randomForest㤼㸲:
margin
#trainControl
numFolds = trainControl(method="cv", number=10) # 10 fold CV
cpGrid = expand.grid(cp = seq(0.01,0.5,0.01)) # parameter combination
cv1 = train(Reverse ~ ., TR[,3:9], method = "rpart",
trControl=numFolds, tuneGrid=cpGrid)
cv1; plot(cv1)
CART
396 samples
6 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 356, 356, 356, 357, 356, 356, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.01 0.6165 0.210701
0.02 0.6265 0.234283
0.03 0.6214 0.231257
0.04 0.6290 0.249679
0.05 0.6340 0.260502
0.06 0.6442 0.282955
0.07 0.6442 0.282955
0.08 0.6442 0.282955
0.09 0.6442 0.282955
0.10 0.6442 0.282955
0.11 0.6442 0.282955
0.12 0.6442 0.282955
0.13 0.6442 0.282955
0.14 0.6442 0.282955
0.15 0.6442 0.282955
0.16 0.6442 0.282955
0.17 0.6442 0.282955
0.18 0.6442 0.282955
0.19 0.6442 0.282955
0.20 0.6237 0.234955
0.21 0.5555 0.053780
0.22 0.5455 0.024488
0.23 0.5404 0.005108
0.24 0.5404 0.005108
0.25 0.5454 0.000000
0.26 0.5454 0.000000
0.27 0.5454 0.000000
0.28 0.5454 0.000000
0.29 0.5454 0.000000
0.30 0.5454 0.000000
0.31 0.5454 0.000000
0.32 0.5454 0.000000
0.33 0.5454 0.000000
0.34 0.5454 0.000000
0.35 0.5454 0.000000
0.36 0.5454 0.000000
0.37 0.5454 0.000000
0.38 0.5454 0.000000
0.39 0.5454 0.000000
0.40 0.5454 0.000000
0.41 0.5454 0.000000
0.42 0.5454 0.000000
0.43 0.5454 0.000000
0.44 0.5454 0.000000
0.45 0.5454 0.000000
0.46 0.5454 0.000000
0.47 0.5454 0.000000
0.48 0.5454 0.000000
0.49 0.5454 0.000000
0.50 0.5454 0.000000
Accuracy was used to select the optimal model using the
largest value.
The final value used for the model was cp = 0.19.
cp which we should we choose? Why?cp = cost of complexity
多設一個節點,損失下降如果比設定的cp高,就不再往下分支了
cp越高,代表model越不複雜
如果ACC一樣,越簡單的模型是越好的
rpart2 = rpart(Reverse ~ ., TR[,3:9], method="class", cp=0.19)
pred = predict(rpart2, TS, type='prob')[,2]
table(TS$Reverse, pred > 0.5) %>% {sum(diag(.)) / sum(.)} # 0.7235
[1] 0.7235
rpart.plot(rpart2)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpartrandomForestcaret package)rm(list=ls(all=TRUE))
D = read.csv("data/ClaimsData.csv")
# Percentage of patients in each cost bucket
table(D$bucket2009)/nrow(D)
1 2 3 4 5
0.671268 0.190170 0.089466 0.043325 0.005771
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(D$bucket2009, SplitRatio = 0.6)
TR = subset(D, spl)
TS = subset(D, !spl)
# Check Percentages Among Subsets
library(dplyr)
table(TR$bucket2009) %>% prop.table
1 2 3 4 5
0.671266 0.190169 0.089468 0.043326 0.005771
table(TS$bucket2009) %>% prop.table
1 2 3 4 5
0.67127 0.19017 0.08946 0.04332 0.00577
table(D$bucket2009) %>% prop.table
1 2 3 4 5
0.671268 0.190170 0.089466 0.043325 0.005771
# sapply(): Apply Function & Aggregate Output
sapply(list(D, TR, TS), function(x)
table(x$bucket2009) %>% prop.table) %>% t
1 2 3 4 5
[1,] 0.6713 0.1902 0.08947 0.04332 0.005771
[2,] 0.6713 0.1902 0.08947 0.04333 0.005771
[3,] 0.6713 0.1902 0.08946 0.04332 0.005770
# Data Exploration
mean(TR$age) # 平均年紀72.64
[1] 72.64
mean(TR$diabetes) # 得糖尿病的比例.3809
[1] 0.3809
# Baseline Accuracy
# 在多類別的狀況下,要怎麼預測呢?
#試著用2008資料來預測2009年的看看
acc.base = table(TS$bucket2009, TS$bucket2008) %>%
{sum(diag(.)) / sum(.)} # ACC=.6838
# Penalty Matrix(隨著商業情境在調整)
Penalty = matrix(c( #在此情境下,猜高比猜低嚴重(懲罰重)
0,1,2,3,4,
2,0,1,2,3,
4,2,0,1,2,
6,4,2,0,1,
8,6,4,2,0), byrow=TRUE, nrow=5)
Penalty
[,1] [,2] [,3] [,4] [,5]
[1,] 0 1 2 3 4
[2,] 2 0 1 2 3
[3,] 4 2 0 1 2
[4,] 6 4 2 0 1
[5,] 8 6 4 2 0
# Penalty Error of Baseline Method
#用2008猜2009
cm = as.matrix(table(TS$bucket2009, TS$bucket2008))
pen.base = sum(cm*Penalty) / nrow(TS)
# 平均penalty
# pen: .7386
# if we use the most popular class as baseline
#所有人都猜1
acc.dumb = (table(TS$bucket2009)/nrow(TS))[1]
# acc: .6713
pen.dumb = sum(table(TS$bucket2009) * c(0,2,4,6,8))/nrow(TS)
# pen: 1.044
#可看出用2008猜2009的 比 baseline 的penalty少
library(rpart)
library(rpart.plot)
rpart1 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)],
method="class", cp=0.00005)
#cp是用CV找出來的
prp(rpart1)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
# Make predictions
pred = predict(rpart1, newdata = TS, type = "class")
acc.rpart1 = table(TS$bucket2009, pred) %>%
{ sum(diag(.)) /nrow(TS)} # acc: .71267 <- .68381
# Penalty Error
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart1 = sum(cm*Penalty)/nrow(TS) # pen: .75789 <- .7386
rpart2 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)],
method="class", cp=0.00005,
parms=list(loss=Penalty))
#
pred = predict(rpart2, TS, type = "class")
acc.rpart2 = table(TS$bucket2009, pred) %>%
{sum(diag(.))/sum(.)} # acc.rpart2: .64727
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart2 = sum(cm*Penalty)/nrow(TS) # pen.rpart2: .64182
# Summary
data.frame(
accuracy = c(acc.dumb, acc.base, acc.rpart1, acc.rpart2),
penalty = c(pen.dumb, pen.base, pen.rpart1, pen.rpart2),
row.names = c("dumb","base","CART","CART w/LossMx"))
rm(list=ls(all=TRUE))
D = read.csv("data/boston.csv")
# Plot observations
plot(D$LON, D$LAT)
# Tracts alongside the Charles River
with(D,
points(LON[CHAS==1], LAT[CHAS==1], col="blue", pch=19))
# Plot MIT
with(D,
points(LON[TRACT==3531], LAT[TRACT==3531], col="red", pch=19) )
# Plot polution
summary(D$NOX)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.385 0.449 0.538 0.555 0.624 0.871
points(D$LON[D$NOX>=0.55], D$LAT[D$NOX>=0.55],
col="green", pch=20)
# Plot prices
plot(D$LON, D$LAT)
summary(D$MEDV)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.0 17.0 21.2 22.5 25.0 50.0
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2],
col="red", pch=20)
latlonlm = lm(MEDV ~ LAT + LON, data=D)
summary(latlonlm)
Call:
lm(formula = MEDV ~ LAT + LON, data = D)
Residuals:
Min 1Q Median 3Q Max
-16.46 -5.59 -1.30 3.69 28.13
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3178.47 484.94 -6.55 0.000000000138619 ***
LAT 8.05 6.33 1.27 0.2
LON -40.27 5.18 -7.77 0.000000000000045 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 8.69 on 503 degrees of freedom
Multiple R-squared: 0.107, Adjusted R-squared: 0.104
F-statistic: 30.2 on 2 and 503 DF, p-value: 0.000000000000416
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2],
col="red", pch=20)
# latlonlm$fitted.values
points(D$LON[latlonlm$fitted.values >= 21.2],
D$LAT[latlonlm$fitted.values >= 21.2],
col="blue", pch="x")
library(rpart)
library(rpart.plot)
# CART model
latlontree = rpart(MEDV ~ LAT + LON, data=D)
prp(latlontree)
# Visualize output
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2],
col="red", pch=20)
fittedvalues = predict(latlontree)
points(D$LON[fittedvalues>21.2],
D$LAT[fittedvalues>=21.2], col="blue", pch="x")
latlontree = rpart(MEDV ~ LAT + LON, data=D, minbucket=50)
rpart.plot(latlontree)
# Visualize Output
plot(D$LON,D$LAT)
abline(v=-71.07)
abline(h=42.21)
abline(h=42.17)
points(D$LON[D$MEDV>=21.2],
D$LAT[D$MEDV>=21.2], col="red", pch=20)
# Split the data
library(caTools)
set.seed(123)
split = sample.split(D$MEDV, SplitRatio = 0.7)
train = subset(D, split==TRUE)
test = subset(D, split==FALSE)
# Create linear regression
linreg = lm(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO, data=train)
summary(linreg)
Call:
lm(formula = MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX +
RM + AGE + DIS + RAD + TAX + PTRATIO, data = train)
Residuals:
Min 1Q Median 3Q Max
-14.51 -2.71 -0.68 1.79 36.88
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -252.29051 436.71008 -0.58 0.564
LAT 1.54396 5.19196 0.30 0.766
LON -2.98711 4.78583 -0.62 0.533
CRIM -0.18080 0.04391 -4.12 0.000047725 ***
ZN 0.03250 0.01877 1.73 0.084 .
INDUS -0.04297 0.08473 -0.51 0.612
CHAS 2.90418 1.22005 2.38 0.018 *
NOX -21.61279 5.41371 -3.99 0.000079778 ***
RM 6.28440 0.48270 13.02 < 2e-16 ***
AGE -0.04430 0.01785 -2.48 0.014 *
DIS -1.57736 0.28418 -5.55 0.000000056 ***
RAD 0.24509 0.09728 2.52 0.012 *
TAX -0.01112 0.00545 -2.04 0.042 *
PTRATIO -0.98349 0.19389 -5.07 0.000000638 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5.6 on 350 degrees of freedom
Multiple R-squared: 0.665, Adjusted R-squared: 0.653
F-statistic: 53.4 on 13 and 350 DF, p-value: <2e-16
# Make predictions
linreg.pred = predict(linreg, newdata=test)
linreg.sse = sum((linreg.pred - test$MEDV)^2)
linreg.sse
[1] 3037
tree = rpart(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO, data=train)
prp(tree)
# Make predictions
tree.pred = predict(tree, newdata=test)
tree.sse = sum((tree.pred - test$MEDV)^2)
tree.sse
[1] 4329
library(caret)
# Number of folds
tr.control = trainControl(method = "cv", number = 10)
# cp values
cp.grid = expand.grid( .cp = (0:10)*0.001)
# Cross-validation
tr = train(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO,
data = train, method = "rpart",
trControl = tr.control, tuneGrid = cp.grid)
tr
CART
364 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 327, 326, 328, 329, 328, 327, ...
Resampling results across tuning parameters:
cp RMSE Rsquared MAE
0.000 4.729 0.7598 3.068
0.001 4.745 0.7584 3.105
0.002 4.756 0.7569 3.091
0.003 4.807 0.7491 3.151
0.004 4.876 0.7418 3.230
0.005 4.917 0.7368 3.281
0.006 4.954 0.7315 3.322
0.007 4.899 0.7360 3.290
0.008 4.933 0.7297 3.323
0.009 4.906 0.7320 3.308
0.010 4.906 0.7320 3.308
RMSE was used to select the optimal model using the
smallest value.
The final value used for the model was cp = 0.
# Extract tree
best.tree = tr$finalModel
prp(best.tree)
Bad 'data' field in model 'call' field.
To make this warning go away:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
# Make predictions
best.tree.pred = predict(best.tree, newdata=test)
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse
[1] 3660
en.pred = (best.tree.pred + linreg.pred)/2
sum((en.pred - test$MEDV)^2)
[1] 2599
# METHOD SSE
# lm 3037
# rpart 4329
# rpart.cv 3660
# lm+rpart.cv 2599