packages = c(
"dplyr","ggplot2","d3heatmap","googleVis","devtools","plotly", "xgboost",
"magrittr","caTools","ROCR","corrplot", "rpart", "rpart.plot",
"doParallel", "caret", "glmnet", "Matrix", "e1071", "randomForest"
)
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
setwd("~/R/MIT/Unit4/")
Sys.setlocale("LC_ALL","C")
[1] "C"
rm(list=ls(all=T)) # 清除工作區
Error in names(frame) <- `*vtmp*` : names() applied to a non-vector
Error in names(frame) <- `*vtmp*` : names() applied to a non-vector
options(digits=5, scipen=12)# 科學記號小於12不要顯示
library(dplyr)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
D = read.csv("data/stevens.csv")
library(caTools)
set.seed(3000)
#Split不只是分7:3,還會讓Reverse比例也維持7:3
split = sample.split(D$Reverse, SplitRatio = 0.7)
Train = subset(D, split)
Test = subset(D, !split)
rpart::rpart()#rpart套件用於CART分類
library(rpart)
rpart1 = rpart(Reverse ~ ., Train[,3:9], # simplify the formula
method="class", minbucket=25)
library(rpart.plot)
prp(rpart1)
Bad 'data' field in model 'call'.
To silence this warning:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart.plot(rpart1,cex=0.6)
Bad 'data' field in model 'call'.
To silence this warning:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
pred = predict(rpart1, Test, type = "class") # predict classes
x = table(actual = Test$Reverse, pred); x
pred
actual 0 1
0 41 36
1 22 71
sum(diag(x))/sum(x) # .65882
[1] 0.65882
library(ROCR)
PredictROC = predict(rpart1, Test) # predict prob.
head(PredictROC)
0 1
1 0.30357 0.69643
3 0.30357 0.69643
4 0.40000 0.60000
6 0.40000 0.60000
8 0.40000 0.60000
21 0.30357 0.69643
perf = prediction(PredictROC[,2], Test$Reverse)
perf = performance(perf, "tpr", "fpr")
plot(perf)
pred = predict(rpart1, Test)[,2] # prob. of Reverse = 1
colAUC(pred, Test$Reverse, T) # AUC = 0.6927
[,1]
0 vs. 1 0.69271
rpart.plot(rpart1,cex=0.75,varlen=3,faclen=2,box.palette="GnRd")
Bad 'data' field in model 'call'.
To silence this warning:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
source("DPP.R")
auc = DPP(predict(rpart1)[,2], Train$Reverse, 1) # AUC = 0.7884
prp()、rpart.plot()和DPP()這幾張圖形minbuckett5 = rpart(Reverse ~ ., Train[,3:9], method="class", minbucket=5)
prp(t5)
Bad 'data' field in model 'call'.
To silence this warning:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
t100 = rpart(Reverse ~ ., Train[,3:9], method="class", minbucket=100)
prp(t100)
Bad 'data' field in model 'call'.
To silence this warning:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
minbucket?randomForest::randomForest()library(randomForest)
Train$Reverse = as.factor(Train$Reverse) # Convert outcome to factor
Test$Reverse = as.factor(Test$Reverse)
# Build model
rf1 = randomForest(Reverse ~ ., Train[,3:9], ntree=200, nodesize=25)
# Make predictions 預測類別
pred = predict(rf1, Test)
x = table(Test$Reverse, pred)
# 正確率 = 對角線(diagonal)相加/全部相加
sum(diag(x)) / sum(x) # 0.6824
[1] 0.68235
# 為了預測門閥值,設定type='prob'預測機率,答案會宇上題目一樣。
pred = predict(rf1, Test, type='prob')[,2]
table(Test$Reverse, pred > 0.5) %>% {sum(diag(.))/sum(.)}
[1] 0.68824
caret packagelibrary(caret)
numFolds = trainControl(method="cv", number=10) # 10 fold CV
# Kappa為另一種量測指標
# cp每次最佳會不同,Accuracy一樣選擇複雜度低(cp較大的值)。
cv1 = train(
Reverse ~ ., Train[,3:9], method = "rpart",
trControl = trainControl(method="cv", number=10), # 10 fold CV
tuneGrid=expand.grid(cp = seq(0.01,0.5,0.01)) # parameter combination
)
cv1; plot(cv1)
CART
396 samples
6 predictor
2 classes: '0', '1'
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 357, 356, 356, 356, 357, 356, ...
Resampling results across tuning parameters:
cp Accuracy Kappa
0.01 0.61359 0.207035
0.02 0.61365 0.206251
0.03 0.62385 0.231274
0.04 0.63141 0.256349
0.05 0.64410 0.284019
0.06 0.64410 0.284019
0.07 0.64410 0.284019
0.08 0.64410 0.284019
0.09 0.64410 0.284019
0.10 0.64410 0.284019
0.11 0.64410 0.284019
0.12 0.64410 0.284019
0.13 0.64410 0.284019
0.14 0.64410 0.284019
0.15 0.64410 0.284019
0.16 0.64410 0.284019
0.17 0.64410 0.284019
0.18 0.64410 0.284019
0.19 0.62103 0.230632
0.20 0.58301 0.137651
0.21 0.58301 0.137651
0.22 0.55301 0.048387
0.23 0.54788 0.032258
0.24 0.54538 0.017685
0.25 0.54538 0.000000
0.26 0.54538 0.000000
0.27 0.54538 0.000000
0.28 0.54538 0.000000
0.29 0.54538 0.000000
0.30 0.54538 0.000000
0.31 0.54538 0.000000
0.32 0.54538 0.000000
0.33 0.54538 0.000000
0.34 0.54538 0.000000
0.35 0.54538 0.000000
0.36 0.54538 0.000000
0.37 0.54538 0.000000
0.38 0.54538 0.000000
0.39 0.54538 0.000000
0.40 0.54538 0.000000
0.41 0.54538 0.000000
0.42 0.54538 0.000000
0.43 0.54538 0.000000
0.44 0.54538 0.000000
0.45 0.54538 0.000000
0.46 0.54538 0.000000
0.47 0.54538 0.000000
0.48 0.54538 0.000000
0.49 0.54538 0.000000
0.50 0.54538 0.000000
Accuracy was used to select the optimal model using the largest value.
The final value used for the model was cp = 0.18.
cp which we should we choose? Why?cp (complex parameter) => 當Accuracy一樣選擇複雜度低(cp較大的值)。 最終(佳)的CP顯示在最後一行。 +
rpart2 = rpart(Reverse ~ ., Train[,3:9], method="class", cp=0.17)
pred = predict(rpart2, Test, type='prob')[,2]
table(Test$Reverse, pred > 0.5) %>% {sum(diag(.)) / sum(.)} # 0.7235
[1] 0.72353
rpart.plot(rpart2)
Bad 'data' field in model 'call'.
To silence this warning:
Call rpart.plot with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
rpart
randomForest
caret package)
rm(list=ls(all=TRUE))
Error in names(frame) <- `*vtmp*` : names() applied to a non-vector
Error in names(frame) <- `*vtmp*` : names() applied to a non-vector
D = read.csv("data/ClaimsData.csv")
# Percentage of patients in each cost bucket
table(D$bucket2009)/nrow(D)
1 2 3 4 5
0.6712678 0.1901704 0.0894663 0.0433249 0.0057707
# Split the data
library(caTools)
set.seed(88)
spl = sample.split(D$bucket2009, SplitRatio = 0.6)
TR = subset(D, spl)
TS = subset(D, !spl)
# sapply(): Apply Function & Aggregate Output
# 1.sapply能夠製成表格 2.function(x)會隨著list的順序分別帶入3次
# 3.%>% t 將直行表格轉至橫的較容易比較
sapply(list(D, TR, TS), function(x)
table(x$bucket2009)/nrow(x) ) %>% t
1 2 3 4 5
[1,] 0.67127 0.19017 0.089466 0.043325 0.0057707
[2,] 0.67127 0.19017 0.089468 0.043326 0.0057714
[3,] 0.67127 0.19017 0.089464 0.043324 0.0057696
# Data Exploration
mean(TR$age) # 72.64
[1] 72.638
# mean(TR$diabetes) # .3809
table(D$diabetes)/nrow(D)
0 1
0.61954 0.38046
# Baseline Accuracy 2008&2009在同一個bucket為底線模型
# 用2008當成計算後的預測,2009年當成實際的值
table(actual = TS$bucket2009, pred = TS$bucket2008)
pred
actual 1 2 3 4 5
1 110138 7787 3427 1452 174
2 16000 10721 4629 2931 559
3 7006 4629 2774 1621 360
4 2688 1943 1415 1539 352
5 293 191 160 309 104
acc.base = table(TS$bucket2009, TS$bucket2008) %>% {sum(diag(.)) / sum(.)}
acc.base # acc.base = .6838
[1] 0.68381
# Penalty Matrix 懲罰矩陣
Penalty = matrix(c(
0,1,2,3,4,
2,0,1,2,3,
4,2,0,1,2,
6,4,2,0,1,
8,6,4,2,0), byrow=TRUE, nrow=5)
Penalty
[,1] [,2] [,3] [,4] [,5]
[1,] 0 1 2 3 4
[2,] 2 0 1 2 3
[3,] 4 2 0 1 2
[4,] 6 4 2 0 1
[5,] 8 6 4 2 0
# Penalty Error of Baseline Method
cm = as.matrix(table(TS$bucket2009, TS$bucket2008))
cm*Penalty
1 2 3 4 5
1 0 7787 6854 4356 696
2 32000 0 4629 5862 1677
3 28024 9258 0 1621 720
4 16128 7772 2830 0 352
5 2344 1146 640 618 0
pen.base = sum(cm*Penalty) / nrow(TS)
pen.base # pen.base: .7386
[1] 0.73861
# Dumb(較愚笨) baseline - Using the most frequent class
# bucket1最多所一全猜1,雖然smart底線的準確率差不多,但是懲罰矩陣會變大很多
acc.dumb = (table(TS$bucket2009)/nrow(TS))[1]
pen.dumb = sum(table(TS$bucket2009) * c(0,2,4,6,8))/nrow(TS)
c(acc.dumb, pen.dumb) # 0.6713 1.0443
1
0.67127 1.04430
# cp=0.00005應該是作者事前計算所得到的超參數
library(rpart)
library(rpart.plot)
rpart1 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)],
method="class", cp=0.00005)
prp(rpart1)
Bad 'data' field in model 'call'.
To silence this warning:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
# Make predictions
pred = predict(rpart1, newdata = TS, type = "class")
acc.rpart1 = table(TS$bucket2009, pred) %>% { sum(diag(.)) /nrow(TS)}
acc.rpart1 # acc.rpart1 = .71267 <- .68381
[1] 0.71267
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart1 = sum(cm*Penalty)/nrow(TS)
pen.rpart1 # pen.rpart1: .75789 <- .7386
[1] 0.75789
# parms=list(loss=Penalty) => 加入模型,原先的算法只會考慮是否預測正確
# (1 => 1)或是錯誤(1 => 5)error就是1和0,加入後當愈測錯誤
# (1 => 5)error必須用懲罰矩陣(8)的值,不再考慮準確性高,而是懲罰的值最低。
rpart2 = rpart(bucket2009 ~ ., data=TR[c(1:14, 16)],
method="class", cp=0.00005,
parms=list(loss=Penalty) # customized loss function
)
pred = predict(rpart2, TS, type = "class")
acc.rpart2 = table(TS$bucket2009, pred) %>% {sum(diag(.))/sum(.)}
acc.rpart2 # acc.rpart2 = .64727
[1] 0.64727
cm = as.matrix(table(TS$bucket2009, pred))
pen.rpart2 = sum(cm*Penalty)/nrow(TS)
pen.rpart2 # pen.rpart2: .64182
[1] 0.64182
# Summary
data.frame(
accuracy = c(acc.dumb, acc.base, acc.rpart1, acc.rpart2),
penalty = c(pen.dumb, pen.base, pen.rpart1, pen.rpart2),
row.names = c("dumb","Smart Base","CART","CART w/LossMx"))
rm(list=ls(all=TRUE))
Error in names(frame) <- `*vtmp*` : names() applied to a non-vector
Error in names(frame) <- `*vtmp*` : names() applied to a non-vector
D = read.csv("data/boston.csv")
# Plot observations
plot(D$LON, D$LAT)
# Tracts alongside the Charles River
with(D, points(LON[CHAS==1], LAT[CHAS==1], col="blue", pch=19))
points(D$LON[D$CHAS==1], D$LAT[D$CHAS==1], col="blue", pch=19)
# Plot MIT
with(D, points(LON[TRACT==3531], LAT[TRACT==3531], col="red", pch=19) )
# Plot polution
summary(D$NOX)
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.385 0.449 0.538 0.555 0.624 0.871
points(D$LON[D$NOX>=0.55], D$LAT[D$NOX>=0.55], col="green", pch=20)
# Plot prices
plot(D$LON, D$LAT)
summary(D$MEDV)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.0 17.0 21.2 22.5 25.0 50.0
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], col="red", pch=20)
latlonlm = lm(MEDV ~ LAT + LON, data=D)
summary(latlonlm)
Call:
lm(formula = MEDV ~ LAT + LON, data = D)
Residuals:
Min 1Q Median 3Q Max
-16.46 -5.59 -1.30 3.69 28.13
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3178.47 484.94 -6.55 0.000000000138619 ***
LAT 8.05 6.33 1.27 0.2
LON -40.27 5.18 -7.77 0.000000000000045 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 8.69 on 503 degrees of freedom
Multiple R-squared: 0.107, Adjusted R-squared: 0.104
F-statistic: 30.2 on 2 and 503 DF, p-value: 0.000000000000416
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2], col="red", pch=20)
# latlonlm$fitted.values
points(D$LON[latlonlm$fitted.values >= 21.2],
D$LAT[latlonlm$fitted.values >= 21.2],
col="blue", pch="x")
library(rpart)
library(rpart.plot)
# CART model
latlontree = rpart(MEDV ~ LAT + LON, data=D)
prp(latlontree)
# Visualize output
plot(D$LON, D$LAT)
points(D$LON[D$MEDV>=21.2], D$LAT[D$MEDV>=21.2],
col="red", pch=20)
fittedvalues = predict(latlontree)
points(D$LON[fittedvalues>21.2],
D$LAT[fittedvalues>=21.2], col="blue", pch="x")
latlontree = rpart(MEDV ~ LAT + LON, data=D, minbucket=50)
rpart.plot(latlontree)
# Visualize Output
plot(D$LON,D$LAT)
abline(v=-71.07)
abline(h=42.21)
abline(h=42.17)
points(D$LON[D$MEDV>=21.2],
D$LAT[D$MEDV>=21.2], col="red", pch=20)
# Split the data
library(caTools)
set.seed(123)
split = sample.split(D$MEDV, SplitRatio = 0.7)
train = subset(D, split==TRUE)
test = subset(D, split==FALSE)
# Create linear regression
linreg = lm(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO, data=train)
summary(linreg)
Call:
lm(formula = MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX +
RM + AGE + DIS + RAD + TAX + PTRATIO, data = train)
Residuals:
Min 1Q Median 3Q Max
-14.51 -2.71 -0.68 1.79 36.88
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -252.29051 436.71008 -0.58 0.564
LAT 1.54396 5.19196 0.30 0.766
LON -2.98711 4.78583 -0.62 0.533
CRIM -0.18080 0.04391 -4.12 0.000047725 ***
ZN 0.03250 0.01877 1.73 0.084 .
INDUS -0.04297 0.08473 -0.51 0.612
CHAS 2.90418 1.22005 2.38 0.018 *
NOX -21.61279 5.41371 -3.99 0.000079778 ***
RM 6.28440 0.48270 13.02 < 2e-16 ***
AGE -0.04430 0.01785 -2.48 0.014 *
DIS -1.57736 0.28418 -5.55 0.000000056 ***
RAD 0.24509 0.09728 2.52 0.012 *
TAX -0.01112 0.00545 -2.04 0.042 *
PTRATIO -0.98349 0.19389 -5.07 0.000000638 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 5.6 on 350 degrees of freedom
Multiple R-squared: 0.665, Adjusted R-squared: 0.653
F-statistic: 53.4 on 13 and 350 DF, p-value: <2e-16
# Make predictions
linreg.pred = predict(linreg, newdata=test)
linreg.sse = sum((linreg.pred - test$MEDV)^2)
linreg.sse
[1] 3037.1
tree = rpart(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO, data=train)
prp(tree)
# Make predictions
tree.pred = predict(tree, newdata=test)
tree.sse = sum((tree.pred - test$MEDV)^2)
tree.sse
[1] 4329
library(caret)
# Number of folds
tr.control = trainControl(method = "cv", number = 10)
# cp values
cp.grid = expand.grid( .cp = (0:10)*0.001)
# Cross-validation
tr = train(MEDV ~ LAT + LON + CRIM + ZN + INDUS + CHAS + NOX + RM +
AGE + DIS + RAD + TAX + PTRATIO,
data = train, method = "rpart",
trControl = tr.control, tuneGrid = cp.grid)
tr
CART
364 samples
13 predictor
No pre-processing
Resampling: Cross-Validated (10 fold)
Summary of sample sizes: 327, 326, 328, 329, 328, 327, ...
Resampling results across tuning parameters:
cp RMSE Rsquared MAE
0.000 4.7290 0.75979 3.0684
0.001 4.7446 0.75837 3.1046
0.002 4.7563 0.75688 3.0909
0.003 4.8067 0.74908 3.1510
0.004 4.8756 0.74179 3.2299
0.005 4.9174 0.73682 3.2811
0.006 4.9539 0.73155 3.3217
0.007 4.8985 0.73604 3.2903
0.008 4.9334 0.72974 3.3231
0.009 4.9056 0.73195 3.3081
0.010 4.9056 0.73195 3.3081
RMSE was used to select the optimal model using the smallest value.
The final value used for the model was cp = 0.
# Extract tree
best.tree = tr$finalModel
prp(best.tree)
Bad 'data' field in model 'call'.
To silence this warning:
Call prp with roundint=FALSE,
or rebuild the rpart model with model=TRUE.
# Make predictions
best.tree.pred = predict(best.tree, newdata=test)
best.tree.sse = sum((best.tree.pred - test$MEDV)^2)
best.tree.sse
[1] 3660.1
en.pred = (best.tree.pred + linreg.pred)/2
sum((en.pred - test$MEDV)^2)
[1] 2598.6
# METHOD SSE
# lm 3037
# rpart 4329
# rpart.cv 3660
# lm+rpart.cv 2599