Compare classifiers using:
AUC and cost are different metrics and should not be directly compared.
Use training data only to build models and choose tuning parameters.
Use testing data only for final out-of-sample evaluation.
For regression models:
For classification models:
1/(5+1) = 1/6rm(list=ls())
library(MASS)
library(boot)
library(rpart)
library(rpart.plot)
library(FNN)
data(Boston)
attach(Boston)
n <- nrow(Boston)
p <- ncol(Boston) - 1
set.seed(14696133)
sample_index <- sample(nrow(Boston), nrow(Boston) * 0.80)
Boston_train <- Boston[sample_index, ]
Boston_test <- Boston[-sample_index, ]
Boston.train.lm <- lm(medv ~ . - indus - age, data = Boston_train)
summary(Boston.train.lm)
##
## Call:
## lm(formula = medv ~ . - indus - age, data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.0967 -2.7533 -0.6326 1.4059 25.5257
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.910017 5.644205 6.539 1.93e-10 ***
## crim -0.108492 0.034066 -3.185 0.001564 **
## zn 0.061968 0.014951 4.145 4.17e-05 ***
## chas 2.332534 0.911222 2.560 0.010848 *
## nox -17.018507 3.980998 -4.275 2.40e-05 ***
## rm 3.335427 0.467919 7.128 4.92e-12 ***
## dis -1.579465 0.211544 -7.466 5.39e-13 ***
## rad 0.268754 0.068595 3.918 0.000105 ***
## tax -0.010210 0.003634 -2.809 0.005213 **
## ptratio -0.869892 0.143438 -6.065 3.12e-09 ***
## black 0.010636 0.002929 3.631 0.000319 ***
## lstat -0.524712 0.053947 -9.726 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.741 on 392 degrees of freedom
## Multiple R-squared: 0.7318, Adjusted R-squared: 0.7243
## F-statistic: 97.24 on 11 and 392 DF, p-value: < 2.2e-16
# In-sample ASE
yhat.train <- predict(Boston.train.lm, Boston_train)
ase.train <- mean((Boston_train$medv - yhat.train)^2)
ase.train
## [1] 21.80507
# Out-of-sample MSPE
yhat.test <- predict(Boston.train.lm, Boston_test)
mspe.lm <- mean((Boston_test$medv - yhat.test)^2)
mspe.lm
## [1] 23.43028
# RMSPE
sqrt(mspe.lm)
## [1] 4.840483
Boston.lm <- glm(medv ~ . - indus - age, data = Boston)
# 5-fold CV
cv5 <- cv.glm(Boston, Boston.lm, K = 5)$delta[2]
cv5
## [1] 23.50911
# Leave-one-out CV
cvloo <- cv.glm(Boston, Boston.lm, K = n)$delta[2]
cvloo
## [1] 23.51161
Boston_tree <- rpart(medv ~ ., data = Boston_train)
prp(Boston_tree, digits = 3, extra = 1)
Boston_largetree <- rpart(
medv ~ .,
data = Boston_train,
cp = 0.001
)
plotcp(Boston_largetree)
# choose the leftmost cp below the dotted line in plotcp()
Boston_pruned <- prune(Boston_largetree, cp = 0.0058)
prp(Boston_pruned, digits = 3, extra = 1)
pred_train_tree <- predict(Boston_pruned, Boston_train)
pred_test_tree <- predict(Boston_pruned, Boston_test)
# In-sample ASE
mean((Boston_train$medv - pred_train_tree)^2)
## [1] 13.03044
# Out-of-sample MSPE
mean((Boston_test$medv - pred_test_tree)^2)
## [1] 16.08579
train.norm <- Boston_train
test.norm <- Boston_test
cols <- colnames(train.norm[, 1:p])
for(j in cols){
train.norm[[j]] <- (train.norm[[j]] - min(Boston_train[[j]])) /
(max(Boston_train[[j]]) - min(Boston_train[[j]]))
test.norm[[j]] <- (test.norm[[j]] - min(Boston_train[[j]])) /
(max(Boston_train[[j]]) - min(Boston_train[[j]]))
}
Boston.knn.reg <- knn.reg(
train = train.norm[, 1:p],
test = test.norm[, 1:p],
y = train.norm$medv,
k = 5
)
Boston.results <- data.frame(
pred = Boston.knn.reg$pred,
actual = Boston_test$medv
)
head(Boston.results, 20)
## pred actual
## 1 24.86 21.6
## 2 27.22 33.4
## 3 28.52 36.2
## 4 20.08 27.1
## 5 17.32 16.5
## 6 19.02 15.0
## 7 18.90 18.2
## 8 15.98 13.9
## 9 15.18 14.8
## 10 17.64 21.0
## 11 14.32 13.1
## 12 30.42 30.8
## 13 22.74 19.6
## 14 20.18 16.0
## 15 24.18 33.0
## 16 23.30 20.3
## 17 24.80 22.2
## 18 25.58 25.0
## 19 21.64 20.6
## 20 35.36 38.7
MSPE.knn5 <- mean((Boston_test$medv - Boston.knn.reg$pred)^2)
MSPE.knn5
## [1] 24.3702
sqrt(MSPE.knn5)
## [1] 4.936618
set.seed(14696133)
sample_index2 <- sample(nrow(train.norm), nrow(train.norm) * 0.80)
train2.norm <- train.norm[sample_index2, ]
valid.norm <- train.norm[-sample_index2, ]
RMSE.df <- data.frame(k = 1:30, RMSE.k = 0)
for(i in 1:30){
knn.reg.pred <- knn.reg(
train = train2.norm[, 1:p],
test = valid.norm[, 1:p],
y = train2.norm$medv,
k = i
)
RMSE.df[i,2] <- sqrt(mean((valid.norm$medv - knn.reg.pred$pred)^2))
}
RMSE.df
## k RMSE.k
## 1 1 4.151364
## 2 2 4.238157
## 3 3 3.355517
## 4 4 2.993295
## 5 5 3.207203
## 6 6 3.285525
## 7 7 3.316633
## 8 8 3.443048
## 9 9 3.521635
## 10 10 3.564697
## 11 11 3.679559
## 12 12 3.795558
## 13 13 3.915780
## 14 14 4.119022
## 15 15 4.143529
## 16 16 4.195813
## 17 17 4.141273
## 18 18 4.147766
## 19 19 4.124188
## 20 20 4.165848
## 21 21 4.188284
## 22 22 4.173539
## 23 23 4.224419
## 24 24 4.249256
## 25 25 4.233384
## 26 26 4.217778
## 27 27 4.252812
## 28 28 4.245642
## 29 29 4.261809
## 30 30 4.264113
best_k <- which.min(RMSE.df$RMSE.k)
best_k
## [1] 4
Boston.knn.final <- knn.reg(
train = train.norm[, 1:p],
test = test.norm[, 1:p],
y = train.norm$medv,
k = best_k
)
MSPE.knn.final <- mean((Boston_test$medv - Boston.knn.final$pred)^2)
MSPE.knn.final
## [1] 20.69513
sqrt(MSPE.knn.final)
## [1] 4.549191
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.2.1 ✔ readr 2.2.0
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.3 ✔ tibble 3.3.1
## ✔ lubridate 1.9.5 ✔ tidyr 1.3.2
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ROCR)
credit <- read.csv("credit_default.csv")
credit <- rename(
credit,
default = default.payment.next.month
)
credit$SEX <- as.factor(credit$SEX)
credit$EDUCATION <- as.factor(credit$EDUCATION)
credit$MARRIAGE <- as.factor(credit$MARRIAGE)
mean(credit$default)
## [1] 0.2212
set.seed(1234)
sample_index <- sample(nrow(credit), nrow(credit) * 0.80)
credit_train <- credit[sample_index, ]
credit_test <- credit[-sample_index, ]
credit_glm <- glm(
default ~ .,
family = binomial,
data = credit_train
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(credit_glm)
##
## Call:
## glm(formula = default ~ ., family = binomial, data = credit_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.749e-01 9.664e-02 -10.088 < 2e-16 ***
## LIMIT_BAL -7.023e-07 1.764e-07 -3.981 6.87e-05 ***
## SEX2 -1.020e-01 3.436e-02 -2.969 0.002991 **
## EDUCATION2 -8.002e-02 3.971e-02 -2.015 0.043892 *
## EDUCATION3 -8.876e-02 5.312e-02 -1.671 0.094705 .
## EDUCATION4 -1.378e+00 2.285e-01 -6.033 1.61e-09 ***
## MARRIAGE2 -2.091e-01 3.866e-02 -5.410 6.32e-08 ***
## MARRIAGE3 -1.937e-01 1.458e-01 -1.328 0.184223
## AGE 5.227e-03 2.081e-03 2.512 0.011996 *
## PAY_0 5.711e-01 1.978e-02 28.869 < 2e-16 ***
## PAY_2 7.985e-02 2.268e-02 3.521 0.000430 ***
## PAY_3 6.264e-02 2.552e-02 2.454 0.014116 *
## PAY_4 2.113e-02 2.804e-02 0.754 0.451082
## PAY_5 3.734e-02 2.995e-02 1.247 0.212506
## PAY_6 2.273e-02 2.465e-02 0.922 0.356422
## BILL_AMT1 -6.787e-06 1.300e-06 -5.220 1.79e-07 ***
## BILL_AMT2 3.593e-06 1.666e-06 2.157 0.031013 *
## BILL_AMT3 1.875e-06 1.471e-06 1.275 0.202428
## BILL_AMT4 -5.134e-07 1.503e-06 -0.342 0.732644
## BILL_AMT5 2.208e-07 1.702e-06 0.130 0.896761
## BILL_AMT6 6.880e-07 1.334e-06 0.516 0.606118
## PAY_AMT1 -1.648e-05 2.732e-06 -6.030 1.64e-09 ***
## PAY_AMT2 -7.902e-06 2.200e-06 -3.591 0.000329 ***
## PAY_AMT3 -2.790e-06 1.981e-06 -1.408 0.159037
## PAY_AMT4 -4.752e-06 2.037e-06 -2.333 0.019633 *
## PAY_AMT5 -1.362e-06 1.842e-06 -0.739 0.459694
## PAY_AMT6 -1.519e-06 1.448e-06 -1.049 0.294072
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 25433 on 23999 degrees of freedom
## Residual deviance: 22306 on 23973 degrees of freedom
## AIC: 22360
##
## Number of Fisher Scoring iterations: 6
pred_glm_train <- predict(
credit_glm,
credit_train,
type = "response"
)
pred_train <- prediction(pred_glm_train, credit_train$default)
performance(pred_train, "auc")@y.values[[1]]
## [1] 0.7263534
pred_glm_test <- predict(
credit_glm,
credit_test,
type = "response"
)
pred_test <- prediction(pred_glm_test, credit_test$default)
performance(pred_test, "auc")@y.values[[1]]
## [1] 0.7216786
costfunc <- function(r, pi){
weight1 <- 5
weight0 <- 1
pcut <- 1 / (weight1 + weight0)
c1 <- (r == 1) & (pi < pcut)
c0 <- (r == 0) & (pi > pcut)
mean(weight1*c1 + weight0*c0)
}
costfunc(credit_train$default, pred_glm_train)
## [1] 0.683
costfunc(credit_test$default, pred_glm_test)
## [1] 0.6775
costfunc_auc <- function(obs, pred.p){
pred <- prediction(pred.p, obs)
auc <- performance(pred, "auc")@y.values[[1]]
return(auc)
}
costfunc_auc(credit_test$default, pred_glm_test)
## [1] 0.7216786
credit0_glm <- glm(
default ~ .,
family = binomial,
data = credit
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv_result_cost <- cv.glm(
data = credit,
glmfit = credit0_glm,
cost = costfunc,
K = 5
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv_result_cost$delta[2]
## [1] 0.6835867
cv_result_auc <- cv.glm(
data = credit,
glmfit = credit0_glm,
cost = costfunc_auc,
K = 5
)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
cv_result_auc$delta[2]
## [1] 0.7243057
credit_tree <- rpart(
default ~ .,
data = credit_train,
method = "class",
parms = list(loss = matrix(c(0,5,1,0), nrow = 2))
)
prp(credit_tree, extra = 1)
prob_tree_train <- predict(
credit_tree,
credit_train,
type = "prob"
)
pred_tree_train <- prediction(prob_tree_train[,2], credit_train$default)
performance(pred_tree_train, "auc")@y.values[[1]]
## [1] 0.735413
costfunc(credit_train$default, prob_tree_train[,2])
## [1] 0.5664583
prob_tree_test <- predict(
credit_tree,
credit_test,
type = "prob"
)
pred_tree_test <- prediction(prob_tree_test[,2], credit_test$default)
performance(pred_tree_test, "auc")@y.values[[1]]
## [1] 0.7367827
costfunc(credit_test$default, prob_tree_test[,2])
## [1] 0.5643333
mtry = p/3mtry = sqrt(p)%IncMSE
Often top variables: - lstat - rm
n.treesshrinkageinteraction.depth| Method | Main Strength | Main Weakness |
|---|---|---|
| Tree | Most interpretable | Weakest accuracy |
| Bagging | Reduces variance | Less interpretable |
| Random Forest | Strong default performer | Black box |
| Boosting | Often highest accuracy | Sensitive tuning |
library(MASS)
library(ipred)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(gbm)
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
data(Boston)
set.seed(14696133)
index <- sample(nrow(Boston), nrow(Boston) * 0.80)
boston_train <- Boston[index, ]
boston_test <- Boston[-index, ]
boston_bag <- bagging(
medv ~ .,
data = boston_train,
nbagg = 100
)
boston_bag
##
## Bagging regression trees with 100 bootstrap replications
##
## Call: bagging.data.frame(formula = medv ~ ., data = boston_train, nbagg = 100)
boston_bag_pred <- predict(
boston_bag,
newdata = boston_test
)
mean((boston_test$medv - boston_bag_pred)^2)
## [1] 15.15432
boston_tree <- rpart(
medv ~ .,
data = boston_train
)
boston_tree_pred <- predict(
boston_tree,
newdata = boston_test
)
mean((boston_test$medv - boston_tree_pred)^2)
## [1] 17.18802
ntree <- c(1, 3, 5, seq(10, 200, 10))
MSE_test <- rep(0, length(ntree))
for(i in 1:length(ntree)){
boston_bag1 <- bagging(
medv ~ .,
data = boston_train,
nbagg = ntree[i]
)
boston_bag_pred1 <- predict(
boston_bag1,
newdata = boston_test
)
MSE_test[i] <- mean((boston_test$medv - boston_bag_pred1)^2)
}
plot(
ntree,
MSE_test,
type = "l",
col = 2,
lwd = 2,
xaxt = "n",
xlab = "Number of Trees",
ylab = "Test MSE"
)
axis(1, at = ntree, las = 1)
boston_bag_oob <- bagging(
medv ~ .,
data = boston_train,
coob = TRUE,
nbagg = 100
)
boston_bag_oob
##
## Bagging regression trees with 100 bootstrap replications
##
## Call: bagging.data.frame(formula = medv ~ ., data = boston_train, coob = TRUE,
## nbagg = 100)
##
## Out-of-bag estimate of root mean squared error: 4.0942
%IncMSE = more important variable.boston_rf <- randomForest(
medv ~ .,
data = boston_train,
importance = TRUE
)
boston_rf
##
## Call:
## randomForest(formula = medv ~ ., data = boston_train, importance = TRUE)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 11.18541
## % Var explained: 86.24
mod_rf <- randomForest(
medv ~ .,
data = boston_train,
importance = TRUE,
ntree = 500
)
mod_rf
##
## Call:
## randomForest(formula = medv ~ ., data = boston_train, importance = TRUE, ntree = 500)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 11.04376
## % Var explained: 86.42
boston_rf$importance
## %IncMSE IncNodePurity
## crim 9.2093070 2087.8188
## zn 0.9819360 413.4442
## indus 4.8605233 1632.5498
## chas 0.6044036 155.3731
## nox 8.3955809 1793.5264
## rm 31.0016862 9129.3335
## age 3.5009343 818.4138
## dis 6.0384457 1836.9817
## rad 1.4178652 314.4648
## tax 4.1840342 1270.0783
## ptratio 7.8552517 2324.4928
## black 1.0577753 601.3486
## lstat 63.5531032 9678.5180
mod_rf$importance
## %IncMSE IncNodePurity
## crim 8.5087897 2006.1373
## zn 1.0560801 439.7611
## indus 4.7714949 1617.7114
## chas 0.5256723 153.2575
## nox 8.4173936 1811.7160
## rm 31.6212873 9030.8541
## age 3.5098929 880.2611
## dis 5.2977454 1859.1130
## rad 1.5711089 270.8520
## tax 4.8527431 1230.8522
## ptratio 7.6964168 2338.8678
## black 1.1640376 514.4605
## lstat 63.9362490 9921.9339
importance(mod_rf)
## %IncMSE IncNodePurity
## crim 14.968271 2006.1373
## zn 4.896609 439.7611
## indus 11.260089 1617.7114
## chas 4.253487 153.2575
## nox 17.463266 1811.7160
## rm 35.353087 9030.8541
## age 12.143403 880.2611
## dis 14.335982 1859.1130
## rad 6.270706 270.8520
## tax 10.761469 1230.8522
## ptratio 16.119109 2338.8678
## black 6.596339 514.4605
## lstat 30.224024 9921.9339
varImpPlot(mod_rf)
plot(
boston_rf$mse,
type = "l",
col = 2,
lwd = 2,
xlab = "ntree",
ylab = "OOB Error"
)
plot(
mod_rf$mse,
type = "l",
col = 2,
lwd = 2,
xlab = "ntree",
ylab = "OOB Error"
)
boston_rf_pred <- predict(
boston_rf,
boston_test
)
mean((boston_test$medv - boston_rf_pred)^2)
## [1] 10.40444
oob_err <- rep(0, 13)
test_err <- rep(0, 13)
for(i in 1:13){
fit <- randomForest(
medv ~ .,
data = boston_train,
mtry = i,
ntree = 500
)
oob_err[i] <- fit$mse[500]
test.err.pred <- predict(fit, boston_test)
test_err[i] <- mean((boston_test$medv - test.err.pred)^2)
cat(i, " ")
}
## 1 2 3 4 5 6 7 8 9 10 11 12 13
matplot(
cbind(test_err, oob_err),
pch = 15,
col = c("red", "blue"),
type = "b",
ylab = "MSE",
xlab = "mtry"
)
legend(
"topright",
legend = c("Test Error", "OOB Error"),
pch = 15,
col = c("red", "blue")
)
gbm is used for boosted treesboston_boost <- gbm(
formula = medv ~ .,
data = boston_train,
distribution = "gaussian",
n.trees = 10000,
shrinkage = 0.01,
interaction.depth = 8
)
summary(boston_boost)
## var rel.inf
## lstat lstat 40.3128152
## rm rm 28.8942667
## dis dis 8.9284832
## crim crim 4.6552477
## age age 4.2216564
## nox nox 4.0666304
## black black 2.5109363
## ptratio ptratio 2.4274624
## tax tax 1.8477892
## rad rad 0.8357928
## indus indus 0.6807902
## chas chas 0.4381755
## zn zn 0.1799540
par(mfrow = c(1,2))
plot(boston_boost, i = "lstat")
plot(boston_boost, i = "rm")
boston_boost_pred_test <- predict(
boston_boost,
boston_test,
n.trees = 10000
)
mean((boston_test$medv - boston_boost_pred_test)^2)
## [1] 7.341079
ntree <- seq(100, 10000, 100)
predmat <- predict(
boston_boost,
newdata = boston_test,
n.trees = ntree
)
err <- apply(
(predmat - boston_test$medv)^2,
2,
mean
)
plot(
ntree,
err,
type = "l",
col = 2,
lwd = 2,
xlab = "n.trees",
ylab = "Test MSE"
)
abline(h = min(err), lty = 2)
library(ROCR)
credit_data <- read.csv(
file = "https://yanyudm.github.io/Data-Mining-R/lecture/data/credit_default.csv",
header = TRUE
)
credit_data$SEX <- as.factor(credit_data$SEX)
credit_data$EDUCATION <- as.factor(credit_data$EDUCATION)
credit_data$MARRIAGE <- as.factor(credit_data$MARRIAGE)
set.seed(14696133)
index <- sample(nrow(credit_data), nrow(credit_data) * 0.80)
credit_train <- credit_data[index, ]
credit_test <- credit_data[-index, ]
credit_rf <- randomForest(
as.factor(default.payment.next.month) ~ .,
data = credit_train,
importance = TRUE,
ntree = 500
)
credit_rf
##
## Call:
## randomForest(formula = as.factor(default.payment.next.month) ~ ., data = credit_train, importance = TRUE, ntree = 500)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 18.24%
## Confusion matrix:
## 0 1 class.error
## 0 7075 426 0.05679243
## 1 1325 774 0.63125298
plot(credit_rf, lwd = rep(2, 3))
legend(
"right",
legend = c("OOB Error", "FPR", "FNR"),
lwd = rep(2, 3),
lty = c(1, 2, 3),
col = c("black", "red", "green")
)
credit_rf_pred_train <- predict(
credit_rf,
type = "prob"
)[,2]
pred_train_rf <- prediction(
credit_rf_pred_train,
credit_train$default.payment.next.month
)
performance(pred_train_rf, "auc")@y.values[[1]]
## [1] 0.7611617
credit_rf_pred_test <- predict(
credit_rf,
newdata = credit_test,
type = "prob"
)[,2]
pred_test_rf <- prediction(
credit_rf_pred_test,
credit_test$default.payment.next.month
)
performance(pred_test_rf, "auc")@y.values[[1]]
## [1] 0.7730796
pcut <- 1/6
credit_rf_class_test <- (credit_rf_pred_test > pcut) * 1
table(
credit_test$default.payment.next.month,
credit_rf_class_test,
dnn = c("True", "Pred")
)
## Pred
## True 0 1
## 0 1151 716
## 1 126 407
library(adabag)
## Loading required package: caret
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
## Loading required package: foreach
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
##
## Attaching package: 'adabag'
## The following object is masked from 'package:ipred':
##
## bagging
credit_train$default.payment.next.month <- as.factor(
credit_train$default.payment.next.month
)
credit_boost <- boosting(
default.payment.next.month ~ .,
data = credit_train,
boos = TRUE
)
pred_credit_boost_train <- predict(
credit_boost,
newdata = credit_train
)
pred_train_boost <- prediction(
pred_credit_boost_train$prob[,2],
credit_train$default.payment.next.month
)
performance(pred_train_boost, "auc")@y.values[[1]]
## [1] 0.8150754
pred_credit_boost_test <- predict(
credit_boost,
newdata = credit_test
)
pred_test_boost <- prediction(
pred_credit_boost_test$prob[,2],
credit_test$default.payment.next.month
)
performance(pred_test_boost, "auc")@y.values[[1]]
## [1] 0.7780233
s(x) means predictor x enters as a smooth
nonlinear term.s(), it enters as a
regular linear term.edf ≈ 1 means the term is approximately linear.edf means more nonlinear/flexible.From the Boston GAM output, predictors with edf close to
1 may be entered linearly: - zn - age -
ptratio - sometimes black
Variables with larger edf, such as rm and
lstat, should usually stay nonlinear.
rm(list=ls())
library(MASS)
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.9-4. For overview type '?mgcv'.
data(Boston)
set.seed(14696133)
sample_index <- sample(nrow(Boston), nrow(Boston) * 0.80)
Boston_train <- Boston[sample_index, ]
Boston_test <- Boston[-sample_index, ]
str(Boston_train)
## 'data.frame': 404 obs. of 14 variables:
## $ crim : num 0.029 0.0871 0.3298 0.0502 0.6298 ...
## $ zn : num 40 0 0 35 0 0 20 0 0 0 ...
## $ indus : num 1.25 12.83 21.89 6.06 8.14 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.429 0.437 0.624 0.438 0.538 ...
## $ rm : num 6.94 6.14 5.82 5.71 5.95 ...
## $ age : num 34.5 45.8 95.4 28.4 61.8 85.4 42.1 63.1 70.6 45.4 ...
## $ dis : num 8.79 4.09 2.47 6.64 4.71 ...
## $ rad : int 1 5 4 1 4 24 3 2 6 5 ...
## $ tax : num 335 398 437 304 307 666 223 270 391 224 ...
## $ ptratio: num 19.7 18.7 21.2 16.9 21 20.2 18.6 17.8 19.2 20.2 ...
## $ black : num 390 387 389 394 397 ...
## $ lstat : num 5.89 10.27 15.03 12.43 8.26 ...
## $ medv : num 26.6 20.8 18.4 17.1 20.4 8.5 21.1 28.7 18.3 19 ...
Boston_gam <- gam(
medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) +
s(rm) + s(age) + s(dis) + rad + s(tax) +
s(ptratio) + s(black) + s(lstat),
data = Boston_train
)
summary(Boston_gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + s(zn) + s(indus) + chas + s(nox) + s(rm) + s(age) +
## s(dis) + rad + s(tax) + s(ptratio) + s(black) + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.1724 1.2122 15.816 < 2e-16 ***
## chas 0.4965 0.6489 0.765 0.44471
## rad 0.3333 0.1282 2.599 0.00975 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 6.964 7.951 7.350 < 2e-16 ***
## s(zn) 1.000 1.000 0.985 0.32165
## s(indus) 4.847 5.807 3.122 0.00404 **
## s(nox) 8.799 8.974 14.931 < 2e-16 ***
## s(rm) 6.984 8.059 18.927 < 2e-16 ***
## s(age) 2.843 3.580 0.770 0.60175
## s(dis) 8.779 8.981 8.059 < 2e-16 ***
## s(tax) 5.568 6.601 5.001 2.29e-05 ***
## s(ptratio) 1.089 1.167 22.404 2.04e-06 ***
## s(black) 1.791 2.213 0.849 0.39954
## s(lstat) 7.359 8.334 23.730 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.886 Deviance explained = 90.2%
## GCV = 10.877 Scale est. = 9.2882 n = 404
plot(Boston_gam, pages = 1)
AIC(Boston_gam)
## [1] 2103.154
BIC(Boston_gam)
## [1] 2343.334
Boston_gam$deviance
## [1] 3204.195
# Model MSE using residual df
Boston_gam_mse_df <- Boston_gam$dev / Boston_gam$df.residual
Boston_gam_mse_df
## [1] 9.288165
# In-sample ASE
Boston_gam_pred_train <- predict(Boston_gam, Boston_train)
Boston_gam_ASE <- mean((Boston_gam_pred_train - Boston_train$medv)^2)
Boston_gam_ASE
## [1] 7.931176
Boston_gam_pred_test <- predict(Boston_gam, Boston_test)
Boston_gam_MSPE <- mean((Boston_gam_pred_test - Boston_test$medv)^2)
Boston_gam_MSPE
## [1] 12.37494
edf values and plots to decide which predictors
should stay nonlinear.edf around 1 can be moved to linear
terms.zn, age,
ptratio, and sometimes black may be considered
linear.rm, lstat, nox,
dis, indus, tax, and
crim usually remain nonlinear.Boston_gam_reduced <- gam(
medv ~ s(crim) + zn + s(indus) + chas + s(nox) +
s(rm) + age + s(dis) + rad + s(tax) +
ptratio + black + s(lstat),
data = Boston_train
)
summary(Boston_gam_reduced)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## medv ~ s(crim) + zn + s(indus) + chas + s(nox) + s(rm) + age +
## s(dis) + rad + s(tax) + ptratio + black + s(lstat)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.0743652 2.9592142 10.501 < 2e-16 ***
## zn 0.0185057 0.0156114 1.185 0.237
## chas 0.4584852 0.6515384 0.704 0.482
## age 0.0007472 0.0119674 0.062 0.950
## rad 0.3044299 0.1281276 2.376 0.018 *
## ptratio -0.6660244 0.1291664 -5.156 4.24e-07 ***
## black 0.0011021 0.0022008 0.501 0.617
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(crim) 6.848 7.863 7.333 < 2e-16 ***
## s(indus) 4.754 5.712 2.942 0.00612 **
## s(nox) 8.857 8.985 14.910 < 2e-16 ***
## s(rm) 6.955 8.041 18.769 < 2e-16 ***
## s(dis) 8.747 8.977 8.051 < 2e-16 ***
## s(tax) 5.597 6.639 4.965 2.42e-05 ***
## s(lstat) 7.348 8.328 23.092 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.884 Deviance explained = 90%
## GCV = 10.946 Scale est. = 9.4258 n = 404
plot(Boston_gam_reduced, pages = 1)
# In-sample ASE
pred_train_reduced <- predict(Boston_gam_reduced, Boston_train)
ASE_gam_reduced <- mean((pred_train_reduced - Boston_train$medv)^2)
ASE_gam_reduced
## [1] 8.116753
# Out-of-sample MSPE
pred_test_reduced <- predict(Boston_gam_reduced, Boston_test)
MSPE_gam_reduced <- mean((pred_test_reduced - Boston_test$medv)^2)
MSPE_gam_reduced
## [1] 12.01165
vis.gam(
Boston_gam_reduced,
view = c("lstat", "black")
)
library(dplyr)
library(ROCR)
credit_data <- read.csv(
file = "https://yanyudm.github.io/Data-Mining-R/lecture/data/credit_default.csv",
header = TRUE
)
credit_data <- rename(
credit_data,
default = default.payment.next.month
)
credit_data$SEX <- as.factor(credit_data$SEX)
credit_data$EDUCATION <- as.factor(credit_data$EDUCATION)
credit_data$MARRIAGE <- as.factor(credit_data$MARRIAGE)
set.seed(14696133)
index <- sample(nrow(credit_data), nrow(credit_data) * 0.80)
credit_train <- credit_data[index, ]
credit_test <- credit_data[-index, ]
gam_formula <- as.formula(
"default ~ s(LIMIT_BAL) + s(AGE) + s(PAY_0) +
s(BILL_AMT1) + s(PAY_AMT1) + SEX + EDUCATION + MARRIAGE"
)
credit_gam <- gam(
formula = gam_formula,
family = binomial,
data = credit_train
)
summary(credit_gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## default ~ s(LIMIT_BAL) + s(AGE) + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) +
## SEX + EDUCATION + MARRIAGE
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.25697 0.07210 -17.433 < 2e-16 ***
## SEX2 -0.18636 0.05699 -3.270 0.00107 **
## EDUCATION2 -0.06484 0.06576 -0.986 0.32414
## EDUCATION3 -0.12370 0.08827 -1.401 0.16111
## EDUCATION4 -1.48059 0.34222 -4.326 1.52e-05 ***
## MARRIAGE2 -0.08979 0.06552 -1.370 0.17057
## MARRIAGE3 -0.05574 0.24966 -0.223 0.82333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(LIMIT_BAL) 5.086 6.069 99.093 < 2e-16 ***
## s(AGE) 2.738 3.462 2.842 0.415283
## s(PAY_0) 6.524 7.266 1084.412 < 2e-16 ***
## s(BILL_AMT1) 1.951 2.457 38.654 < 2e-16 ***
## s(PAY_AMT1) 5.439 6.651 27.243 0.000267 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.2 Deviance explained = 17.2%
## UBRE = -0.12379 Scale est. = 1 n = 9600
plot(
credit_gam,
shade = TRUE,
seWithMean = TRUE,
scale = 0,
pages = 1
)
vis.gam(
credit_gam,
view = c("LIMIT_BAL", "AGE"),
theta = 140
)
creditcost <- function(r, pi){
weight1 <- 5
weight0 <- 1
pcut <- weight0 / (weight0 + weight1)
c1 <- (r == 1) & (pi < pcut)
c0 <- (r == 0) & (pi > pcut)
mean(weight1*c1 + weight0*c0)
}
pcut_gam <- 1/6
prob_gam_in <- predict(
credit_gam,
credit_train,
type = "response"
)
pred_gam_in <- (prob_gam_in >= pcut_gam) * 1
table(
credit_train$default,
pred_gam_in,
dnn = c("Observed", "Predicted")
)
## Predicted
## Observed 0 1
## 0 5173 2328
## 1 644 1455
creditcost(credit_train$default, prob_gam_in)
## [1] 0.5779167
pred <- prediction(
predictions = c(prob_gam_in),
labels = credit_train$default
)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE)
performance(pred, "auc")@y.values[[1]]
## [1] 0.7647059
AIC(credit_gam)
## [1] 8411.643
BIC(credit_gam)
## [1] 8617.676
prob_gam_out <- predict(
credit_gam,
credit_test,
type = "response"
)
pred_gam_out <- (prob_gam_out >= pcut_gam) * 1
table(
credit_test$default,
pred_gam_out,
dnn = c("Observed", "Predicted")
)
## Predicted
## Observed 0 1
## 0 1311 556
## 1 168 365
creditcost(credit_test$default, prob_gam_out)
## [1] 0.5816667
pred <- prediction(
predictions = c(prob_gam_out),
labels = credit_test$default
)
perf <- performance(pred, "tpr", "fpr")
plot(perf, colorize = TRUE)
performance(pred, "auc")@y.values[[1]]
## [1] 0.7552178
AGE has edf near 1 and its plot looks
approximately linear, move AGE to a linear term.gam_formula_reduced <- as.formula(
"default ~ s(LIMIT_BAL) + AGE + s(PAY_0) +
s(BILL_AMT1) + s(PAY_AMT1) + SEX + EDUCATION + MARRIAGE"
)
credit_gam_reduced <- gam(
formula = gam_formula_reduced,
family = binomial,
data = credit_train
)
summary(credit_gam_reduced)
##
## Family: binomial
## Link function: logit
##
## Formula:
## default ~ s(LIMIT_BAL) + AGE + s(PAY_0) + s(BILL_AMT1) + s(PAY_AMT1) +
## SEX + EDUCATION + MARRIAGE
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.413136 0.155666 -9.078 < 2e-16 ***
## AGE 0.004473 0.003458 1.293 0.19586
## SEX2 -0.186018 0.056853 -3.272 0.00107 **
## EDUCATION2 -0.061694 0.065712 -0.939 0.34781
## EDUCATION3 -0.121121 0.088205 -1.373 0.16970
## EDUCATION4 -1.469967 0.341966 -4.299 1.72e-05 ***
## MARRIAGE2 -0.097859 0.064279 -1.522 0.12790
## MARRIAGE3 -0.058406 0.249869 -0.234 0.81518
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(LIMIT_BAL) 5.154 6.137 104.76 < 2e-16 ***
## s(PAY_0) 6.522 7.265 1084.54 < 2e-16 ***
## s(BILL_AMT1) 1.923 2.421 38.38 < 2e-16 ***
## s(PAY_AMT1) 5.405 6.615 27.08 0.000279 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.2 Deviance explained = 17.1%
## UBRE = -0.12381 Scale est. = 1 n = 9600
prob_reduced_out <- predict(
credit_gam_reduced,
credit_test,
type = "response"
)
pred_reduced <- prediction(
predictions = c(prob_reduced_out),
labels = credit_test$default
)
performance(pred_reduced, "auc")@y.values[[1]]
## [1] 0.7555745
s(x) for nonlinear effects.edf ≈ 1.H0: s(x)=0.edf together to decide whether a term
should be linear or nonlinear.A neural network contains: - Input layer = predictors \(X\) - Hidden layer(s) = transformed combinations of predictors - Output layer = prediction
Each connection has: - Weights = coefficients - Bias terms = intercepts
Boston example used: - 13 predictors - hidden = c(5,3) -
2 hidden layers - 1 output node for medv
Network: - Inputs = 13 - Hidden layer 1 = 5 nodes - Hidden layer 2 = 3 nodes - Output = 1 node
Parameters: \[ (13+1)\times5 + (5+1)\times3 + 3 + 1 = 92 \]
Linear model parameters: \[ 13+1 = 14 \]
MNIST:
Parameters: \[ (784+1)\times256 + (256+1)\times128 + (128+1)\times10 = 235146 \]
Common hidden-layer activations: - Sigmoid - ReLU
For multiclass outputs: - Softmax Softmax converts outputs into probabilities that sum to 1.
With enough iterations, neural nets can overfit. Ways to reduce overfitting: - Smaller network - Fewer epochs - Validation monitoring - Dropout - Weight decay
rm(list=ls())
library(MASS)
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:ROCR':
##
## prediction
## The following object is masked from 'package:dplyr':
##
## compute
data(Boston)
set.seed(14696133)
index <- sample(1:nrow(Boston), round(0.8*nrow(Boston)))
train_Boston <- Boston[index, ]
test_Boston <- Boston[-index, ]
f <- medv ~ .
nn_unscaled <- neuralnet(
f,
data = train_Boston,
hidden = c(5,3),
linear.output = TRUE,
stepmax = 1e6
)
plot(nn_unscaled)
pr_train <- compute(nn_unscaled, train_Boston[,1:13])
pred_train <- pr_train$net.result
ASE_unscaled <- mean((train_Boston$medv - pred_train)^2)
ASE_unscaled
## [1] 81.10611
pr_test <- compute(nn_unscaled, test_Boston[,1:13])
pred_test <- pr_test$net.result
MSPE_unscaled <- mean((test_Boston$medv - pred_test)^2)
MSPE_unscaled
## [1] 97.90301
Use training min/max to scale both training and testing data. Never use testing min/max.
train_max <- apply(train_Boston, 2, max)
train_min <- apply(train_Boston, 2, min)
train_scaled <- as.data.frame(
scale(train_Boston,
center = train_min,
scale = train_max - train_min)
)
test_scaled <- as.data.frame(
scale(test_Boston,
center = train_min,
scale = train_max - train_min)
)
nn_scaled <- neuralnet(
f,
data = train_scaled,
hidden = c(5,3),
linear.output = TRUE,
stepmax = 1e6
)
plot(nn_scaled)
pr_train <- compute(nn_scaled, train_scaled[,1:13])
pred_train_scaled <- pr_train$net.result
pred_train <- pred_train_scaled *
(train_max["medv"] - train_min["medv"]) +
train_min["medv"]
ASE_scaled <- mean((train_Boston$medv - pred_train)^2)
ASE_scaled
## [1] 4.847148
pr_test <- compute(nn_scaled, test_scaled[,1:13])
pred_test_scaled <- pr_test$net.result
pred_test <- pred_test_scaled *
(train_max["medv"] - train_min["medv"]) +
train_min["medv"]
MSPE_scaled <- mean((test_Boston$medv - pred_test)^2)
MSPE_scaled
## [1] 13.13242
library(randomForest)
set.seed(14696133)
rf_model <- randomForest(
medv ~ .,
data = train_Boston,
ntree = 500
)
pred_rf <- predict(rf_model, test_Boston)
MSPE_rf <- mean((test_Boston$medv - pred_rf)^2)
MSPE_rf
## [1] 10.47369
library(ROCR)
Bank_data <- read.csv("https://yanyudm.github.io/Data-Mining-R/lecture/data/bankruptcy.csv",
header=TRUE
)
Bank_data$DLRSN <- as.numeric(Bank_data$DLRSN)
library(ROCR)
glm_fit <- glm(
DLRSN ~ R1 + R2 + R3 + R4 + R5 + R6 + R7 + R8 + R9 + R10,
family = binomial,
data = Bank_data
)
glm_prob <- predict(glm_fit, type = "response")
pred_glm <- ROCR::prediction(as.vector(glm_prob), Bank_data$DLRSN)
auc_glm <- ROCR::performance(pred_glm, "auc")@y.values[[1]]
round(auc_glm, 4)
## [1] 0.8786
library(mgcv)
gam_bank <- gam(
DLRSN ~ s(R1)+s(R2)+s(R3)+s(R4)+s(R5)+
s(R6)+s(R7)+s(R8)+s(R9)+s(R10),
family = binomial,
data = Bank_data
)
prob_gam <- as.vector(predict(gam_bank, type = "response"))
pred <- ROCR::prediction(prob_gam, Bank_data$DLRSN)
performance(pred, "auc")@y.values[[1]]
## [1] 0.8995445
Bank_data_scaled <- Bank_data
maxs <- apply(Bank_data[,-c(1:3)],2,max)
mins <- apply(Bank_data[,-c(1:3)],2,min)
Bank_data_scaled[,-c(1:3)] <-
as.data.frame(scale(
Bank_data[,-c(1:3)],
center = mins,
scale = maxs - mins
))
sample_index <- sample(nrow(Bank_data_scaled),nrow(Bank_data_scaled)*0.80)
Bank_train <- Bank_data_scaled[sample_index,]
Bank_test <- Bank_data_scaled[-sample_index,]
f_bank <- as.formula(
"DLRSN ~ R1+R2+R3+R4+R5+R6+R7+R8+R9+R10"
)
Bank_nn <- neuralnet(
f_bank,
data = Bank_data_scaled,
hidden = c(3),
algorithm = "rprop+",
linear.output = FALSE,
likelihood = TRUE
)
plot(Bank_nn)
#Out-of-sample fit performance
pcut_nn <- 1/36
prob_nn_out <- predict(Bank_nn, Bank_test, type="response")
pred_nn_out <- (prob_nn_out>=pcut_nn)*1
table(Bank_test$DLRSN, pred_nn_out, dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 500 450
## 1 7 131
#Out-of-sample ROC Curve
pred <- ROCR::prediction(prob_nn_out, Bank_test$DLRSN)
perf <- ROCR::performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
#Get the AUC
unlist(slot(ROCR::performance(pred, "auc"), "y.values"))
## [1] 0.911106
10 inputs, 3 hidden nodes, 1 output: \[ (10+1)\times3 + (3+1)\times1 = 37 \]
How many hidden layers? hidden = c(5,3) means:
2 hidden layers
If asked: Which statement is FALSE? False statement usually:
| Type | Has Response Y? | Example |
|---|---|---|
| Supervised Learning | Yes | Linear regression, logistic regression, neural networks |
| Unsupervised Learning | No | k-means clustering, hierarchical clustering |
| Method | Requires k Before Starting? | Main Idea |
|---|---|---|
| Hierarchical clustering | No | Start with each record as its own cluster, then merge |
| k-means clustering | Yes | Prespecify k and iteratively assign observations |
| Linkage | Meaning |
|---|---|
| Single linkage | Distance between nearest pair of records |
| Complete linkage | Distance between farthest pair of records |
| Centroid linkage | Distance between cluster mean vectors |
| Average linkage | Average distance across all record pairs |
data("iris")
iris1 <- iris[, 1:4]
head(iris1)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
set.seed(14696133)
kmi <- kmeans(iris1, centers = 2)
kmi
## K-means clustering with 2 clusters of sizes 53, 97
##
## Cluster means:
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.005660 3.369811 1.560377 0.290566
## 2 6.301031 2.886598 4.958763 1.695876
##
## Clustering vector:
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2
##
## Within cluster sum of squares by cluster:
## [1] 28.55208 123.79588
## (between_SS / total_SS = 77.6 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
round(sum(kmi$withinss), 2)
## [1] 152.35
kmi$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 2 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2
round(mean(iris$Sepal.Length[kmi$cluster == 1]), 4)
## [1] 5.0057
set.seed(14696133)
kmi2 <- kmeans(iris1, centers = 3)
round(sum(kmi2$withinss), 2)
## [1] 78.85
kmi2$cluster
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [75] 3 3 3 2 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 2 3 2 2 2 2 3 2 2 2 2
## [112] 2 2 3 3 2 2 2 2 3 2 3 2 3 2 2 3 3 2 2 2 2 2 3 2 2 2 2 3 2 2 2 3 2 2 2 3 2
## [149] 2 3
kmi2$centers
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.006000 3.428000 1.462000 0.246000
## 2 6.850000 3.073684 5.742105 2.071053
## 3 5.901613 2.748387 4.393548 1.433871
kmi2$size
## [1] 50 38 62
set.seed(14696133)
wss <- numeric(10)
for(k in 1:10){
km <- kmeans(iris1, centers = k, nstart = 25)
wss[k] <- sum(km$withinss)
}
plot(
1:10,
wss,
type = "b",
xlab = "Number of Clusters k",
ylab = "Total Within-Cluster Sum of Squares",
main = "Elbow Method for Choosing k"
)
### Visualizing clusters
library(fpc)
fit <- kmeans(iris[,1:4], 3)
plotcluster(
iris[,1:4],
fit$cluster
)
### Hierarchical Clustering on Iris
dist_iris <- dist(iris1, method = "euclidean")
hc_iris <- hclust(
dist_iris,
method = "single"
)
plot(
hc_iris,
hang = -1,
main = "Hierarchical Clustering: Iris"
)
memb_iris <- cutree(hc_iris, k = 3)
memb_iris
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [75] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [112] 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [149] 2 2
table(memb_iris, iris$Species)
##
## memb_iris setosa versicolor virginica
## 1 50 0 0
## 2 0 50 48
## 3 0 0 2
library(fpc)
utilities.df <- read.csv("Utilities.csv")
head(utilities.df)
## Company Fixed_charge RoR Cost Load_factor Demand_growth Sales Nuclear
## 1 Arizona 1.06 9.2 151 54.4 1.6 9077 0.0
## 2 Boston 0.89 10.3 202 57.9 2.2 5088 25.3
## 3 Central 1.43 15.4 113 53.0 3.4 9212 0.0
## 4 Commonwealth 1.02 11.2 168 56.0 0.3 6423 34.3
## 5 NY 1.49 8.8 192 51.2 1.0 3300 15.6
## 6 Florida 1.32 13.5 111 60.0 -2.2 11127 22.5
## Fuel_Cost
## 1 0.628
## 2 1.555
## 3 1.058
## 4 0.700
## 5 2.044
## 6 1.241
plot(
Fuel_Cost ~ Sales,
data = utilities.df,
xlim = c(0, 20000),
ylim = c(0, 2.5),
xlab = "Sales",
ylab = "Fuel Cost"
)
text(
Fuel_Cost ~ Sales,
labels = Company,
data = utilities.df,
pos = 4,
cex = 0.8
)
row.names(utilities.df) <- utilities.df[, 1]
utilities.df <- utilities.df[, -1]
head(utilities.df)
## Fixed_charge RoR Cost Load_factor Demand_growth Sales Nuclear
## Arizona 1.06 9.2 151 54.4 1.6 9077 0.0
## Boston 0.89 10.3 202 57.9 2.2 5088 25.3
## Central 1.43 15.4 113 53.0 3.4 9212 0.0
## Commonwealth 1.02 11.2 168 56.0 0.3 6423 34.3
## NY 1.49 8.8 192 51.2 1.0 3300 15.6
## Florida 1.32 13.5 111 60.0 -2.2 11127 22.5
## Fuel_Cost
## Arizona 0.628
## Boston 1.555
## Central 1.058
## Commonwealth 0.700
## NY 2.044
## Florida 1.241
utilities.df.norm <- utilities.df
cols <- colnames(utilities.df)
for(i in cols){
utilities.df.norm[[i]] <-
(utilities.df.norm[[i]] - mean(utilities.df[[i]])) / sd(utilities.df[[i]])
}
head(utilities.df.norm)
## Fixed_charge RoR Cost Load_factor Demand_growth
## Arizona -0.2931579 -0.6846390 -0.417122002 -0.5777152 -0.52622751
## Boston -1.2145113 -0.1944537 0.821002037 0.2068363 -0.33381191
## Central 1.7121407 2.0782236 -1.339645796 -0.8915357 0.05101929
## Commonwealth -0.5099470 0.2066070 -0.004413989 -0.2190631 -0.94312798
## NY 2.0373243 -0.8628882 0.578232617 -1.2950193 -0.71864311
## Florida 1.1159709 1.2315399 -1.388199680 0.6775672 -1.74485965
## Sales Nuclear Fuel_Cost
## Arizona 0.04590290 -0.7146294 -0.85367545
## Boston -1.07776413 0.7920476 0.81329670
## Central 0.08393124 -0.7146294 -0.08043055
## Commonwealth -0.70170610 1.3280197 -0.72420189
## NY -1.58142837 0.2143888 1.69263800
## Florida 0.62337028 0.6253007 0.24864810
plot(
Fuel_Cost ~ Sales,
data = utilities.df.norm,
xlab = "Sales Standardized",
ylab = "Fuel Cost Standardized"
)
dist.norm.two <- dist(
utilities.df.norm[c("Sales", "Fuel_Cost")],
method = "euclidean"
)
hc_two <- hclust(
dist.norm.two,
method = "single"
)
plot(
hc_two,
hang = -1,
ann = FALSE
)
abline(h = 0.75, col = "red")
memb_two <- cutree(hc_two, k = 3)
memb_two
## Arizona Boston Central Commonwealth NY Florida
## 1 2 1 1 2 1
## Hawaiian Idaho Kentucky Madison Nevada New England
## 2 3 1 1 3 2
## Northern Oklahoma Pacific Puget San Diego Southern
## 1 1 2 3 2 1
## Texas Wisconsin United Virginia
## 3 1 2 1
dist.norm.all <- dist(
utilities.df.norm,
method = "euclidean"
)
hc_all <- hclust(
dist.norm.all,
method = "single"
)
plot(
hc_all,
hang = -1,
ann = FALSE
)
abline(h = 3.5, col = "red")
abline(h = 2.6, col = "green")
memb_all <- cutree(hc_all, k = 6)
memb_all
## Arizona Boston Central Commonwealth NY Florida
## 1 1 2 1 3 1
## Hawaiian Idaho Kentucky Madison Nevada New England
## 1 4 1 1 5 1
## Northern Oklahoma Pacific Puget San Diego Southern
## 1 1 1 4 6 1
## Texas Wisconsin United Virginia
## 1 1 1 1
set.seed(14696133)
km_util <- kmeans(
utilities.df.norm,
centers = 6
)
km_util
## K-means clustering with 6 clusters of sizes 3, 6, 1, 5, 4, 3
##
## Cluster means:
## Fixed_charge RoR Cost Load_factor Demand_growth Sales
## 1 -0.60027572 -0.8331800 1.3389101 -0.4805802 0.9917178 1.8565214
## 2 0.38430785 0.7413546 -1.1494764 -0.5216758 -0.7827816 0.4343553
## 3 2.03732429 -0.8628882 0.5782326 -1.2950193 -0.7186431 -1.5814284
## 4 -0.01133215 0.3313815 0.2189339 -0.3580408 0.1664686 -0.4018738
## 5 -1.09256750 -1.1191214 0.2019400 0.8456853 0.1311925 -0.8264954
## 6 0.62819552 0.5779595 0.1331553 1.4247590 0.3610222 -0.4263058
## Nuclear Fuel_Cost
## 1 -0.7146294 -0.9657660
## 2 -0.4913077 -0.4068118
## 3 0.2143888 1.6926380
## 4 1.5650384 -0.5954476
## 5 -0.2009895 1.1599082
## 6 -0.7146294 0.6610454
##
## Clustering vector:
## Arizona Boston Central Commonwealth NY Florida
## 2 5 2 4 3 2
## Hawaiian Idaho Kentucky Madison Nevada New England
## 6 1 6 4 1 6
## Northern Oklahoma Pacific Puget San Diego Southern
## 4 2 5 1 5 2
## Texas Wisconsin United Virginia
## 2 4 5 4
##
## Within cluster sum of squares by cluster:
## [1] 9.533522 19.353940 0.000000 10.177094 12.574083 6.019991
## (between_SS / total_SS = 65.7 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
km_util$cluster
## Arizona Boston Central Commonwealth NY Florida
## 2 5 2 4 3 2
## Hawaiian Idaho Kentucky Madison Nevada New England
## 6 1 6 4 1 6
## Northern Oklahoma Pacific Puget San Diego Southern
## 4 2 5 1 5 2
## Texas Wisconsin United Virginia
## 2 4 5 4
km_util$centers
## Fixed_charge RoR Cost Load_factor Demand_growth Sales
## 1 -0.60027572 -0.8331800 1.3389101 -0.4805802 0.9917178 1.8565214
## 2 0.38430785 0.7413546 -1.1494764 -0.5216758 -0.7827816 0.4343553
## 3 2.03732429 -0.8628882 0.5782326 -1.2950193 -0.7186431 -1.5814284
## 4 -0.01133215 0.3313815 0.2189339 -0.3580408 0.1664686 -0.4018738
## 5 -1.09256750 -1.1191214 0.2019400 0.8456853 0.1311925 -0.8264954
## 6 0.62819552 0.5779595 0.1331553 1.4247590 0.3610222 -0.4263058
## Nuclear Fuel_Cost
## 1 -0.7146294 -0.9657660
## 2 -0.4913077 -0.4068118
## 3 0.2143888 1.6926380
## 4 1.5650384 -0.5954476
## 5 -0.2009895 1.1599082
## 6 -0.7146294 0.6610454
km_util$withinss
## [1] 9.533522 19.353940 0.000000 10.177094 12.574083 6.019991
sum(km_util$withinss)
## [1] 57.65863
km_util$size
## [1] 3 6 1 5 4 3
k-means requires: - Distance metric - Number of clusters k - Initial cluster centroids So if asked, the answer is all of the above.
fviz_cluster() uses PCA when data has more than two
variables.\[ E(Y|X) \]
set.seed() makes random results reproducible.set.seed() when:
set.seed(), R uses a default random seed based
on the clock.rm(list=ls())
# Random draws
rbinom(5, 1, 0.9)
## [1] 1 1 1 1 0
rnorm(5)
## [1] -0.7863282 0.6716143 1.0121694 -0.5537609 -0.2810061
# Reproducible random numbers
set.seed(1234)
rnorm(5)
## [1] -1.2070657 0.2774292 1.0844412 -2.3456977 0.4291247
# This will continue from the prior random sequence
rnorm(5)
## [1] 0.5060559 -0.5747400 -0.5466319 -0.5644520 -0.8900378
# Random sample
sample(5, 2)
## [1] 4 5
# Reproducible sample
set.seed(1234)
sample(5, 2)
## [1] 4 5
library(MASS)
data(Boston)
# Random 80/20 split
sample_index <- sample(nrow(Boston), nrow(Boston) * 0.80)
Boston_train <- Boston[sample_index, ]
Boston_test <- Boston[-sample_index, ]
# Random 60/40 split
sample_index <- sample(nrow(Boston), nrow(Boston) * 0.60)
Boston_train <- Boston[sample_index, ]
Boston_test <- Boston[-sample_index, ]
# Reproducible 80/20 split
set.seed(14696133)
sample_index <- sample(nrow(Boston), nrow(Boston) * 0.80)
Boston_train <- Boston[sample_index, ]
Boston_test <- Boston[-sample_index, ]
Boston.train.lm <- lm(
medv ~ .,
data = Boston_train
)
summary(Boston.train.lm)
##
## Call:
## lm(formula = medv ~ ., data = Boston_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.1310 -2.7913 -0.6166 1.5033 25.4967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.334893 5.688919 6.563 1.69e-10 ***
## crim -0.107868 0.034101 -3.163 0.001683 **
## zn 0.064312 0.015186 4.235 2.85e-05 ***
## indus 0.079975 0.069694 1.148 0.251868
## chas 2.216388 0.918546 2.413 0.016285 *
## nox -18.193056 4.321909 -4.209 3.18e-05 ***
## rm 3.382181 0.480410 7.040 8.71e-12 ***
## age -0.001973 0.014874 -0.133 0.894561
## dis -1.541009 0.228943 -6.731 6.04e-11 ***
## rad 0.295365 0.073101 4.041 6.43e-05 ***
## tax -0.012460 0.004148 -3.004 0.002837 **
## ptratio -0.890234 0.145250 -6.129 2.17e-09 ***
## black 0.010866 0.002943 3.692 0.000254 ***
## lstat -0.526979 0.057059 -9.236 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 390 degrees of freedom
## Multiple R-squared: 0.7327, Adjusted R-squared: 0.7238
## F-statistic: 82.25 on 13 and 390 DF, p-value: < 2.2e-16
yhat.train <- predict(
object = Boston.train.lm,
newdata = Boston_train
)
mse.train <- sum((Boston_train$medv - yhat.train)^2) /
(dim(Boston_train)[1] - 2 - 1)
sigmahat.train <- sqrt(mse.train)
mse.train
## [1] 21.89311
sigmahat.train
## [1] 4.679007
yhat.test <- predict(
object = Boston.train.lm,
newdata = Boston_test
)
mspe <- sum((Boston_test$medv - yhat.test)^2) /
dim(Boston_test)[1]
mspe
## [1] 23.95934
mean((Boston_test$medv - yhat.test)^2)
## [1] 23.95934
library(boot)
model_full <- glm(
medv ~ .,
data = Boston
)
# 2-fold CV may vary due to random fold split
cv.glm(
data = Boston,
glmfit = model_full,
K = 2
)$delta[2]
## [1] 22.90188
# LOOCV does not vary
cv.glm(
data = Boston,
glmfit = model_full,
K = nrow(Boston)
)$delta[2]
## [1] 23.72388
set.seed() to reproduce the result.library(fpc)
data(iris)
fit <- kmeans(
iris[, 1:4],
centers = 5
)
plotcluster(
iris[, 1:4],
fit$cluster
)
Association rules are written as: \[ A \Rightarrow B \] Where: - \(A\) = antecedent set - \(B\) = consequent set Example:
{ToppingFood} => {Ice.Cream.ConeFood}
Support measures how often the full rule appears. \[ Support(A \Rightarrow B) = P(A \cap B) \] In words: - Percent of transactions containing both antecedent and consequent. - Low support means the rule may be rare or may have occurred by chance.
Confidence measures how often the consequent appears when the antecedent appears. \[ Confidence(A \Rightarrow B) = P(B|A) \] In words: - Among transactions with \(A\), what percent also include \(B\)? - Higher confidence means the rule is more reliable.
Lift compares the rule to independence. \[ Lift = \frac{Confidence}{Benchmark \ Confidence} \]
Equivalent: \[ Lift = \frac{P(A \cap B)}{P(A)P(B)} \]
| Lift | Meaning |
|---|---|
| Greater than 1 | Items appear together more often than expected |
| Equal to 1 | Items are approximately independent |
| Less than 1 | Items appear together less often than expected |
Suppose: - 1000 transactions - 1/4 include forks - 1/2 include spoon - 100 include both forks and spoon For rule:
forks => spoon
Support: \[ 100/1000 = 0.1 \]
Confidence: \[ 100/(1000 \times 1/4) = 100/250 = 0.4 \]
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
TransFood <- read.csv(
"https://yanyudm.github.io/Data-Mining-R/data/food_4_association.csv"
)
TransFood <- TransFood[, -1]
# Find entries not equal to 0 or 1 and change them to 1
Others <- which(
!(as.matrix(TransFood) == 1 | as.matrix(TransFood) == 0),
arr.ind = TRUE
)
TransFood[Others] <- 1
TransFood <- as(
as.matrix(TransFood),
"transactions"
)
dim(TransFood)
## [1] 19076 118
Expected number of rows:
19076
summary(TransFood)
## transactions as itemMatrix in sparse format with
## 19076 rows (elements/itemsets/transactions) and
## 118 columns (items) and a density of 0.02230729
##
## most frequent items:
## Bottled.WaterFood Slice.of.CheeseFood Medium.DrinkFood Small.DrinkFood
## 3166 3072 2871 2769
## Slice.of.PeppFood (Other)
## 2354 35981
##
## element (itemset/transaction) length distribution:
## sizes
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 15
## 197 5675 5178 3253 2129 1293 655 351 178 95 42 14 8 7 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 2.000 2.632 4.000 15.000
##
## includes extended item information - examples:
## labels
## 1 Add.CheeseFood
## 2 BeerFood
## 3 Bottled.WaterFood
Most frequent item:
Bottled.WaterFood
Expected count:
3166
The Apriori algorithm: 1. Finds frequent itemsets using minimum support. 2. Generates rules using minimum confidence.
Important property: - Any subset of a frequent itemset must also be frequent. - If an itemset is not frequent, none of its supersets can be frequent.
basket_rules <- apriori(
TransFood,
parameter = list(
sup = 0.005,
conf = 0.95,
target = "rules"
)
)
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.95 0.1 1 none FALSE TRUE 5 0.005 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 95
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[115 item(s), 19076 transaction(s)] done [0.00s].
## sorting and recoding items ... [58 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [2 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(basket_rules)
## set of 2 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3
## 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 2.25 2.50 2.50 2.75 3.00
##
## summary of quality measures:
## support confidence coverage lift
## Min. :0.005662 Min. :0.9558 Min. :0.005924 Min. :8.948
## 1st Qu.:0.011389 1st Qu.:0.9664 1st Qu.:0.011598 1st Qu.:9.159
## Median :0.017116 Median :0.9770 Median :0.017273 Median :9.370
## Mean :0.017116 Mean :0.9770 Mean :0.017273 Mean :9.370
## 3rd Qu.:0.022843 3rd Qu.:0.9876 3rd Qu.:0.022948 3rd Qu.:9.581
## Max. :0.028570 Max. :0.9982 Max. :0.028622 Max. :9.792
## count
## Min. :108.0
## 1st Qu.:217.2
## Median :326.5
## Mean :326.5
## 3rd Qu.:435.8
## Max. :545.0
##
## mining info:
## data ntransactions support confidence
## TransFood 19076 0.005 0.95
## call
## apriori(data = TransFood, parameter = list(sup = 0.005, conf = 0.95, target = "rules"))
inspect(head(basket_rules))
## lhs rhs support confidence coverage lift count
## [1] {ToppingFood} => {Ice.Cream.ConeFood} 0.028569931 0.9981685 0.028622353 8.947868 545
## [2] {Chicken.TendersFood,
## Krazy.KritterFood} => {French.Fries.BasketFood} 0.005661564 0.9557522 0.005923674 9.791584 108
Expected number of rules:
2
inspect(basket_rules)
## lhs rhs support confidence coverage lift count
## [1] {ToppingFood} => {Ice.Cream.ConeFood} 0.028569931 0.9981685 0.028622353 8.947868 545
## [2] {Chicken.TendersFood,
## Krazy.KritterFood} => {French.Fries.BasketFood} 0.005661564 0.9557522 0.005923674 9.791584 108
Expected important rule:
{ToppingFood} => {Ice.Cream.ConeFood}
Expected lift:
8.95
Since lift is greater than 1:
The antecedent set and consequent set appear more often together than expected.
data(iris)
set.seed(14696133)
sample_index <- sample(
nrow(iris),
nrow(iris) * 0.80
)
iris1 <- iris[sample_index, ]
summary(iris1)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## Min. :4.30 Min. :2.000 Min. :1.000 Min. :0.100 setosa :38
## 1st Qu.:5.10 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.375 versicolor:44
## Median :5.80 Median :3.000 Median :4.350 Median :1.300 virginica :38
## Mean :5.84 Mean :3.046 Mean :3.766 Mean :1.202
## 3rd Qu.:6.40 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.90 Max. :4.400 Max. :6.900 Max. :2.400
set.seed() makes random processes reproducible.Antecedent => Consequent