library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.1.8
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggthemes)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(ISLR)
library(pcr)
set.seed(1)
X <- rnorm(100)
noise <- rnorm(100)
Y <- 3 + 1*X + 4*X^2 - 1*X^3 + noise
require(leaps)
## Loading required package: leaps
df <- data.frame(Y, X)
fit <- regsubsets(Y ~ poly(X, 10), data = df, nvmax = 10)
fit_summary <- summary(fit)
require(tidyverse);require(ggplot2);require(ggthemes);
data_frame(Cp = fit_summary$cp,
BIC = fit_summary$bic,
AdjR2 = fit_summary$adjr2) %>%
mutate(id = row_number()) %>%
gather(value_type, value, -id) %>%
ggplot(aes(id, value, col = value_type)) +
geom_line() + geom_point() + ylab('') + xlab('Number of Variables Used') +
facet_wrap(~ value_type, scales = 'free') +
theme_tufte() + scale_x_continuous(breaks = 1:10)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
require(caret)
model_back <- train(Y ~ poly(X, 10), data = df,
method = 'glmStepAIC', direction = 'backward',
trace = 0,
trControl = trainControl(method = 'none', verboseIter = FALSE))
postResample(predict(model_back, df), df$Y)
## RMSE Rsquared MAE
## 0.9314956 0.9569843 0.7488821
summary(model_back$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8914 -0.5860 -0.1516 0.5892 2.1794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.10265 0.09557 63.856 < 2e-16 ***
## `poly(X, 10)1` -7.19295 0.95569 -7.526 2.96e-11 ***
## `poly(X, 10)2` 40.74405 0.95569 42.633 < 2e-16 ***
## `poly(X, 10)3` -14.70908 0.95569 -15.391 < 2e-16 ***
## `poly(X, 10)5` 1.48019 0.95569 1.549 0.125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.9133516)
##
## Null deviance: 2017.132 on 99 degrees of freedom
## Residual deviance: 86.768 on 95 degrees of freedom
## AIC: 281.59
##
## Number of Fisher Scoring iterations: 2
x_poly <- poly(df$X, 10)
colnames(x_poly) <- paste0('poly', 1:10)
model_forw <- train(y = Y, x = x_poly,
method = 'glmStepAIC', direction = 'forward',
trace = 0,
trControl = trainControl(method = 'none', verboseIter = FALSE))
postResample(predict(model_forw, data.frame(x_poly)), df$Y)
## RMSE Rsquared MAE
## 0.9314956 0.9569843 0.7488821
summary(model_forw$finalModel)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8914 -0.5860 -0.1516 0.5892 2.1794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.10265 0.09557 63.856 < 2e-16 ***
## poly2 40.74405 0.95569 42.633 < 2e-16 ***
## poly3 -14.70908 0.95569 -15.391 < 2e-16 ***
## poly1 -7.19295 0.95569 -7.526 2.96e-11 ***
## poly5 1.48019 0.95569 1.549 0.125
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.9133516)
##
## Null deviance: 2017.132 on 99 degrees of freedom
## Residual deviance: 86.768 on 95 degrees of freedom
## AIC: 281.59
##
## Number of Fisher Scoring iterations: 2
lasso_model <- train(Y ~ poly(X, 10), data = df,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.001, 0.2, by = 0.005)))
plot(lasso_model)
plot(varImp(lasso_model))
coef(lasso_model$finalModel, lasso_model$bestTune$lambda)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 6.1026472
## poly(X, 10)1 -6.6829512
## poly(X, 10)2 40.2340466
## poly(X, 10)3 -14.1990830
## poly(X, 10)4 0.7470950
## poly(X, 10)5 0.9701884
## poly(X, 10)6 .
## poly(X, 10)7 .
## poly(X, 10)8 .
## poly(X, 10)9 .
## poly(X, 10)10 -0.4412295
postResample(predict(lasso_model, df), df$Y)
## RMSE Rsquared MAE
## 0.9265197 0.9577192 0.7566424
Y_7 <- 3 + 8*X^7 + noise
df_2 <- data_frame(Y_7 = Y_7, X = df[,-1])
fit <- regsubsets(Y_7 ~ poly(X, 10), data = df_2, nvmax = 10)
fit_summary <- summary(fit)
data_frame(Cp = fit_summary$cp,
BIC = fit_summary$bic,
R2 = fit_summary$adjr2) %>%
mutate(id = row_number()) %>%
gather(value_type, value, -id) %>%
ggplot(aes(id, value, col = value_type)) +
geom_line() + geom_point() + ylab('') + xlab('Number of Variables Used') +
facet_wrap(~ value_type, scales = 'free') +
theme_tufte() + scale_x_continuous(breaks = 1:10)
lasso_y7_model <- train(Y_7 ~ poly(X, 10), data = df_2,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 5),
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.001, 0.2, by = 0.005)))
plot(lasso_y7_model)
plot(varImp(lasso_y7_model))
coef(lasso_y7_model$finalModel, lasso_y7_model$bestTune$lambda)
## 11 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 36.72505
## poly(X, 10)1 2630.24239
## poly(X, 10)2 1059.42846
## poly(X, 10)3 3491.14380
## poly(X, 10)4 597.14287
## poly(X, 10)5 1363.96802
## poly(X, 10)6 70.93052
## poly(X, 10)7 177.79560
## poly(X, 10)8 .
## poly(X, 10)9 .
## poly(X, 10)10 .
postResample(predict(lasso_y7_model, df_2), df_2$Y_7)
## RMSE Rsquared MAE
## 14.2854375 0.9996164 4.9531386
require(ISLR)
require(caret)
require(tidyverse)
data('College')
set.seed(1)
inTrain <- createDataPartition(College$Apps, p = 0.75, list = FALSE)
training <- College[inTrain,]
testing <- College[-inTrain,]
preObj <- preProcess(training, method = c('center', 'scale'))
training <- predict(preObj, training)
testing <- predict(preObj, testing)
y_train <- training$Apps
y_test <- testing$Apps
one_hot_encoding <- dummyVars(Apps ~ ., data = training)
x_train <- predict(one_hot_encoding, training)
x_test <- predict(one_hot_encoding, testing)
lin_model <- lm(Apps ~ ., data = training)
pred <- predict(lin_model, testing)
(lin_info <- postResample(pred, testing$Apps))
## RMSE Rsquared MAE
## 0.2799768 0.9201765 0.1568743
ridge_fit <- train(x = x_train, y = y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 0,
lambda = seq(0, 10e2, length.out = 20)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
(ridge_info <- postResample(predict(ridge_fit, x_test), y_test))
## RMSE Rsquared MAE
## 0.2853247 0.9211286 0.1645806
coef(ridge_fit$finalModel, ridge_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) 0.034871314
## Private.No 0.075423210
## Private.Yes -0.076037580
## Accept 0.665628735
## Enroll 0.090243371
## Top10perc 0.107160248
## Top25perc 0.011628030
## F.Undergrad 0.063308800
## P.Undergrad 0.017427317
## Outstate -0.028995432
## Room.Board 0.048720533
## Books 0.012799145
## Personal -0.002894430
## PhD -0.017989250
## Terminal -0.010434665
## S.F.Ratio 0.006920126
## perc.alumni -0.031683867
## Expend 0.083525070
## Grad.Rate 0.058131023
plot(ridge_fit)
plot(varImp(ridge_fit))
lasso_fit <- train(x = x_train, y = y_train,
method = 'glmnet',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(alpha = 1,
lambda = seq(0.0001, 1, length.out = 50)))
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo, :
## There were missing values in resampled performance measures.
(lasso_info <- postResample(predict(lasso_fit, x_test), y_test))
## RMSE Rsquared MAE
## 0.2914812 0.9141364 0.1511801
coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.470609e-02
## Private.No 5.410732e-02
## Private.Yes -1.041607e-14
## Accept 8.883839e-01
## Enroll .
## Top10perc 1.092201e-01
## Top25perc .
## F.Undergrad .
## P.Undergrad .
## Outstate .
## Room.Board 1.483337e-04
## Books .
## Personal .
## PhD .
## Terminal .
## S.F.Ratio .
## perc.alumni .
## Expend 4.067011e-02
## Grad.Rate 4.062729e-06
plot(lasso_fit)
plot(varImp(lasso_fit))
pcr_model <- train(x = x_train, y = y_train,
method = 'pcr',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:10))
(pcr_info <- postResample(predict(pcr_model, x_test), y_test))
## RMSE Rsquared MAE
## 0.3231292 0.8916531 0.1986075
coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## s1
## (Intercept) -1.470609e-02
## Private.No 5.410732e-02
## Private.Yes -1.041607e-14
## Accept 8.883839e-01
## Enroll .
## Top10perc 1.092201e-01
## Top25perc .
## F.Undergrad .
## P.Undergrad .
## Outstate .
## Room.Board 1.483337e-04
## Books .
## Personal .
## PhD .
## Terminal .
## S.F.Ratio .
## perc.alumni .
## Expend 4.067011e-02
## Grad.Rate 4.062729e-06
plot(lasso_fit)
plot(varImp(lasso_fit))
pcr_model <- train(x = x_train, y = y_train,
method = 'pcr',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:10))
(pcr_info <- postResample(predict(pcr_model, x_test), y_test))
## RMSE Rsquared MAE
## 0.3231292 0.8916531 0.1986075
coef(pcr_model$finalModel)
## , , 10 comps
##
## .outcome
## Private.No 0.031985972
## Private.Yes -0.031985972
## Accept 0.343576750
## Enroll 0.305359773
## Top10perc 0.042630417
## Top25perc 0.027790893
## F.Undergrad 0.273818439
## P.Undergrad -0.049487667
## Outstate 0.038573119
## Room.Board 0.070607615
## Books 0.016433593
## Personal -0.023529455
## PhD -0.023992433
## Terminal -0.024182230
## S.F.Ratio 0.003741623
## perc.alumni -0.070567887
## Expend 0.090126298
## Grad.Rate 0.071302714
plot(pcr_model)
plot(varImp(pcr_model))
pls_model <- train(x = x_train, y = y_train,
method = 'pls',
trControl = trainControl(method = 'cv', number = 10),
tuneGrid = expand.grid(ncomp = 1:10))
(pls_info <- postResample(predict(pls_model, x_test), y_test))
## RMSE Rsquared MAE
## 0.2792580 0.9209302 0.1572165
coef(pls_model$finalModel)
## , , 10 comps
##
## .outcome
## Private.No 0.071314109
## Private.Yes -0.071314109
## Accept 1.039263053
## Enroll -0.169855531
## Top10perc 0.235572152
## Top25perc -0.070270182
## F.Undergrad -0.022580399
## P.Undergrad 0.032522451
## Outstate -0.085919942
## Room.Board 0.036344367
## Books 0.004835496
## Personal 0.007546695
## PhD -0.044709439
## Terminal 0.002332025
## S.F.Ratio 0.010868271
## perc.alumni -0.009046689
## Expend 0.072963738
## Grad.Rate 0.037425850
plot(pls_model)
as_data_frame(rbind(lin_info,
ridge_info,
lasso_info,
pcr_info,
pls_info)) %>%
mutate(model = c('Linear', 'Ridge', 'Lasso', 'PCR', 'PLS')) %>%
select(model, RMSE, Rsquared)
## Warning: `as_data_frame()` was deprecated in tibble 2.0.0.
## ℹ Please use `as_tibble()` instead.
## ℹ The signature and semantics have changed, see `?as_tibble`.
testing %>%
summarize(sd = sd(Apps))
Semua model bekerja dengan cara yang sama, kecuali model PCR. R2≥94 untuk mereka semua dan RMSE≤20. Ketika kita membandingkan skor RMSE dengan rata-rata dan simpangan baku dari variabel respons, kita melihat bahwa semua model memiliki akurasi yang fenomenal!
require(ggthemes)
residfunc <- function(fit, data) {
predict(fit, data) - testing$Apps
}
data_frame(Observed = testing$Apps,
LM = residfunc(lin_model, testing),
Ridge = residfunc(ridge_fit, x_test),
Lasso = residfunc(lasso_fit, x_test),
PCR = residfunc(pcr_model, x_test),
PLS = residfunc(pls_model, x_test)) %>%
gather(Model, Residuals, -Observed) %>%
ggplot(aes(Observed, Residuals, col = Model)) +
geom_hline(yintercept = 0, lty = 2) +
geom_point(alpha = 0.6) +
geom_smooth(method = 'loess', alpha = 0.01, col = 'lightsalmon3') +
facet_wrap(~ Model, ncol = 5) +
theme_tufte() +
theme(legend.position = 'top') +
coord_flip()
## Warning: attributes are not identical across measure variables; they will be
## dropped
## `geom_smooth()` using formula = 'y ~ x'
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.