final Khulan Bayarkhuu

require(MASS); require(tidyverse); require(ggthemes)

## Loading required package: MASS

## Loading required package: tidyverse

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: ggthemes

require(knitr); require(kableExtra); require(corrplot)

## Loading required package: knitr
## Loading required package: kableExtra
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
## 
## Loading required package: corrplot
## corrplot 0.92 loaded

require(broom)

## Loading required package: broom

set.seed(1)
theme_set(theme_tufte(base_size = 14))
data('Boston')

Boston <- Boston %>%
    mutate(chas = factor(chas),
           crime_factor = factor(ifelse(crim > median(crim), 
                                              'High', 'Low'), 
                                       levels = c('High', 'Low')))

cor_test <- Boston %>%
    select(-chas, -crime_factor) %>%
    cor.mtest(conf.level = .95)

Boston %>%
    select(-chas, -crime_factor) %>%
    cor %>%
    corrplot(method = 'color', 
         order = 'hclust', addrect = 2,
         tl.col = 'black', addCoef.col = 'black', number.cex = 0.65,
         p.mat = cor_test$p, sig.level = .05)

scale_this <- function(x) {
    (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

table_1 <- Boston %>%
          dplyr::select(zn:crime_factor) %>%
          gather(Variable, value, -crime_factor, -chas) %>%
          group_by(Variable) %>%
          mutate(value = scale_this(value)) %>%
          group_by(Variable, crime_factor) %>%
          summarize(Q10 = quantile(value, .10),
                    Q25 = quantile(value, .25),
                    median = median(value), 
                    mean = mean(value), 
                    Q75 = quantile(value, .75),
                    Q90 = quantile(value, .90))

## `summarise()` has grouped output by 'Variable'. You can override using the
## `.groups` argument.

kable(table_1,
      digits = 3, format = 'html') %>%
    kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
    column_spec(1:2, bold = T) %>%
    scroll_box(height = '400px')

Variable	crime_factor	Q10	Q25	median	mean	Q75	Q90
age	High	-0.234	0.470	0.846	0.613	1.042	1.116
age	Low	-1.782	-1.317	-0.713	-0.613	0.118	0.750
black	High	-2.942	-0.298	0.292	-0.351	0.421	0.441
black	Low	0.222	0.356	0.405	0.351	0.441	0.441
dis	High	-1.108	-0.970	-0.786	-0.616	-0.355	0.098
dis	Low	-0.631	-0.202	0.628	0.616	1.275	1.915
indus	High	-0.720	-0.180	1.015	0.603	1.015	1.231
indus	Low	-1.306	-1.132	-0.801	-0.603	-0.376	0.247
lstat	High	-0.914	-0.290	0.352	0.453	1.076	1.931
lstat	Low	-1.149	-0.964	-0.580	-0.453	-0.045	0.457
medv	High	-1.319	-0.971	-0.504	-0.263	0.051	0.995
medv	Low	-0.436	-0.243	0.062	0.263	0.649	1.345
nox	High	-0.412	-0.092	0.598	0.723	1.254	1.599
nox	Low	-1.256	-1.067	-0.912	-0.723	-0.343	-0.046
ptratio	High	-1.735	-0.026	0.806	0.253	0.806	1.175
ptratio	Low	-1.504	-0.857	-0.210	-0.253	0.298	0.806
rad	High	-0.637	-0.522	1.660	0.619	1.660	1.660
rad	Low	-0.867	-0.752	-0.637	-0.619	-0.522	-0.408
rm	High	-1.337	-0.751	-0.222	-0.156	0.265	1.221
rm	Low	-0.672	-0.429	0.036	0.156	0.630	1.229
tax	High	-0.618	-0.601	1.529	0.608	1.529	1.529
tax	Low	-1.105	-0.957	-0.719	-0.608	-0.375	-0.061
zn	High	-0.487	-0.487	-0.487	-0.436	-0.487	-0.487
zn	Low	-0.487	-0.487	-0.487	0.436	0.971	2.943

difference <- function(x) x[1] - x[2]

table_2 <- table_1 %>%
    group_by(Variable) %>%
    summarize(diff_Q10 = difference(Q10),
              diff_Q25 = difference(Q25),
              diff_med = difference(median),
              diff_mean = difference(mean),
              diff_Q75 = difference(Q75),
              diff_Q90 = difference(Q90))


kable(table_2,
      digits = 3, format = 'html') %>%
    kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
    column_spec(1, bold = T) %>%
    add_header_above(c(' ' = 1, 'Between-Groups Differences' = 6)) %>%
    scroll_box(height = '400px')

	Between-Groups Differences
Variable	diff_Q10	diff_Q25	diff_med	diff_mean	diff_Q75	diff_Q90
age	1.548	1.787	1.560	1.227	0.924	0.366
black	-3.165	-0.654	-0.113	-0.702	-0.020	0.000
dis	-0.477	-0.768	-1.414	-1.231	-1.630	-1.817
indus	0.586	0.952	1.816	1.205	1.391	0.984
lstat	0.235	0.674	0.933	0.906	1.122	1.474
medv	-0.883	-0.728	-0.565	-0.526	-0.598	-0.350
nox	0.844	0.975	1.510	1.445	1.597	1.645
ptratio	-0.231	0.831	1.016	0.507	0.508	0.370
rad	0.230	0.230	2.297	1.238	2.182	2.067
rm	-0.666	-0.322	-0.258	-0.312	-0.364	-0.007
tax	0.487	0.356	2.249	1.216	1.905	1.590
zn	0.000	0.000	0.000	-0.871	-1.458	-3.430

table_3 <- table_2 %>%
    gather(Measure, value, -Variable) %>%
    group_by(Variable) %>%
    summarize(`Absolute Mean Differences` = abs(mean(value))) %>%
    arrange(desc(`Absolute Mean Differences`))

## Warning: attributes are not identical across measure variables; they will be
## dropped

kable(table_3,
      digits = 3, format = 'html') %>%
    kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
    column_spec(1, bold = T) %>%
    scroll_box(height = '400px')

Variable	Absolute Mean Differences
rad	1.374
nox	1.336
tax	1.300
age	1.235
dis	1.223
indus	1.156
zn	0.960
lstat	0.890
black	0.776
medv	0.608
ptratio	0.500
rm	0.322

Boston %>%
    dplyr::select(zn:crime_factor) %>%
    gather(value_type, value, -crime_factor, -chas) %>%
    ggplot(aes(value_type, value, fill = crime_factor)) +
    geom_boxplot(alpha = 0.5) +
    facet_wrap(~value_type, scales = 'free') +
    scale_fill_discrete(name = 'Crime Rate') +
    theme(legend.position = 'top')

Boston %>%
    dplyr::select(crim, crime_factor, rad, nox, tax, age, dis, indus) %>%
    gather(Variable, value, -crim, -crime_factor) %>%
    mutate(Variable = str_to_title(Variable)) %>%
    ggplot(aes(value, crim)) +
    geom_point(aes(col = crime_factor)) +
    facet_wrap(~ Variable, scales = 'free') +
    geom_smooth(method = 'lm', formula = y ~ poly(x, 3), se = FALSE) +
    guides(col = FALSE) +
    labs(title = 'Scatterplots for each strong predictor')

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

set.seed(1)
train_size <- nrow(Boston) * 0.75
inTrain <- sample(1:nrow(Boston), size = train_size)
train <- Boston[inTrain,]
test <- Boston[-inTrain,]

log_reg <- glm(crime_factor ~ rad + nox + tax + 
                   age + dis + indus, data = train, family = binomial)

log_reg %>%
    tidy %>%
    kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	31.312	4.984	6.282	0.000
rad	-0.602	0.132	-4.543	0.000
nox	-53.440	9.090	-5.879	0.000
tax	0.008	0.003	2.903	0.004
age	-0.018	0.011	-1.697	0.090
dis	-0.544	0.179	-3.034	0.002
indus	0.095	0.049	1.950	0.051

pred <- predict(log_reg, test, type = 'response')
pred_value <- ifelse(pred >= 0.5, 'Low', 'High')
acc <- mean(pred_value == test$crime_factor)
    
table(pred_value, test$crime_factor) %>%
    kable(format = 'html') %>%
    kable_styling() %>%
    add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
    column_spec(1, bold = T) %>%
    add_footnote(label = acc)

Predicted	Observed
	High	Low
High	58	14
Low	6	49
^a 0.84251968503937

log_reg <- glm(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                   poly(tax, 3) + poly(age, 3) + poly(dis, 3), 
               data = train, family = binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

tidy(log_reg) %>%
    kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	-336.446	156.083	-2.156	0.031
poly(rad, 3)1	-13319.953	5336.905	-2.496	0.013
poly(rad, 3)2	-2181.427	867.504	-2.515	0.012
poly(rad, 3)3	-297.994	126.652	-2.353	0.019
poly(nox, 3)1	-1674.159	657.049	-2.548	0.011
poly(nox, 3)2	-1384.744	580.787	-2.384	0.017
poly(nox, 3)3	-512.519	215.952	-2.373	0.018
poly(tax, 3)1	3380.046	1285.347	2.630	0.009
poly(tax, 3)2	1213.841	456.842	2.657	0.008
poly(tax, 3)3	233.440	94.770	2.463	0.014
poly(age, 3)1	-3.601	8.031	-0.448	0.654
poly(age, 3)2	-12.508	5.997	-2.086	0.037
poly(age, 3)3	0.976	6.002	0.163	0.871
poly(dis, 3)1	-13.737	15.247	-0.901	0.368
poly(dis, 3)2	2.369	10.966	0.216	0.829
poly(dis, 3)3	-16.354	9.667	-1.692	0.091

pred <- predict(log_reg, test, type = 'response')
pred_value <- ifelse(pred >= 0.5, 'Low', 'High')
acc <- mean(pred_value == test$crime_factor)

table(pred_value, test$crime_factor) %>%
    kable(format = 'html') %>%
    kable_styling() %>%
    add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
    column_spec(1, bold = T) %>%
    add_footnote(label = acc)

Predicted	Observed
	High	Low
High	63	7
Low	1	56
^a 0.937007874015748

lda_model <- lda(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                   poly(tax, 3) + poly(age, 3) + poly(dis, 3), data = train)
pred <- predict(lda_model, test)
acc <- mean(pred$class == test$crime_factor)

table(pred$class, test$crime_factor) %>%
    kable(format = 'html') %>%
    kable_styling() %>%
    add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
    column_spec(1, bold = T) %>%
    add_footnote(label = acc)

Predicted	Observed
	High	Low
High	62	11
Low	2	52
^a 0.897637795275591

lda_model <- lda(crime_factor ~ rad + nox + tax + age + dis, data = train)
pred <- predict(lda_model, test)
acc <- mean(pred$class == test$crime_factor)

table(pred$class, test$crime_factor) %>%
    kable(format = 'html') %>%
    kable_styling() %>%
    add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
    column_spec(1, bold = T) %>%
    add_footnote(label = acc)

Predicted	Observed
	High	Low
High	49	1
Low	15	62
^a 0.874015748031496

qda_model <- qda(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                   poly(tax, 3) + poly(age, 3) + poly(dis, 3), data = train)
pred <- predict(qda_model, test)
acc <- mean(pred$class == test$crime_factor)

table(pred$class, test$crime_factor) %>%
    kable(format = 'html') %>%
    kable_styling() %>%
    add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
    column_spec(1, bold = T) %>%
    add_footnote(label = acc)

Predicted	Observed
	High	Low
High	59	2
Low	5	61
^a 0.94488188976378

qda_model <- qda(crime_factor ~ rad + nox + tax + age + dis, data = train)
pred <- predict(qda_model, test)
acc <- mean(pred$class == test$crime_factor) 

table(pred$class, test$crime_factor) %>%
    kable(format = 'html') %>%
    kable_styling() %>%
    add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
    column_spec(1, bold = T) %>%
    add_footnote(label = acc)

Predicted	Observed
	High	Low
High	52	3
Low	12	60
^a 0.881889763779528

qda_model <- qda(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                   poly(tax, 3) + poly(age, 3) + poly(dis, 3) + poly(indus, 3),
                 data = train)
pred <- predict(qda_model, test)
acc <- mean(pred$class == test$crime_factor)

table(pred$class, test$crime_factor) %>%
    kable(format = 'html') %>%
    kable_styling() %>%
    add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
    column_spec(1, bold = T) %>%
    add_footnote(label = acc)

Predicted	Observed
	High	Low
High	57	1
Low	7	62
^a 0.937007874015748

require(class)

## Loading required package: class

variables <- c('rad', 'nox', 'tax', 'age', 'dis', 'zn', 'indus')

x_train <- train[, variables]
y_train <- train$crime_factor
x_test <- test[, variables]
acc <- list()

for (i in 1:20) {
    knn_pred <- knn(train = x_train, test = x_test, cl = y_train, k = i)
    acc[as.character(i)] = mean(knn_pred == test$crime_factor)
}

acc <- unlist(acc)

data_frame(acc = acc) %>%
    mutate(k = row_number()) %>%
    ggplot(aes(k, acc)) +
    geom_col(aes(fill = k == which.max(acc))) +
    labs(x = 'K', y = 'Accuracy', title = 'KNN Accuracy for different values of K') +
    scale_x_continuous(breaks = 1:20) +
    scale_y_continuous(breaks = round(c(seq(0.90, 0.94, 0.01), max(acc)),
                                      digits = 3)) +
    geom_hline(yintercept = max(acc), lty = 2) +
    coord_cartesian(ylim = c(min(acc), max(acc))) +
    guides(fill = FALSE)

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

lambda = 3
y= 4
betas = seq(-10, 10, 0.2)
ridge = function(beta, y, lambda) (y-beta)^2 + lambda*beta^2
plot(betas, ridge(betas, y, lambda), pch = 20, xlab = "beta", ylab = "Ridge optimization")
est.beta= y/(1+ lambda)
points(est.beta, ridge(est.beta, y, lambda), col = "red", pch = 20, lwd=5)

lambda = 3
y= 4
betas = seq(-10, 10, 0.2)
lasso = function(beta, y, lambda) (y-beta)^2 + lambda*abs(beta)
plot(betas, lasso(betas, y, lambda), xlab="beta", main="Lasso Regression Optimization", pch=20)
est.beta= y- (lambda/2)
points(est.beta, lasso(est.beta, y, lambda), col = "red", pch = 20, lwd=5)

set.seed(1)
X = rnorm(100)
eps = rnorm(100)

b0 = 4
b1 = 2
b2 = 3
b3 = 2
Y = b0 + b1 * X + b2 * X^2 + b3 * X^3 + eps

library(leaps)
data.full = data.frame(Y, X)
regfit.full = regsubsets(Y~poly(X,10,raw=T), data=data.frame(Y,X), nvmax=10)
summary = summary(regfit.full)
par(mfrow=c(2,2))
plot(summary$cp, xlab ="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col = "red", cex = 2, pch = 20)
plot(summary$bic, xlab ="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col = "red", cex = 2, pch = 20)
plot(summary$adjr2, xlab ="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col = "red", cex = 2, pch = 20)

library(leaps)
data.full = data.frame(Y, X)
regfit.fwd = regsubsets(Y~poly(X,10,raw=T), data=data.frame(Y,X), nvmax=10, method="forward")
summary = summary(regfit.fwd)
par(mfrow=c(2,2))
plot(summary$cp, xlab ="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col = "red", cex = 2, pch = 20)
plot(summary$bic, xlab ="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col = "red", cex = 2, pch = 20)
plot(summary$adjr2, xlab ="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col = "red", cex = 2, pch = 20)

library(leaps)
data.full = data.frame(Y, X)
regfit.back = regsubsets(Y~poly(X,10,raw=T), data=data.frame(Y,X), nvmax=10, method="backward")
summary = summary(regfit.back)
par(mfrow=c(2,2))
plot(summary$cp, xlab ="Number of variables", ylab="C_p", type="l")
points(which.min(summary$cp), summary$cp[which.min(summary$cp)], col = "red", cex = 2, pch = 20)
plot(summary$bic, xlab ="Number of variables", ylab="BIC", type="l")
points(which.min(summary$bic), summary$bic[which.min(summary$bic)], col = "red", cex = 2, pch = 20)
plot(summary$adjr2, xlab ="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary$adjr2), summary$adjr2[which.max(summary$adjr2)], col = "red", cex = 2, pch = 20)

coef(regfit.full, which.max(summary(regfit.full)$adjr2))

##           (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2 
##            4.07200775            2.38745596            2.84575641 
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5 
##            1.55797426            0.08072292

coef(regfit.fwd, which.max(summary(regfit.fwd)$adjr2))

##           (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2 
##            4.07200775            2.38745596            2.84575641 
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5 
##            1.55797426            0.08072292

coef(regfit.back, which.max(summary(regfit.back)$adjr2))

##           (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2 
##           4.079236362           2.231905828           2.833494180 
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)9 
##           1.819555807           0.001290827

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.1-7

xmat = model.matrix(Y~poly(X,10,raw=T))[,-1]
lasso.mod = cv.glmnet(xmat, Y, alpha=1)
(lambda = lasso.mod$lambda.min)

## [1] 0.04924644

par(mfrow=c(1,1))
plot(lasso.mod)

predict(lasso.mod, s=lambda, type="coefficients")

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                                 s1
## (Intercept)            4.160843693
## poly(X, 10, raw = T)1  2.203516697
## poly(X, 10, raw = T)2  2.650263654
## poly(X, 10, raw = T)3  1.765931597
## poly(X, 10, raw = T)4  0.040178155
## poly(X, 10, raw = T)5  0.021903165
## poly(X, 10, raw = T)6  .          
## poly(X, 10, raw = T)7  0.003685793
## poly(X, 10, raw = T)8  .          
## poly(X, 10, raw = T)9  .          
## poly(X, 10, raw = T)10 .

Y2 = 4 + 7*X^7 + eps
data.full2 = data.frame(y = Y2,x = X)
regfit.full2 = regsubsets(Y~poly(X,10,raw=T), data=data.full2, nvmax=10)
summary2 = summary(regfit.full2)
par(mfrow=c(2,2))
plot(summary2$cp, xlab ="Number of variables", ylab="C_p", type="l")
points(which.min(summary2$cp), summary2$cp[which.min(summary2$cp)], col = "red", cex = 2, pch = 20)
plot(summary2$bic, xlab ="Number of variables", ylab="BIC", type="l")
points(which.min(summary2$bic), summary2$bic[which.min(summary2$bic)], col = "red", cex = 2, pch = 20)
plot(summary2$adjr2, xlab ="Number of variables", ylab="Adjusted R^2", type="l")
points(which.max(summary2$adjr2), summary2$adjr2[which.max(summary2$adjr2)], col = "red", cex = 2, pch = 20)

coef(regfit.full2, which.min(summary2$cp))

##           (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2 
##            4.07200775            2.38745596            2.84575641 
## poly(X, 10, raw = T)3 poly(X, 10, raw = T)5 
##            1.55797426            0.08072292

coef(regfit.full2, which.min(summary2$bic))

##           (Intercept) poly(X, 10, raw = T)1 poly(X, 10, raw = T)2 
##              4.061507              1.975280              2.876209 
## poly(X, 10, raw = T)3 
##              2.017639

coef(regfit.full2, which.min(summary2$adjr2))

##           (Intercept) poly(X, 10, raw = T)3 
##              6.437156              2.828270

xmat = model.matrix(Y2~poly(X,10,raw=T))[,-1]
lasso = cv.glmnet(xmat, Y2, alpha=1)
(lambda = lasso$lambda.min)

## [1] 12.36884

par(mfrow=c(1,1))
plot(lasso)

predict(lasso, s=lambda, type="coefficients")

## 11 x 1 sparse Matrix of class "dgCMatrix"
##                              s1
## (Intercept)            4.820215
## poly(X, 10, raw = T)1  .       
## poly(X, 10, raw = T)2  .       
## poly(X, 10, raw = T)3  .       
## poly(X, 10, raw = T)4  .       
## poly(X, 10, raw = T)5  .       
## poly(X, 10, raw = T)6  .       
## poly(X, 10, raw = T)7  6.796694
## poly(X, 10, raw = T)8  .       
## poly(X, 10, raw = T)9  .       
## poly(X, 10, raw = T)10 .

library(ISLR)
data(College)
set.seed(1)
trainid = sample(1:nrow(College), nrow(College)/2)
train = College[trainid,]
test = College[-trainid,]

lm.fit = lm(Apps~., data=train)
lm.pred = predict(lm.fit, test)
lm.err = mean((test$Apps - lm.pred)^2)
lm.err

## [1] 1135758

library(glmnet)
train.X = model.matrix(Apps ~ ., data = train)
train.Y = train[, "Apps"]
test.X = model.matrix(Apps ~ ., data = test)
test.Y = test[, "Apps"]
grid = 10 ^ seq(4, -2, length=100)
ridge.mod = glmnet(train.X, train.Y, alpha =0, lambda=grid, thresh = 1e-12)
lambda.best = ridge.mod$lambda.min
ridge.pred = predict(ridge.mod, newx= test.X, s=lambda.best)
(ridge.err = mean((test.Y - ridge.pred)^2))

## [1] 1164319

lasso.mod = glmnet(train.X, train.Y, alpha =1, lambda=grid, thresh = 1e-12)
lasso.cv = cv.glmnet(train.X, train.Y, alpha =1, lambda=grid, thresh = 1e-12)
lambda.best = lasso.cv$lambda.min
lasso.pred = predict(lasso.mod, newx= test.X, s=lambda.best)
(lasso.err = mean((test.Y - lasso.pred)^2))

## [1] 1135660

predict(lasso.mod, s = lambda.best, type="coefficients")

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -7.900363e+02
## (Intercept)  .           
## PrivateYes  -3.070103e+02
## Accept       1.779328e+00
## Enroll      -1.469508e+00
## Top10perc    6.672214e+01
## Top25perc   -2.230442e+01
## F.Undergrad  9.258974e-02
## P.Undergrad  9.408838e-03
## Outstate    -1.083495e-01
## Room.Board   2.115147e-01
## Books        2.912105e-01
## Personal     6.120406e-03
## PhD         -1.547200e+01
## Terminal     6.409503e+00
## S.F.Ratio    2.282638e+01
## perc.alumni  1.130498e+00
## Expend       4.856697e-02
## Grad.Rate    7.488081e+00

library(pls)

## 
## Attaching package: 'pls'

## The following object is masked from 'package:corrplot':
## 
##     corrplot

## The following object is masked from 'package:stats':
## 
##     loadings

pcr.fit = pcr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(pcr.fit, val.type="MSEP")

summary(pcr.fit)

## Data:    X dimension: 388 17 
##  Y dimension: 388 1
## Fit method: svdpc
## Number of components considered: 17
## 
## VALIDATION: RMSEP
## Cross-validated using 10 random segments.
##        (Intercept)  1 comps  2 comps  3 comps  4 comps  5 comps  6 comps
## CV            4288     4027     2351     2355     2046     1965     1906
## adjCV         4288     4031     2347     2353     2014     1955     1899
##        7 comps  8 comps  9 comps  10 comps  11 comps  12 comps  13 comps
## CV        1910     1913     1871      1799      1799      1802      1800
## adjCV     1903     1908     1866      1790      1792      1795      1793
##        14 comps  15 comps  16 comps  17 comps
## CV         1832      1728      1310      1222
## adjCV      1829      1702      1296      1212
## 
## TRAINING: % variance explained
##       1 comps  2 comps  3 comps  4 comps  5 comps  6 comps  7 comps  8 comps
## X       32.20    57.78    65.31    70.99    76.37    81.27     84.8    87.85
## Apps    13.44    70.93    71.07    79.87    81.15    82.25     82.3    82.33
##       9 comps  10 comps  11 comps  12 comps  13 comps  14 comps  15 comps
## X       90.62     92.91     94.98     96.74     97.79     98.72     99.42
## Apps    83.38     84.76     84.80     84.84     85.11     85.14     90.55
##       16 comps  17 comps
## X        99.88    100.00
## Apps     93.42     93.89

pcr.pred = predict(pcr.fit, test, ncomp=10)

(pcr.err = mean((test$Apps - pcr.pred)^2))

## [1] 1723100

pls.fit = plsr(Apps~., data=train, scale=TRUE, validation="CV")
validationplot(pls.fit, val.type="MSEP")

pls.pred = predict(pls.fit, test, ncomp=10)
(pls.err = mean((test$Apps - pls.pred)^2))

## [1] 1131661

test.avg = mean(test$Apps)
lm.r2 =1 - lm.err/mean((test.avg - test$Apps)^2)
ridge.r2 =1 - ridge.err/mean((test.avg - test$Apps)^2)
lasso.r2 =1 - lasso.err/mean((test.avg - test$Apps)^2)
pcr.r2 =1 - pcr.err/mean((test.avg - test$Apps)^2)
pls.r2 =1 - pls.err/mean((test.avg - test$Apps)^2)
all.r2 = c(lm.r2, ridge.r2, lasso.r2, pcr.r2, pls.r2)
names(all.r2) = c("lm", "ridge", "lasso", "pcr", "pls")
barplot(all.r2 )

final Khulan Bayarkhuu

2023-06-15