Final Project Financial Software Emmanuel

#Question 1

library(ISLR2)
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:ISLR2':
## 
##     Boston

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::select() masks MASS::select()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

#install.packages("kableExtra")
library(kableExtra)

## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows

#install.packages('corrplot')
library(corrplot)

## corrplot 0.92 loaded

#install.packages("ggthemes") 
library(ggthemes)
library(knitr)
library(broom)
data('Boston')

Boston <- Boston %>%
  mutate(chas = factor(chas),
         crime_factor = factor(ifelse(crim > median(crim), 
                                      'High', 'Low'), 
                               levels = c('High', 'Low')))

cor_test <- Boston %>%
  select(-chas, -crime_factor) %>%
  cor.mtest(conf.level = .95)

Boston %>%
  select(-chas, -crime_factor) %>%
  cor %>%
  corrplot(method = 'color', 
           order = 'hclust', addrect = 2,
           tl.col = 'black', addCoef.col = 'black', number.cex = 0.65,
           p.mat = cor_test$p, sig.level = .05)

scale_this <- function(x) {
  (x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
}

table_1 <- Boston %>%
  dplyr::select(zn:crime_factor) %>%
  gather(Variable, value, -crime_factor, -chas) %>%
  group_by(Variable) %>%
  mutate(value = scale_this(value)) %>%
  group_by(Variable, crime_factor) %>%
  summarize(Q10 = quantile(value, .10),
            Q25 = quantile(value, .25),
            median = median(value), 
            mean = mean(value), 
            Q75 = quantile(value, .75),
            Q90 = quantile(value, .90))

## `summarise()` has grouped output by 'Variable'. You can override using the
## `.groups` argument.

kable(table_1,
      digits = 3, format = 'html') %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
  column_spec(1:2, bold = T) %>%
  scroll_box(height = '400px')

Variable	crime_factor	Q10	Q25	median	mean	Q75	Q90
age	High	-0.234	0.470	0.846	0.613	1.042	1.116
age	Low	-1.782	-1.317	-0.713	-0.613	0.118	0.750
black	High	-2.942	-0.298	0.292	-0.351	0.421	0.441
black	Low	0.222	0.356	0.405	0.351	0.441	0.441
dis	High	-1.108	-0.970	-0.786	-0.616	-0.355	0.098
dis	Low	-0.631	-0.202	0.628	0.616	1.275	1.915
indus	High	-0.720	-0.180	1.015	0.603	1.015	1.231
indus	Low	-1.306	-1.132	-0.801	-0.603	-0.376	0.247
lstat	High	-0.914	-0.290	0.352	0.453	1.076	1.931
lstat	Low	-1.149	-0.964	-0.580	-0.453	-0.045	0.457
medv	High	-1.319	-0.971	-0.504	-0.263	0.051	0.995
medv	Low	-0.436	-0.243	0.062	0.263	0.649	1.345
nox	High	-0.412	-0.092	0.598	0.723	1.254	1.599
nox	Low	-1.256	-1.067	-0.912	-0.723	-0.343	-0.046
ptratio	High	-1.735	-0.026	0.806	0.253	0.806	1.175
ptratio	Low	-1.504	-0.857	-0.210	-0.253	0.298	0.806
rad	High	-0.637	-0.522	1.660	0.619	1.660	1.660
rad	Low	-0.867	-0.752	-0.637	-0.619	-0.522	-0.408
rm	High	-1.337	-0.751	-0.222	-0.156	0.265	1.221
rm	Low	-0.672	-0.429	0.036	0.156	0.630	1.229
tax	High	-0.618	-0.601	1.529	0.608	1.529	1.529
tax	Low	-1.105	-0.957	-0.719	-0.608	-0.375	-0.061
zn	High	-0.487	-0.487	-0.487	-0.436	-0.487	-0.487
zn	Low	-0.487	-0.487	-0.487	0.436	0.971	2.943

difference <- function(x) x[1] - x[2]

table_2 <- table_1 %>%
  group_by(Variable) %>%
  summarize(diff_Q10 = difference(Q10),
            diff_Q25 = difference(Q25),
            diff_med = difference(median),
            diff_mean = difference(mean),
            diff_Q75 = difference(Q75),
            diff_Q90 = difference(Q90))


kable(table_2,
      digits = 3, format = 'html') %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
  column_spec(1, bold = T) %>%
  add_header_above(c(' ' = 1, 'Between-Groups Differences' = 6)) %>%
  scroll_box(height = '400px')

	Between-Groups Differences
Variable	diff_Q10	diff_Q25	diff_med	diff_mean	diff_Q75	diff_Q90
age	1.548	1.787	1.560	1.227	0.924	0.366
black	-3.165	-0.654	-0.113	-0.702	-0.020	0.000
dis	-0.477	-0.768	-1.414	-1.231	-1.630	-1.817
indus	0.586	0.952	1.816	1.205	1.391	0.984
lstat	0.235	0.674	0.933	0.906	1.122	1.474
medv	-0.883	-0.728	-0.565	-0.526	-0.598	-0.350
nox	0.844	0.975	1.510	1.445	1.597	1.645
ptratio	-0.231	0.831	1.016	0.507	0.508	0.370
rad	0.230	0.230	2.297	1.238	2.182	2.067
rm	-0.666	-0.322	-0.258	-0.312	-0.364	-0.007
tax	0.487	0.356	2.249	1.216	1.905	1.590
zn	0.000	0.000	0.000	-0.871	-1.458	-3.430

table_3 <- table_2 %>%
  gather(Measure, value, -Variable) %>%
  group_by(Variable) %>%
  summarize(`Absolute Mean Differences` = abs(mean(value))) %>%
  arrange(desc(`Absolute Mean Differences`))

## Warning: attributes are not identical across measure variables; they will be
## dropped

kable(table_3,
      digits = 3, format = 'html') %>%
  kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
  column_spec(1, bold = T) %>%
  scroll_box(height = '400px')

Variable	Absolute Mean Differences
rad	1.374
nox	1.336
tax	1.300
age	1.235
dis	1.223
indus	1.156
zn	0.960
lstat	0.890
black	0.776
medv	0.608
ptratio	0.500
rm	0.322

Boston %>%
  dplyr::select(zn:crime_factor) %>%
  gather(value_type, value, -crime_factor, -chas) %>%
  ggplot(aes(value_type, value, fill = crime_factor)) +
  geom_boxplot(alpha = 0.5) +
  facet_wrap(~value_type, scales = 'free') +
  scale_fill_discrete(name = 'Crime Rate') +
  theme(legend.position = 'top')

#LOGISTIC REGRESSION
set.seed(1)
train_size <- nrow(Boston) * 0.75
inTrain <- sample(1:nrow(Boston), size = train_size)
train <- Boston[inTrain,]
test <- Boston[-inTrain,]

log_reg <- glm(crime_factor ~ rad + nox + tax + 
                 age + dis + indus, data = train, family = binomial)

log_reg %>%
  tidy %>%
  kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	31.312	4.984	6.282	0.000
rad	-0.602	0.132	-4.543	0.000
nox	-53.440	9.090	-5.879	0.000
tax	0.008	0.003	2.903	0.004
age	-0.018	0.011	-1.697	0.090
dis	-0.544	0.179	-3.034	0.002
indus	0.095	0.049	1.950	0.051

pred <- predict(log_reg, test, type = 'response')
pred_value <- ifelse(pred >= 0.5, 'Low', 'High')
acc <- mean(pred_value == test$crime_factor)

table(pred_value, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)

Predicted	Observed
	High	Low
High	58	14
Low	6	49
^a 0.84251968503937

log_reg <- glm(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                 poly(tax, 3) + poly(age, 3) + poly(dis, 3), 
               data = train, family = binomial)

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

tidy(log_reg) %>%
  kable(digits = 3)

term	estimate	std.error	statistic	p.value
(Intercept)	-336.446	156.083	-2.156	0.031
poly(rad, 3)1	-13319.953	5336.905	-2.496	0.013
poly(rad, 3)2	-2181.427	867.504	-2.515	0.012
poly(rad, 3)3	-297.994	126.652	-2.353	0.019
poly(nox, 3)1	-1674.159	657.049	-2.548	0.011
poly(nox, 3)2	-1384.744	580.787	-2.384	0.017
poly(nox, 3)3	-512.519	215.952	-2.373	0.018
poly(tax, 3)1	3380.046	1285.347	2.630	0.009
poly(tax, 3)2	1213.841	456.842	2.657	0.008
poly(tax, 3)3	233.440	94.770	2.463	0.014
poly(age, 3)1	-3.601	8.031	-0.448	0.654
poly(age, 3)2	-12.508	5.997	-2.086	0.037
poly(age, 3)3	0.976	6.002	0.163	0.871
poly(dis, 3)1	-13.737	15.247	-0.901	0.368
poly(dis, 3)2	2.369	10.966	0.216	0.829
poly(dis, 3)3	-16.354	9.667	-1.692	0.091

pred <- predict(log_reg, test, type = 'response')
pred_value <- ifelse(pred >= 0.5, 'Low', 'High')
acc <- mean(pred_value == test$crime_factor)

table(pred_value, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)

Predicted	Observed
	High	Low
High	63	7
Low	1	56
^a 0.937007874015748

#LDA
lda_model <- lda(crime_factor ~ poly(rad, 3) + poly(nox, 3) + 
                   poly(tax, 3) + poly(age, 3) + poly(dis, 3), data = train)
pred <- predict(lda_model, test)
acc <- mean(pred$class == test$crime_factor)

table(pred$class, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)

Predicted	Observed
	High	Low
High	62	11
Low	2	52
^a 0.897637795275591

lda_model <- lda(crime_factor ~ rad + nox + tax + age + dis, data = train)
pred <- predict(lda_model, test)
acc <- mean(pred$class == test$crime_factor)

table(pred$class, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)

Predicted	Observed
	High	Low
High	49	1
Low	15	62
^a 0.874015748031496

#Naive Bayes
library(e1071)
nb_model <- naiveBayes(crime_factor ~ rad + nox + tax + age + dis, data = train)
pred <- predict(nb_model, test)
acc <- mean(pred == test$crime_factor)

table(pred, test$crime_factor) %>%
  kable(format = 'html') %>%
  kable_styling() %>%
  add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
  column_spec(1, bold = T) %>%
  add_footnote(label = acc)

Predicted	Observed
	High	Low
High	49	4
Low	15	59
^a 0.850393700787402

#KNN
library(class)
variables <- c('rad', 'nox', 'tax', 'age', 'dis', 'zn', 'indus')

x_train <- train[, variables]
y_train <- train$crime_factor
x_test <- test[, variables]
acc <- list()

for (i in 1:20) {
  knn_pred <- knn(train = x_train, test = x_test, cl = y_train, k = i)
  acc[as.character(i)] = mean(knn_pred == test$crime_factor)
}

acc <- unlist(acc)

data_frame(acc = acc) %>%
  mutate(k = row_number()) %>%
  ggplot(aes(k, acc)) +
  geom_col(aes(fill = k == which.max(acc))) +
  labs(x = 'K', y = 'Accuracy', title = 'KNN Accuracy for different values of K') +
  scale_x_continuous(breaks = 1:20) +
  scale_y_continuous(breaks = round(c(seq(0.90, 0.94, 0.01), max(acc)),
                                    digits = 3)) +
  geom_hline(yintercept = max(acc), lty = 2) +
  coord_cartesian(ylim = c(min(acc), max(acc))) +
  guides(fill = FALSE)

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#We can conclude that Logistic regression with polynomial model is the best classifier to use with 93.7% accuracy.

#Question 2 We perform best subset, forward stepwise, and backward stepwise selection on a single data set. For each approach, we obtain p+1 models, containing 0,1,2,…,p predictors. Explain your answers:

Which of the three models with k predictors has the smallest training RSS?

The model with best subset selection has the smallest training RSS since it considers every possible model with k predictrors.

Which of the three models with k predictors has the smallest test RSS?

Best subset selection may have the smallest test RSS because it takes into account more models than the other methods. However, the other methods might also pick a model with smaller test RSS by sheer luck. So it is hard to say which model has the smallest test RSS.

C.True or False: i. The predictors in the k-variable model identified by forwardstep wise are a subset of the predictors in the (k+1)-variable model identified by forward stepwise selection. True. The model with (k+1) predictors is obtained by augmenting the predictors in the model with k predictors with one additional predictor.

The predictors in the k-variable model identified by backward stepwise are a subset of the predictors in the (k+1)-variable model identified by backward stepwise selection. True. The model with k predictors is obtained by removing one predictor from the model with (k+1) predictors.
The predictors in the k-variable model identified by backward stepwise are a subset of the predictors in the (k+1)-variable model identified by forward stepwise selection. False. There is no direct link between the models obtained from forward and backward selection.

iv.The predictors in the k-variable model identified by forward stepwise are a subset of the predictors in the (k+1)-variable model identified by backward stepwise selection. False. There is no direct link between the models obtained from forward and backward selection.

The predictors in the k-variable model identified by best subset are a subset of the predictors in the (k+1)-variable model identified by best subset selection. False. The model with (k+1) predictors is obtained by selecting among all possible models with (k+1) predictors, and so does not necessarily contain all the predictors selected for the k-variable model.

#Question 3 a) Split the data set into a training set and a test set.

library(ISLR2)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(tidyverse)
data('College')
set.seed(1)

inTrain <- createDataPartition(College$Apps, p = 0.75, list = FALSE)

training <- College[inTrain,]
testing <- College[-inTrain,]

preObj <- preProcess(training, method = c('center', 'scale'))

training <- predict(preObj, training)
testing <- predict(preObj, testing)

y_train <- training$Apps
y_test <- testing$Apps

one_hot_encoding <- dummyVars(Apps ~ ., data = training)
x_train <- predict(one_hot_encoding, training)
x_test <- predict(one_hot_encoding, testing)

Fit a linear model using least squares on the training set, and report the test error obtained.

lin_model <- lm(Apps ~ ., data = training)

pred <- predict(lin_model, testing)

(lin_info <- postResample(pred, testing$Apps))

##      RMSE  Rsquared       MAE 
## 0.2799768 0.9201765 0.1568743

Fit a ridge regression model on the training set, with λ chosen by cross-validation. Report the test error obtained.

ridge_fit <- train(x = x_train, y = y_train,
                   method = 'glmnet', 
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 0,
                                          lambda = seq(0, 10e2, length.out = 20)))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

(ridge_info <- postResample(predict(ridge_fit, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 0.2853247 0.9211286 0.1645806

coef(ridge_fit$finalModel, ridge_fit$bestTune$lambda)

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                       s1
## (Intercept)  0.034871314
## Private.No   0.075423210
## Private.Yes -0.076037580
## Accept       0.665628735
## Enroll       0.090243371
## Top10perc    0.107160248
## Top25perc    0.011628030
## F.Undergrad  0.063308800
## P.Undergrad  0.017427317
## Outstate    -0.028995432
## Room.Board   0.048720533
## Books        0.012799145
## Personal    -0.002894430
## PhD         -0.017989250
## Terminal    -0.010434665
## S.F.Ratio    0.006920126
## perc.alumni -0.031683867
## Expend       0.083525070
## Grad.Rate    0.058131023

plot(ridge_fit)

plot(varImp(ridge_fit))

d) Fit a lasso model on the training set, with λ chosen by crossvalidation. Report the test error obtained, along with the number of non-zero coefficient estimates.

lasso_fit <- train(x = x_train, y = y_train, 
                   method = 'glmnet',
                   trControl = trainControl(method = 'cv', number = 10),
                   tuneGrid = expand.grid(alpha = 1,
                                          lambda = seq(0.0001, 1, length.out = 50)))

## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.

(lasso_info <- postResample(predict(lasso_fit, x_test), y_test))

##      RMSE  Rsquared       MAE 
## 0.2914812 0.9141364 0.1511801

coef(lasso_fit$finalModel, lasso_fit$bestTune$lambda)

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                        s1
## (Intercept) -1.470609e-02
## Private.No   5.410732e-02
## Private.Yes -1.041607e-14
## Accept       8.883839e-01
## Enroll       .           
## Top10perc    1.092201e-01
## Top25perc    .           
## F.Undergrad  .           
## P.Undergrad  .           
## Outstate     .           
## Room.Board   1.483337e-04
## Books        .           
## Personal     .           
## PhD          .           
## Terminal     .           
## S.F.Ratio    .           
## perc.alumni  .           
## Expend       4.067011e-02
## Grad.Rate    4.062729e-06

plot(lasso_fit)

plot(varImp(lasso_fit))

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

summary(cars)

##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Final Project Financial Software Emmanuel

Emmanuel Ferdinand Anggawirja

2023-06-13

R Markdown

Including Plots