##library(ISLR)
##library(corrupt)
## Year Lag1 Lag2 Lag3
## Min. :1990 Min. :-18.1950 Min. :-18.1950 Min. :-18.1950
## 1st Qu.:1995 1st Qu.: -1.1540 1st Qu.: -1.1540 1st Qu.: -1.1580
## Median :2000 Median : 0.2410 Median : 0.2410 Median : 0.2410
## Mean :2000 Mean : 0.1506 Mean : 0.1511 Mean : 0.1472
## 3rd Qu.:2005 3rd Qu.: 1.4050 3rd Qu.: 1.4090 3rd Qu.: 1.4090
## Max. :2010 Max. : 12.0260 Max. : 12.0260 Max. : 12.0260
## Lag4 Lag5 Volume
## Min. :-18.1950 Min. :-18.1950 Min. :0.08747
## 1st Qu.: -1.1580 1st Qu.: -1.1660 1st Qu.:0.33202
## Median : 0.2380 Median : 0.2340 Median :1.00268
## Mean : 0.1458 Mean : 0.1399 Mean :1.5746
## 3rd Qu.: 1.4090 3rd Qu.: 1.4050 3rd Qu.:2.05373
## Max. : 12.0260 Max. : 12.0260 Max. :9.32821
## Today Direction
## Min. :-18.1950 Down:484
## 1st Qu.: -1.1540 Up :605
## Median : 0.2410
## Mean : 0.149
## 3rd Qu.: 1.405
## Max. : 12.0260
#corrplot(cor(Weekly[,-9]), method="square")
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Weekly)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6949 -1.2565 0.9913 1.0849 1.4579
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.26686 0.08593 3.106 0.0019 **
## Lag1 -0.04127 0.02641 -1.563 0.1181
## Lag2 0.05844 0.02686 2.175 0.0296 *
## Lag3 -0.01606 0.02666 -0.602 0.5469
## Lag4 -0.02779 0.02646 -1.050 0.2937
## Lag5 -0.01447 0.02638 -0.549 0.5833
## Volume -0.02274 0.03690 -0.616 0.5377
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1496.2 on 1088 degrees of freedom
## Residual deviance: 1486.4 on 1082 degrees of freedom
## AIC: 1500.4
##
#ogWeekly.prob= predict(Weekly.fit, type='response')
#logWeekly.pred =rep("Down", length(logWeekly.prob))
#logWeekly.pred[logWeekly.prob > 0.5] = "Up"
#table(logWeekly.pred, Direction)
## Direction
## logWeekly.pred Down Up
## Down 54 48
## Up 430 557
#mean(logWeekly.pred == Direction.0910)
library(MASS)
## Warning: package 'MASS' was built under R version 4.2.2
## Direction.0910
## Down Up
## Down 9 5
## Up 34 56
#mean(Weeklylda.pred$class==Direction.0910)
## [1] 0.625
#Weeklyqda.fit = qda(Direction ~ Lag2, data = Weekly, subset = train)
#Weeklyqda.pred = predict(Weeklyqda.fit, Weekly.0910)$class
#table(Weeklyqda.pred, Direction.0910)
## Direction.0910
## Weeklyqda.pred Down Up
## Down 0 0
## Up 43 61
#mean(Weeklyqda.pred==Direction.0910)
## [1] 0.586538
#library(class)
#Week.train=as.matrix(Lag2[train])
#Week.test=as.matrix(Lag2[!train])
#train.Direction =Direction[train]
#set.seed(1)
#Weekknn.pred=knn(Week.train,Week.test,train.Direction,k=1)
#table(Weekknn.pred,Direction.0910)
## Direction.0910
## Weekknn.pred Down Up
## Down 21 30
## Up 22 31
#mean(Weekknn.pred == Direction.0910)
#mean(logWeekly.pred == Direction.0910)
#Weekly.fit<-glm(Direction~Lag2:Lag4+Lag2, data=Weekly,family=binomial, subset=train)
#logWeekly.prob= predict(Weekly.fit, Weekly.0910, type = "response")
#logWeekly.pred = rep("Down", length(logWeekly.prob))
#logWeekly.pred[logWeekly.prob > 0.5] = "Up"
#Direction.0910 = Direction[!train]
#table(logWeekly.pred, Direction.0910)
## Direction.0910
## logWeekly.pred Down Up
## Down 3 4
## Up 40 57
#mean(logWeekly.pred == Direction.0910)
#Weeklylda.fit<-lda(Direction~Lag2:Lag4+Lag2, data=Weekly,family=binomial,
#subset=train)Weeklylda.pred<-predict(Weeklylda.fit, Weekly.0910)table(Weeklylda.pred$class,
#Direction.0910)
#mean(Weeklyqda.pred==Direction.0910)
## [1] 0.625
#require(ISLR); require(tidyverse); require(ggthemes); require(GGally)
#require(knitr); require(kableExtra); require(broom)
#theme_set(theme_tufte(base_size = 14))
#set.seed(1)
#data('Auto')
#Auto <- Auto %>%
#filter(!cylinders %in% c(3,5)) %>%
#mutate(mpg01 = factor(ifelse(mpg > median(mpg), 1, 0)),
#cylinders = factor(cylinders,
#levels = c(4,6,8),
#ordered = TRUE),
#origin = factor(origin,
#levels = c(1,2,3),
#labels = c('American', 'European', 'Asian')))
#median(Auto$mpg)
## [1] 23
#Auto %>%
#dplyr::select(mpg, mpg01) %>%
#sample_n(5)
#dplyr::select(-name, -mpg) %>%
#ggpairs(aes(col = mpg01, fill = mpg01, alpha = 0.6),
#upper = list(combo = 'box'),
#diag = list(discrete = wrap('barDiag', position = 'fill')),
#lower = list(combo = 'dot_no_facet')) +
#theme(axis.text.x = element_text(angle = 90, hjust = 1))
#num_train <- nrow(Auto) * 0.75
#inTrain <- sample(nrow(Auto), size = num_train)
#training <- Auto[inTrain,]
#testing <- Auto[-inTrain,]
#require(MASS)
#fmla <- as.formula('mpg01 ~ displacement + horsepower + weight + year + cylinders')
#lda_model <- lda(fmla, data = training)
#pred <- predict(lda_model, testing)
#table(pred$class, testing$mpg01)
##
## 0 1
## 0 52 2
## 1 4 39
#mean(pred$class == testing$mpg01)
## [1] 0.9381443
#qda_model <- qda(fmla, data = training)
#pred <- predict(qda_model, testing)
#table(pred$class, testing$mpg01)
##
## 0 1
## 0 52 1
## 1 4 40
#mean(pred$class == testing$mpg01)
## [1] 0.9484536
#log_reg <- glm(fmla, data = training, family = binomial)
#pred <- predict(log_reg, testing, type = 'response')
#pred_values <- round(pred)
#table(pred_values, testing$mpg01)
##
## pred_values 0 1
## 0 53 3
## 1 3 38
#mean(pred_values == testing$mpg01)
## [1] 0.9381443
#require(class)
#set.seed(1)
#acc <- list()
#x_train <- training[,c('cylinders', 'displacement', 'horsepower', 'weight', 'year')]
#y_train <- training$mpg0
#x_test <- testing[,c('cylinders', 'displacement', 'horsepower', 'weight', 'year')]
#for (i in 1:20) {
#knn_pred <- knn(train = x_train, test = x_test, cl = y_train, k = i)
#acc[as.character(i)] = mean(knn_pred == testing$mpg01)
#}
#geom_hline(yintercept = max(acc), lty = 2) +
#coord_cartesian(ylim = c(min(acc), max(acc))) +
#guides(fill = FALSE)
#scale_this <- function(x) {
#(x - mean(x, na.rm = TRUE)) / sd(x, na.rm = TRUE)
#}
#table_1 <- Boston %>%
#dplyr::select(zn:crime_factor) %>%
#gather(Variable, value, -crime_factor, -chas) %>%
#group_by(Variable) %>%
#mutate(value = scale_this(value)) %>%
#group_by(Variable, crime_factor) %>%
#summarize(Q10 = quantile(value, .10),
#Q25 = quantile(value, .25),
#median = median(value),
#mean = mean(value),
#Q75 = quantile(value, .75),
#Q90 = quantile(value, .90))
#kable(table_1,
#digits = 3, format = 'html') %>%
#kable_styling(bootstrap_options = c('striped', 'hover', 'condensed')) %>%
#column_spec(1:2, bold = T) %>%
##Boston %>%
#dplyr::select(zn:crime_factor) %>%
#gather(value_type, value, -crime_factor, -chas) %>%
#ggplot(aes(value_type, value, fill = crime_factor)) +
#geom_boxplot(alpha = 0.5) +
#facet_wrap(~value_type, scales = 'free') +
#scale_fill_discrete(name = 'Crime Rate') +
#theme(legend.position = 'top')
#Boston %>%
#dplyr::select(crim, crime_factor, rad, nox, tax, age, dis, indus) %>%
#gather(Variable, value, -crim, -crime_factor) %>%
#mutate(Variable = str_to_title(Variable)) %>%
#ggplot(aes(value, crim)) +
#geom_point(aes(col = crime_factor)) +
#facet_wrap(~ Variable, scales = 'free') +
#geom_smooth(method = 'lm', formula = y ~ poly(x, 3), se = FALSE) +
#guides(col = FALSE) +
#labs(title = 'Scatterplots for each strong predictor')
#qda_model <- qda(crime_factor ~ poly(rad, 3) + poly(nox, 3) + poly(tax, 3) + poly(age, 3) + poly(dis, 3) +
#poly(indus, 3),
#data = train)
#pred <- predict(qda_model, test)
#acc <- mean(pred$class == test$crime_factor)
#table(pred$class, test$crime_factor) %>%
#kable(format = 'html') %>%
#kable_styling() %>%
#add_header_above(c('Predicted' = 1, 'Observed' = 2)) %>%
#column_spec(1, bold = T) %>%
#add_footnote(label = acc)
Hint: You will have to create the response variable yourself, using the variables that are contained in the Boston data seT