Reference: UCLA Institution For Digital Research and Education
A study looks at factors that influence the decision of whether to apply to graduate school. College juniors are asked if they are unlikely, somewhat likely, or very likely to apply to graduate school. Hence, our outcome variable has three categories. Data on parental educational status, whether the undergraduate institution is public or private, and current GPA is also collected. The researchers have reason to believe that the “distances” between these three points are not equal. For example, the “distance” between “unlikely” and “somewhat likely” may be shorter than the distance between “somewhat likely” and “very likely”.
## ORDINAL LOGISTIC REGRESSION
#install.packages("foreign")
library(foreign)
#import graduate school data
dat <- read.dta("https://stats.idre.ucla.edu/stat/data/ologit.dta")
head(dat)
## apply pared public gpa
## 1 very likely 0 0 3.26
## 2 somewhat likely 1 0 3.21
## 3 unlikely 1 1 3.94
## 4 somewhat likely 0 0 2.81
## 5 somewhat likely 0 0 2.53
## 6 unlikely 0 1 2.59
# create a viz comparing the apply with gpa, pared, and public
library(tidyverse)
ggplot(dat, aes(x = apply, y = gpa)) +
geom_boxplot(size = .75) +
geom_jitter(alpha = .5) +
facet_grid(pared~ public) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
#flatten table
ftable(xtabs(~ public + apply + pared, data = dat))
## pared 0 1
## public apply
## 0 unlikely 175 14
## somewhat likely 98 26
## very likely 20 10
## 1 unlikely 25 6
## somewhat likely 12 4
## very likely 7 3
# fit the model
library(MASS)
m <- polr(apply ~ pared + public + gpa, data = dat, Hess=TRUE)
summary(m)
## Call:
## polr(formula = apply ~ pared + public + gpa, data = dat, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## pared 1.04769 0.2658 3.9418
## public -0.05879 0.2979 -0.1974
## gpa 0.61594 0.2606 2.3632
##
## Intercepts:
## Value Std. Error t value
## unlikely|somewhat likely 2.2039 0.7795 2.8272
## somewhat likely|very likely 4.2994 0.8043 5.3453
##
## Residual Deviance: 717.0249
## AIC: 727.0249
coef(m)
## pared public gpa
## 1.04769010 -0.05878572 0.61594057
# store the coefficients
ctable <- coef(summary(m))
ctable
## Value Std. Error t value
## pared 1.04769010 0.2657894 3.9418050
## public -0.05878572 0.2978614 -0.1973593
## gpa 0.61594057 0.2606340 2.3632399
## unlikely|somewhat likely 2.20391473 0.7795455 2.8271792
## somewhat likely|very likely 4.29936315 0.8043267 5.3452947
# calculate and store p values
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
p
## pared public
## 8.087072e-05 8.435464e-01
## gpa unlikely|somewhat likely
## 1.811594e-02 4.696004e-03
## somewhat likely|very likely
## 9.027008e-08
# combined table
ctable <- cbind(ctable, "p value" = p)
ctable
## Value Std. Error t value p value
## pared 1.04769010 0.2657894 3.9418050 8.087072e-05
## public -0.05878572 0.2978614 -0.1973593 8.435464e-01
## gpa 0.61594057 0.2606340 2.3632399 1.811594e-02
## unlikely|somewhat likely 2.20391473 0.7795455 2.8271792 4.696004e-03
## somewhat likely|very likely 4.29936315 0.8043267 5.3452947 9.027008e-08
# confidence intervals from profiles
ci <- confint(m)
ci
## 2.5 % 97.5 %
## pared 0.5281768 1.5721750
## public -0.6522060 0.5191384
## gpa 0.1076202 1.1309148
# odds ratios
exp(coef(m))
## pared public gpa
## 2.8510579 0.9429088 1.8513972
# OR and CI
exp(cbind(OR = coef(m), ci))
## OR 2.5 % 97.5 %
## pared 2.8510579 1.6958376 4.817114
## public 0.9429088 0.5208954 1.680579
## gpa 1.8513972 1.1136247 3.098490
Smarket data, which is part of the ISLR library. This data set consists of percentage returns for the S&P 500 stock index over 1, 250 days, from the beginning of 2001 until the end of 2005. For each date, we have recorded the percentage returns for each of the five previous trading days, Lag1 through Lag5. We have also recorded Volume (the number of shares traded on the previous day, in billions), Today (the percentage return on the date in question) and Direction (whether the market was Up or Down on this date).
## COMPARE CLASSIFICATION METHODS
library(ISLR)
# stockmarket dataset
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
dim(Smarket)
## [1] 1250 9
summary(Smarket)
## Year Lag1 Lag2
## Min. :2001 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500
## Median :2003 Median : 0.039000 Median : 0.039000
## Mean :2003 Mean : 0.003834 Mean : 0.003919
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000
## Lag3 Lag4 Lag5
## Min. :-4.922000 Min. :-4.922000 Min. :-4.92200
## 1st Qu.:-0.640000 1st Qu.:-0.640000 1st Qu.:-0.64000
## Median : 0.038500 Median : 0.038500 Median : 0.03850
## Mean : 0.001716 Mean : 0.001636 Mean : 0.00561
## 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.59700
## Max. : 5.733000 Max. : 5.733000 Max. : 5.73300
## Volume Today Direction
## Min. :0.3561 Min. :-4.922000 Down:602
## 1st Qu.:1.2574 1st Qu.:-0.639500 Up :648
## Median :1.4229 Median : 0.038500
## Mean :1.4783 Mean : 0.003138
## 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. :3.1525 Max. : 5.733000
pairs(Smarket)
cor(Smarket[,-9])
## Year Lag1 Lag2 Lag3 Lag4
## Year 1.00000000 0.029699649 0.030596422 0.033194581 0.035688718
## Lag1 0.02969965 1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2 0.03059642 -0.026294328 1.000000000 -0.025896670 -0.010853533
## Lag3 0.03319458 -0.010803402 -0.025896670 1.000000000 -0.024051036
## Lag4 0.03568872 -0.002985911 -0.010853533 -0.024051036 1.000000000
## Lag5 0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647 0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today 0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
## Lag5 Volume Today
## Year 0.029787995 0.53900647 0.030095229
## Lag1 -0.005674606 0.04090991 -0.026155045
## Lag2 -0.003557949 -0.04338321 -0.010250033
## Lag3 -0.018808338 -0.04182369 -0.002447647
## Lag4 -0.027083641 -0.04841425 -0.006899527
## Lag5 1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315 1.00000000 0.014591823
## Today -0.034860083 0.01459182 1.000000000
attach(Smarket)
# volume increased over time
# average number of stocks traded daily increased from 2001 to 2005
plot(Volume)
# Logistic Regression
glm.fit<-glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket, family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Smarket)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.446 -1.203 1.065 1.145 1.326
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## Lag1 -0.073074 0.050167 -1.457 0.145
## Lag2 -0.042301 0.050086 -0.845 0.398
## Lag3 0.011085 0.049939 0.222 0.824
## Lag4 0.009359 0.049974 0.187 0.851
## Lag5 0.010313 0.049511 0.208 0.835
## Volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
# predict
glm.probs<-predict(glm.fit, type="response")
head(glm.probs)
## 1 2 3 4 5 6
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565
# re-code to "up" and "down"
glm.pred<-rep("Down", 1250)
glm.pred[glm.probs>.5]="Up"
# confusion matrix
table(glm.pred, Direction)
## Direction
## glm.pred Down Up
## Down 145 141
## Up 457 507
# correct rate
mean(glm.pred==Direction)
## [1] 0.5216
# testing and training
# train on years 2001 through 2004
train<-Year<2005
Smarket.2005<-Smarket[!train,]
dim(Smarket.2005)
## [1] 252 9
Direction.2005<-Direction[!train]
glm.fit2<-glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket, family=binomial, subset=train)
glm.probsTest<-predict(glm.fit2, Smarket.2005, type="response")
# re-code to "up" and "down"
glm.predTest<-rep("Down", 252)
glm.predTest[glm.probsTest>.5]="Up"
# confusion matrix
table(glm.predTest, Direction.2005)
## Direction.2005
## glm.predTest Down Up
## Down 77 97
## Up 34 44
# correct rate
mean(glm.predTest==Direction.2005)
## [1] 0.4801587
# error rate
mean(glm.predTest!=Direction.2005)
## [1] 0.5198413
# Fit a simpler model
glm.fit3<-glm(Direction~Lag1+Lag2,
data=Smarket, family=binomial, subset=train)
glm.probsTest2<-predict(glm.fit3, Smarket.2005, type="response")
# re-code to "up" and "down"
glm.predTest2<-rep("Down", 252)
glm.predTest2[glm.probsTest2>.5]="Up"
# confusion matrix
table(glm.predTest2, Direction.2005)
## Direction.2005
## glm.predTest2 Down Up
## Down 35 35
## Up 76 106
# correct rate
mean(glm.predTest2==Direction.2005)
## [1] 0.5595238
# error rate
mean(glm.predTest2!=Direction.2005)
## [1] 0.4404762
# LDA
library(MASS)
lda.fit<-lda(Direction~Lag1+Lag2, data=Smarket, subset=train)
lda.fit
## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
##
## Coefficients of linear discriminants:
## LD1
## Lag1 -0.6420190
## Lag2 -0.5135293
# predict
lda.pred<-predict(lda.fit, Smarket.2005)
names(lda.pred)
## [1] "class" "posterior" "x"
lda.class<-lda.pred$class
#confusion matrix
table(lda.class, Direction.2005)
## Direction.2005
## lda.class Down Up
## Down 35 35
## Up 76 106
# correct rate
mean(lda.class==Direction.2005)
## [1] 0.5595238
# QDA
qda.fit<-qda(Direction~Lag1+Lag2, data=Smarket, subset=train)
qda.fit
## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
##
## Prior probabilities of groups:
## Down Up
## 0.491984 0.508016
##
## Group means:
## Lag1 Lag2
## Down 0.04279022 0.03389409
## Up -0.03954635 -0.03132544
# predict
qda.class<-predict(qda.fit, Smarket.2005)$class
#confusion matrix
table(qda.class, Direction.2005)
## Direction.2005
## qda.class Down Up
## Down 30 20
## Up 81 121
# correct rate
mean(qda.class==Direction.2005)
## [1] 0.5992063
#KNN
library(class)
train.X<-cbind(Lag1, Lag2)[train,]
test.X<-cbind(Lag1, Lag2)[!train,]
train.Direction<-Direction[train]
# K=1
set.seed(1)
knn.pred<-knn(train.X, test.X, train.Direction, k=1)
table(knn.pred, Direction.2005)
## Direction.2005
## knn.pred Down Up
## Down 43 58
## Up 68 83
# correct rate
mean(knn.pred==Direction.2005)
## [1] 0.5
# K=3
knn.pred3<-knn(train.X, test.X, train.Direction, k=3)
table(knn.pred3, Direction.2005)
## Direction.2005
## knn.pred3 Down Up
## Down 48 54
## Up 63 87
# correct rate
mean(knn.pred3==Direction.2005)
## [1] 0.5357143