Class 10B

Ordinal Logistic Regression

Reference: UCLA Institution For Digital Research and Education

A study looks at factors that influence the decision of whether to apply to graduate school. College juniors are asked if they are unlikely, somewhat likely, or very likely to apply to graduate school. Hence, our outcome variable has three categories. Data on parental educational status, whether the undergraduate institution is public or private, and current GPA is also collected. The researchers have reason to believe that the “distances” between these three points are not equal. For example, the “distance” between “unlikely” and “somewhat likely” may be shorter than the distance between “somewhat likely” and “very likely”.

## ORDINAL LOGISTIC REGRESSION 
#install.packages("foreign")
library(foreign)

#import graduate school data 
dat <- read.dta("https://stats.idre.ucla.edu/stat/data/ologit.dta")
head(dat)

##             apply pared public  gpa
## 1     very likely     0      0 3.26
## 2 somewhat likely     1      0 3.21
## 3        unlikely     1      1 3.94
## 4 somewhat likely     0      0 2.81
## 5 somewhat likely     0      0 2.53
## 6        unlikely     0      1 2.59

Variables:

apply: levels “unlikely”, “somewhat likely”, and “very likely”, coded 1, 2, and 3, respectively
pared: (stands for parent education) a 0/1 variable indicating whether at least one parent has a graduate degree
public: a 0/1 variable where 1 indicates that the undergraduate institution is public and 0 private
gpa: the student’s grade point average

Visualized the data

# create a viz comparing the apply with gpa, pared, and public 
library(tidyverse)
ggplot(dat, aes(x = apply, y = gpa)) +
  geom_boxplot(size = .75) +
  geom_jitter(alpha = .5) +
  facet_grid(pared~ public) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))

Created a “flattened” table

#flatten table
ftable(xtabs(~ public + apply + pared, data = dat))

##                        pared   0   1
## public apply                        
## 0      unlikely              175  14
##        somewhat likely        98  26
##        very likely            20  10
## 1      unlikely               25   6
##        somewhat likely        12   4
##        very likely             7   3

Fit a ordinal logistic model

# fit the model 
library(MASS)
m <- polr(apply ~ pared + public + gpa, data = dat, Hess=TRUE)
summary(m)

## Call:
## polr(formula = apply ~ pared + public + gpa, data = dat, Hess = TRUE)
## 
## Coefficients:
##           Value Std. Error t value
## pared   1.04769     0.2658  3.9418
## public -0.05879     0.2979 -0.1974
## gpa     0.61594     0.2606  2.3632
## 
## Intercepts:
##                             Value   Std. Error t value
## unlikely|somewhat likely     2.2039  0.7795     2.8272
## somewhat likely|very likely  4.2994  0.8043     5.3453
## 
## Residual Deviance: 717.0249 
## AIC: 727.0249

coef(m)

##       pared      public         gpa 
##  1.04769010 -0.05878572  0.61594057

# store the coefficients
ctable <- coef(summary(m))
ctable

##                                   Value Std. Error    t value
## pared                        1.04769010  0.2657894  3.9418050
## public                      -0.05878572  0.2978614 -0.1973593
## gpa                          0.61594057  0.2606340  2.3632399
## unlikely|somewhat likely     2.20391473  0.7795455  2.8271792
## somewhat likely|very likely  4.29936315  0.8043267  5.3452947

# calculate and store p values
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
p

##                       pared                      public 
##                8.087072e-05                8.435464e-01 
##                         gpa    unlikely|somewhat likely 
##                1.811594e-02                4.696004e-03 
## somewhat likely|very likely 
##                9.027008e-08

# combined table
ctable <- cbind(ctable, "p value" = p)
ctable

##                                   Value Std. Error    t value      p value
## pared                        1.04769010  0.2657894  3.9418050 8.087072e-05
## public                      -0.05878572  0.2978614 -0.1973593 8.435464e-01
## gpa                          0.61594057  0.2606340  2.3632399 1.811594e-02
## unlikely|somewhat likely     2.20391473  0.7795455  2.8271792 4.696004e-03
## somewhat likely|very likely  4.29936315  0.8043267  5.3452947 9.027008e-08

# confidence intervals from profiles
ci <- confint(m)
ci

##             2.5 %    97.5 %
## pared   0.5281768 1.5721750
## public -0.6522060 0.5191384
## gpa     0.1076202 1.1309148

# odds ratios
exp(coef(m))

##     pared    public       gpa 
## 2.8510579 0.9429088 1.8513972

# OR and CI
exp(cbind(OR = coef(m), ci))

##               OR     2.5 %   97.5 %
## pared  2.8510579 1.6958376 4.817114
## public 0.9429088 0.5208954 1.680579
## gpa    1.8513972 1.1136247 3.098490

Comparing Classification Methods

Smarket data, which is part of the ISLR library. This data set consists of percentage returns for the S&P 500 stock index over 1, 250 days, from the beginning of 2001 until the end of 2005. For each date, we have recorded the percentage returns for each of the five previous trading days, Lag1 through Lag5. We have also recorded Volume (the number of shares traded on the previous day, in billions), Today (the percentage return on the date in question) and Direction (whether the market was Up or Down on this date).

## COMPARE CLASSIFICATION METHODS
library(ISLR)

# stockmarket dataset
names(Smarket)

## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"

dim(Smarket)

## [1] 1250    9

summary(Smarket)

##       Year           Lag1                Lag2          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500  
##  Median :2003   Median : 0.039000   Median : 0.039000  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000  
##       Lag3                Lag4                Lag5         
##  Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.92200  
##  1st Qu.:-0.640000   1st Qu.:-0.640000   1st Qu.:-0.64000  
##  Median : 0.038500   Median : 0.038500   Median : 0.03850  
##  Mean   : 0.001716   Mean   : 0.001636   Mean   : 0.00561  
##  3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.59700  
##  Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.73300  
##      Volume           Today           Direction 
##  Min.   :0.3561   Min.   :-4.922000   Down:602  
##  1st Qu.:1.2574   1st Qu.:-0.639500   Up  :648  
##  Median :1.4229   Median : 0.038500             
##  Mean   :1.4783   Mean   : 0.003138             
##  3rd Qu.:1.6417   3rd Qu.: 0.596750             
##  Max.   :3.1525   Max.   : 5.733000

pairs(Smarket)

cor(Smarket[,-9])

##              Year         Lag1         Lag2         Lag3         Lag4
## Year   1.00000000  0.029699649  0.030596422  0.033194581  0.035688718
## Lag1   0.02969965  1.000000000 -0.026294328 -0.010803402 -0.002985911
## Lag2   0.03059642 -0.026294328  1.000000000 -0.025896670 -0.010853533
## Lag3   0.03319458 -0.010803402 -0.025896670  1.000000000 -0.024051036
## Lag4   0.03568872 -0.002985911 -0.010853533 -0.024051036  1.000000000
## Lag5   0.02978799 -0.005674606 -0.003557949 -0.018808338 -0.027083641
## Volume 0.53900647  0.040909908 -0.043383215 -0.041823686 -0.048414246
## Today  0.03009523 -0.026155045 -0.010250033 -0.002447647 -0.006899527
##                Lag5      Volume        Today
## Year    0.029787995  0.53900647  0.030095229
## Lag1   -0.005674606  0.04090991 -0.026155045
## Lag2   -0.003557949 -0.04338321 -0.010250033
## Lag3   -0.018808338 -0.04182369 -0.002447647
## Lag4   -0.027083641 -0.04841425 -0.006899527
## Lag5    1.000000000 -0.02200231 -0.034860083
## Volume -0.022002315  1.00000000  0.014591823
## Today  -0.034860083  0.01459182  1.000000000

attach(Smarket)

# volume increased over time 
# average number of stocks traded daily increased from 2001 to 2005
plot(Volume)

Logistic Regression

# Logistic Regression 
glm.fit<-glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
             data=Smarket, family=binomial)

summary(glm.fit)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.446  -1.203   1.065   1.145   1.326  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3

# predict
glm.probs<-predict(glm.fit, type="response")
head(glm.probs)

##         1         2         3         4         5         6 
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812 0.5069565

# re-code to "up" and "down"
glm.pred<-rep("Down", 1250)
glm.pred[glm.probs>.5]="Up"

# confusion matrix
table(glm.pred, Direction)

##         Direction
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507

# correct rate
mean(glm.pred==Direction)

## [1] 0.5216

# testing and training
# train on years 2001 through 2004
train<-Year<2005
Smarket.2005<-Smarket[!train,]
dim(Smarket.2005)

## [1] 252   9

Direction.2005<-Direction[!train]

glm.fit2<-glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
             data=Smarket, family=binomial, subset=train)

glm.probsTest<-predict(glm.fit2, Smarket.2005, type="response")

# re-code to "up" and "down"
glm.predTest<-rep("Down", 252)
glm.predTest[glm.probsTest>.5]="Up"

# confusion matrix
table(glm.predTest, Direction.2005)

##             Direction.2005
## glm.predTest Down Up
##         Down   77 97
##         Up     34 44

# correct rate
mean(glm.predTest==Direction.2005)

## [1] 0.4801587

# error rate
mean(glm.predTest!=Direction.2005)

## [1] 0.5198413

# Fit a simpler model
glm.fit3<-glm(Direction~Lag1+Lag2,
              data=Smarket, family=binomial, subset=train)

glm.probsTest2<-predict(glm.fit3, Smarket.2005, type="response")

# re-code to "up" and "down"
glm.predTest2<-rep("Down", 252)
glm.predTest2[glm.probsTest2>.5]="Up"

# confusion matrix
table(glm.predTest2, Direction.2005)

##              Direction.2005
## glm.predTest2 Down  Up
##          Down   35  35
##          Up     76 106

# correct rate
mean(glm.predTest2==Direction.2005)

## [1] 0.5595238

# error rate
mean(glm.predTest2!=Direction.2005)

## [1] 0.4404762

Linear Discriminant Analysis (LDA)

# LDA
library(MASS)
lda.fit<-lda(Direction~Lag1+Lag2, data=Smarket, subset=train)
lda.fit

## Call:
## lda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544
## 
## Coefficients of linear discriminants:
##             LD1
## Lag1 -0.6420190
## Lag2 -0.5135293

# predict
lda.pred<-predict(lda.fit, Smarket.2005)
names(lda.pred)

## [1] "class"     "posterior" "x"

lda.class<-lda.pred$class

#confusion matrix
table(lda.class, Direction.2005)

##          Direction.2005
## lda.class Down  Up
##      Down   35  35
##      Up     76 106

# correct rate
mean(lda.class==Direction.2005)

## [1] 0.5595238

Quadratic Discriminant Analysis (QDA)

# QDA
qda.fit<-qda(Direction~Lag1+Lag2, data=Smarket, subset=train)
qda.fit

## Call:
## qda(Direction ~ Lag1 + Lag2, data = Smarket, subset = train)
## 
## Prior probabilities of groups:
##     Down       Up 
## 0.491984 0.508016 
## 
## Group means:
##             Lag1        Lag2
## Down  0.04279022  0.03389409
## Up   -0.03954635 -0.03132544

# predict
qda.class<-predict(qda.fit, Smarket.2005)$class

#confusion matrix
table(qda.class, Direction.2005)

##          Direction.2005
## qda.class Down  Up
##      Down   30  20
##      Up     81 121

# correct rate
mean(qda.class==Direction.2005)

## [1] 0.5992063

KNN

#KNN
library(class)
train.X<-cbind(Lag1, Lag2)[train,]
test.X<-cbind(Lag1, Lag2)[!train,]
train.Direction<-Direction[train]

# K=1
set.seed(1)
knn.pred<-knn(train.X, test.X, train.Direction, k=1)
table(knn.pred, Direction.2005)

##         Direction.2005
## knn.pred Down Up
##     Down   43 58
##     Up     68 83

# correct rate
mean(knn.pred==Direction.2005)

## [1] 0.5

# K=3
knn.pred3<-knn(train.X, test.X, train.Direction, k=3)
table(knn.pred3, Direction.2005)

##          Direction.2005
## knn.pred3 Down Up
##      Down   48 54
##      Up     63 87

# correct rate
mean(knn.pred3==Direction.2005)

## [1] 0.5357143