Lab: Logit and Probit Models

Part I: Basic Logit and Probit

install.packages("titanic", repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Alexander/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)

## package 'titanic' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Alexander\AppData\Local\Temp\Rtmp4QcbqE\downloaded_packages

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(tidyverse)

## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

library(titanic)
logit_t=glm(formula=Survived ~ Pclass+Sex, family=binomial(link="logit"), data=titanic_train)
probit_t=glm(formula=Survived ~ Fare, family=binomial(link="probit"), data=titanic_train)
lm_t=lm(Survived ~ Fare, data=titanic_train)
summary(probit_t)

## 
## Call:
## glm(formula = Survived ~ Fare, family = binomial(link = "probit"), 
##     data = titanic_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4601  -0.8920  -0.8601   1.3527   1.5811  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.563592   0.056440  -9.986  < 2e-16 ***
## Fare         0.008453   0.001242   6.804 1.02e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.7  on 890  degrees of freedom
## Residual deviance: 1119.9  on 889  degrees of freedom
## AIC: 1123.9
## 
## Number of Fisher Scoring iterations: 6

summary(logit_t)

## 
## Call:
## glm(formula = Survived ~ Pclass + Sex, family = binomial(link = "logit"), 
##     data = titanic_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2030  -0.7036  -0.4519   0.6719   2.1599  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   3.2946     0.2974  11.077   <2e-16 ***
## Pclass       -0.9606     0.1061  -9.057   <2e-16 ***
## Sexmale      -2.6434     0.1838 -14.380   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.7  on 890  degrees of freedom
## Residual deviance:  827.2  on 888  degrees of freedom
## AIC: 833.2
## 
## Number of Fisher Scoring iterations: 4

ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() + 
  stat_smooth(method="lm", se=TRUE)

ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() + 
  stat_smooth(method="glm", method.args=list(family="binomial"(link="logit")), se=TRUE)

ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() + 
  stat_smooth(method="glm", method.args=list(family="binomial"(link="probit")), se=TRUE)

Part II: Multinomial Logit and Probit

install.packages("nnet", repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Alexander/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)

## package 'nnet' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Alexander\AppData\Local\Temp\Rtmp4QcbqE\downloaded_packages

install.packages("Ecdat", repos = "http://cran.us.r-project.org")

## Installing package into 'C:/Users/Alexander/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)

## package 'Ecdat' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Alexander\AppData\Local\Temp\Rtmp4QcbqE\downloaded_packages

library(nnet)
library(Ecdat)

## Loading required package: Ecfun

## 
## Attaching package: 'Ecfun'

## The following object is masked from 'package:base':
## 
##     sign

## 
## Attaching package: 'Ecdat'

## The following object is masked from 'package:datasets':
## 
##     Orange

Y <- Yogurt
M <- multinom(choice ~., data = Y) #the model

## # weights:  44 (30 variable)
## initial  value 3343.741999 
## iter  10 value 2740.831153
## iter  20 value 2593.985140
## iter  30 value 2563.707966
## final  value 2562.990614 
## converged

summary(M)

## Call:
## multinom(formula = choice ~ ., data = Y)
## 
## Coefficients:
##        (Intercept)            id feat.yoplait feat.dannon feat.hiland
## dannon -0.03520968  0.0001402729   -0.3401250  -0.4537247 -0.06022368
## hiland  4.14419228 -0.0005795527   -0.5042782  -0.5157286  1.07464455
## weight  1.18249094  0.0048663666   -0.2611470  -0.7810408 -1.23737832
##        feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon   0.4865494     0.3675842   -0.7044580    0.1846033   0.13864682
## hiland   1.0134997     0.1305399   -0.4733090   -0.7054197  -0.06976759
## weight   1.1985995     0.4751706   -0.5396113   -0.3682489  -0.05816486
## 
## Std. Errors:
##        (Intercept)          id feat.yoplait feat.dannon feat.hiland
## dannon   0.8822363 0.001870004    0.2239925   0.3140816   0.3276537
## hiland   2.0291901 0.004573466    0.7435141   0.8224360   0.4603284
## weight   1.0029899 0.002120281    0.2688138   0.3821215   0.3936001
##        feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon   0.3276952    0.03461712   0.06710173   0.07237536   0.08089368
## hiland   0.6936778    0.06907950   0.15048557   0.15430184   0.16553123
## weight   0.3360577    0.04614420   0.07464468   0.08106812   0.08519369
## 
## Residual Deviance: 5125.981 
## AIC: 5185.981

exp(coef(M)) #log odds relative to baseline

##        (Intercept)        id feat.yoplait feat.dannon feat.hiland
## dannon    0.965403 1.0001403    0.7116813   0.6352576   0.9415539
## hiland   63.066661 0.9994206    0.6039413   0.5970654   2.9289516
## weight    3.262491 1.0048782    0.7701677   0.4579292   0.2901439
##        feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon    1.626694      1.444241    0.4943765    1.2027413    1.1487183
## hiland    2.755227      1.139443    0.6229375    0.4939013    0.9326105
## weight    3.315470      1.608289    0.5829748    0.6919450    0.9434944

head(fitted(M)) #probabilities of picking each category for each observation

##     yoplait    dannon      hiland    weight
## 1 0.3245838 0.5091098 0.014125195 0.1521812
## 2 0.6018923 0.2850152 0.009748865 0.1033436
## 3 0.5813145 0.3033498 0.010775186 0.1045605
## 4 0.5813145 0.3033498 0.010775186 0.1045605
## 5 0.4133581 0.2930154 0.023418517 0.2702081
## 6 0.4915570 0.2899564 0.027613810 0.1908728

Y$choice <- relevel(Y$choice, ref = "dannon") #set new baseline
M <- multinom(choice ~., data = Y)

## # weights:  44 (30 variable)
## initial  value 3343.741999 
## iter  10 value 2723.422944
## iter  20 value 2610.939896
## iter  30 value 2563.450128
## final  value 2562.990625 
## converged

summary(M)

## Call:
## multinom(formula = choice ~ ., data = Y)
## 
## Coefficients:
##         (Intercept)            id feat.yoplait feat.dannon feat.hiland
## yoplait  0.03458607 -0.0001407410   0.34004481  0.45379140  0.06035367
## hiland   4.17701865 -0.0007153663  -0.16441842 -0.06136236  1.13495466
## weight   1.21648919  0.0047271381   0.07897677 -0.32719222 -1.17703765
##         feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait  -0.4864858    -0.3675775    0.7044753   -0.1845759   -0.1386124
## hiland    0.5276242    -0.2369168    0.2316330   -0.8900015   -0.2088435
## weight    0.7120903     0.1076158    0.1648777   -0.5528200   -0.1967649
## 
## Std. Errors:
##         (Intercept)          id feat.yoplait feat.dannon feat.hiland
## yoplait   0.8822345 0.001870002    0.2239905   0.3140810   0.3276568
## hiland    2.0045405 0.004559142    0.7531383   0.8017917   0.4601638
## weight    0.8945313 0.001968840    0.2807492   0.3197073   0.3823698
##         feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait   0.3276950    0.03461661   0.06710192   0.07237505   0.08089485
## hiland    0.6865561    0.07314327   0.14532253   0.15387341   0.16449378
## weight    0.2871956    0.04632864   0.05918325   0.07819600   0.07641287
## 
## Residual Deviance: 5125.981 
## AIC: 5185.981

z <- summary(M)$coefficients/summary(M)$standard.errors #Wald test for p values
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p

##         (Intercept)         id feat.yoplait feat.dannon feat.hiland
## yoplait   0.9687287 0.94000587    0.1289838   0.1485077 0.853858262
## hiland    0.0371803 0.87531728    0.8271868   0.9389962 0.013647324
## weight    0.1738559 0.01635142    0.7784748   0.3061132 0.002082057
##         feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait  0.13765801   0.000000000  0.000000000 1.076396e-02   0.08662261
## hiland   0.44218515   0.001199163  0.110953132 7.295151e-09   0.20422249
## weight   0.01315811   0.020185676  0.005338194 1.552980e-12   0.01002335

library(stargazer) #a package for making the output into a nice table

## 
## Please cite as:

##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer

M1 <- multinom(choice ~ price.yoplait + price.dannon + price.hiland + price.weight, data = Y) #price model

## # weights:  24 (15 variable)
## initial  value 3343.741999 
## iter  10 value 2696.572347
## iter  20 value 2590.714860
## final  value 2590.678971 
## converged

summary(M1)

## Call:
## multinom(formula = choice ~ price.yoplait + price.dannon + price.hiland + 
##     price.weight, data = Y)
## 
## Coefficients:
##         (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait -0.01730093   -0.37181637    0.6872791   -0.2021630  -0.09475388
## hiland   6.36906160   -0.24038641    0.2362219   -1.1644689  -0.29925903
## weight   1.11884075    0.08901697    0.1897998   -0.4480783  -0.22525465
## 
## Std. Errors:
##         (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait   0.8186591    0.03417663   0.06291827   0.06846684   0.07733802
## hiland    1.6863041    0.07344800   0.13278123   0.11799583   0.14801776
## weight    0.8152734    0.04589639   0.05402714   0.07013378   0.07000950
## 
## Residual Deviance: 5181.358 
## AIC: 5211.358

z1 <- summary(M1)$coefficients/summary(M1)$standard.errors 
p1 <- (1 - pnorm(abs(z1), 0, 1)) * 2
p1

##          (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait 0.9831393611   0.000000000 0.0000000000 3.149938e-03  0.220503123
## hiland  0.0001587699   0.001064581 0.0752347040 0.000000e+00  0.043199301
## weight  0.1699543794   0.052438052 0.0004430004 1.670737e-10  0.001293189

M2 <- multinom(choice ~ feat.yoplait + feat.dannon + feat.hiland + feat.weight, data = Y) #feature model

## # weights:  24 (15 variable)
## initial  value 3343.741999 
## iter  10 value 2811.342546
## iter  20 value 2786.117255
## final  value 2786.109276 
## converged

summary(M2)

## Call:
## multinom(formula = choice ~ feat.yoplait + feat.dannon + feat.hiland + 
##     feat.weight, data = Y)
## 
## Coefficients:
##         (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait  -0.1936272  0.846540156  -0.6273882 -0.02145402  -0.1248708
## hiland   -2.9226341 -0.021333436  -0.2961226  2.48442120   0.5246249
## weight   -0.5541230  0.007649006  -0.5247896 -0.31481937   0.4915248
## 
## Std. Errors:
##         (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait  0.05202093    0.2066922   0.2610760   0.2737980   0.2732667
## hiland   0.15475460    0.7416536   0.7374798   0.3258950   0.6225818
## weight   0.05788051    0.2740806   0.2866666   0.3354511   0.2567316
## 
## Residual Deviance: 5572.219 
## AIC: 5602.219

z2 <- summary(M2)$coefficients/summary(M2)$standard.errors
p2 <- (1 - pnorm(abs(z2), 0, 1)) * 2
p2

##          (Intercept) feat.yoplait feat.dannon  feat.hiland feat.weight
## yoplait 0.0001975709 4.209728e-05  0.01625733 9.375440e-01   0.6477029
## hiland  0.0000000000 9.770523e-01  0.68802757 2.464695e-14   0.3994185
## weight  0.0000000000 9.777356e-01  0.06715108 3.479898e-01   0.0555503

M3 <- multinom(choice ~ feat.weight + price.yoplait + price.dannon + price.hiland, data = Y)

## # weights:  24 (15 variable)
## initial  value 3343.741999 
## iter  10 value 2689.814698
## iter  20 value 2586.232922
## final  value 2586.218053 
## converged

summary(M3)

## Call:
## multinom(formula = choice ~ feat.weight + price.yoplait + price.dannon + 
##     price.hiland, data = Y)
## 
## Coefficients:
##         (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait  -0.4868515  -0.3600374   -0.38592053    0.6645636   -0.1906188
## hiland    4.3842502   0.9043384   -0.24106611    0.2005119   -1.1886847
## weight   -0.1938915   0.9480919    0.08262245    0.1532484   -0.4768760
## 
## Std. Errors:
##         (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait   0.6953676   0.3133467    0.03414217   0.06176640   0.06882024
## hiland    1.4367336   0.6610763    0.07445377   0.12879778   0.11938915
## weight    0.7029393   0.2679710    0.04484957   0.05149647   0.07105390
## 
## Residual Deviance: 5172.436 
## AIC: 5202.436

z3 <- summary(M3)$coefficients/summary(M3)$standard.errors
p3 <- (1 - pnorm(abs(z3), 0, 1)) * 2
p3

##         (Intercept)  feat.weight price.yoplait price.dannon price.hiland
## yoplait 0.483842694 0.2505532970    0.00000000  0.000000000 5.608959e-03
## hiland  0.002276706 0.1713186599    0.00120457  0.119518777 0.000000e+00
## weight  0.782678888 0.0004031105    0.06544402  0.002921295 1.926770e-11

stargazer(M3, type = "text", title = "M3")

## 
## M3
## ===============================================
##                        Dependent variable:     
##                   -----------------------------
##                    yoplait   hiland    weight  
##                      (1)       (2)       (3)   
## -----------------------------------------------
## feat.weight        -0.360     0.904   0.948*** 
##                    (0.313)   (0.661)   (0.268) 
##                                                
## price.yoplait     -0.386*** -0.241***  0.083*  
##                    (0.034)   (0.074)   (0.045) 
##                                                
## price.dannon      0.665***    0.201   0.153*** 
##                    (0.062)   (0.129)   (0.051) 
##                                                
## price.hiland      -0.191*** -1.189*** -0.477***
##                    (0.069)   (0.119)   (0.071) 
##                                                
## Constant           -0.487   4.384***   -0.194  
##                    (0.695)   (1.437)   (0.703) 
##                                                
## -----------------------------------------------
## Akaike Inf. Crit. 5,202.436 5,202.436 5,202.436
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Part III: Ordinal Logit and Probit

# install.packages("MASS")
library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:Ecdat':
## 
##     SP500

## The following object is masked from 'package:dplyr':
## 
##     select

# Load dataset
data("housing")
head(housing)

##      Sat   Infl  Type Cont Freq
## 1    Low    Low Tower  Low   21
## 2 Medium    Low Tower  Low   21
## 3   High    Low Tower  Low   28
## 4    Low Medium Tower  Low   34
## 5 Medium Medium Tower  Low   22
## 6   High Medium Tower  Low   36

# Modify the original dataset to make each row a observation
housing1 <- housing[rep(row.names(housing), housing$Freq), 1:4]
head(housing1)

##     Sat Infl  Type Cont
## 1   Low  Low Tower  Low
## 1.1 Low  Low Tower  Low
## 1.2 Low  Low Tower  Low
## 1.3 Low  Low Tower  Low
## 1.4 Low  Low Tower  Low
## 1.5 Low  Low Tower  Low

# Fit a ordinal logistic model using the modified dataset
m1 <- polr(Sat ~ ., 
           data = housing1, 
           Hess = TRUE, 
           method = "logistic")
summary(m1)

## Call:
## polr(formula = Sat ~ ., data = housing1, Hess = TRUE, method = "logistic")
## 
## Coefficients:
##                 Value Std. Error t value
## InflMedium     0.5664    0.10465   5.412
## InflHigh       1.2888    0.12716  10.136
## TypeApartment -0.5724    0.11924  -4.800
## TypeAtrium    -0.3662    0.15517  -2.360
## TypeTerrace   -1.0910    0.15149  -7.202
## ContHigh       0.3603    0.09554   3.771
## 
## Intercepts:
##             Value   Std. Error t value
## Low|Medium  -0.4961  0.1248    -3.9739
## Medium|High  0.6907  0.1255     5.5049
## 
## Residual Deviance: 3479.149 
## AIC: 3495.149

# An alternative (and easier) way to fit a ordinal logistic model
m <- polr(Sat ~ Infl + Type + Cont, 
          data = housing, 
          weights = Freq, 
          Hess = TRUE, 
          method = "logistic")
summary(m)

## Call:
## polr(formula = Sat ~ Infl + Type + Cont, data = housing, weights = Freq, 
##     Hess = TRUE, method = "logistic")
## 
## Coefficients:
##                 Value Std. Error t value
## InflMedium     0.5664    0.10465   5.412
## InflHigh       1.2888    0.12716  10.136
## TypeApartment -0.5724    0.11924  -4.800
## TypeAtrium    -0.3662    0.15517  -2.360
## TypeTerrace   -1.0910    0.15149  -7.202
## ContHigh       0.3603    0.09554   3.771
## 
## Intercepts:
##             Value   Std. Error t value
## Low|Medium  -0.4961  0.1248    -3.9739
## Medium|High  0.6907  0.1255     5.5049
## 
## Residual Deviance: 3479.149 
## AIC: 3495.149

# Construct a table to look at p-values
ctable <- coef(summary(m))
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
ctable <- cbind(ctable, "p value" = p)
ctable

##                    Value Std. Error   t value      p value
## InflMedium     0.5663937  0.1046528  5.412123 6.228192e-08
## InflHigh       1.2888191  0.1271561 10.135720 3.835394e-24
## TypeApartment -0.5723501  0.1192380 -4.800064 1.586146e-06
## TypeAtrium    -0.3661866  0.1551733 -2.359855 1.828208e-02
## TypeTerrace   -1.0910149  0.1514860 -7.202083 5.929964e-13
## ContHigh       0.3602841  0.0955358  3.771195 1.624675e-04
## Low|Medium    -0.4961353  0.1248472 -3.973939 7.069368e-05
## Medium|High    0.6907083  0.1254719  5.504882 3.694147e-08

# Construct confidence intervals for estimators
confint.default(m)

##                    2.5 %      97.5 %
## InflMedium     0.3612780  0.77150943
## InflHigh       1.0395976  1.53804054
## TypeApartment -0.8060524 -0.33864793
## TypeAtrium    -0.6703207 -0.06205241
## TypeTerrace   -1.3879221 -0.79410775
## ContHigh       0.1730374  0.54753088

# Now we know that all the predictred estimators are statistically significant, we can try to interpret the result
# Let's find the odds ratio to help interpretation
## odds ratios
exp(coef(m))

##    InflMedium      InflHigh TypeApartment    TypeAtrium   TypeTerrace 
##     1.7619017     3.6284990     0.5641979     0.6933734     0.3358754 
##      ContHigh 
##     1.4337368

# Try with probit!

Lab: Logit and Probit Models

Will Daniel, Wenxin Du, Alex Lloyd-Damnjanovic

November 13, 2018

Part I: Basic Logit and Probit

Part II: Multinomial Logit and Probit

Part III: Ordinal Logit and Probit