Part I: Basic Logit and Probit
install.packages("titanic", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Alexander/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'titanic' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Alexander\AppData\Local\Temp\Rtmp4QcbqE\downloaded_packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(titanic)
logit_t=glm(formula=Survived ~ Pclass+Sex, family=binomial(link="logit"), data=titanic_train)
probit_t=glm(formula=Survived ~ Fare, family=binomial(link="probit"), data=titanic_train)
lm_t=lm(Survived ~ Fare, data=titanic_train)
summary(probit_t)
##
## Call:
## glm(formula = Survived ~ Fare, family = binomial(link = "probit"),
## data = titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4601 -0.8920 -0.8601 1.3527 1.5811
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.563592 0.056440 -9.986 < 2e-16 ***
## Fare 0.008453 0.001242 6.804 1.02e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 1119.9 on 889 degrees of freedom
## AIC: 1123.9
##
## Number of Fisher Scoring iterations: 6
summary(logit_t)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex, family = binomial(link = "logit"),
## data = titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2030 -0.7036 -0.4519 0.6719 2.1599
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.2946 0.2974 11.077 <2e-16 ***
## Pclass -0.9606 0.1061 -9.057 <2e-16 ***
## Sexmale -2.6434 0.1838 -14.380 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 827.2 on 888 degrees of freedom
## AIC: 833.2
##
## Number of Fisher Scoring iterations: 4
ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() +
stat_smooth(method="lm", se=TRUE)

ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() +
stat_smooth(method="glm", method.args=list(family="binomial"(link="logit")), se=TRUE)

ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() +
stat_smooth(method="glm", method.args=list(family="binomial"(link="probit")), se=TRUE)

Part II: Multinomial Logit and Probit
install.packages("nnet", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Alexander/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'nnet' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Alexander\AppData\Local\Temp\Rtmp4QcbqE\downloaded_packages
install.packages("Ecdat", repos = "http://cran.us.r-project.org")
## Installing package into 'C:/Users/Alexander/Documents/R/win-library/3.4'
## (as 'lib' is unspecified)
## package 'Ecdat' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Alexander\AppData\Local\Temp\Rtmp4QcbqE\downloaded_packages
library(nnet)
library(Ecdat)
## Loading required package: Ecfun
##
## Attaching package: 'Ecfun'
## The following object is masked from 'package:base':
##
## sign
##
## Attaching package: 'Ecdat'
## The following object is masked from 'package:datasets':
##
## Orange
Y <- Yogurt
M <- multinom(choice ~., data = Y) #the model
## # weights: 44 (30 variable)
## initial value 3343.741999
## iter 10 value 2740.831153
## iter 20 value 2593.985140
## iter 30 value 2563.707966
## final value 2562.990614
## converged
summary(M)
## Call:
## multinom(formula = choice ~ ., data = Y)
##
## Coefficients:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## dannon -0.03520968 0.0001402729 -0.3401250 -0.4537247 -0.06022368
## hiland 4.14419228 -0.0005795527 -0.5042782 -0.5157286 1.07464455
## weight 1.18249094 0.0048663666 -0.2611470 -0.7810408 -1.23737832
## feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon 0.4865494 0.3675842 -0.7044580 0.1846033 0.13864682
## hiland 1.0134997 0.1305399 -0.4733090 -0.7054197 -0.06976759
## weight 1.1985995 0.4751706 -0.5396113 -0.3682489 -0.05816486
##
## Std. Errors:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## dannon 0.8822363 0.001870004 0.2239925 0.3140816 0.3276537
## hiland 2.0291901 0.004573466 0.7435141 0.8224360 0.4603284
## weight 1.0029899 0.002120281 0.2688138 0.3821215 0.3936001
## feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon 0.3276952 0.03461712 0.06710173 0.07237536 0.08089368
## hiland 0.6936778 0.06907950 0.15048557 0.15430184 0.16553123
## weight 0.3360577 0.04614420 0.07464468 0.08106812 0.08519369
##
## Residual Deviance: 5125.981
## AIC: 5185.981
exp(coef(M)) #log odds relative to baseline
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## dannon 0.965403 1.0001403 0.7116813 0.6352576 0.9415539
## hiland 63.066661 0.9994206 0.6039413 0.5970654 2.9289516
## weight 3.262491 1.0048782 0.7701677 0.4579292 0.2901439
## feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon 1.626694 1.444241 0.4943765 1.2027413 1.1487183
## hiland 2.755227 1.139443 0.6229375 0.4939013 0.9326105
## weight 3.315470 1.608289 0.5829748 0.6919450 0.9434944
head(fitted(M)) #probabilities of picking each category for each observation
## yoplait dannon hiland weight
## 1 0.3245838 0.5091098 0.014125195 0.1521812
## 2 0.6018923 0.2850152 0.009748865 0.1033436
## 3 0.5813145 0.3033498 0.010775186 0.1045605
## 4 0.5813145 0.3033498 0.010775186 0.1045605
## 5 0.4133581 0.2930154 0.023418517 0.2702081
## 6 0.4915570 0.2899564 0.027613810 0.1908728
Y$choice <- relevel(Y$choice, ref = "dannon") #set new baseline
M <- multinom(choice ~., data = Y)
## # weights: 44 (30 variable)
## initial value 3343.741999
## iter 10 value 2723.422944
## iter 20 value 2610.939896
## iter 30 value 2563.450128
## final value 2562.990625
## converged
summary(M)
## Call:
## multinom(formula = choice ~ ., data = Y)
##
## Coefficients:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## yoplait 0.03458607 -0.0001407410 0.34004481 0.45379140 0.06035367
## hiland 4.17701865 -0.0007153663 -0.16441842 -0.06136236 1.13495466
## weight 1.21648919 0.0047271381 0.07897677 -0.32719222 -1.17703765
## feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait -0.4864858 -0.3675775 0.7044753 -0.1845759 -0.1386124
## hiland 0.5276242 -0.2369168 0.2316330 -0.8900015 -0.2088435
## weight 0.7120903 0.1076158 0.1648777 -0.5528200 -0.1967649
##
## Std. Errors:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## yoplait 0.8822345 0.001870002 0.2239905 0.3140810 0.3276568
## hiland 2.0045405 0.004559142 0.7531383 0.8017917 0.4601638
## weight 0.8945313 0.001968840 0.2807492 0.3197073 0.3823698
## feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait 0.3276950 0.03461661 0.06710192 0.07237505 0.08089485
## hiland 0.6865561 0.07314327 0.14532253 0.15387341 0.16449378
## weight 0.2871956 0.04632864 0.05918325 0.07819600 0.07641287
##
## Residual Deviance: 5125.981
## AIC: 5185.981
z <- summary(M)$coefficients/summary(M)$standard.errors #Wald test for p values
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## yoplait 0.9687287 0.94000587 0.1289838 0.1485077 0.853858262
## hiland 0.0371803 0.87531728 0.8271868 0.9389962 0.013647324
## weight 0.1738559 0.01635142 0.7784748 0.3061132 0.002082057
## feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait 0.13765801 0.000000000 0.000000000 1.076396e-02 0.08662261
## hiland 0.44218515 0.001199163 0.110953132 7.295151e-09 0.20422249
## weight 0.01315811 0.020185676 0.005338194 1.552980e-12 0.01002335
library(stargazer) #a package for making the output into a nice table
##
## Please cite as:
## Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
M1 <- multinom(choice ~ price.yoplait + price.dannon + price.hiland + price.weight, data = Y) #price model
## # weights: 24 (15 variable)
## initial value 3343.741999
## iter 10 value 2696.572347
## iter 20 value 2590.714860
## final value 2590.678971
## converged
summary(M1)
## Call:
## multinom(formula = choice ~ price.yoplait + price.dannon + price.hiland +
## price.weight, data = Y)
##
## Coefficients:
## (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait -0.01730093 -0.37181637 0.6872791 -0.2021630 -0.09475388
## hiland 6.36906160 -0.24038641 0.2362219 -1.1644689 -0.29925903
## weight 1.11884075 0.08901697 0.1897998 -0.4480783 -0.22525465
##
## Std. Errors:
## (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait 0.8186591 0.03417663 0.06291827 0.06846684 0.07733802
## hiland 1.6863041 0.07344800 0.13278123 0.11799583 0.14801776
## weight 0.8152734 0.04589639 0.05402714 0.07013378 0.07000950
##
## Residual Deviance: 5181.358
## AIC: 5211.358
z1 <- summary(M1)$coefficients/summary(M1)$standard.errors
p1 <- (1 - pnorm(abs(z1), 0, 1)) * 2
p1
## (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait 0.9831393611 0.000000000 0.0000000000 3.149938e-03 0.220503123
## hiland 0.0001587699 0.001064581 0.0752347040 0.000000e+00 0.043199301
## weight 0.1699543794 0.052438052 0.0004430004 1.670737e-10 0.001293189
M2 <- multinom(choice ~ feat.yoplait + feat.dannon + feat.hiland + feat.weight, data = Y) #feature model
## # weights: 24 (15 variable)
## initial value 3343.741999
## iter 10 value 2811.342546
## iter 20 value 2786.117255
## final value 2786.109276
## converged
summary(M2)
## Call:
## multinom(formula = choice ~ feat.yoplait + feat.dannon + feat.hiland +
## feat.weight, data = Y)
##
## Coefficients:
## (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait -0.1936272 0.846540156 -0.6273882 -0.02145402 -0.1248708
## hiland -2.9226341 -0.021333436 -0.2961226 2.48442120 0.5246249
## weight -0.5541230 0.007649006 -0.5247896 -0.31481937 0.4915248
##
## Std. Errors:
## (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait 0.05202093 0.2066922 0.2610760 0.2737980 0.2732667
## hiland 0.15475460 0.7416536 0.7374798 0.3258950 0.6225818
## weight 0.05788051 0.2740806 0.2866666 0.3354511 0.2567316
##
## Residual Deviance: 5572.219
## AIC: 5602.219
z2 <- summary(M2)$coefficients/summary(M2)$standard.errors
p2 <- (1 - pnorm(abs(z2), 0, 1)) * 2
p2
## (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait 0.0001975709 4.209728e-05 0.01625733 9.375440e-01 0.6477029
## hiland 0.0000000000 9.770523e-01 0.68802757 2.464695e-14 0.3994185
## weight 0.0000000000 9.777356e-01 0.06715108 3.479898e-01 0.0555503
M3 <- multinom(choice ~ feat.weight + price.yoplait + price.dannon + price.hiland, data = Y)
## # weights: 24 (15 variable)
## initial value 3343.741999
## iter 10 value 2689.814698
## iter 20 value 2586.232922
## final value 2586.218053
## converged
summary(M3)
## Call:
## multinom(formula = choice ~ feat.weight + price.yoplait + price.dannon +
## price.hiland, data = Y)
##
## Coefficients:
## (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait -0.4868515 -0.3600374 -0.38592053 0.6645636 -0.1906188
## hiland 4.3842502 0.9043384 -0.24106611 0.2005119 -1.1886847
## weight -0.1938915 0.9480919 0.08262245 0.1532484 -0.4768760
##
## Std. Errors:
## (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait 0.6953676 0.3133467 0.03414217 0.06176640 0.06882024
## hiland 1.4367336 0.6610763 0.07445377 0.12879778 0.11938915
## weight 0.7029393 0.2679710 0.04484957 0.05149647 0.07105390
##
## Residual Deviance: 5172.436
## AIC: 5202.436
z3 <- summary(M3)$coefficients/summary(M3)$standard.errors
p3 <- (1 - pnorm(abs(z3), 0, 1)) * 2
p3
## (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait 0.483842694 0.2505532970 0.00000000 0.000000000 5.608959e-03
## hiland 0.002276706 0.1713186599 0.00120457 0.119518777 0.000000e+00
## weight 0.782678888 0.0004031105 0.06544402 0.002921295 1.926770e-11
stargazer(M3, type = "text", title = "M3")
##
## M3
## ===============================================
## Dependent variable:
## -----------------------------
## yoplait hiland weight
## (1) (2) (3)
## -----------------------------------------------
## feat.weight -0.360 0.904 0.948***
## (0.313) (0.661) (0.268)
##
## price.yoplait -0.386*** -0.241*** 0.083*
## (0.034) (0.074) (0.045)
##
## price.dannon 0.665*** 0.201 0.153***
## (0.062) (0.129) (0.051)
##
## price.hiland -0.191*** -1.189*** -0.477***
## (0.069) (0.119) (0.071)
##
## Constant -0.487 4.384*** -0.194
## (0.695) (1.437) (0.703)
##
## -----------------------------------------------
## Akaike Inf. Crit. 5,202.436 5,202.436 5,202.436
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Part III: Ordinal Logit and Probit
# install.packages("MASS")
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:Ecdat':
##
## SP500
## The following object is masked from 'package:dplyr':
##
## select
# Load dataset
data("housing")
head(housing)
## Sat Infl Type Cont Freq
## 1 Low Low Tower Low 21
## 2 Medium Low Tower Low 21
## 3 High Low Tower Low 28
## 4 Low Medium Tower Low 34
## 5 Medium Medium Tower Low 22
## 6 High Medium Tower Low 36
# Modify the original dataset to make each row a observation
housing1 <- housing[rep(row.names(housing), housing$Freq), 1:4]
head(housing1)
## Sat Infl Type Cont
## 1 Low Low Tower Low
## 1.1 Low Low Tower Low
## 1.2 Low Low Tower Low
## 1.3 Low Low Tower Low
## 1.4 Low Low Tower Low
## 1.5 Low Low Tower Low
# Fit a ordinal logistic model using the modified dataset
m1 <- polr(Sat ~ .,
data = housing1,
Hess = TRUE,
method = "logistic")
summary(m1)
## Call:
## polr(formula = Sat ~ ., data = housing1, Hess = TRUE, method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## InflMedium 0.5664 0.10465 5.412
## InflHigh 1.2888 0.12716 10.136
## TypeApartment -0.5724 0.11924 -4.800
## TypeAtrium -0.3662 0.15517 -2.360
## TypeTerrace -1.0910 0.15149 -7.202
## ContHigh 0.3603 0.09554 3.771
##
## Intercepts:
## Value Std. Error t value
## Low|Medium -0.4961 0.1248 -3.9739
## Medium|High 0.6907 0.1255 5.5049
##
## Residual Deviance: 3479.149
## AIC: 3495.149
# An alternative (and easier) way to fit a ordinal logistic model
m <- polr(Sat ~ Infl + Type + Cont,
data = housing,
weights = Freq,
Hess = TRUE,
method = "logistic")
summary(m)
## Call:
## polr(formula = Sat ~ Infl + Type + Cont, data = housing, weights = Freq,
## Hess = TRUE, method = "logistic")
##
## Coefficients:
## Value Std. Error t value
## InflMedium 0.5664 0.10465 5.412
## InflHigh 1.2888 0.12716 10.136
## TypeApartment -0.5724 0.11924 -4.800
## TypeAtrium -0.3662 0.15517 -2.360
## TypeTerrace -1.0910 0.15149 -7.202
## ContHigh 0.3603 0.09554 3.771
##
## Intercepts:
## Value Std. Error t value
## Low|Medium -0.4961 0.1248 -3.9739
## Medium|High 0.6907 0.1255 5.5049
##
## Residual Deviance: 3479.149
## AIC: 3495.149
# Construct a table to look at p-values
ctable <- coef(summary(m))
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
ctable <- cbind(ctable, "p value" = p)
ctable
## Value Std. Error t value p value
## InflMedium 0.5663937 0.1046528 5.412123 6.228192e-08
## InflHigh 1.2888191 0.1271561 10.135720 3.835394e-24
## TypeApartment -0.5723501 0.1192380 -4.800064 1.586146e-06
## TypeAtrium -0.3661866 0.1551733 -2.359855 1.828208e-02
## TypeTerrace -1.0910149 0.1514860 -7.202083 5.929964e-13
## ContHigh 0.3602841 0.0955358 3.771195 1.624675e-04
## Low|Medium -0.4961353 0.1248472 -3.973939 7.069368e-05
## Medium|High 0.6907083 0.1254719 5.504882 3.694147e-08
# Construct confidence intervals for estimators
confint.default(m)
## 2.5 % 97.5 %
## InflMedium 0.3612780 0.77150943
## InflHigh 1.0395976 1.53804054
## TypeApartment -0.8060524 -0.33864793
## TypeAtrium -0.6703207 -0.06205241
## TypeTerrace -1.3879221 -0.79410775
## ContHigh 0.1730374 0.54753088
# Now we know that all the predictred estimators are statistically significant, we can try to interpret the result
# Let's find the odds ratio to help interpretation
## odds ratios
exp(coef(m))
## InflMedium InflHigh TypeApartment TypeAtrium TypeTerrace
## 1.7619017 3.6284990 0.5641979 0.6933734 0.3358754
## ContHigh
## 1.4337368
# Try with probit!