install.packages("titanic", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/cc/1pvgy4s11875jgp6h1fml9wr0000gn/T//Rtmp1ZBJsG/downloaded_packages
library(tidyverse)
library(titanic)
# LINEAR MODEL
lm_t=lm(Survived ~ Fare, data=titanic_train)
ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() +
stat_smooth(method="lm", se=TRUE)
# LOGIT MODEL
logit_t=glm(formula=Survived ~ Pclass+Sex, family=binomial(link="logit"), data=titanic_train)
summary(logit_t)
##
## Call:
## glm(formula = Survived ~ Pclass + Sex, family = binomial(link = "logit"),
## data = titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2030 -0.7036 -0.4519 0.6719 2.1599
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.2946 0.2974 11.077 <2e-16 ***
## Pclass -0.9606 0.1061 -9.057 <2e-16 ***
## Sexmale -2.6434 0.1838 -14.380 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 827.2 on 888 degrees of freedom
## AIC: 833.2
##
## Number of Fisher Scoring iterations: 4
ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() +
stat_smooth(method="glm", method.args=list(family="binomial"(link="logit")), se=TRUE)
# PROBIT MODEL
probit_t=glm(formula=Survived ~ Fare, family=binomial(link="probit"), data=titanic_train)
summary(probit_t)
##
## Call:
## glm(formula = Survived ~ Fare, family = binomial(link = "probit"),
## data = titanic_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4601 -0.8920 -0.8601 1.3527 1.5811
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.563592 0.056440 -9.986 < 2e-16 ***
## Fare 0.008453 0.001242 6.804 1.02e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.7 on 890 degrees of freedom
## Residual deviance: 1119.9 on 889 degrees of freedom
## AIC: 1123.9
##
## Number of Fisher Scoring iterations: 6
ggplot(titanic_train, aes(x=Fare, y=Survived)) + geom_point() +
stat_smooth(method="glm", method.args=list(family="binomial"(link="probit")), se=TRUE)
install.packages("nnet", repos = "http://cran.us.r-project.org")
##
## There is a binary version available but the source version is
## later:
## binary source needs_compilation
## nnet 7.3-12 7.3-13 TRUE
install.packages("Ecdat", repos = "http://cran.us.r-project.org")
library(nnet)
library(Ecdat)
Y <- Yogurt
M <- multinom(choice ~., data = Y) #the model
## # weights: 44 (30 variable)
## initial value 3343.741999
## iter 10 value 2740.831153
## iter 20 value 2593.985140
## iter 30 value 2563.707966
## final value 2562.990614
## converged
summary(M)
## Call:
## multinom(formula = choice ~ ., data = Y)
##
## Coefficients:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## dannon -0.03520968 0.0001402729 -0.3401250 -0.4537247 -0.06022368
## hiland 4.14419228 -0.0005795527 -0.5042782 -0.5157286 1.07464455
## weight 1.18249094 0.0048663666 -0.2611470 -0.7810408 -1.23737832
## feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon 0.4865494 0.3675842 -0.7044580 0.1846033 0.13864682
## hiland 1.0134997 0.1305399 -0.4733090 -0.7054197 -0.06976759
## weight 1.1985995 0.4751706 -0.5396113 -0.3682489 -0.05816486
##
## Std. Errors:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## dannon 0.8822363 0.001870004 0.2239925 0.3140816 0.3276537
## hiland 2.0291901 0.004573466 0.7435141 0.8224360 0.4603284
## weight 1.0029899 0.002120281 0.2688138 0.3821215 0.3936001
## feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon 0.3276952 0.03461712 0.06710173 0.07237536 0.08089368
## hiland 0.6936778 0.06907950 0.15048557 0.15430184 0.16553123
## weight 0.3360577 0.04614420 0.07464468 0.08106812 0.08519369
##
## Residual Deviance: 5125.981
## AIC: 5185.981
exp(coef(M)) #log odds relative to baseline
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## dannon 0.965403 1.0001403 0.7116813 0.6352576 0.9415539
## hiland 63.066661 0.9994206 0.6039413 0.5970654 2.9289516
## weight 3.262491 1.0048782 0.7701677 0.4579292 0.2901439
## feat.weight price.yoplait price.dannon price.hiland price.weight
## dannon 1.626694 1.444241 0.4943765 1.2027413 1.1487183
## hiland 2.755227 1.139443 0.6229375 0.4939013 0.9326105
## weight 3.315470 1.608289 0.5829748 0.6919450 0.9434944
head(fitted(M)) #probabilities of picking each category for each observation
## yoplait dannon hiland weight
## 1 0.3245838 0.5091098 0.014125195 0.1521812
## 2 0.6018923 0.2850152 0.009748865 0.1033436
## 3 0.5813145 0.3033498 0.010775186 0.1045605
## 4 0.5813145 0.3033498 0.010775186 0.1045605
## 5 0.4133581 0.2930154 0.023418517 0.2702081
## 6 0.4915570 0.2899564 0.027613810 0.1908728
Y$choice <- relevel(Y$choice, ref = "dannon") #set new baseline
# FULL MODEL
M <- multinom(choice ~., data = Y)
## # weights: 44 (30 variable)
## initial value 3343.741999
## iter 10 value 2723.422944
## iter 20 value 2610.939896
## iter 30 value 2563.450128
## final value 2562.990625
## converged
summary(M)
## Call:
## multinom(formula = choice ~ ., data = Y)
##
## Coefficients:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## yoplait 0.03458607 -0.0001407410 0.34004481 0.45379140 0.06035367
## hiland 4.17701865 -0.0007153663 -0.16441842 -0.06136236 1.13495466
## weight 1.21648919 0.0047271381 0.07897677 -0.32719222 -1.17703765
## feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait -0.4864858 -0.3675775 0.7044753 -0.1845759 -0.1386124
## hiland 0.5276242 -0.2369168 0.2316330 -0.8900015 -0.2088435
## weight 0.7120903 0.1076158 0.1648777 -0.5528200 -0.1967649
##
## Std. Errors:
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## yoplait 0.8822345 0.001870002 0.2239905 0.3140810 0.3276568
## hiland 2.0045405 0.004559142 0.7531383 0.8017917 0.4601638
## weight 0.8945313 0.001968840 0.2807492 0.3197073 0.3823698
## feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait 0.3276950 0.03461661 0.06710192 0.07237505 0.08089485
## hiland 0.6865561 0.07314327 0.14532253 0.15387341 0.16449378
## weight 0.2871956 0.04632864 0.05918325 0.07819600 0.07641287
##
## Residual Deviance: 5125.981
## AIC: 5185.981
z <- summary(M)$coefficients/summary(M)$standard.errors #Wald test for p values
p <- (1 - pnorm(abs(z), 0, 1)) * 2
p
## (Intercept) id feat.yoplait feat.dannon feat.hiland
## yoplait 0.9687287 0.94000587 0.1289838 0.1485077 0.853858262
## hiland 0.0371803 0.87531728 0.8271868 0.9389962 0.013647324
## weight 0.1738559 0.01635142 0.7784748 0.3061132 0.002082057
## feat.weight price.yoplait price.dannon price.hiland price.weight
## yoplait 0.13765801 0.000000000 0.000000000 1.076396e-02 0.08662261
## hiland 0.44218515 0.001199163 0.110953132 7.295151e-09 0.20422249
## weight 0.01315811 0.020185676 0.005338194 1.552980e-12 0.01002335
library(stargazer) #a package for making the output into a nice table
# MODEL 1
M1 <- multinom(choice ~ price.yoplait + price.dannon + price.hiland + price.weight, data = Y) #price model
## # weights: 24 (15 variable)
## initial value 3343.741999
## iter 10 value 2696.572347
## iter 20 value 2590.714860
## final value 2590.678971
## converged
summary(M1)
## Call:
## multinom(formula = choice ~ price.yoplait + price.dannon + price.hiland +
## price.weight, data = Y)
##
## Coefficients:
## (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait -0.01730093 -0.37181637 0.6872791 -0.2021630 -0.09475388
## hiland 6.36906160 -0.24038641 0.2362219 -1.1644689 -0.29925903
## weight 1.11884075 0.08901697 0.1897998 -0.4480783 -0.22525465
##
## Std. Errors:
## (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait 0.8186591 0.03417663 0.06291827 0.06846684 0.07733802
## hiland 1.6863041 0.07344800 0.13278123 0.11799583 0.14801776
## weight 0.8152734 0.04589639 0.05402714 0.07013378 0.07000950
##
## Residual Deviance: 5181.358
## AIC: 5211.358
z1 <- summary(M1)$coefficients/summary(M1)$standard.errors
p1 <- (1 - pnorm(abs(z1), 0, 1)) * 2
p1
## (Intercept) price.yoplait price.dannon price.hiland price.weight
## yoplait 0.9831393611 0.000000000 0.0000000000 3.149938e-03 0.220503123
## hiland 0.0001587699 0.001064581 0.0752347040 0.000000e+00 0.043199301
## weight 0.1699543794 0.052438052 0.0004430004 1.670737e-10 0.001293189
# MODEL 2
M2 <- multinom(choice ~ feat.yoplait + feat.dannon + feat.hiland + feat.weight, data = Y) #feature model
## # weights: 24 (15 variable)
## initial value 3343.741999
## iter 10 value 2811.342546
## iter 20 value 2786.117255
## final value 2786.109276
## converged
summary(M2)
## Call:
## multinom(formula = choice ~ feat.yoplait + feat.dannon + feat.hiland +
## feat.weight, data = Y)
##
## Coefficients:
## (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait -0.1936272 0.846540156 -0.6273882 -0.02145402 -0.1248708
## hiland -2.9226341 -0.021333436 -0.2961226 2.48442120 0.5246249
## weight -0.5541230 0.007649006 -0.5247896 -0.31481937 0.4915248
##
## Std. Errors:
## (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait 0.05202093 0.2066922 0.2610760 0.2737980 0.2732667
## hiland 0.15475460 0.7416536 0.7374798 0.3258950 0.6225818
## weight 0.05788051 0.2740806 0.2866666 0.3354511 0.2567316
##
## Residual Deviance: 5572.219
## AIC: 5602.219
z2 <- summary(M2)$coefficients/summary(M2)$standard.errors
p2 <- (1 - pnorm(abs(z2), 0, 1)) * 2
p2
## (Intercept) feat.yoplait feat.dannon feat.hiland feat.weight
## yoplait 0.0001975709 4.209728e-05 0.01625733 9.375440e-01 0.6477029
## hiland 0.0000000000 9.770523e-01 0.68802757 2.464695e-14 0.3994185
## weight 0.0000000000 9.777356e-01 0.06715108 3.479898e-01 0.0555503
# MODEL 3
M3 <- multinom(choice ~ feat.weight + price.yoplait + price.dannon + price.hiland, data = Y)
## # weights: 24 (15 variable)
## initial value 3343.741999
## iter 10 value 2689.814698
## iter 20 value 2586.232922
## final value 2586.218053
## converged
summary(M3)
## Call:
## multinom(formula = choice ~ feat.weight + price.yoplait + price.dannon +
## price.hiland, data = Y)
##
## Coefficients:
## (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait -0.4868515 -0.3600374 -0.38592053 0.6645636 -0.1906188
## hiland 4.3842502 0.9043384 -0.24106611 0.2005119 -1.1886847
## weight -0.1938915 0.9480919 0.08262245 0.1532484 -0.4768760
##
## Std. Errors:
## (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait 0.6953676 0.3133467 0.03414217 0.06176640 0.06882024
## hiland 1.4367336 0.6610763 0.07445377 0.12879778 0.11938915
## weight 0.7029393 0.2679710 0.04484957 0.05149647 0.07105390
##
## Residual Deviance: 5172.436
## AIC: 5202.436
z3 <- summary(M3)$coefficients/summary(M3)$standard.errors
p3 <- (1 - pnorm(abs(z3), 0, 1)) * 2
p3
## (Intercept) feat.weight price.yoplait price.dannon price.hiland
## yoplait 0.483842694 0.2505532970 0.00000000 0.000000000 5.608959e-03
## hiland 0.002276706 0.1713186599 0.00120457 0.119518777 0.000000e+00
## weight 0.782678888 0.0004031105 0.06544402 0.002921295 1.926770e-11
stargazer(M3, type = "text", title = "M3")
##
## M3
## ===============================================
## Dependent variable:
## -----------------------------
## yoplait hiland weight
## (1) (2) (3)
## -----------------------------------------------
## feat.weight -0.360 0.904 0.948***
## (0.313) (0.661) (0.268)
##
## price.yoplait -0.386*** -0.241*** 0.083*
## (0.034) (0.074) (0.045)
##
## price.dannon 0.665*** 0.201 0.153***
## (0.062) (0.129) (0.051)
##
## price.hiland -0.191*** -1.189*** -0.477***
## (0.069) (0.119) (0.071)
##
## Constant -0.487 4.384*** -0.194
## (0.695) (1.437) (0.703)
##
## -----------------------------------------------
## Akaike Inf. Crit. 5,202.436 5,202.436 5,202.436
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Reference: UCLA Institution For Digital Research and Education
A study looks at factors that influence the decision of whether to apply to graduate school. College juniors are asked if they are unlikely, somewhat likely, or very likely to apply to graduate school. Hence, our outcome variable has three categories. Data on parental educational status, whether the undergraduate institution is public or private, and current GPA is also collected. The researchers have reason to believe that the “distances” between these three points are not equal. For example, the “distance” between “unlikely” and “somewhat likely” may be shorter than the distance between “somewhat likely” and “very likely”.
## ORDINAL LOGISTIC REGRESSION
#install.packages("foreign")
library(foreign)
#import graduate school data
dat <- read.dta("https://stats.idre.ucla.edu/stat/data/ologit.dta")
head(dat)
## apply pared public gpa
## 1 very likely 0 0 3.26
## 2 somewhat likely 1 0 3.21
## 3 unlikely 1 1 3.94
## 4 somewhat likely 0 0 2.81
## 5 somewhat likely 0 0 2.53
## 6 unlikely 0 1 2.59
# create a viz comparing the apply with gpa, pared, and public
library(tidyverse)
ggplot(dat, aes(x = apply, y = gpa)) +
geom_boxplot(size = .75) +
geom_jitter(alpha = .5) +
facet_grid(pared~ public) +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1))
#flatten table
ftable(xtabs(~ public + apply + pared, data = dat))
## pared 0 1
## public apply
## 0 unlikely 175 14
## somewhat likely 98 26
## very likely 20 10
## 1 unlikely 25 6
## somewhat likely 12 4
## very likely 7 3
# fit the model
library(MASS)
m <- polr(apply ~ pared + public + gpa, data = dat, Hess=TRUE)
summary(m)
## Call:
## polr(formula = apply ~ pared + public + gpa, data = dat, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## pared 1.04769 0.2658 3.9418
## public -0.05879 0.2979 -0.1974
## gpa 0.61594 0.2606 2.3632
##
## Intercepts:
## Value Std. Error t value
## unlikely|somewhat likely 2.2039 0.7795 2.8272
## somewhat likely|very likely 4.2994 0.8043 5.3453
##
## Residual Deviance: 717.0249
## AIC: 727.0249
coef(m)
## pared public gpa
## 1.04769010 -0.05878572 0.61594057
# store the coefficients
ctable <- coef(summary(m))
ctable
## Value Std. Error t value
## pared 1.04769010 0.2657894 3.9418050
## public -0.05878572 0.2978614 -0.1973593
## gpa 0.61594057 0.2606340 2.3632399
## unlikely|somewhat likely 2.20391473 0.7795455 2.8271792
## somewhat likely|very likely 4.29936315 0.8043267 5.3452947
# calculate and store p values
p <- pnorm(abs(ctable[, "t value"]), lower.tail = FALSE) * 2
p
## pared public
## 8.087072e-05 8.435464e-01
## gpa unlikely|somewhat likely
## 1.811594e-02 4.696004e-03
## somewhat likely|very likely
## 9.027008e-08
# combined table
ctable <- cbind(ctable, "p value" = p)
ctable
## Value Std. Error t value p value
## pared 1.04769010 0.2657894 3.9418050 8.087072e-05
## public -0.05878572 0.2978614 -0.1973593 8.435464e-01
## gpa 0.61594057 0.2606340 2.3632399 1.811594e-02
## unlikely|somewhat likely 2.20391473 0.7795455 2.8271792 4.696004e-03
## somewhat likely|very likely 4.29936315 0.8043267 5.3452947 9.027008e-08
# confidence intervals from profiles
ci <- confint(m)
ci
## 2.5 % 97.5 %
## pared 0.5281768 1.5721750
## public -0.6522060 0.5191384
## gpa 0.1076202 1.1309148
# odds ratios
exp(coef(m))
## pared public gpa
## 2.8510579 0.9429088 1.8513972
# OR and CI
exp(cbind(OR = coef(m), ci))
## OR 2.5 % 97.5 %
## pared 2.8510579 1.6958376 4.817114
## public 0.9429088 0.5208954 1.680579
## gpa 1.8513972 1.1136247 3.098490