Libraries

rm(list=ls()) 
library(openxlsx)               # load package; to read xlsx file
library(sandwich)               # load package; to compute heteroskedasticity robust standard errors
library(lmtest)                 # load package; to conduct hypothesis test using robust SE
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(car)              # load package; to conduct hypothesis for multiple coefficients
## Loading required package: carData
library(stargazer)        # load package; to put regression results into a single stargazer table
## 
## Please cite as:
##  Hlavac, Marek (2018). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.2. https://CRAN.R-project.org/package=stargazer
library(plm) 


library(openxlsx)
id <- "1eVyfH-AhYRm4oMtZUuQos8zfS9YLvN8t"
smoke <- read.xlsx(sprintf("https://docs.google.com/uc?id=%s&export=download",id),
                 sheet=1,startRow=1,colNames=TRUE,rowNames=FALSE)
attach(smoke)

`` Description of the dataset: https://drive.google.com/file/d/1_xaHYpIKO7A0CPhFyU_OzMnMkI7FP8uz/view?usp=sharing

Question 11.2 Estimate the probability of smoking for (i) all workers, (ii) workers affected by workplace smoking bans, and (iii) workers not affected by workplace smoking bans.

Question A

  1. Probability of smoker
n <- dim(smoke)[1]
prob.smoke <- sum(smoker)/n
prob.smoke
## [1] 0.2423
  1. Probability of workers affected
sub.ii <- subset(smkban, smoker==1)

prob.asmoke <- sum(sub.ii)/sum(smkban)
prob.asmoke
## [1] 0.2120367
#creating the table
# sum(smkban)
# sum(smoker)-10000
# sub.ii <- subset(smoker==0, smkban==0)
# sum(sub.ii)
  1. Probability of workers not affected
sub.iii <- subset(smoker, smkban==0)
prob.nsmoke <- sum(sub.iii)/-(sum(smkban)-n)
prob.nsmoke
## [1] 0.2895951

Question B

What is the difference in the probability of smoking between workers affected by a work place smoking ban and workers not affected by a workplace smoking ban? Use a linear probability model to determine whether this difference is statistically significant.

dif.prob.smoke <- prob.asmoke - prob.nsmoke #difference
dif.prob.smoke # Difference in Prob
## [1] -0.07755835
model1 <- smoker~smkban #regression

fit.smoker <- lm(model1) #OLS 
coeftest(fit.smoker, vcov=vcovHC, type="HC1") #Heteroskedasticity Robust Error
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  0.2895951  0.0072619 39.8789 < 2.2e-16 ***
## smkban      -0.0775583  0.0089520 -8.6638 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Answer: As we can see the result is statistically significant at 1% and the difference is about 7.8%


Question C

Estimate a linear probability model with smoker as the dependent variable and the following regressors: smkban, female, age, age2, hsdrop, hsgrad, colsome, colgrad, black, and Hispanic. Compare the estimated effect of a smoking ban from this regression with your answer from (b). Suggest an explanation, based on the substance of this regression, for the change in the estimated effect of a smoking ban between (b) and (c).

model2 <- (smoker~smkban + female+ age+ I(age^2)+ hsdrop+ hsgrad+ colsome+ colgrad+ black+ hispanic) #regression

fit.smoker2 <- lm(model2) #OLS 
coeftest(fit.smoker2, vcov=vcovHC, type="HC1") #Heteroskedasticity Robust Error
## 
## t test of coefficients:
## 
##                Estimate  Std. Error t value  Pr(>|t|)    
## (Intercept) -1.4110e-02  4.1423e-02 -0.3406 0.7333890    
## smkban      -4.7240e-02  8.9661e-03 -5.2687 1.402e-07 ***
## female      -3.3257e-02  8.5683e-03 -3.8814 0.0001045 ***
## age          9.6744e-03  1.8954e-03  5.1040 3.386e-07 ***
## I(age^2)    -1.3180e-04  2.1905e-05 -6.0169 1.841e-09 ***
## hsdrop       3.2271e-01  1.9489e-02 16.5592 < 2.2e-16 ***
## hsgrad       2.3270e-01  1.2590e-02 18.4826 < 2.2e-16 ***
## colsome      1.6430e-01  1.2625e-02 13.0138 < 2.2e-16 ***
## colgrad      4.4798e-02  1.2044e-02  3.7196 0.0002006 ***
## black       -2.7566e-02  1.6078e-02 -1.7144 0.0864772 .  
## hispanic    -1.0482e-01  1.3975e-02 -7.5004 6.905e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Answer:

According to linear model1 we expect to have a difference of about 7.8% and with model2 we expect 4.7%. One possible explanation is omitted variables. From the variables we can see that college can influence the result. We see that workers that works in the ban differ from the ones that don’t. Age can also influence.

fit.smoker <- lm(model1) #OLS 
coeftest(fit.smoker, vcov=vcovHC, type="HC1") #Heteroskedasticity Robust Error
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept)  0.2895951  0.0072619 39.8789 < 2.2e-16 ***
## smkban      -0.0775583  0.0089520 -8.6638 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Question D

Test the hypothesis that the coefficient on smkban is O in the population version of the regression in (c) against the alternative that it is nonzero, at the 5% significance level.

Answer: The value is already calculate in the table and it is t = -5.2 and that is way above the critical value of -1.96 so we can reject the null hypothesis.


Question E

Test the hypothesis that the probability of smoking does not depend on the level of education in the regression in (c). Does the probability of smoking increase or decrease with the level of education.

linearHypothesis(fit.smoker2, c("hsdrop=0","hsgrad=0","colsome=0","colgrad=0"),white.adjust=c("hc1"))
## Linear hypothesis test
## 
## Hypothesis:
## hsdrop = 0
## hsgrad = 0
## colsome = 0
## colgrad = 0
## 
## Model 1: restricted model
## Model 2: smoker ~ smkban + female + age + I(age^2) + hsdrop + hsgrad + 
##     colsome + colgrad + black + hispanic
## 
## Note: Coefficient covariance matrix supplied.
## 
##   Res.Df Df      F    Pr(>F)    
## 1   9993                        
## 2   9989  4 140.09 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
linearHypothesis(fit.smoker2, c("hsdrop=0","hsgrad=0","colsome=0","colgrad=0"),white.adjust=c("hc1"))
## Linear hypothesis test
## 
## Hypothesis:
## hsdrop = 0
## hsgrad = 0
## colsome = 0
## colgrad = 0
## 
## Model 1: restricted model
## Model 2: smoker ~ smkban + female + age + I(age^2) + hsdrop + hsgrad + 
##     colsome + colgrad + black + hispanic
## 
## Note: Coefficient covariance matrix supplied.
## 
##   Res.Df Df      F    Pr(>F)    
## 1   9993                        
## 2   9989  4 140.09 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Answer: The omitted education variable is master degree or higher and we can see that all the other education levels that represents less education has a positive sign, so more education less probability of smoking. From the test we can see that education affects smoking.


Question F

Repeat (c)-(e) using a probit model.

  1. The result is very different in this case the smkban is -0.158 (15,8%)and in the linear regression was -0.04 so we can say that the effect of ban is stronger in this regression.
model.prob1 <- smoker~smkban + smkban+ female+ age+ I(age^2)+ hsdrop+ hsgrad+ colsome+ colgrad+ black+ hispanic #regression

fit.smoker.p1 <- glm(model.prob1, data = smoke, family= binomial(link="probit")) #Probit
coeftest(fit.smoker.p1, vcov=vcovHC, type="HC1") #Heteroskedasticity Robust Error
## 
## z test of coefficients:
## 
##                Estimate  Std. Error  z value  Pr(>|z|)    
## (Intercept) -1.7349e+00  1.5168e-01 -11.4384 < 2.2e-16 ***
## smkban      -1.5863e-01  2.9176e-02  -5.4370 5.419e-08 ***
## female      -1.1173e-01  2.8866e-02  -3.8707 0.0001085 ***
## age          3.4511e-02  6.8549e-03   5.0346 4.789e-07 ***
## I(age^2)    -4.6754e-04  8.2487e-05  -5.6681 1.444e-08 ***
## hsdrop       1.1416e+00  7.3449e-02  15.5428 < 2.2e-16 ***
## hsgrad       8.8267e-01  6.0664e-02  14.5502 < 2.2e-16 ***
## colsome      6.7712e-01  6.1681e-02  10.9777 < 2.2e-16 ***
## colgrad      2.3468e-01  6.5578e-02   3.5787 0.0003453 ***
## black       -8.4279e-02  5.3828e-02  -1.5657 0.1174193    
## hispanic    -3.3827e-01  5.0155e-02  -6.7446 1.534e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. From the table we see SMKBAN t = -5.437 and that is way above the critical value of -1.96 so we can reject the null hypothesis.
linearHypothesis(fit.smoker.p1, c("hsdrop=0","hsgrad=0","colsome=0","colgrad=0"),
white.adjust=c("hc1"))
## Linear hypothesis test
## 
## Hypothesis:
## hsdrop = 0
## hsgrad = 0
## colsome = 0
## colgrad = 0
## 
## Model 1: restricted model
## Model 2: smoker ~ smkban + smkban + female + age + I(age^2) + hsdrop + 
##     hsgrad + colsome + colgrad + black + hispanic
## 
##   Res.Df Df  Chisq Pr(>Chisq)    
## 1   9993                         
## 2   9989  4 461.12  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. The omitted education variable is master degree or higher and we can see that all the other education levels that represents less education has a positive sign, in this case lower education has a higher coefficient, but we can conclude that education affects smoking.
model.prob1 <- smoker~smkban + smkban+ female+ age+ I(age^2)+ hsdrop+ hsgrad+ colsome+ colgrad+ black+ hispanic #regression

fit.smoker.p1 <- glm(model.prob1, data = smoke, family= binomial(link="probit")) #Probit
coeftest(fit.smoker.p1, vcov=vcovHC, type="HC1") #Heteroskedasticity Robust Error
## 
## z test of coefficients:
## 
##                Estimate  Std. Error  z value  Pr(>|z|)    
## (Intercept) -1.7349e+00  1.5168e-01 -11.4384 < 2.2e-16 ***
## smkban      -1.5863e-01  2.9176e-02  -5.4370 5.419e-08 ***
## female      -1.1173e-01  2.8866e-02  -3.8707 0.0001085 ***
## age          3.4511e-02  6.8549e-03   5.0346 4.789e-07 ***
## I(age^2)    -4.6754e-04  8.2487e-05  -5.6681 1.444e-08 ***
## hsdrop       1.1416e+00  7.3449e-02  15.5428 < 2.2e-16 ***
## hsgrad       8.8267e-01  6.0664e-02  14.5502 < 2.2e-16 ***
## colsome      6.7712e-01  6.1681e-02  10.9777 < 2.2e-16 ***
## colgrad      2.3468e-01  6.5578e-02   3.5787 0.0003453 ***
## black       -8.4279e-02  5.3828e-02  -1.5657 0.1174193    
## hispanic    -3.3827e-01  5.0155e-02  -6.7446 1.534e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Question G

Repeat (c)-(e) using a logit model.

  1. In this case the logit model is very similar to the probit model
model.logi1 <- smoker~smkban + smkban+ female+ age+ I(age^2)+ hsdrop+ hsgrad+ colsome+ colgrad+ black+ hispanic #regression

fit.smoker.g1 <- glm(model.logi1, data = smoke, family= binomial(link="probit")) #Probit
coeftest(fit.smoker.g1, vcov=vcovHC, type="HC1") #Heteroskedasticity Robust Error
## 
## z test of coefficients:
## 
##                Estimate  Std. Error  z value  Pr(>|z|)    
## (Intercept) -1.7349e+00  1.5168e-01 -11.4384 < 2.2e-16 ***
## smkban      -1.5863e-01  2.9176e-02  -5.4370 5.419e-08 ***
## female      -1.1173e-01  2.8866e-02  -3.8707 0.0001085 ***
## age          3.4511e-02  6.8549e-03   5.0346 4.789e-07 ***
## I(age^2)    -4.6754e-04  8.2487e-05  -5.6681 1.444e-08 ***
## hsdrop       1.1416e+00  7.3449e-02  15.5428 < 2.2e-16 ***
## hsgrad       8.8267e-01  6.0664e-02  14.5502 < 2.2e-16 ***
## colsome      6.7712e-01  6.1681e-02  10.9777 < 2.2e-16 ***
## colgrad      2.3468e-01  6.5578e-02   3.5787 0.0003453 ***
## black       -8.4279e-02  5.3828e-02  -1.5657 0.1174193    
## hispanic    -3.3827e-01  5.0155e-02  -6.7446 1.534e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. From the table we see SMKBAN t = -5.437 and that is way above the critical value of -1.96 so we can reject the null hypothesis.
linearHypothesis(fit.smoker.g1, c("hsdrop=0","hsgrad=0","colsome=0","colgrad=0"),
white.adjust=c("hc1"))
## Linear hypothesis test
## 
## Hypothesis:
## hsdrop = 0
## hsgrad = 0
## colsome = 0
## colgrad = 0
## 
## Model 1: restricted model
## Model 2: smoker ~ smkban + smkban + female + age + I(age^2) + hsdrop + 
##     hsgrad + colsome + colgrad + black + hispanic
## 
##   Res.Df Df  Chisq Pr(>Chisq)    
## 1   9993                         
## 2   9989  4 461.12  < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. The omitted education variable is master degree or higher and we can see that all the other education levels that represents less education has a positive sign, in this case lower education has a higher coefficient, but we can conclude that education affects smoking.

Question H

  1. Mr. A is white, non-Hispanic, 20 years old, and a high school dropout. Using the probit regression and assuming that Mr. A is not subject to a workplace smoking ban, calculate the probability that Mr. A smokes.

Answer: 46.41% change of smoke in the first case and 40.17% in the second case. The effect is about 6% in this case.

#For this regression we have this betas no zero
b0 = coefficients(fit.smoker.g1)[1]
b1 = coefficients(fit.smoker.g1)[2]*0
b4 = coefficients(fit.smoker.g1)[4]*20
b5 = coefficients(fit.smoker.g1)[5]*(20^2)
b6 = coefficients(fit.smoker.g1)[6]

prob.smoke.A = b0+b1+b4+b5+b6
pnorm(prob.smoke.A)
## (Intercept) 
##    0.464102
b0 = coefficients(fit.smoker.g1)[1]
b1 = coefficients(fit.smoker.g1)[2]
b4 = coefficients(fit.smoker.g1)[4]*20
b5 = coefficients(fit.smoker.g1)[5]*(20^2)
b6 = coefficients(fit.smoker.g1)[6]

prob.smoke.A = b0+b1+b4+b5+b6
pnorm(prob.smoke.A)
## (Intercept) 
##   0.4017831
  1. Repeat (i) for Ms. B, a female, black, 40-year-old college graduate.

Answer: Mr.B has 14.36% chance of smoke in the first case and 11.07% in the second case. The difference is about 3.2%

b0 = coefficients(fit.smoker.g1)[1]
b1 = coefficients(fit.smoker.g1)[2]*0
b2 = coefficients(fit.smoker.g1)[3]
b4 = coefficients(fit.smoker.g1)[4]*40
b5 = coefficients(fit.smoker.g1)[5]*(40^2)
b8 = coefficients(fit.smoker.g1)[9]
b9 = coefficients(fit.smoker.g1)[10]


prob.smoke.B = b0+b1+b2+b4+b5+b8+b9
pnorm(prob.smoke.B)
## (Intercept) 
##   0.1436957
b0 = coefficients(fit.smoker.g1)[1]
b1 = coefficients(fit.smoker.g1)[2]
b2 = coefficients(fit.smoker.g1)[3]
b4 = coefficients(fit.smoker.g1)[4]*40
b5 = coefficients(fit.smoker.g1)[5]*(40^2)
b8 = coefficients(fit.smoker.g1)[9]
b9 = coefficients(fit.smoker.g1)[10]


prob.smoke.B = b0+b1+b2+b4+b5+b8+b9
pnorm(prob.smoke.B)
## (Intercept) 
##   0.1107609
  1. Repeat (i) - (ii) using the linear probability model.
b0 = coefficients(fit.smoker2)[1]
b1 = coefficients(fit.smoker2)[2]*0
b4 = coefficients(fit.smoker2)[4]*20
b5 = coefficients(fit.smoker2)[5]*(20^2)
b6 = coefficients(fit.smoker2)[6]

prob.smoke.A = b0+b1+b4+b5+b6
prob.smoke.A
## (Intercept) 
##   0.4493721
b0 = coefficients(fit.smoker2)[1]
b1 = coefficients(fit.smoker2)[2]
b4 = coefficients(fit.smoker2)[4]*20
b5 = coefficients(fit.smoker2)[5]*(20^2)
b6 = coefficients(fit.smoker2)[6]

prob.smoke.A = b0+b1+b4+b5+b6
prob.smoke.A
## (Intercept) 
##   0.4021323

Answer: For Mr.A in the first case 44.93% and second case 40.21% probability to smoke. Almost the same from the other model

b0 = coefficients(fit.smoker2)[1]
b1 = coefficients(fit.smoker2)[2]*0
b2 = coefficients(fit.smoker2)[3]
b4 = coefficients(fit.smoker2)[4]*40
b5 = coefficients(fit.smoker2)[5]*(40^2)
b8 = coefficients(fit.smoker2)[9]
b9 = coefficients(fit.smoker2)[10]


prob.smoke.B = b0+b1+b2+b4+b5+b8+b9
prob.smoke.B
## (Intercept) 
##    0.145961
b0 = coefficients(fit.smoker2)[1]
b1 = coefficients(fit.smoker2)[2]
b2 = coefficients(fit.smoker2)[3]
b4 = coefficients(fit.smoker2)[4]*40
b5 = coefficients(fit.smoker2)[5]*(40^2)
b8 = coefficients(fit.smoker2)[9]
b9 = coefficients(fit.smoker2)[10]


prob.smoke.B = b0+b1+b2+b4+b5+b8+b9
prob.smoke.B
## (Intercept) 
##  0.09872116

For Mr.B the results are 14.59% and 9.87% probability to smoke, also very similar to the other model.

(iV) Repeat (i)(ii) using logit model.

fit.smoker.p1 <- glm(model.logi1, data = smoke, family= binomial(link="logit")) #Logit
coeftest(fit.smoker.p1, vcov=vcovHC, type="HC1") #Heteroskedasticity Robust Error
## 
## z test of coefficients:
## 
##                Estimate  Std. Error  z value  Pr(>|z|)    
## (Intercept) -2.99918220  0.26537659 -11.3016 < 2.2e-16 ***
## smkban      -0.26202867  0.04950196  -5.2933 1.201e-07 ***
## female      -0.19077250  0.04920099  -3.8774 0.0001056 ***
## age          0.05993658  0.01182862   5.0671 4.040e-07 ***
## I(age^2)    -0.00081819  0.00014321  -5.7131 1.109e-08 ***
## hsdrop       2.01685337  0.13409500  15.0405 < 2.2e-16 ***
## hsgrad       1.57850359  0.11582668  13.6282 < 2.2e-16 ***
## colsome      1.22997749  0.11770914  10.4493 < 2.2e-16 ***
## colgrad      0.44658309  0.12643554   3.5321 0.0004123 ***
## black       -0.15603412  0.09129560  -1.7091 0.0874308 .  
## hispanic    -0.59717314  0.08623010  -6.9253 4.349e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#For this regression we have this betas no zero
b0 = coefficients(fit.smoker.p1)[1]
b1 = coefficients(fit.smoker.p1)[2]*0
b4 = coefficients(fit.smoker.p1)[4]*20
b5 = coefficients(fit.smoker.p1)[5]*(20^2)
b6 = coefficients(fit.smoker.p1)[6]

prob.smoke.A = b0+b1+b4+b5+b6
plogis(prob.smoke.A)
## (Intercept) 
##   0.4723103
b0 = coefficients(fit.smoker.p1)[1]
b1 = coefficients(fit.smoker.p1)[2]
b4 = coefficients(fit.smoker.p1)[4]*20
b5 = coefficients(fit.smoker.p1)[5]*(20^2)
b6 = coefficients(fit.smoker.p1)[6]

prob.smoke.A = b0+b1+b4+b5+b6
plogis(prob.smoke.A)
## (Intercept) 
##   0.4078402

For the Mr. A we get 47.23% in the first case and 40.78% in the second case.

b0 = coefficients(fit.smoker.g1)[1]
b1 = coefficients(fit.smoker.g1)[2]*0
b2 = coefficients(fit.smoker.g1)[3]
b4 = coefficients(fit.smoker.g1)[4]*40
b5 = coefficients(fit.smoker.g1)[5]*(40^2)
b8 = coefficients(fit.smoker.g1)[9]
b9 = coefficients(fit.smoker.g1)[10]


prob.smoke.B = b0+b1+b2+b4+b5+b8+b9
plogis(prob.smoke.B)
## (Intercept) 
##   0.2565722
b0 = coefficients(fit.smoker.g1)[1]
b1 = coefficients(fit.smoker.g1)[2]
b2 = coefficients(fit.smoker.g1)[3]
b4 = coefficients(fit.smoker.g1)[4]*40
b5 = coefficients(fit.smoker.g1)[5]*(40^2)
b8 = coefficients(fit.smoker.g1)[9]
b9 = coefficients(fit.smoker.g1)[10]


prob.smoke.B = b0+b1+b2+b4+b5+b8+b9
plogis(prob.smoke.B)
## (Intercept) 
##   0.2274983

For Mr.B we have 14.05% and 11.17%

  1. Answer: The results are not the same but are very close to each other when using probit and logit, so we can say that both are almost equal. For the linear model the result is very different. In the real world we have to think about the question we are trying to answer. In general would be better to no use linear model in cases like this one since we want to have different rate through the function.

Showing all in one table

library(jtools)
library(broom.mixed)
## Registered S3 methods overwritten by 'broom':
##   method            from  
##   tidy.glht         jtools
##   tidy.summary.glht jtools
export_summs(fit.smoker,fit.smoker.g1,fit.smoker.p1,fit.smoker2, transform.reponse = TRUE)
Model 1Model 2Model 3Model 4
(Intercept)0.29 ***-1.73 ***-3.00 ***-0.01    
(0.01)   (0.15)   (0.27)   (0.04)   
smkban-0.08 ***-0.16 ***-0.26 ***-0.05 ***
(0.01)   (0.03)   (0.05)   (0.01)   
female       -0.11 ***-0.19 ***-0.03 ***
       (0.03)   (0.05)   (0.01)   
age       0.03 ***0.06 ***0.01 ***
       (0.01)   (0.01)   (0.00)   
I(age^2)       -0.00 ***-0.00 ***-0.00 ***
       (0.00)   (0.00)   (0.00)   
hsdrop       1.14 ***2.02 ***0.32 ***
       (0.07)   (0.13)   (0.02)   
hsgrad       0.88 ***1.58 ***0.23 ***
       (0.06)   (0.11)   (0.02)   
colsome       0.68 ***1.23 ***0.16 ***
       (0.06)   (0.12)   (0.02)   
colgrad       0.23 ***0.45 ***0.04 ** 
       (0.07)   (0.13)   (0.02)   
black       -0.08    -0.16    -0.03    
       (0.05)   (0.09)   (0.02)   
hispanic       -0.34 ***-0.60 ***-0.10 ***
       (0.05)   (0.08)   (0.01)   
N10000       10000       10000       10000       
R20.01                  0.06    
AIC11356.04    10493.74    10490.00    10865.49    
BIC11377.67    10573.05    10569.31    10952.01    
Pseudo R2       0.09    0.09           
*** p < 0.001; ** p < 0.01; * p < 0.05.
summary(fit.smoker)
## 
## Call:
## lm(formula = model1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2896 -0.2896 -0.2120 -0.2120  0.7880 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.289595   0.006833  42.380   <2e-16 ***
## smkban      -0.077558   0.008750  -8.863   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4268 on 9998 degrees of freedom
## Multiple R-squared:  0.007796,   Adjusted R-squared:  0.007697 
## F-statistic: 78.56 on 1 and 9998 DF,  p-value: < 2.2e-16

Now we have the table in the graph (how cool is that?)

library("ggstance")
plot_coefs(fit.smoker,fit.smoker.g1,fit.smoker.p1,fit.smoker2, robust = "HC1")

Lets go further and plot now the distributions for each

plot_summs(fit.smoker, robust = "HC1", scale = TRUE, plot.distributions = TRUE)

plot_summs(fit.smoker.g1, robust = "HC1", scale = TRUE, plot.distributions = TRUE)

plot_summs(fit.smoker.p1, robust = "HC1", scale = TRUE, plot.distributions = TRUE)

plot_summs(fit.smoker2, robust = "HC1", scale = TRUE, plot.distributions = TRUE)

Maybe we can go further and visualize our data and our prediction at the same time let’s try AGE

# library(data.table) 
# library(jtools) 
# library(ggplot2)
effect_plot(fit.smoker.g1, pred = age, interval = TRUE, plot.points = TRUE)

What about marginal effects?

library(margins)
probitmargins <- margins(fit.smoker.p1)
probitmargins
## Average marginal effects
## glm(formula = model.logi1, family = binomial(link = "logit"),     data = smoke)
##    smkban   female        age hsdrop hsgrad colsome colgrad    black hispanic
##  -0.04531 -0.03299 -0.0004218 0.3487 0.2729  0.2127 0.07722 -0.02698  -0.1033

Finish