load data

Data are downloaded, loaded, and names of fields are read from the file:

setInternet2(TRUE) # solution for https files
download.file("https://sites.google.com/site/econometriks/docs/R_smoking.rda", "R_smoking.rda")
load("R_smoking.rda")

names(smoking)
##  [1] "educ"        "age"         "income"      "pcigs79"     "ageeduc"    
##  [6] "smoker"      "educincome"  "qrt.age"     "inc.10k"     "fcast.reg"  
## [11] "fcast.logit" "fcast.prob"  "qrt.edu"     "qrt.inc"     "qrt.price"

Loading required packages and attaching the data set.

library(xtable)
library(psych)
attach(smoking)
## The following object is masked from package:psych:
## 
##     income
library(htmlTable)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2. http://CRAN.R-project.org/package=stargazer

Descriptive stats on continuous varaiables

Table 8.4

t1<-table(smoker)
tab <- xtable(cbind(t1,'t1(%)'=t1/sum(t1)*100))
htmlTable(txtRound(tab,2))
t1 t1(%)
no 695.00 61.94
yes 427.00 38.06

Table 8.5

smoking$smoker2<-factor(smoking$smoker)

f1 <- function(x) c( mean=mean(x), sd= sd(x), obs=length(x))

tab<-rbind(education=f1(educ),'age in years'=f1(age),
      'household income'=f1(income), 
       'cigarette price in cents'=f1(pcigs79),
      'smoker' =f1(as.numeric(smoking$smoker2)-1))
htmlTable(txtRound(tab,2))
mean sd obs
education 12.19 3.30 1122.00
age in years 41.88 17.08 1122.00
household income 19359.18 9052.52 1122.00
cigarette price in cents 60.97 4.85 1122.00
smoker 0.38 0.49 1122.00

… …

Descriptive statistics by smoker status

Table 8.6

tab <- xtable(describeBy(cbind(educ, age, income, pcigs79), smoker, mat=T, skew=F, ranges=F))
names(tab)[2] <-c( "smoker")
print(tab, type="html")
item smoker vars n mean sd se
educ1 1 no 1.00 695.00 12.43 3.52 0.13
educ2 2 yes 1.00 427.00 11.81 2.86 0.14
age1 3 no 2.00 695.00 43.72 18.15 0.69
age2 4 yes 2.00 427.00 38.89 14.73 0.71
income1 5 no 3.00 695.00 19492.81 9192.07 348.68
income2 6 yes 3.00 427.00 19141.69 8827.00 427.17
pcigs791 7 no 4.00 695.00 61.21 4.80 0.18
pcigs792 8 yes 4.00 427.00 60.58 4.92 0.24

Other versions of Table 8.6

t1<-(aggregate(cbind(educ, age, income, pcigs79) ~ smoker, data=smoking,f1))
htmlTable(txtRound(t(t1),2))
smoker no yes
educ.mean 12.43 11.81
educ.sd 3.52 2.86
educ.obs 695.00 427.00
age.mean 43.72 38.89
age.sd 18.15 14.73
age.obs 695.00 427.00
income.mean 19492.81 19141.69
income.sd 9192.07 8827.00
income.obs 695.00 427.00
pcigs79.mean 61.21 60.58
pcigs79.sd 4.80 4.92
pcigs79.obs 695.00 427.00
t1<-aggregate(cbind(income,age, educ, pcigs79) ~ smoker, data=smoking, f1)
tab<-xtable( as.matrix(t(t1)) )
print(tab, type="html")
1 2
smoker no yes
income.mean 19492.806 19141.686
income.sd 9192.072 8826.999
income.obs 695.000 427.000
age.mean 43.72230 38.89227
age.sd 18.14572 14.72686
age.obs 695.00000 427.00000
educ.mean 12.427338 11.806792
educ.sd 3.517470 2.861969
educ.obs 695.000000 427.000000
pcigs79.mean 61.214532 60.577049
pcigs79.sd 4.800400 4.915248
pcigs79.obs 695.000000 427.000000
t1<-aggregate(cbind(income,age, educ, pcigs79) ~ smoker+1, data=smoking, f1)
t2<-aggregate(cbind(income,age, educ, pcigs79) ~ 1, data=smoking, f1);

t3<-as.matrix(t(t1))
t4<-as.matrix(rbind("total",t(t2)))
t5<-cbind(t3,t4)

tab<-xtable(t5)
print(tab, type="html")
1 2 3
smoker no yes total
income.mean 19492.806 19141.686 19359.1800356506
income.sd 9192.072 8826.999 9052.51555447872
income.obs 695.000 427.000 1122
age.mean 43.72230 38.89227 41.8841354723708
age.sd 18.14572 14.72686 17.0812441184657
age.obs 695.00000 427.00000 1122
educ.mean 12.427338 11.806792 12.1911764705882
educ.sd 3.517470 2.861969 3.29594980916978
educ.obs 695.000000 427.000000 1122
pcigs79.mean 61.214532 60.577049 60.9719250384924
pcigs79.sd 4.800400 4.915248 4.85213378896449
pcigs79.obs 695.000000 427.000000 1122
htmlTable(txtRound(t5,0,excl.rows=c(1:3,5,6,8,9,11,12)))
smoker no yes total
income.mean 19492.806 19141.686 19359.1800356506
income.sd 9192.072 8826.999 9052.51555447872
income.obs 695 427 1122
age.mean 43.72230 38.89227 41.8841354723708
age.sd 18.14572 14.72686 17.0812441184657
age.obs 695 427 1122
educ.mean 12.427338 11.806792 12.1911764705882
educ.sd 3.517470 2.861969 3.29594980916978
educ.obs 695 427 1122
pcigs79.mean 61.214532 60.577049 60.9719250384924
pcigs79.sd 4.800400 4.915248 4.85213378896449
pcigs79.obs 695 427 1122

OLS model

Table 8.7

ols.1<- lm((as.numeric(smoker)-1) ~ age +educ + pcigs79 +income,
           data = smoking)
ols.1a <- xtable(summary(ols.1))
print(ols.1a, type="html") 
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.1811 0.1943 6.08 0.0000
age -0.0046 0.0009 -5.36 0.0000
educ -0.0197 0.0047 -4.14 0.0000
pcigs79 -0.0063 0.0029 -2.14 0.0323
income 0.0000 0.0000 0.47 0.6400
ols.2<- lm((as.numeric(smoker)-1) ~ age +educ + pcigs79 +inc.10k)
ols.2a <- xtable(summary(ols.2))
print(ols.2a, type="html") 
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.1811 0.1943 6.08 0.0000
age -0.0046 0.0009 -5.36 0.0000
educ -0.0197 0.0047 -4.14 0.0000
pcigs79 -0.0063 0.0029 -2.14 0.0323
inc.10k 0.0079 0.0170 0.47 0.6400
## Alternate version

stargazer(ols.1,ols.2, type="html", align=TRUE, no.space=TRUE, 
          digits=4, dep.var.labels=c("smoker"),
          column.labels=c("income", "income/10k"),
          covariate.labels=c("age in years", 
                             "education in years",
                             "price of a pack", "household income",
                             "income in 10k"))
Dependent variable:
smoker
income income/10k
(1) (2)
age in years -0.0046*** -0.0046***
(0.0009) (0.0009)
education in years -0.0197*** -0.0197***
(0.0047) (0.0047)
price of a pack -0.0063** -0.0063**
(0.0029) (0.0029)
household income 0.000001
(0.000002)
income in 10k 0.0079
(0.0170)
Constant 1.1811*** 1.1811***
(0.1943) (0.1943)
Observations 1,122 1,122
R2 0.0380 0.0380
Adjusted R2 0.0346 0.0346
Residual Std. Error (df = 1117) 0.4773 0.4773
F Statistic (df = 4; 1117) 11.0438*** 11.0438***
Note: p<0.1; p<0.05; p<0.01

Logit models

Table 8.9

GLM.1 <- glm(smoker ~ age +educ + pcigs79 + income, family=binomial(logit))
coef.vector1 <-exp(GLM.1$coef)

stargazer(GLM.1, type="html", align=TRUE, no.space=TRUE, 
          digits=4, dep.var.labels=c("smoker"),
          column.labels=c("income", "income/10k"),
         covariate.labels=c("age in years", "education in years",
                           "price of a pack", "household income"))
Dependent variable:
smoker
income
age in years -0.0202***
(0.0038)
education in years -0.0867***
(0.0212)
price of a pack -0.0274**
(0.0129)
household income 0.000004
(0.00001)
Constant 2.9926***
(0.8553)
Observations 1,122
Log Likelihood -723.6548
Akaike Inf. Crit. 1,457.3100
Note: p<0.1; p<0.05; p<0.01

Exponentiated coefficients Table 8.10

GLM.1 <- glm(smoker ~ age +educ + pcigs79 + income, family=binomial(logit))
coef.vector1 <-exp(GLM.1$coef)

GLM.2 <- glm(smoker ~ age +educ + pcigs79 + inc.10k, family=binomial(logit))
coef.vector2 <-exp(GLM.2$coef)

stargazer(GLM.1, GLM.2, coef= list(coef.vector1, coef.vector2), 
           digits=4, dep.var.labels=c("smoker"),
          column.labels=c("income", "income/10k"), 
          type="html",
          covariate.labels=c("age in years", "education in years",
                           "price of a pack", "household income",
                           "income in 10k"))
Dependent variable:
smoker
income income/10k
(1) (2)
age in years 0.9800*** 0.9800***
(0.0038) (0.0038)
education in years 0.9169*** 0.9169***
(0.0212) (0.0212)
price of a pack 0.9730*** 0.9730***
(0.0129) (0.0129)
household income 1.0000***
(0.00001)
income in 10k 1.0379***
(0.0745)
Constant 19.9367*** 19.9367***
(0.8553) (0.8553)
Observations 1,122 1,122
Log Likelihood -723.6548 -723.6548
Akaike Inf. Crit. 1,457.3100 1,457.3100
Note: p<0.1; p<0.05; p<0.01

Probit models

Figure 8.9

GLM.p<-(glm(smoker ~ age +educ + pcigs79 + inc.10k, data=smoking, family=binomial(probit)))
summary(GLM.p)
## 
## Call:
## glm(formula = smoker ~ age + educ + pcigs79 + inc.10k, family = binomial(probit), 
##     data = smoking)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4392  -0.9872  -0.8035   1.2806   1.8318  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.851707   0.525606   3.523 0.000427 ***
## age         -0.012539   0.002341  -5.357 8.47e-08 ***
## educ        -0.053713   0.012957  -4.145 3.39e-05 ***
## pcigs79     -0.016871   0.007924  -2.129 0.033242 *  
## inc.10k      0.021489   0.045853   0.469 0.639312    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1490.8  on 1121  degrees of freedom
## Residual deviance: 1446.9  on 1117  degrees of freedom
## AIC: 1456.9
## 
## Number of Fisher Scoring iterations: 4

Table 8.12

stargazer(ols.2, GLM.2,GLM.p, digits=4, dep.var.labels=c("smoker"),
          type="html",
          covariate.labels=c("age in years", 
                             "education in years",
                             "price of a pack", "income in 10k"))
Dependent variable:
smoker smoker
OLS logistic probit
(1) (2) (3)
age in years -0.0046*** -0.0202*** -0.0125***
(0.0009) (0.0038) (0.0023)
education in years -0.0197*** -0.0867*** -0.0537***
(0.0047) (0.0212) (0.0130)
price of a pack -0.0063** -0.0274** -0.0169**
(0.0029) (0.0129) (0.0079)
income in 10k 0.0079 0.0372 0.0215
(0.0170) (0.0745) (0.0459)
Constant 1.1811*** 2.9926*** 1.8517***
(0.1943) (0.8553) (0.5256)
Observations 1,122 1,122 1,122
R2 0.0380
Adjusted R2 0.0346
Log Likelihood -723.6548 -723.4255
Akaike Inf. Crit. 1,457.3100 1,456.8510
Residual Std. Error 0.4773 (df = 1117)
F Statistic 11.0438*** (df = 4; 1117)
Note: p<0.1; p<0.05; p<0.01

Zelig

Figure 8.15, 8.16 and more

z.out1 <- zelig(smoker ~ age + educ  + pcigs79 + income,
                model = "logit", data = smoking)
## How to cite this model in Zelig:
##   R Core Team. 2007.
##   logit: Logistic Regression for Dichotomous Dependent Variables
##   in Christine Choirat, James Honaker, Kosuke Imai, Gary King, and Olivia Lau,
##   "Zelig: Everyone's Statistical Software," http://zeligproject.org/
x.out1 <- setx(z.out1, age = 41.88414,  educ=12.19118, 
               income = 19359.18, pcigs79 = 60.97193)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored
## Warning in Ops.factor(y, z$residuals): '-' not meaningful for factors
s.out1 <- sim(z.out1, x = x.out1, num=100000)
summary(s.out1)
## 
##  sim x :
##  -----
## ev
##           mean         sd       50%      2.5%     97.5%
## [1,] 0.3759722 0.01480765 0.3759038 0.3473249 0.4052036
## pv
##            0       1
## [1,] 0.62702 0.37298
x.out2 <- setx(z.out1, age = 41.88414,  educ=12.19118, 
               income = 19359.18, pcigs79 = 76.2149125)
## Warning in model.response(mf, "numeric"): using type = "numeric" with a
## factor response will be ignored

## Warning in model.response(mf, "numeric"): '-' not meaningful for factors
s.out2 <- sim(z.out1, x = x.out2, x1 = x.out1, num=100000)
summary(s.out2)
## 
##  sim x :
##  -----
## ev
##           mean         sd       50%      2.5%    97.5%
## [1,] 0.2858559 0.04218039 0.2840144 0.2086999 0.373392
## pv
##            0       1
## [1,] 0.71444 0.28556
## 
##  sim x1 :
##  -----
## ev
##           mean         sd       50%      2.5%     97.5%
## [1,] 0.3759802 0.01480383 0.3758579 0.3470313 0.4052447
## pv
##            0       1
## [1,] 0.62507 0.37493
## fd
##            mean         sd        50%        2.5%     97.5%
## [1,] 0.09012424 0.03986612 0.09181075 0.007302585 0.1633069
plot(s.out1)

plot(s.out2)

Figures 8.16, 8.17 and 8.18

library (erer)
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
ma <- glm(smoker ~ age + educ + inc.10k + pcigs79, x = TRUE,
data = smoking, family = binomial(link = "probit"))
ea <- maBina(w = ma, x.mean = TRUE, rev.dum = TRUE)
summary(ma)
## 
## Call:
## glm(formula = smoker ~ age + educ + inc.10k + pcigs79, family = binomial(link = "probit"), 
##     data = smoking, x = TRUE)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4392  -0.9872  -0.8035   1.2806   1.8318  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.851707   0.525606   3.523 0.000427 ***
## age         -0.012539   0.002341  -5.357 8.47e-08 ***
## educ        -0.053713   0.012957  -4.145 3.39e-05 ***
## inc.10k      0.021489   0.045853   0.469 0.639312    
## pcigs79     -0.016871   0.007924  -2.129 0.033242 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1490.8  on 1121  degrees of freedom
## Residual deviance: 1446.9  on 1117  degrees of freedom
## AIC: 1456.9
## 
## Number of Fisher Scoring iterations: 4
ea
##             effect error t.value p.value
## (Intercept)  0.703 0.200   3.514   0.000
## age         -0.005 0.001  -5.363   0.000
## educ        -0.020 0.005  -4.148   0.000
## inc.10k      0.008 0.017   0.469   0.639
## pcigs79     -0.006 0.003  -2.129   0.033
edu.1 <- maTrend(q = ea, nam.c="educ")
plot(edu.1)

Grouped Logit

download.file("https://sites.google.com/site/econometriks/docs/ny_grouped.rda", "ny_grouped.rda")
load("ny_grouped.rda")

attach(ny_grouped)

Table 8.15

f1 <- function(x) c( mean=mean(x), sd= sd(x), obs=length(x))

tab<-rbind("average autos/hhld"=f1(hh_autos),
           'average persons/hhld'=f1(hh_size),
      'distance from CBD'=f1(d_cbd), 
       'total number of trips'=f1(n_trips),
      'total transit trips' =f1(n_transit))
htmlTable(txtRound(tab,2))
mean sd obs
average autos/hhld 0.78 0.44 2707.00
average persons/hhld 2.75 0.51 2707.00
distance from CBD 13.40 5.87 2707.00
total number of trips 1518.03 1073.37 2707.00
total transit trips 698.10 674.33 2707.00

Figure 8.20 and Table 8.17

ny_grouped$non_transit <- with(ny_grouped, n_trips-n_transit)
detach(ny_grouped)
attach(ny_grouped)

mod1 <- glm(cbind( n_transit, non_transit) ~ hh_autos + hh_size + 
            d_cbd, data = ny_grouped,
            family = binomial(link="logit"),trace=T)

Deviance = 249880.9 Iterations - 1 Deviance = 246322.9 Iterations - 2 Deviance = 246320 Iterations - 3 Deviance = 246320 Iterations - 4

stargazer(mod1, type="html",
          covariate.labels=c("avg. autos/hhld", "avg. persons/ hhld",
                             "distance from cbd")) 
Dependent variable:
cbind(n_transit, non_transit)
avg. autos/hhld -1.693***
(0.003)
avg. persons/ hhld 0.090***
(0.002)
distance from cbd -0.006***
(0.0002)
Constant 0.974***
(0.005)
Observations 2,707
Log Likelihood -133,013.300
Akaike Inf. Crit. 266,034.600
Note: p<0.1; p<0.05; p<0.01

Figure 8.19

res<-(data.frame("observed transit trips"=(ny_grouped$n_transit), "forecasted transit trips"=fitted(mod1)*ny_grouped$n_trips))

head(res)
##   observed.transit.trips forecasted.transit.trips
## 1                    182                 194.4994
## 2                   1467                1251.3738
## 3                    718                 674.3818
## 4                    883                 838.6868
## 5                   1803                1705.3699
## 6                    251                 286.3279
plot(res)