“Interpretation by simulation”

Using the Zelig package to run the regression models and visualize the results (Zelig 5 vs Zelig 4)

Loading the packages

library(Zelig)
library(DAAG)
library(tidyverse)
library(dplyr)
library(tidyr)
library(ggplot2)

Using Zelig 5 to analyze the Titanic dataset

Opening the Titanic data

library(radiant.data)
data("titanic")
head(titanic)
##   pclass survived    sex     age sibsp parch     fare                            name   cabin    embarked
## 1    1st      Yes female 29.0000     0     0 211.3375   Allen, Miss. Elisabeth Walton      B5 Southampton
## 2    1st      Yes   male  0.9167     1     2 151.5500  Allison, Master. Hudson Trevor C22 C26 Southampton
## 3    1st       No female  2.0000     1     2 151.5500    Allison, Miss. Helen Loraine C22 C26 Southampton
## 4    1st       No   male 30.0000     1     2 151.5500 Allison, Mr. Hudson Joshua Crei C22 C26 Southampton
## 5    1st       No female 25.0000     1     2 151.5500 Allison, Mrs. Hudson J C (Bessi C22 C26 Southampton
## 6    1st      Yes   male 48.0000     0     0  26.5500             Anderson, Mr. Harry     E12 Southampton

Recoding the data to prepare them for further analysis

newtitanic <- titanic %>% 
  mutate(survived1 = as.integer(survived)) %>%
  mutate (age = as.integer(age)) %>%
  mutate(survival = sjmisc::rec(survived1, rec = "2=0; 1=1")) %>%
  select(survived, survival, age, sex, pclass, fare)
head(newtitanic)
##   survived survival age    sex pclass     fare
## 1      Yes        1  29 female    1st 211.3375
## 2      Yes        1   0   male    1st 151.5500
## 3       No        0   2 female    1st 151.5500
## 4       No        0  30   male    1st 151.5500
## 5       No        0  25 female    1st 151.5500
## 6      Yes        1  48   male    1st  26.5500

Creating logistic regresion model using the Zelig 5 package

z5<- zlogit$new()
z5$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
summary(z5)
## Model: 
## 
## Call:
## z5$zelig(formula = survival ~ age + sex * pclass + fare, data = newtitanic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0628  -0.6636  -0.4941   0.4337   2.4940  
## 
## Coefficients:
##                     Estimate Std. Error z value            Pr(>|z|)
## (Intercept)        4.8959649  0.6128145   7.989 0.00000000000000136
## age               -0.0386781  0.0067926  -5.694 0.00000001239977237
## sexmale           -3.9001038  0.5015680  -7.776 0.00000000000000750
## pclass2nd         -1.5922712  0.6024689  -2.643             0.00822
## pclass3rd         -4.1381922  0.5601346  -7.388 0.00000000000014922
## fare              -0.0009074  0.0020510  -0.442             0.65820
## sexmale:pclass2nd -0.0603255  0.6373047  -0.095             0.92459
## sexmale:pclass3rd  2.5016703  0.5479814   4.565 0.00000498908018340
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  931.42  on 1035  degrees of freedom
## AIC: 947.42
## 
## Number of Fisher Scoring iterations: 5
## 
## Next step: Use 'setx' method

Preview of the variables age and fare to see the Min.and Max. values

summary(newtitanic$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   21.00   28.00   29.79   39.00   80.00
summary(newtitanic$fare)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    8.05   15.75   36.60   35.08  512.33

The simulation with the continues variable (age) using Zelig 5 package

Age effect

z5$setrange (age = 0:80)
z5$sim()
z5$graph()

The simulation with the continues variable (fare) using Zelig 5 package

Fare effect

z5$setrange (fare = 0:512.33)
z5$sim()
z5$graph()

The simulation with the categorical variable (sex) using Zelig 5 package

Sex difference

z5.sex <- zlogit$new()
z5.sex$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
  z5.sex$setx(sex = "male") 
  z5.sex$setx1(sex = "female")
  z5.sex$sim()
  summary(z5.sex)

Summary of the simulated first difference (fd) only

fd <- z5.sex$get_qi (xvalue = "x1", qi = "fd")
summary(fd)
##        V1        
##  Min.   :0.1462  
##  1st Qu.:0.2278  
##  Median :0.2599  
##  Mean   :0.2583  
##  3rd Qu.:0.2864  
##  Max.   :0.3891
plot(z5.sex)

Testing the class variation in sex difference using Zelig 5

##FIRST CLASS
z5.c1s <- zlogit$new()
z5.c1s$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
z5.c1s$setx(sex = "male", pclass = "1st") 
z5.c1s$setx1(sex = "female", pclass = "1st")
z5.c1s$sim()
plot(z5.c1s)

##SECOND CLASS
z5.c2s <- zlogit$new()
z5.c2s$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
z5.c2s$setx(sex = "male", pclass = "2nd")
z5.c2s$setx1(sex = "female", pclass = "2nd")
z5.c2s$sim()
plot(z5.c2s)

##THIRD CLASS
z5.c3s <- zlogit$new()
z5.c3s$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
z5.c3s$setx(sex = "male", pclass = "3rd")
z5.c3s$setx1(sex = "female", pclass = "3rd")
z5.c3s$sim()
plot(z5.c3s)

Putting the results of sex difference in all 3 classes together

d1 <- z5.c1s$get_qi(xvalue="x1", qi="fd")
d2 <- z5.c2s$get_qi(xvalue="x1", qi="fd")
d3 <- z5.c3s$get_qi(xvalue="x1", qi="fd")
  
dfd <- as.data.frame(cbind(d1, d2, d3))
head(dfd)
##          V1        V2        V3
## 1 0.5709284 0.7452609 0.2936210
## 2 0.4936304 0.6652919 0.2318504
## 3 0.4707436 0.8511496 0.2331585
## 4 0.4401018 0.7711951 0.2650154
## 5 0.6083386 0.6890202 0.2155337
## 6 0.4927805 0.6877159 0.2173076
#Segregaring the sex difference by class
tidd <- dfd %>% 
  gather(class, simv, 1:3)
head(tidd)
##   class      simv
## 1    V1 0.5709284
## 2    V1 0.4936304
## 3    V1 0.4707436
## 4    V1 0.4401018
## 5    V1 0.6083386
## 6    V1 0.4927805
#Grouping the results. Showing the mean value for sex difference in every class (V1 = first class, V2 = swcond class, V3 = third class)
tidd %>% 
  group_by(class) %>% 
  summarise(mean = mean(simv), sd = sd(simv))
## # A tibble: 3 x 3
##   class      mean         sd
##   <chr>     <dbl>      <dbl>
## 1    V1 0.5207240 0.04658455
## 2    V2 0.7489061 0.04203874
## 3    V3 0.2556884 0.04266622

Plotting the graph using ggplot2 package

The sex difference among 1st, 2nd, and 3rd class in Titanic

ggplot(tidd, aes(simv)) + geom_histogram() + facet_grid(~class)

Using Zelig 4 to analyze the nassCDS dataset

In this assignment I am going to analyze the nassCDS data set which is about police-recorded car crashes in USA between 1997-2002. I have created the logistic regression model to analyze the influence of age of a person, sex and the age of the car on survivor in car crashes.

Opening and preview of the nassCDS data

library(DAAG)
data(nassCDS)
nassCDS2 <- nassCDS
head(nassCDS2)

Manipulating the data to prepare them for further analysis (creating new variables, changing missing values to “0”, recoding for 0/1 values)

nassCDS2 <- nassCDS %>%
  filter(!is.na(yearVeh)) %>%
  mutate(carage = yearacc - yearVeh,
         seatbelt = as.factor(seatbelt),
         sex = as.factor(sex),
         alive = ifelse(dead=="alive",1,0)) %>%
select(alive, ageOFocc, carage, seatbelt, sex, airbag)
head(nassCDS2)
##   alive ageOFocc carage seatbelt sex airbag
## 1     1       26      7   belted   f   none
## 2     1       72      2   belted   f airbag
## 3     1       69      9     none   f   none
## 4     1       53      2   belted   f airbag
## 5     1       32      9   belted   f   none
## 6     1       22     12   belted   f   none

Creating the logistic regresion model using Zelig 4 package

The results of the logistic regression show that the age of the occupant and the age of the car has a negative effect on survival. Males are more likely to suffer in a car accidents than females. We can also see that the seat belts has a positive effect on survivor.

z1 <- zelig (alive ~ ageOFocc + seatbelt + carage + sex, model = "logit", data = nassCDS2, cite = F)
summary(z1)
## Model: 
## 
## Call:
## z5$zelig(formula = alive ~ ageOFocc + seatbelt + carage + sex, 
##     data = nassCDS2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9751   0.1879   0.2336   0.3444   0.9465  
## 
## Coefficients:
##                 Estimate Std. Error z value             Pr(>|z|)
## (Intercept)     3.560648   0.096489  36.902 < 0.0000000000000002
## ageOFocc       -0.025016   0.001516 -16.501 < 0.0000000000000002
## seatbeltbelted  1.303178   0.062991  20.688 < 0.0000000000000002
## carage         -0.018461   0.005224  -3.534             0.000409
## sexm           -0.190541   0.062747  -3.037             0.002392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 9624.1  on 26215  degrees of freedom
## Residual deviance: 8913.0  on 26211  degrees of freedom
## AIC: 8923
## 
## Number of Fisher Scoring iterations: 6
## 
## Next step: Use 'setx' method

Setting values for explanatory variables to create simulations

The simulation with the continues variable (ageOFocc = age), age effect. The graph shows the correlation between the age of occupant and survivor in a car accident. With the increase of age the chances to survive the car crash decreases.

a.range = min(nassCDS2$ageOFocc):max(nassCDS2$ageOFocc)
x <- setx (z1, ageOFocc = a.range)
s <- sim (z1, x = x)
ci.plot (s)

The simulation with the categorical variable (seatbelt) using Zelig 4 package

Difference in using the seat belts

x <- setx(z1, seatbelt = "belted")
x1 <- setx(z1, seatbelt = "none")
s <- sim(z1, x = x, x1 = x1)
summary(s)
## 
##  sim x :
##  -----
## ev
##           mean          sd       50%      2.5%     97.5%
## [1,] 0.9738061 0.001389667 0.9738557 0.9710678 0.9763582
## pv
##          0     1
## [1,] 0.031 0.969
## 
##  sim x1 :
##  -----
## ev
##           mean          sd       50%      2.5%     97.5%
## [1,] 0.9099686 0.003935363 0.9099711 0.9022682 0.9171859
## pv
##          0     1
## [1,] 0.086 0.914
## fd
##            mean          sd         50%        2.5%       97.5%
## [1,] -0.0638375 0.003822854 -0.06392984 -0.07139587 -0.05635914

Summary of the simulated first difference (fd) only

ev = expected value (more reliable) pv = predicted value fd = first difference (simulated difference between ev produced by the 2 contractual values “belted” and “none”) To count the fd we need to subtract the second ev with the first ev (the difference between two sets of ev) 0.9099683 - 0.9737965 = -0.06382824 (fd)

fd is a normal distribution. We are able to summarize it to get the mean, median, min, max values etc.

fd <- s$get_qi (xvalue = "x1", qi = "fd")
summary(fd)
##        V1          
##  Min.   :-0.07646  
##  1st Qu.:-0.06635  
##  Median :-0.06393  
##  Mean   :-0.06384  
##  3rd Qu.:-0.06119  
##  Max.   :-0.05138

Plotting the result of a simulated categorical variables

The graph show a visual representation of the predicted values, expected values and a first difference. In this case we are looking at the differences and comparison of those who had the seat belts belted an non belted.

plot(s)

Seatbelt use difference between gender

##BELTED for male and female
b1x <- setx (z1, sex = "m", seatbelt = "belted")
b1x1 <- setx(z1, sex = "f", seatbelt = "belted")
b1s <- sim(z1, x = b1x, x1 = b1x1)
plot(b1s)

##NONE BELTED for male and female
b0x <- setx(z1, sex = "m", seatbelt = "none")
b0x1 <- setx(z1, sex = "f", seatbelt = "none")
b0s <- sim(z1, x = b0x, x1 = b0x1)
plot(b0s)

Putting the results together (belted V1, not belted V2)

d1<- b1s$get_qi (xvalue = "x1", qi = "fd")
d2 <- b0s$get_qi (xvalue = "x1", qi = "fd")
dfd <- as.data.frame(cbind(d1,d2))
head(dfd)
##            V1         V2
## 1 0.005749095 0.01502411
## 2 0.004370425 0.01926418
## 3 0.004062228 0.01185482
## 4 0.005805127 0.01009222
## 5 0.003734842 0.01594024
## 6 0.005262077 0.01168258

Segregating the gender difference by seatbelts.

simv represent the gender difference

tidd <- dfd%>%
  gather(seatbelt, simv, 1:2)
head(tidd)
##   seatbelt        simv
## 1       V1 0.005749095
## 2       V1 0.004370425
## 3       V1 0.004062228
## 4       V1 0.005805127
## 5       V1 0.003734842
## 6       V1 0.005262077

Grouping the results. Showing the mean value for gender difference in setbelts use = belted(V1) and not belted(V2)

tidd %>%
  group_by(seatbelt) %>%
  summarise(mean = mean(simv), sd = sd(simv))
## # A tibble: 2 x 3
##   seatbelt        mean          sd
##      <chr>       <dbl>       <dbl>
## 1       V1 0.004447168 0.001505479
## 2       V2 0.014363204 0.004453364

Plotting the graph using ggplot2 package

The gender difference between the use of seat belts (V1 = seat belts, V2 = lack of seat belts). The graph show that the variation in sex is greater between people who has not been belted (V2) between those who were belted (V1)

ggplot(tidd, aes(simv)) + geom_histogram() + facet_grid(~seatbelt)

Conclusions

The purpose of this assignment was to explore the possibilities of analysis using Zelig package. The “titanic” data was analyzed using the Zelig 5 package and the “nassCDS” data was analyzed using Zelig 4 package. Zelig allows to do various statistical models for estimation and interpretation. In this homework, I used Zelig to run the logistic regression models and then do the simulations (which is a random number drawing that stacks two variables together to make a multivariate normal distribution) and a group wised analysis of dependent variables from the regression model. I used graphs to visualize the results and differences between the variables.