library(Zelig)
library(DAAG)
library(tidyverse)
library(dplyr)
library(tidyr)
library(ggplot2)
library(radiant.data)
data("titanic")
head(titanic)
## pclass survived sex age sibsp parch fare name cabin embarked
## 1 1st Yes female 29.0000 0 0 211.3375 Allen, Miss. Elisabeth Walton B5 Southampton
## 2 1st Yes male 0.9167 1 2 151.5500 Allison, Master. Hudson Trevor C22 C26 Southampton
## 3 1st No female 2.0000 1 2 151.5500 Allison, Miss. Helen Loraine C22 C26 Southampton
## 4 1st No male 30.0000 1 2 151.5500 Allison, Mr. Hudson Joshua Crei C22 C26 Southampton
## 5 1st No female 25.0000 1 2 151.5500 Allison, Mrs. Hudson J C (Bessi C22 C26 Southampton
## 6 1st Yes male 48.0000 0 0 26.5500 Anderson, Mr. Harry E12 Southampton
newtitanic <- titanic %>%
mutate(survived1 = as.integer(survived)) %>%
mutate (age = as.integer(age)) %>%
mutate(survival = sjmisc::rec(survived1, rec = "2=0; 1=1")) %>%
select(survived, survival, age, sex, pclass, fare)
head(newtitanic)
## survived survival age sex pclass fare
## 1 Yes 1 29 female 1st 211.3375
## 2 Yes 1 0 male 1st 151.5500
## 3 No 0 2 female 1st 151.5500
## 4 No 0 30 male 1st 151.5500
## 5 No 0 25 female 1st 151.5500
## 6 Yes 1 48 male 1st 26.5500
z5<- zlogit$new()
z5$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
summary(z5)
## Model:
##
## Call:
## z5$zelig(formula = survival ~ age + sex * pclass + fare, data = newtitanic)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0628 -0.6636 -0.4941 0.4337 2.4940
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.8959649 0.6128145 7.989 0.00000000000000136
## age -0.0386781 0.0067926 -5.694 0.00000001239977237
## sexmale -3.9001038 0.5015680 -7.776 0.00000000000000750
## pclass2nd -1.5922712 0.6024689 -2.643 0.00822
## pclass3rd -4.1381922 0.5601346 -7.388 0.00000000000014922
## fare -0.0009074 0.0020510 -0.442 0.65820
## sexmale:pclass2nd -0.0603255 0.6373047 -0.095 0.92459
## sexmale:pclass3rd 2.5016703 0.5479814 4.565 0.00000498908018340
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1409.99 on 1042 degrees of freedom
## Residual deviance: 931.42 on 1035 degrees of freedom
## AIC: 947.42
##
## Number of Fisher Scoring iterations: 5
##
## Next step: Use 'setx' method
summary(newtitanic$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 21.00 28.00 29.79 39.00 80.00
summary(newtitanic$fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 8.05 15.75 36.60 35.08 512.33
z5$setrange (age = 0:80)
z5$sim()
z5$graph()
z5$setrange (fare = 0:512.33)
z5$sim()
z5$graph()
z5.sex <- zlogit$new()
z5.sex$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
z5.sex$setx(sex = "male")
z5.sex$setx1(sex = "female")
z5.sex$sim()
summary(z5.sex)
fd <- z5.sex$get_qi (xvalue = "x1", qi = "fd")
summary(fd)
## V1
## Min. :0.1462
## 1st Qu.:0.2278
## Median :0.2599
## Mean :0.2583
## 3rd Qu.:0.2864
## Max. :0.3891
plot(z5.sex)
##FIRST CLASS
z5.c1s <- zlogit$new()
z5.c1s$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
z5.c1s$setx(sex = "male", pclass = "1st")
z5.c1s$setx1(sex = "female", pclass = "1st")
z5.c1s$sim()
plot(z5.c1s)
##SECOND CLASS
z5.c2s <- zlogit$new()
z5.c2s$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
z5.c2s$setx(sex = "male", pclass = "2nd")
z5.c2s$setx1(sex = "female", pclass = "2nd")
z5.c2s$sim()
plot(z5.c2s)
##THIRD CLASS
z5.c3s <- zlogit$new()
z5.c3s$zelig(survival ~ age + sex*pclass + fare, data = newtitanic)
z5.c3s$setx(sex = "male", pclass = "3rd")
z5.c3s$setx1(sex = "female", pclass = "3rd")
z5.c3s$sim()
plot(z5.c3s)
d1 <- z5.c1s$get_qi(xvalue="x1", qi="fd")
d2 <- z5.c2s$get_qi(xvalue="x1", qi="fd")
d3 <- z5.c3s$get_qi(xvalue="x1", qi="fd")
dfd <- as.data.frame(cbind(d1, d2, d3))
head(dfd)
## V1 V2 V3
## 1 0.5709284 0.7452609 0.2936210
## 2 0.4936304 0.6652919 0.2318504
## 3 0.4707436 0.8511496 0.2331585
## 4 0.4401018 0.7711951 0.2650154
## 5 0.6083386 0.6890202 0.2155337
## 6 0.4927805 0.6877159 0.2173076
#Segregaring the sex difference by class
tidd <- dfd %>%
gather(class, simv, 1:3)
head(tidd)
## class simv
## 1 V1 0.5709284
## 2 V1 0.4936304
## 3 V1 0.4707436
## 4 V1 0.4401018
## 5 V1 0.6083386
## 6 V1 0.4927805
#Grouping the results. Showing the mean value for sex difference in every class (V1 = first class, V2 = swcond class, V3 = third class)
tidd %>%
group_by(class) %>%
summarise(mean = mean(simv), sd = sd(simv))
## # A tibble: 3 x 3
## class mean sd
## <chr> <dbl> <dbl>
## 1 V1 0.5207240 0.04658455
## 2 V2 0.7489061 0.04203874
## 3 V3 0.2556884 0.04266622
The sex difference among 1st, 2nd, and 3rd class in Titanic
ggplot(tidd, aes(simv)) + geom_histogram() + facet_grid(~class)
In this assignment I am going to analyze the nassCDS data set which is about police-recorded car crashes in USA between 1997-2002. I have created the logistic regression model to analyze the influence of age of a person, sex and the age of the car on survivor in car crashes.
library(DAAG)
data(nassCDS)
nassCDS2 <- nassCDS
head(nassCDS2)
nassCDS2 <- nassCDS %>%
filter(!is.na(yearVeh)) %>%
mutate(carage = yearacc - yearVeh,
seatbelt = as.factor(seatbelt),
sex = as.factor(sex),
alive = ifelse(dead=="alive",1,0)) %>%
select(alive, ageOFocc, carage, seatbelt, sex, airbag)
head(nassCDS2)
## alive ageOFocc carage seatbelt sex airbag
## 1 1 26 7 belted f none
## 2 1 72 2 belted f airbag
## 3 1 69 9 none f none
## 4 1 53 2 belted f airbag
## 5 1 32 9 belted f none
## 6 1 22 12 belted f none
The results of the logistic regression show that the age of the occupant and the age of the car has a negative effect on survival. Males are more likely to suffer in a car accidents than females. We can also see that the seat belts has a positive effect on survivor.
z1 <- zelig (alive ~ ageOFocc + seatbelt + carage + sex, model = "logit", data = nassCDS2, cite = F)
summary(z1)
## Model:
##
## Call:
## z5$zelig(formula = alive ~ ageOFocc + seatbelt + carage + sex,
## data = nassCDS2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9751 0.1879 0.2336 0.3444 0.9465
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.560648 0.096489 36.902 < 0.0000000000000002
## ageOFocc -0.025016 0.001516 -16.501 < 0.0000000000000002
## seatbeltbelted 1.303178 0.062991 20.688 < 0.0000000000000002
## carage -0.018461 0.005224 -3.534 0.000409
## sexm -0.190541 0.062747 -3.037 0.002392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 9624.1 on 26215 degrees of freedom
## Residual deviance: 8913.0 on 26211 degrees of freedom
## AIC: 8923
##
## Number of Fisher Scoring iterations: 6
##
## Next step: Use 'setx' method
The simulation with the continues variable (ageOFocc = age), age effect. The graph shows the correlation between the age of occupant and survivor in a car accident. With the increase of age the chances to survive the car crash decreases.
a.range = min(nassCDS2$ageOFocc):max(nassCDS2$ageOFocc)
x <- setx (z1, ageOFocc = a.range)
s <- sim (z1, x = x)
ci.plot (s)
Difference in using the seat belts
x <- setx(z1, seatbelt = "belted")
x1 <- setx(z1, seatbelt = "none")
s <- sim(z1, x = x, x1 = x1)
summary(s)
##
## sim x :
## -----
## ev
## mean sd 50% 2.5% 97.5%
## [1,] 0.9738061 0.001389667 0.9738557 0.9710678 0.9763582
## pv
## 0 1
## [1,] 0.031 0.969
##
## sim x1 :
## -----
## ev
## mean sd 50% 2.5% 97.5%
## [1,] 0.9099686 0.003935363 0.9099711 0.9022682 0.9171859
## pv
## 0 1
## [1,] 0.086 0.914
## fd
## mean sd 50% 2.5% 97.5%
## [1,] -0.0638375 0.003822854 -0.06392984 -0.07139587 -0.05635914
ev = expected value (more reliable) pv = predicted value fd = first difference (simulated difference between ev produced by the 2 contractual values “belted” and “none”) To count the fd we need to subtract the second ev with the first ev (the difference between two sets of ev) 0.9099683 - 0.9737965 = -0.06382824 (fd)
fd is a normal distribution. We are able to summarize it to get the mean, median, min, max values etc.
fd <- s$get_qi (xvalue = "x1", qi = "fd")
summary(fd)
## V1
## Min. :-0.07646
## 1st Qu.:-0.06635
## Median :-0.06393
## Mean :-0.06384
## 3rd Qu.:-0.06119
## Max. :-0.05138
The graph show a visual representation of the predicted values, expected values and a first difference. In this case we are looking at the differences and comparison of those who had the seat belts belted an non belted.
plot(s)
##BELTED for male and female
b1x <- setx (z1, sex = "m", seatbelt = "belted")
b1x1 <- setx(z1, sex = "f", seatbelt = "belted")
b1s <- sim(z1, x = b1x, x1 = b1x1)
plot(b1s)
##NONE BELTED for male and female
b0x <- setx(z1, sex = "m", seatbelt = "none")
b0x1 <- setx(z1, sex = "f", seatbelt = "none")
b0s <- sim(z1, x = b0x, x1 = b0x1)
plot(b0s)
d1<- b1s$get_qi (xvalue = "x1", qi = "fd")
d2 <- b0s$get_qi (xvalue = "x1", qi = "fd")
dfd <- as.data.frame(cbind(d1,d2))
head(dfd)
## V1 V2
## 1 0.005749095 0.01502411
## 2 0.004370425 0.01926418
## 3 0.004062228 0.01185482
## 4 0.005805127 0.01009222
## 5 0.003734842 0.01594024
## 6 0.005262077 0.01168258
simv represent the gender difference
tidd <- dfd%>%
gather(seatbelt, simv, 1:2)
head(tidd)
## seatbelt simv
## 1 V1 0.005749095
## 2 V1 0.004370425
## 3 V1 0.004062228
## 4 V1 0.005805127
## 5 V1 0.003734842
## 6 V1 0.005262077
tidd %>%
group_by(seatbelt) %>%
summarise(mean = mean(simv), sd = sd(simv))
## # A tibble: 2 x 3
## seatbelt mean sd
## <chr> <dbl> <dbl>
## 1 V1 0.004447168 0.001505479
## 2 V2 0.014363204 0.004453364
The gender difference between the use of seat belts (V1 = seat belts, V2 = lack of seat belts). The graph show that the variation in sex is greater between people who has not been belted (V2) between those who were belted (V1)
ggplot(tidd, aes(simv)) + geom_histogram() + facet_grid(~seatbelt)
The purpose of this assignment was to explore the possibilities of analysis using Zelig package. The “titanic” data was analyzed using the Zelig 5 package and the “nassCDS” data was analyzed using Zelig 4 package. Zelig allows to do various statistical models for estimation and interpretation. In this homework, I used Zelig to run the logistic regression models and then do the simulations (which is a random number drawing that stacks two variables together to make a multivariate normal distribution) and a group wised analysis of dependent variables from the regression model. I used graphs to visualize the results and differences between the variables.