library(dplyr)
library(Zelig)
library(texreg)
library(visreg)
library(ggplot2)
library(sjmisc)
library(tidyr)
library(radiant.data)
data(titanic)

Manipulating Data for Analysis

titanic <- titanic %>%
  mutate(survival = as.integer(survived)) %>% 
  mutate(survival = sjmisc::rec(survival, rec = "2=0; 1=1")) %>% 
  select(survival,survived, everything())

Reimplementing Slide 5.2 using Zelig 5 Syntax

alpha <- zlogit$new()
alpha$zelig(survival ~ age + sex*pclass + fare, data = titanic)
summary(alpha)

## Model: 
## 
## Call:
## alpha$zelig(formula = survival ~ age + sex * pclass + fare, data = titanic)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0634  -0.6641  -0.4943   0.4336   2.4941  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)        4.8978215  0.6131092   7.988 1.37e-15
## age               -0.0387245  0.0068044  -5.691 1.26e-08
## sexmale           -3.8996177  0.5015659  -7.775 7.55e-15
## pclass2nd         -1.5923247  0.6024844  -2.643  0.00822
## pclass3rd         -4.1382715  0.5601819  -7.387 1.50e-13
## fare              -0.0009058  0.0020509  -0.442  0.65874
## sexmale:pclass2nd -0.0600076  0.6372949  -0.094  0.92498
## sexmale:pclass3rd  2.5019110  0.5479901   4.566 4.98e-06
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1409.99  on 1042  degrees of freedom
## Residual deviance:  931.45  on 1035  degrees of freedom
## AIC: 947.45
## 
## Number of Fisher Scoring iterations: 5
## 
## Next step: Use 'setx' method

Reimplementing the Age Effects of Slide(s) 5.3 - 5.4 Using Zelig 5 Syntax

alpha$setrange(age=min(titanic$age):max(titanic$age))
alpha$sim()
alpha$graph()

Reimplementing the Fare Effect of Slide(s) 5.5 - 5.6 Using Zelig 5 Syntax

alpha$setrange(fare=min(titanic$fare):max(titanic$fare))
alpha$sim()
alpha$graph()

Reimplementing the Sex Difference of Slide(s) 5.7 Using Zelig 5 Syntax

alpha$setx(sex="male")
alpha$setx1(sex="female")
alpha$sim()
fd <- alpha$get_qi(xvalue="x1", qi="fd")
summary(fd)

##        V1        
##  Min.   :0.1292  
##  1st Qu.:0.2264  
##  Median :0.2570  
##  Mean   :0.2574  
##  3rd Qu.:0.2886  
##  Max.   :0.4025

par(“mar”)

par(mar= c(1,1,1,1))
alpha$graph()

Reimplementing the Sex Difference of Slide(s) 5.10 Using Zelig 5 Syntax

First Class Analysis

bravo <- zlogit$new()
bravo$zelig(survival ~ age +sex*pclass + fare, data=titanic)
bravo$setx(sex="male", pclass="1st")
bravo$setx1(sex="female", pclass="1st")
par(mar= c(1,1,1,1))
bravo$sim()
bravo$graph()

Second Class Analysis

charlie <- zlogit$new()
charlie$zelig(survival ~ age +sex*pclass + fare, data=titanic)
charlie$setx(sex="male", pclass="2nd")
charlie$setx1(sex="female", pclass="2nd")
par(mar= c(1,1,1,1))
charlie$sim()
charlie$graph()

Third Class Analysis

delta <- zlogit$new()
delta$zelig(survival ~ age +sex*pclass + fare, data=titanic)
delta$setx(sex="male", pclass="3rd")
delta$setx1(sex="female", pclass="3rd")
par(mar= c(1,1,1,1))
delta$sim()
delta$graph()

Reimplementing Slide(s) 5.15 Using Zelig 5 Syntax - Putting it all Together (using Syntax 4)

d1 <- bravo$get_qi(xvalue="x1", qi="fd")
d2 <- charlie$get_qi(xvalue="x1", qi="fd")
d3 <- delta$get_qi(xvalue="x1", qi="fd")

dfd <- as.data.frame(cbind(d1, d2, d3))
head(dfd)

##          V1        V2        V3
## 1 0.5967198 0.8029676 0.2556934
## 2 0.5167552 0.6980133 0.2694845
## 3 0.5174264 0.7471248 0.3146801
## 4 0.4993967 0.6766970 0.2583324
## 5 0.4736819 0.7334844 0.2473503
## 6 0.4905807 0.7487350 0.2716825

tidd <- dfd %>%
  gather(class, simv, 1:3)
head(tidd)

##   class      simv
## 1    V1 0.5967198
## 2    V1 0.5167552
## 3    V1 0.5174264
## 4    V1 0.4993967
## 5    V1 0.4736819
## 6    V1 0.4905807

Organizing by Group

tidd %>%
  group_by(class) %>%
  summarise(mean=mean(simv), sd=sd(simv))

## # A tibble: 3 x 3
##   class  mean     sd
##   <chr> <dbl>  <dbl>
## 1 V1    0.519 0.0506
## 2 V2    0.747 0.0422
## 3 V3    0.258 0.0441

Plotting & Graphing

par(mar= c(1,1,1,1))
ggplot(tidd, aes(simv)) + geom_histogram(fill = "black", color="blue") + facet_grid(~class)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Continuing with Game of Thrones Data

The dataset being used for this weeks assignment is a continuation of last assignment Game of Thrones dataset. Just like before, I’m interedted in looking at the popularity of the nobles and commoners of Game of Thrones.

Data Import

got$Noble <- as.character (got$isNoble)
got$Noble[got$isNoble == "1"] <- "Noble"
got$Noble[got$isNoble == "0"] <- "Commoner"

got$Alive <- as.character (got$isAlive)
got$Alive[got$isAlive == "1"] <- "Alive"
got$Alive[got$isAlive == "0"] <- "Dead"

got$Popular <- as.character (got$isPopular)
got$Popular[got$isPopular == "1"] <- "Popular"
got$Popular[got$isPopular == "0"] <- "Non-Popular"

got$Sex <- as.character (got$male)
got$Sex[got$male == "1"] <- "male"
got$Sex[got$male == "0"] <- "female"

got$male <- as.factor(got$male)
got$isNoble <- as.factor(got$isNoble)
got$isPopular <- as.factor(got$isPopular)

z.got <- zelig(isAlive ~  male + isPopular*isNoble, model = "logit", data = got)

## How to cite this model in Zelig:
##   R Core Team. 2007.
##   logit: Logistic Regression for Dichotomous Dependent Variables
##   in Christine Choirat, Christopher Gandrud, James Honaker, Kosuke Imai, Gary King, and Olivia Lau,
##   "Zelig: Everyone's Statistical Software," http://zeligproject.org/

summary(z.got)

## Model: 
## 
## Call:
## z5$zelig(formula = isAlive ~ male + isPopular * isNoble, data = got)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.9081  -0.7435   0.6303   0.7930   1.6858  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)
## (Intercept)           1.6438     0.1066  15.419  < 2e-16
## male1                -0.6482     0.1183  -5.478 4.29e-08
## isPopular1           -2.1401     0.3971  -5.390 7.06e-08
## isNoble1             -0.1286     0.1120  -1.149   0.2507
## isPopular1:isNoble1   1.4917     0.4639   3.215   0.0013
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2207.1  on 1945  degrees of freedom
## Residual deviance: 2121.4  on 1941  degrees of freedom
## AIC: 2131.4
## 
## Number of Fisher Scoring iterations: 4
## 
## Next step: Use 'setx' method

The data so far does not indicate any level of significance. This is unexpected as when performing a similar analysis the previous week, there were a few levels of significant using similar variables. However, based on these results, it can be stated that the log odds of a popular male character being alive is -2.14 less that that of a female character.

Regression Results

Looking at Alive Differences Between Nobility

x <- setx(z.got, isNoble = "0")
x1 <- setx(z.got, isNoble = "1")
a <- sim(z.got, x = x, x1 = x1)
fd <- a$get_qi(xvalue="x1", qi="fd")
summary(fd)

##        V1          
##  Min.   :-0.09625  
##  1st Qu.:-0.04110  
##  Median :-0.02464  
##  Mean   :-0.02578  
##  3rd Qu.:-0.01119  
##  Max.   : 0.03946

Based on the first results, we can state on average Nobles are -.026 less likely of being alive by the end of Game of Thrones

plot(a)

Looking at Alive Differences Between Popularity

z <- setx(z.got, isPopular = "0")
z1 <- setx(z.got, isPopular = "1")
b <- sim(z.got, x = z, x1 = z1)
fd <- b$get_qi(xvalue="x1", qi="fd")
summary(fd)

##        V1         
##  Min.   :-0.6672  
##  1st Qu.:-0.5340  
##  Median :-0.4914  
##  Mean   :-0.4835  
##  3rd Qu.:-0.4412  
##  Max.   :-0.1866

Based on the second results, we can state on average that popular characters are -.48 less likely of being alive by the end of Game of Thrones

plot(b)

Observing Alive Characters

None-Popular

Pop1 <- setx(z.got, isNoble="0", isPopular="0")
Pop1a <- setx(z.got, isNoble="1", isPopular="0")
Pops <- sim(z.got, x=Pop1, x1=Pop1a)

Plotting Non-Popularity

par(mar= c(1,1,1,1))
plot(Pops)

Plotting Popularity

par(mar= c(1,1,1,1))
plot(Popsa)

Putting it All Together

a1 <- Pops$get_qi(xvalue="x1", qi="fd")
a2 <- Popsa$get_qi(xvalue="x1", qi="fd")
afd <- as.data.frame(cbind(a1, a2))
Popular <- afd %>%
  gather(isAlive, simv, 1:2)
head(Popular)

##   isAlive        simv
## 1      V1 -0.02699210
## 2      V1 -0.05142281
## 3      V1 -0.02679997
## 4      V1 -0.03815652
## 5      V1 -0.02111528
## 6      V1 -0.03472899

Differences Among Groups

Popular %>%
    group_by(isAlive) %>%
    summarise(mean=mean(simv), sd=sd(simv))

## # A tibble: 2 x 3
##   isAlive    mean     sd
##   <chr>     <dbl>  <dbl>
## 1 V1      -0.0255 0.0234
## 2 V2       0.301  0.0906

Between nobles and popular characters, it can be states that both nobles and popular characters are less likely to be alive at the end of Game of Thrones.

Plotting the Distributions

ggplot(Popular, aes(simv)) + geom_histogram(fill="red", color="blue") + facet_grid(~isAlive) + geom_vline(xintercept=mean(Popular))

## Warning in mean.default(Popular): argument is not numeric or logical:
## returning NA

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 2 rows containing missing values (geom_vline).

Advanced Analytics Homework #6

Rafael Arevalo

3/24/2019