#Build a GLM that attempts to predict whether a passenger survived (or not)… #Use at least 1 of the provided explanatory variables, and produce at least 1 #graph showing what you’re testing for.

This creates an object dat from the Titanic CSV file

dat <- read.csv("~/Desktop/ANT291WD/titanic.csv")
head(dat)
##   PassengerId Survived Pclass
## 1           1        0      3
## 2           2        1      1
## 3           3        1      3
## 4           4        1      1
## 5           5        0      3
## 6           6        0      3
##                                                  Name    Sex Age SibSp
## 1                             Braund, Mr. Owen Harris   male  22     1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1
## 3                              Heikkinen, Miss. Laina female  26     0
## 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1
## 5                            Allen, Mr. William Henry   male  35     0
## 6                                    Moran, Mr. James   male  NA     0
##   Parch           Ticket    Fare Cabin Embarked
## 1     0        A/5 21171  7.2500              S
## 2     0         PC 17599 71.2833   C85        C
## 3     0 STON/O2. 3101282  7.9250              S
## 4     0           113803 53.1000  C123        S
## 5     0           373450  8.0500              S
## 6     0           330877  8.4583              Q

Create objects for the various factors in the data

surv <- dat$Survived
class <- dat$Pclass
sex <- dat$Sex

Creating barplots for the various factors

barplot(table(surv))

barplot(table(class))

barplot(table(sex))

?tapply
barplot(tapply(surv, class, mean))

barplot(tapply(surv, interaction(sex, class), mean))

The final barplot shows females in first and second class were pretty likely to survive, while females in third class had about an even chance of survival. Males of all classes were less likely to survive than females, but first class males survived at a higher rate than other classes.

This line creates an object for a glm of the data and plots it.

mod <- glm(Survived~Pclass*Sex, data=dat, family=binomial)
summary(mod)
## 
## Call:
## glm(formula = Survived ~ Pclass * Sex, family = binomial, data = dat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8488  -0.6969  -0.5199   0.4946   2.0339  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      6.0416     0.8273   7.303 2.81e-13 ***
## Pclass          -2.0011     0.2950  -6.784 1.17e-11 ***
## Sexmale         -6.0493     0.8756  -6.909 4.88e-12 ***
## Pclass:Sexmale   1.3593     0.3202   4.245 2.19e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1186.66  on 890  degrees of freedom
## Residual deviance:  803.12  on 887  degrees of freedom
## AIC: 811.12
## 
## Number of Fisher Scoring iterations: 6
plot(mod)

observ <- tapply(surv, interaction(sex, class), mean)

This creates an object for the predicted values of the data and then plots the observed values against the predicted values.

pred <- predict(mod, type="response")
predex <- tapply(pred, interaction(sex, class), mean)
barplot(rbind(observ, predex), beside = T, ylim=c(0,1), col = c("orange3","mediumpurple4"))
legend("topright", legend=c("observed", "predicted"), fill=c("orange3", "mediumpurple4"))