GLM for Titanic

library(titanic)
tanic<- as.data.frame(Titanic)
datat<- titanic_train
attach(datat)
##change blank in to NA
datat[datat == ""] <- NA
##check how many NA values together wrong way 
####apply with is.na works differernt with sapply 
datat1<- datat[!apply(is.na(datat), 2, any),]
datat2<- datat[!is.na(datat$Cabin),]
######### sapply shows individual columns
sapply(datat, function(x) sum(is.na(x)))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

##same to colsums
colSums(is.na(datat))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

#####since there are too many NA values, we may keep it in otherwise we will tak out too much data
colnames(datat)

##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Cabin"       "Embarked"

datat<- datat[,-c(1,4,9,11,12)] ##taking out irrelevant columns 
####checking relationships

##starting with correlation analysis
library(sjPlot)

## #refugeeswelcome

library(ggplot2)
ageandfare<-ggplot(datat,aes(x=Age,y=Fare,colour=SibSp))+geom_point() ###the relationship of age and fare with coloured by sibsp
ageandfare+scale_color_gradient(low="blue", high="red")##change the colour by the scale

ageandfare+scale_color_gradientn(colours = rainbow(5))##change the colour by rainbow

survivewithage<-ggplot(datat,aes(x=Age,fill=..count..,))+geom_histogram() ####distribution of age
anotherway<-qplot(x =Age, fill=..count.., geom="histogram") ##another way 
#####look at other types 
####lets look something relative to the response
sjp.frq(Survived,geom.colors = c("green","red")) ####38% of people survived

sjp.xtab(Survived,Sex,show.total = F, show.n =F) ## huge difference in

###interactions
library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

p<-ggplot(datat,aes(y=Fare,x=Age,colour=ifelse(Survived==1,"green","blue")))+geom_point(alpha=0.5)
p<-p+ labs(colour="Survived") ##change the legend name 
agewithsurvive<-p + theme(legend.title = element_text(colour="blue", size=10,face="bold"))  +theme(legend.background = element_rect(fill="lightblue",
                                                                   size=0.5, linetype="solid", colour ="darkblue"))   ######change the background colour and the title colour
ggplotly(agewithsurvive)###make it interactive

## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`

######## starting with the model 
##library(dummies)
datat<- datat[,c(1:7)]
##datat$Sex<- dummy(datat$Sex)
datat$Pclass<-as.factor(datat$Pclass)
is.factor(datat$Pclass)

## [1] TRUE

is.factor(datat$Parch)

## [1] FALSE

datat$Parch<-as.factor(datat$Parch)
datat$Sex<-as.factor(datat$Sex)
is.factor(datat$Sex)

## [1] TRUE

#####modelling 
##the full model
model<-glm(data=datat, formula=Survived~., family = binomial)
summary(model)  ## few p values greater than 0.5, AIC quite high

## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = datat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7755  -0.6213  -0.3892   0.6287   2.4194  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.959e+00  5.195e-01   7.620 2.53e-14 ***
## Pclass2     -1.298e+00  3.245e-01  -4.000 6.32e-05 ***
## Pclass3     -2.422e+00  3.424e-01  -7.074 1.51e-12 ***
## Sexmale     -2.619e+00  2.197e-01 -11.920  < 2e-16 ***
## Age         -4.009e-02  8.650e-03  -4.635 3.58e-06 ***
## SibSp       -4.380e-01  1.334e-01  -3.283  0.00103 ** 
## Parch1       4.576e-01  2.963e-01   1.544  0.12256    
## Parch2       1.178e-01  3.996e-01   0.295  0.76817    
## Parch3       7.162e-01  1.056e+00   0.678  0.49764    
## Parch4      -1.474e+01  6.256e+02  -0.024  0.98120    
## Parch5      -9.363e-01  1.178e+00  -0.795  0.42659    
## Parch6      -1.503e+01  1.455e+03  -0.010  0.99176    
## Fare         1.795e-03  2.643e-03   0.679  0.49708    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 964.52  on 713  degrees of freedom
## Residual deviance: 628.08  on 701  degrees of freedom
##   (177 observations deleted due to missingness)
## AIC: 654.08
## 
## Number of Fisher Scoring iterations: 14

##lets search for potential confounders 
library(car)
vif(model)##all values are smaller than 5

##            GVIF Df GVIF^(1/(2*Df))
## Pclass 2.078214  2        1.200667
## Sex    1.146907  1        1.070938
## Age    1.594915  1        1.262899
## SibSp  1.363756  1        1.167800
## Parch  1.530036  6        1.036076
## Fare   1.584426  1        1.258740

durbinWatsonTest(model) ### the p value is smaller than 0.5, thus We have failed to reject the null hypothesis,  for which the null hypothesis is that the residuals are not correlated.

##  lag Autocorrelation D-W Statistic p-value
##    1       0.0645941      1.870243   0.088
##  Alternative hypothesis: rho != 0

##
model$df.residual

## [1] 701

pchisq(model$deviance, df=model$df.residual)

## [1] 0.02267474

#The null hypothesis is that our model is correctly specified, and we have strong evidence to reject that hypothesis. So we have strong evidence that our model fits badly. 
## we need to consider another model 
##lets try the forward selection
backward<- step(model,direction = "backward")

## Start:  AIC=654.08
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare
## 
##          Df Deviance    AIC
## - Parch   6   636.03 650.03
## - Fare    1   628.58 652.58
## <none>        628.08 654.08
## - SibSp   1   639.97 663.97
## - Age     1   651.40 675.40
## - Pclass  2   680.65 702.65
## - Sex     1   803.02 827.02
## 
## Step:  AIC=650.03
## Survived ~ Pclass + Sex + Age + SibSp + Fare
## 
##          Df Deviance    AIC
## - Fare    1   636.56 648.56
## <none>        636.03 650.03
## - SibSp   1   647.23 659.23
## - Age     1   667.61 679.61
## - Pclass  2   699.21 709.21
## - Sex     1   819.08 831.08
## 
## Step:  AIC=648.56
## Survived ~ Pclass + Sex + Age + SibSp
## 
##          Df Deviance    AIC
## <none>        636.56 648.56
## - SibSp   1   647.28 657.28
## - Age     1   669.40 679.40
## - Pclass  2   742.29 750.29
## - Sex     1   823.72 833.72

formula(backward) ###the variables with the least AIC

## Survived ~ Pclass + Sex + Age + SibSp

##lets fit the final model!
model1<- glm(data=datat, formula=Survived ~ Pclass + Sex+Age+SibSp,family = binomial)
summary(model1) ## the Pvalues in the summary for each variable are <0.5, which is all significant!

## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = binomial, 
##     data = datat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7876  -0.6417  -0.3864   0.6261   2.4539  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.334201   0.450700   9.617  < 2e-16 ***
## Pclass2     -1.414360   0.284727  -4.967 6.78e-07 ***
## Pclass3     -2.652618   0.285832  -9.280  < 2e-16 ***
## Sexmale     -2.627679   0.214771 -12.235  < 2e-16 ***
## Age         -0.044760   0.008225  -5.442 5.27e-08 ***
## SibSp       -0.380190   0.121516  -3.129  0.00176 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 964.52  on 713  degrees of freedom
## Residual deviance: 636.56  on 708  degrees of freedom
##   (177 observations deleted due to missingness)
## AIC: 648.56
## 
## Number of Fisher Scoring iterations: 5