library(titanic)
tanic<- as.data.frame(Titanic)
datat<- titanic_train
attach(datat)
##change blank in to NA
datat[datat == ""] <- NA
##check how many NA values together wrong way
####apply with is.na works differernt with sapply
datat1<- datat[!apply(is.na(datat), 2, any),]
datat2<- datat[!is.na(datat$Cabin),]
######### sapply shows individual columns
sapply(datat, function(x) sum(is.na(x)))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
##same to colsums
colSums(is.na(datat))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
#####since there are too many NA values, we may keep it in otherwise we will tak out too much data
colnames(datat)
## [1] "PassengerId" "Survived" "Pclass" "Name" "Sex"
## [6] "Age" "SibSp" "Parch" "Ticket" "Fare"
## [11] "Cabin" "Embarked"
datat<- datat[,-c(1,4,9,11,12)] ##taking out irrelevant columns
####checking relationships
##starting with correlation analysis
library(sjPlot)
## #refugeeswelcome
library(ggplot2)
ageandfare<-ggplot(datat,aes(x=Age,y=Fare,colour=SibSp))+geom_point() ###the relationship of age and fare with coloured by sibsp
ageandfare+scale_color_gradient(low="blue", high="red")##change the colour by the scale

ageandfare+scale_color_gradientn(colours = rainbow(5))##change the colour by rainbow

survivewithage<-ggplot(datat,aes(x=Age,fill=..count..,))+geom_histogram() ####distribution of age
anotherway<-qplot(x =Age, fill=..count.., geom="histogram") ##another way
#####look at other types
####lets look something relative to the response
sjp.frq(Survived,geom.colors = c("green","red")) ####38% of people survived

sjp.xtab(Survived,Sex,show.total = F, show.n =F) ## huge difference in

###interactions
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
p<-ggplot(datat,aes(y=Fare,x=Age,colour=ifelse(Survived==1,"green","blue")))+geom_point(alpha=0.5)
p<-p+ labs(colour="Survived") ##change the legend name
agewithsurvive<-p + theme(legend.title = element_text(colour="blue", size=10,face="bold")) +theme(legend.background = element_rect(fill="lightblue",
size=0.5, linetype="solid", colour ="darkblue")) ######change the background colour and the title colour
ggplotly(agewithsurvive)###make it interactive
## We recommend that you use the dev version of ggplot2 with `ggplotly()`
## Install it with: `devtools::install_github('hadley/ggplot2')`
######## starting with the model
##library(dummies)
datat<- datat[,c(1:7)]
##datat$Sex<- dummy(datat$Sex)
datat$Pclass<-as.factor(datat$Pclass)
is.factor(datat$Pclass)
## [1] TRUE
is.factor(datat$Parch)
## [1] FALSE
datat$Parch<-as.factor(datat$Parch)
datat$Sex<-as.factor(datat$Sex)
is.factor(datat$Sex)
## [1] TRUE
#####modelling
##the full model
model<-glm(data=datat, formula=Survived~., family = binomial)
summary(model) ## few p values greater than 0.5, AIC quite high
##
## Call:
## glm(formula = Survived ~ ., family = binomial, data = datat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7755 -0.6213 -0.3892 0.6287 2.4194
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.959e+00 5.195e-01 7.620 2.53e-14 ***
## Pclass2 -1.298e+00 3.245e-01 -4.000 6.32e-05 ***
## Pclass3 -2.422e+00 3.424e-01 -7.074 1.51e-12 ***
## Sexmale -2.619e+00 2.197e-01 -11.920 < 2e-16 ***
## Age -4.009e-02 8.650e-03 -4.635 3.58e-06 ***
## SibSp -4.380e-01 1.334e-01 -3.283 0.00103 **
## Parch1 4.576e-01 2.963e-01 1.544 0.12256
## Parch2 1.178e-01 3.996e-01 0.295 0.76817
## Parch3 7.162e-01 1.056e+00 0.678 0.49764
## Parch4 -1.474e+01 6.256e+02 -0.024 0.98120
## Parch5 -9.363e-01 1.178e+00 -0.795 0.42659
## Parch6 -1.503e+01 1.455e+03 -0.010 0.99176
## Fare 1.795e-03 2.643e-03 0.679 0.49708
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 628.08 on 701 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 654.08
##
## Number of Fisher Scoring iterations: 14
##lets search for potential confounders
library(car)
vif(model)##all values are smaller than 5
## GVIF Df GVIF^(1/(2*Df))
## Pclass 2.078214 2 1.200667
## Sex 1.146907 1 1.070938
## Age 1.594915 1 1.262899
## SibSp 1.363756 1 1.167800
## Parch 1.530036 6 1.036076
## Fare 1.584426 1 1.258740
durbinWatsonTest(model) ### the p value is smaller than 0.5, thus We have failed to reject the null hypothesis, for which the null hypothesis is that the residuals are not correlated.
## lag Autocorrelation D-W Statistic p-value
## 1 0.0645941 1.870243 0.088
## Alternative hypothesis: rho != 0
##
model$df.residual
## [1] 701
pchisq(model$deviance, df=model$df.residual)
## [1] 0.02267474
#The null hypothesis is that our model is correctly specified, and we have strong evidence to reject that hypothesis. So we have strong evidence that our model fits badly.
## we need to consider another model
##lets try the forward selection
backward<- step(model,direction = "backward")
## Start: AIC=654.08
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare
##
## Df Deviance AIC
## - Parch 6 636.03 650.03
## - Fare 1 628.58 652.58
## <none> 628.08 654.08
## - SibSp 1 639.97 663.97
## - Age 1 651.40 675.40
## - Pclass 2 680.65 702.65
## - Sex 1 803.02 827.02
##
## Step: AIC=650.03
## Survived ~ Pclass + Sex + Age + SibSp + Fare
##
## Df Deviance AIC
## - Fare 1 636.56 648.56
## <none> 636.03 650.03
## - SibSp 1 647.23 659.23
## - Age 1 667.61 679.61
## - Pclass 2 699.21 709.21
## - Sex 1 819.08 831.08
##
## Step: AIC=648.56
## Survived ~ Pclass + Sex + Age + SibSp
##
## Df Deviance AIC
## <none> 636.56 648.56
## - SibSp 1 647.28 657.28
## - Age 1 669.40 679.40
## - Pclass 2 742.29 750.29
## - Sex 1 823.72 833.72
formula(backward) ###the variables with the least AIC
## Survived ~ Pclass + Sex + Age + SibSp
##lets fit the final model!
model1<- glm(data=datat, formula=Survived ~ Pclass + Sex+Age+SibSp,family = binomial)
summary(model1) ## the Pvalues in the summary for each variable are <0.5, which is all significant!
##
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = binomial,
## data = datat)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7876 -0.6417 -0.3864 0.6261 2.4539
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.334201 0.450700 9.617 < 2e-16 ***
## Pclass2 -1.414360 0.284727 -4.967 6.78e-07 ***
## Pclass3 -2.652618 0.285832 -9.280 < 2e-16 ***
## Sexmale -2.627679 0.214771 -12.235 < 2e-16 ***
## Age -0.044760 0.008225 -5.442 5.27e-08 ***
## SibSp -0.380190 0.121516 -3.129 0.00176 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 964.52 on 713 degrees of freedom
## Residual deviance: 636.56 on 708 degrees of freedom
## (177 observations deleted due to missingness)
## AIC: 648.56
##
## Number of Fisher Scoring iterations: 5