titanic_data.R

#based on  http://datascienceplus.com/perform-logistic-regression-in-r/
# REGRESSION MODELS
# How to Perform a Logistic Regression in R
# by Michy Aliceon September 13, 2015  

getwd()

## [1] "C:/Users/dell/Desktop/test"

training.data.raw <- read.csv('train.csv',header=T,na.strings=c(""))

sapply(training.data.raw,
       function(x) sum(is.na(x)))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##           0           0           0           0           0         177 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           0           0           0           0         687           2

sapply(training.data.raw, function(x) length(unique(x)))

## PassengerId    Survived      Pclass        Name         Sex         Age 
##         891           2           3         891           2          89 
##       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
##           7           7         681         248         148           4

#install.packages("Amelia")
library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2016 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

missmap(training.data.raw, main = "Missing values vs observed")


data <- subset(training.data.raw,select=c(2,3,5,6,7,8,10,12))

data$Age[is.na(data$Age)] <- mean(data$Age,na.rm=T)

data <- data[!is.na(data$Embarked),]
rownames(data) <- NULL

train <- data[1:800,]
test <- data[801:889,]

model <- glm(Survived ~.,family=binomial(link='logit'),data=train)
summary(model)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial(link = "logit"), 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6064  -0.5954  -0.4254   0.6220   2.4165  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  5.137627   0.594998   8.635  < 2e-16 ***
## Pclass      -1.087156   0.151168  -7.192 6.40e-13 ***
## Sexmale     -2.756819   0.212026 -13.002  < 2e-16 ***
## Age         -0.037267   0.008195  -4.547 5.43e-06 ***
## SibSp       -0.292920   0.114642  -2.555   0.0106 *  
## Parch       -0.116576   0.128127  -0.910   0.3629    
## Fare         0.001528   0.002353   0.649   0.5160    
## EmbarkedQ   -0.002656   0.400882  -0.007   0.9947    
## EmbarkedS   -0.318786   0.252960  -1.260   0.2076    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1065.39  on 799  degrees of freedom
## Residual deviance:  709.39  on 791  degrees of freedom
## AIC: 727.39
## 
## Number of Fisher Scoring iterations: 5

anova(model, test="Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Survived
## 
## Terms added sequentially (first to last)
## 
## 
##          Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                       799    1065.39              
## Pclass    1   83.607       798     981.79 < 2.2e-16 ***
## Sex       1  240.014       797     741.77 < 2.2e-16 ***
## Age       1   17.495       796     724.28 2.881e-05 ***
## SibSp     1   10.842       795     713.43  0.000992 ***
## Parch     1    0.863       794     712.57  0.352873    
## Fare      1    0.994       793     711.58  0.318717    
## Embarked  2    2.187       791     709.39  0.334990    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#install.packages("pscl")
library(pscl)

## Loading required package: MASS

## Loading required package: lattice

## Classes and Methods for R developed in the

## Political Science Computational Laboratory

## Department of Political Science

## Stanford University

## Simon Jackman

## hurdle and zeroinfl functions by Achim Zeileis

pR2(model)

##          llh      llhNull           G2     McFadden         r2ML 
## -354.6950111 -532.6961008  356.0021794    0.3341513    0.3591775 
##         r2CU 
##    0.4880244

fitted.results <- predict(model,newdata=subset(test,select=c(2,3,4,5,6,7,8)),type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)

misClasificError <- mean(fitted.results != test$Survived)
print(paste('Accuracy',1-misClasificError))

## [1] "Accuracy 0.842696629213483"

#install.packages("ROCR")
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

p <- predict(model, newdata=subset(test,select=c(2,3,4,5,6,7,8)), type="response")
pr <- prediction(p, test$Survived)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc

## [1] 0.8647186

titanic_data.R

dell

Sat Oct 15 09:02:23 2016