#based on http://datascienceplus.com/perform-logistic-regression-in-r/
# REGRESSION MODELS
# How to Perform a Logistic Regression in R
# by Michy Aliceon September 13, 2015
getwd()
## [1] "C:/Users/dell/Desktop/test"
training.data.raw <- read.csv('train.csv',header=T,na.strings=c(""))
sapply(training.data.raw,
function(x) sum(is.na(x)))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 177
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 687 2
sapply(training.data.raw, function(x) length(unique(x)))
## PassengerId Survived Pclass Name Sex Age
## 891 2 3 891 2 89
## SibSp Parch Ticket Fare Cabin Embarked
## 7 7 681 248 148 4
#install.packages("Amelia")
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.4, built: 2015-12-05)
## ## Copyright (C) 2005-2016 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
missmap(training.data.raw, main = "Missing values vs observed")
data <- subset(training.data.raw,select=c(2,3,5,6,7,8,10,12))
data$Age[is.na(data$Age)] <- mean(data$Age,na.rm=T)
data <- data[!is.na(data$Embarked),]
rownames(data) <- NULL
train <- data[1:800,]
test <- data[801:889,]
model <- glm(Survived ~.,family=binomial(link='logit'),data=train)
summary(model)
##
## Call:
## glm(formula = Survived ~ ., family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6064 -0.5954 -0.4254 0.6220 2.4165
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.137627 0.594998 8.635 < 2e-16 ***
## Pclass -1.087156 0.151168 -7.192 6.40e-13 ***
## Sexmale -2.756819 0.212026 -13.002 < 2e-16 ***
## Age -0.037267 0.008195 -4.547 5.43e-06 ***
## SibSp -0.292920 0.114642 -2.555 0.0106 *
## Parch -0.116576 0.128127 -0.910 0.3629
## Fare 0.001528 0.002353 0.649 0.5160
## EmbarkedQ -0.002656 0.400882 -0.007 0.9947
## EmbarkedS -0.318786 0.252960 -1.260 0.2076
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1065.39 on 799 degrees of freedom
## Residual deviance: 709.39 on 791 degrees of freedom
## AIC: 727.39
##
## Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Survived
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 799 1065.39
## Pclass 1 83.607 798 981.79 < 2.2e-16 ***
## Sex 1 240.014 797 741.77 < 2.2e-16 ***
## Age 1 17.495 796 724.28 2.881e-05 ***
## SibSp 1 10.842 795 713.43 0.000992 ***
## Parch 1 0.863 794 712.57 0.352873
## Fare 1 0.994 793 711.58 0.318717
## Embarked 2 2.187 791 709.39 0.334990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#install.packages("pscl")
library(pscl)
## Loading required package: MASS
## Loading required package: lattice
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

pR2(model)
## llh llhNull G2 McFadden r2ML
## -354.6950111 -532.6961008 356.0021794 0.3341513 0.3591775
## r2CU
## 0.4880244
fitted.results <- predict(model,newdata=subset(test,select=c(2,3,4,5,6,7,8)),type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$Survived)
print(paste('Accuracy',1-misClasificError))
## [1] "Accuracy 0.842696629213483"
#install.packages("ROCR")
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
p <- predict(model, newdata=subset(test,select=c(2,3,4,5,6,7,8)), type="response")
pr <- prediction(p, test$Survived)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)

auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8647186