training.data.raw <- read.csv('train.csv',header=T,na.strings=c(""))
sapply(training.data.raw,function(x) sum(is.na(x)))
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
0 0 0 0 0 177 0 0 0 0
Cabin Embarked
687 2
sapply(training.data.raw, function(x)length(unique(x)))
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare
891 2 3 891 2 89 7 7 681 248
Cabin Embarked
148 4
library(Amelia)
Loading required package: Rcpp
unknown timezone 'default/Asia/Kolkata'##
## Amelia II: Multiple Imputation
## (Version 1.7.4, built: 2015-12-05)
## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## Refer to http://gking.harvard.edu/amelia/ for more information
##
missmap(training.data.raw, main = "Missing values vs observed")
data <- subset(training.data.raw,select=c(2,3,5,6,7,8,10,12))
data$Age[is.na(data$Age)] <- mean(data$Age,na.rm=T)
is.factor(data$Sex)
[1] TRUE
is.factor(data$Embarked)
[1] TRUE
contrasts(data$Sex)
male
female 0
male 1
contrasts(data$Embarked)
Q S
C 0 0
Q 1 0
S 0 1
data <- data[!is.na(data$Embarked),]
rownames(data) <- NULL
train <- data[1:800,]
test <- data[801:889,]
model <- glm(Survived ~.,family=binomial(link='logit'),data=train)
summary(model)
Call:
glm(formula = Survived ~ ., family = binomial(link = "logit"),
data = train)
Deviance Residuals:
Min 1Q Median 3Q Max
-2.6064 -0.5954 -0.4254 0.6220 2.4165
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.137627 0.594998 8.635 < 2e-16 ***
Pclass -1.087156 0.151168 -7.192 6.40e-13 ***
Sexmale -2.756819 0.212026 -13.002 < 2e-16 ***
Age -0.037267 0.008195 -4.547 5.43e-06 ***
SibSp -0.292920 0.114642 -2.555 0.0106 *
Parch -0.116576 0.128127 -0.910 0.3629
Fare 0.001528 0.002353 0.649 0.5160
EmbarkedQ -0.002656 0.400882 -0.007 0.9947
EmbarkedS -0.318786 0.252960 -1.260 0.2076
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1065.39 on 799 degrees of freedom
Residual deviance: 709.39 on 791 degrees of freedom
AIC: 727.39
Number of Fisher Scoring iterations: 5
anova(model, test="Chisq")
Analysis of Deviance Table
Model: binomial, link: logit
Response: Survived
Terms added sequentially (first to last)
Df Deviance Resid. Df Resid. Dev Pr(>Chi)
NULL 799 1065.39
Pclass 1 83.607 798 981.79 < 2.2e-16 ***
Sex 1 240.014 797 741.77 < 2.2e-16 ***
Age 1 17.495 796 724.28 2.881e-05 ***
SibSp 1 10.842 795 713.43 0.000992 ***
Parch 1 0.863 794 712.57 0.352873
Fare 1 0.994 793 711.58 0.318717
Embarked 2 2.187 791 709.39 0.334990
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
fitted.results <- predict(model,newdata=subset(test,select=c(2,3,4,5,6,7,8)),type='response')
fitted.results <- ifelse(fitted.results > 0.5,1,0)
misClasificError <- mean(fitted.results != test$Survived)
print(paste('Accuracy',1-misClasificError))
[1] "Accuracy 0.842696629213483"
#install.packages("ROCR")
library(ROCR)
p <- predict(model, newdata=subset(test,select=c(2,3,4,5,6,7,8)), type="response")
pr <- prediction(p, test$Survived)
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
[1] 0.8647186
asdfasdf
Logit and probit differ in the assumption of the underlying distribution. Logit assumes the distribution is logistic (i.e. the outcome either happens or it doesn’t). Probit assumes the underlying distribution is normal which means, essentially, that the observed outcome either happens or doesn’t but this reflects a certain threshold being met for the underlying latent variable which is normally distributed.
mfxboot <- function(modform,dist,data,boot=1000,digits=3){
x <- glm(modform, family=binomial(link=dist),data)
# get marginal effects
pdf <- ifelse(dist=="probit",
mean(dnorm(predict(x, type = "link"))),
mean(dlogis(predict(x, type = "link"))))
marginal.effects <- pdf*coef(x)
# start bootstrap
bootvals <- matrix(rep(NA,boot*length(coef(x))), nrow=boot)
set.seed(1111)
for(i in 1:boot){
samp1 <- data[sample(1:dim(data)[1],replace=T,dim(data)[1]),]
x1 <- glm(modform, family=binomial(link=dist),samp1)
pdf1 <- ifelse(dist=="probit",
mean(dnorm(predict(x, type = "link"))),
mean(dlogis(predict(x, type = "link"))))
bootvals[i,] <- pdf1*coef(x1)
}
res <- cbind(marginal.effects,apply(bootvals,2,sd),marginal.effects/apply(bootvals,2,sd))
if(names(x$coefficients[1])=="(Intercept)"){
res1 <- res[2:nrow(res),]
res2 <- matrix(as.numeric(sprintf(paste("%.",paste(digits,"f",sep=""),sep=""),res1)),nrow=dim(res1)[1])
rownames(res2) <- rownames(res1)
} else {
res2 <- matrix(as.numeric(sprintf(paste("%.",paste(digits,"f",sep=""),sep="")),nrow=dim(res)[1]))
rownames(res2) <- rownames(res)
}
colnames(res2) <- c("marginal.effect","standard.error","z.ratio")
return(res2)
}
library(AER)
data(train)
data set ‘train’ not found
mfx1 <- mfxboot(Survived ~ . + I(Age^2),"probit",data=train)
mfx2 <- mfxboot(Survived ~ . + I(Age^2),"logit",train)
mfx3 <- mfxboot(Survived ~ . + I(Age^2),"probit",train,boot=100,digits=4)
mfxdat <- data.frame(cbind(rownames(mfx1),mfx1))
mfxdat$me <- as.numeric(as.character(mfxdat$marginal.effect))
mfxdat$se <- as.numeric(as.character(mfxdat$standard.error))
# coefplot
library(ggplot2)
ggplot(mfxdat, aes(V1, marginal.effect,ymin = me - 2*se,ymax= me + 2*se)) +
scale_x_discrete('Variable') +
scale_y_continuous('Marginal Effect',limits=c(-0.5,1)) +
theme_bw() +
geom_errorbar(aes(x = V1, y = me),size=.3,width=.2) +
geom_point(aes(x = V1, y = me)) +
geom_hline(yintercept=0) +
coord_flip() +
opts(title="Marginal Effects with 95% Confidence Intervals")
Ignoring unknown aesthetics: yError: could not find function "opts"