require(ggplot2)
require(RColorBrewer)
require(ggthemes)
require(dplyr)
library(xtable)
library(Amelia)
library(reshape2)
require(andrews)
require(tidyr)
require(GGally)
require(scales)
require(ggExtra)
# require(grid) for multiplot function
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}data = read.csv("train.csv")
dim(data) ## rows and variables of the dataset## [1] 891 12
str(data) ## structure of the data## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
dataoptions(xtable.comment = FALSE)
xtable(summary(data))data[data==""]= NAAmelia package (which is for imputation) to visualize the missing valuesmissmap(data,y.cex=0.5,main="Na's in the dataset")ggplot2 and help from neato package’s functionggplot_missing <- function(x){
x %>%
is.na %>%
melt %>%
ggplot(data = .,
aes(x = Var2,
y = Var1)) +
geom_raster(aes(fill = value)) +
scale_fill_grey(name = "",
labels = c("Present","Missing")) +
theme_tufte() +
theme(axis.text.x = element_text(angle=30, vjust=0.5)) +
labs(x = "Variables in Dataset",
y = "Rows / observations")
}
ggplot_missing(data)head(data$Name)## [1] Braund, Mr. Owen Harris
## [2] Cumings, Mrs. John Bradley (Florence Briggs Thayer)
## [3] Heikkinen, Miss. Laina
## [4] Futrelle, Mrs. Jacques Heath (Lily May Peel)
## [5] Allen, Mr. William Henry
## [6] Moran, Mr. James
## 891 Levels: Abbing, Mr. Anthony ... Zimmerman, Mr. Leo
# sort(data$Name) # I sorted alphabetically so I could understand better. You can se
# Now lets make some new variables
data=separate(data,col=Name,into=c('Surname',"temp"),sep =c(', '),remove=F)
data=separate(data,col=temp,into="Title",sep ='. ',remove=TRUE)## Warning: Too many values at 891 locations: 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
## 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...
# After a while, I compared the results with others and i had one mistake. I did not take into account if the title is more than one word. So, I lost 'the Countess' and got just 'th'. So watch out!
#data$Title <- gsub('(.*, )|(\\..*)', '', data$Name)
#data$Surame=sapply(as.character(data$Name),function(x) unlist(strsplit(x, split = '[,.]'))[1])
summary(factor(data$Title))## Capt Col Don Dr Jonkheer Lady Major Master
## 1 2 1 7 1 1 2 40
## Miss Mlle Mme Mr Mrs Ms Rev Sir
## 182 2 1 517 125 1 6 1
## th
## 1
data[,c(5,6)]other_title <- c('Dona', 'Lady', 'th','Capt', 'Col', 'Don',
'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer')
data$Title[data$Title == 'Mlle'] <- 'Miss'
data$Title[data$Title == 'Ms'] <- 'Miss'
data$Title[data$Title == 'Mme'] <- 'Mrs'
data$Title[data$Title %in% other_title] <- 'Other Title'
table(data$Sex,data$Title)##
## Master Miss Mr Mrs Other Title
## female 0 185 0 126 3
## male 40 0 517 0 20
n=data.frame(table(data$Surname))
reg_names=data[rownames(n[n$Freq > 1,]),]
SurnameNum=data.frame(table(sort(n$Freq)))
names(SurnameNum)=c("# of times the Surname appears","# of names")
options(xtable.comment = FALSE)
xtable(SurnameNum)sort(reg_names[,5]) # these are the names that appear more than one time in the dataset## [1] "Abelson" "Allison"
## [3] "Anderson" "Andersson"
## [5] "Andersson" "Andrews"
## [7] "Arnold-Franchi" "Asplund"
## [9] "Asplund" "Asplund"
## [11] "Backstrom" "Baclini"
## [13] "Baclini" "Barber"
## [15] "Baxter" "Bengtsson"
## [17] "Berglund" "Bissette"
## [19] "Bjornstrom-Steffansson" "Bonnell"
## [21] "Bowerman" "Braund"
## [23] "Butt" "Calic"
## [25] "Carbines" "Celotti"
## [27] "Chronopoulos" "Cor"
## [29] "Coutts" "Crease"
## [31] "Cumings" "Cunningham"
## [33] "Denkoff" "Drazenoic"
## [35] "Duff Gordon" "Elias"
## [37] "Elsbury" "Flynn"
## [39] "Ford" "Fortune"
## [41] "Fortune" "Fortune"
## [43] "Glynn" "Goldsmith"
## [45] "Goodwin" "Graham"
## [47] "Green" "Greenfield"
## [49] "Gustafsson" "Hakkarainen"
## [51] "Hamalainen" "Hanna"
## [53] "Hart" "Hays"
## [55] "Heikkinen" "Henry"
## [57] "Hickman" "Hold"
## [59] "Hoyt" "Hoyt"
## [61] "Hunt" "Isham"
## [63] "Jacobsohn" "Jensen"
## [65] "Johanson" "Johansson"
## [67] "Johnson" "Jussila"
## [69] "Kassem" "Keane"
## [71] "Kelly" "Kenyon"
## [73] "Kimball" "Lang"
## [75] "Larsson" "Lewy"
## [77] "Leyson" "Lindell"
## [79] "Louch" "Lundahl"
## [81] "Maenpaa" "Masselmani"
## [83] "Matthews" "McGowan"
## [85] "Meek" "Mernagh"
## [87] "Minahan" "Mionoff"
## [89] "Nasser" "Navratil"
## [91] "Newell" "Nicola-Yarred"
## [93] "Nysten" "O'Driscoll"
## [95] "O'Leary" "Palsson"
## [97] "Panula" "Parrish"
## [99] "Partner" "Patchett"
## [101] "Paulner" "Peter"
## [103] "Petranec" "Plotcharsky"
## [105] "Quick" "Rice"
## [107] "Rice" "Robbins"
## [109] "Rogers" "Rosblom"
## [111] "Sandstrom" "Saundercock"
## [113] "Scanlan" "Silvey"
## [115] "Simonius-Blumer" "Sirayanian"
## [117] "Slabenoff" "Smart"
## [119] "Smiljanic" "Smith"
## [121] "Sobey" "Spencer"
## [123] "Staneff" "Strom"
## [125] "Sunderland" "Sutton"
## [127] "Taussig" "Toomey"
## [129] "Trout" "Turcin"
## [131] "van Billiard" "Webber"
## [133] "Wiklund"
set.seed(123)
m=lm(Age~.,data=data[,-c(1,4,5,11,9,10,13)]) # regression
t=predict(m,data[is.na(data$Age),-c(1,4,5,11,9,10,13,8)]) # predict na's from regression model
# Remove any negative value and replace it with 0.5
for(i in 1:length(t)){
if(t[i]<0) {
t[i]= 0.5
}
}
# Lets test now
test=data
test[names(t),8]=t
par(mfrow=c(1,2))
hist(data$Age,freq=F,ylim = c(0,0.04),main="Histogram of Age without Na's")
hist(test$Age,freq=F,ylim = c(0,0.04),main="Histogram of Age after imputation")# Since it seems ok
data[names(t),8]=t
dim(data)## [1] 891 14
summary(is.na(data$Age)) ## DONE!## Mode FALSE
## logical 891
At this point I got a little bored so I ll just remove those two observations
data=data[!is.na(data$Embarked),]
dim(data)## [1] 889 14
zeroFare=data[data$Fare==0,]
#ggplot(data,
# aes(x=log10(Fare)))+
# geom_density(fill="blue",
# alpha=0.3)+
# theme_excel()
options(xtable.comment = FALSE)
xtable(summary(zeroFare))g1= ggplot(zeroFare,
aes(x=factor(Survived),fill=factor(Survived)))+
geom_bar()+
scale_x_discrete(breaks=c(0,1),
labels=c("Not Survived","Survived"),
name=" ")+
scale_fill_brewer(palette="Set1",breaks=c(0,1),
labels=c("Not Survived","Survived"),
name=" ")+
theme_tufte()+
theme(legend.position = c(0.9,0.9),
plot.title = element_text(hjust=0.5))+
ggtitle("Zero Fare vs Survival")
g2= ggplot(zeroFare,
aes(x=factor(Pclass),fill=factor(Pclass)))+
geom_bar()+
scale_fill_tableau(name="Class",
breaks=c(1,2,3),
labels=c("1st","2nd","3rd"))+
scale_x_discrete(name="",
breaks=c(1,2,3),
labels=c("1st","2nd","3rd"))+
theme_tufte()+
theme(legend.position = c(0.9,0.9),
plot.title = element_text(hjust=0.5))+
ggtitle("Zero Fare vs Pclass")
multiplot(g1,g2,cols = 2) - Only one of them Survived. I consider these zero values as missing and I ll handle them as Na’s.
ggplot(data,
aes(x=factor(Survived)))+
geom_bar(
aes(y=(..count../sum(..count..)),
fill=factor(Survived)))+
scale_x_discrete(breaks=c(0,1),
labels=c("Not Survived","Survived"),name=" ")+
scale_fill_colorblind(breaks=c(0,1),
labels=c("Not Survived","Survived"),name="Survived")+
scale_y_continuous(breaks = c(0,0.2,0.4,0.6),
labels=c("0%","20%","40%","60%"))+
theme_classic()+
theme(plot.title = element_text(hjust=0.5),
legend.position = c(0.9,0.9))+
ggtitle('Survived or not? Percentage of the train dataset')+
ylab(" ")ggplot(data,
aes(Sex)
)+
geom_bar(
aes(y=(..count..)/sum(..count..)), # returns the percentage instead of the count, for just count = ..count..
fill=c('red','blue') # choose your colours
)+
ylab('')+
ggtitle("Gender of Titanic's passengers")+
theme_bw()+
theme(plot.title = element_text(hjust = 0.5)) # centers the main titleggplot(data,
aes(y=Survived,
x=Sex,
fill=as.factor(Sex)
)
)+
geom_bar(stat = 'identity'
)+
scale_fill_brewer(palette='Dark2', # RcolorBrewer package
name='Sex' # title for the legend
)+
theme_gdocs(base_size = 12 # font size for the gdocs theme from ggthemes package
)+
theme(legend.position = c(0.9, .85), # choose the exact position for the legend in your graph
plot.title = element_text(hjust = 0.5) # centers the main title of the plot
)+
ggtitle('How many male and female people survived')ggplot(na.omit(data[,c(2,8)]),
aes(x=factor(Survived),
y=Age,
fill=factor(Survived))
)+
geom_boxplot(outlier.colour = "red", # how to show the outliers in the plot
outlier.shape = 3, #outlier shape
outlier.size = 2 # outlier size
)+
stat_summary(fun.y = mean, # visualize the mean in the boxplot to get an idea about the data distribution
geom = "point", # type of the mean icon
shape = 6, # shape of the mean icon
size = 2 # size
)+
theme_wsj(base_size = 10 # size of the font in wall street journal theme
)+
ggtitle("Boxplot")+
xlab('Survived')+
theme(legend.position = 'bottom', # theme() options overrides the wsj theme options
plot.title = element_text(hjust = 0.5) # centers the title
)+
scale_fill_brewer(palette = 'Set1',
name="Survived",
breaks=c(0,1),
labels=c("No", "Yes")
)+
scale_x_discrete(breaks=c(0,1),
labels=c("No", "Yes")
)ggplot(data,
aes(x=Pclass,
y=Survived,
fill=factor(Pclass))
)+
geom_bar(stat="identity")+
theme_solarized()+ # nice theme from ggthemes
scale_fill_brewer(palette="Set2",
name="Ticket Class",
breaks=c(1,2,3),
labels=c("1st","2nd","3rd")
)+
theme(legend.position = 'bottom',
axis.text.x = element_blank(), # No text on x axis
plot.title = element_text(hjust = 0.5)
)+
xlab("")+
ggtitle('How many survived from each ticket class')g1=ggplot(data,
aes(x=factor(Survived),y=Fare)
)+
geom_boxplot()+
theme_minimal()+
theme(axis.title.x = element_blank()
)
## There are extreme values that destroy the plot..Let's remove:
g2=ggplot(data,
aes(x=factor(Survived),y=Fare)
)+
geom_boxplot()+
scale_y_continuous(limits = c(0, 300)
)+
theme_excel()+
theme(axis.title.x = element_blank()
)+
ylab('')
## The plot shows 1 extreme value over 300 but three values are excluded.
tail(sort(data$Fare))## [1] 263.0000 263.0000 263.0000 512.3292 512.3292 512.3292
## and thats correct because three tickets had the biggest and the same fare.
g3=ggplot(data,
aes(x=factor(Survived),y=Fare)
)+
geom_boxplot()+
scale_y_continuous(limits = c(0, 120))+
theme_dark()+
theme(axis.title.x = element_blank()
)+
ylab('')
multiplot(g1,g2,g3,cols=3)## Warning: Removed 3 rows containing non-finite values (stat_boxplot).
## Warning: Removed 38 rows containing non-finite values (stat_boxplot).
It seems that the survivers had paid significant higher fare by mean, even though we have many extreme values and outliers. It should be pointed out that the three passengers with the much higher fare all survived
ggplot(data,
aes(x=factor(Survived),
y=Fare,
fill=factor(Survived)))+
geom_boxplot()+
facet_grid(~Pclass,
labeller=labeller(Pclass = c('1'="1st Class",'2'="2nd Class",'3'="3rd Class")) # change the facet's labels
)+
theme_solarized_2()+
scale_y_continuous(limits = c(0,160) # manipulate the y-axis limits to omit some extreme values
)+
xlab("Survived")+
scale_fill_brewer(palette = 'Set1',guide=FALSE # guide=F to remove legend
)+
scale_x_discrete(breaks=c(0,1),labels=c("No","Yes")) # change the names of x axis. You have to define the breaks and then give labels## Warning: Removed 22 rows containing non-finite values (stat_boxplot).
This plot gives a slightly different view. In the 2nd and 3rd class there seems to be no significant difference. In the 1st class thought it seems that the people who paid higher fare survived by mean.
ggplot(data[!is.na(data$Embarked),],aes(x=Embarked,
y=(..count..)/sum(..count..),fill=Embarked))+
geom_bar()+
facet_grid(~Survived,
labeller = labeller(Survived=c('0'="Not Survived",'1'="Survived")))+
scale_fill_brewer(palette = "Spectral",guide=F)+
theme_dark()+ylab('')+xlab("")+
scale_x_discrete(breaks=c("C","Q","S"),
labels=c("Cherbourg","Queenstown","Southampton"))ggplot(data[!is.na(data$Embarked),],aes(x=Embarked,
y=(..count..)/sum(..count..),fill=Embarked))+
geom_bar()+
scale_fill_brewer(palette = "Spectral")+
theme_dark()+ylab('')+xlab("")+
scale_x_discrete(breaks=c("C","Q","S"),
labels=c("Cherbourg","Queenstown","Southampton"))ggplot(data[!is.na(data$Embarked),],aes(x=factor(Embarked),
y=Fare,fill=Embarked))+
theme_dark()+
geom_boxplot()+scale_y_continuous(limits = c(0,175))## Warning: Removed 20 rows containing non-finite values (stat_boxplot).
test<-data[,-c(1,4,5,9,10,11)] # I exclude all the none numerical variables, not the Survived though
test<-na.omit(test) # no missing values for the andrews plot..
andrews(test, # data
clr=1, # the variable I want to use to color the strings
type =1, ymax=3,step=80,main="Andrews plot, best to use with multi-dimensional datasets")Andrews plot is useful for multidimensional datasets. We can see if the groups of the Survived variable are seperatable with the (numerical) variables of this dataset.
data$FamSize=data$SibSp+data$Parch+1
ggplot(data,
aes(x=factor(FamSize),
fill=factor(Survived))
)+
geom_bar(stat='count',position='dodge')+
scale_fill_brewer(palette="Pastel1",name="Survival",
breaks=c(0,1),labels=c("No","Yes"))+
theme_dark()+xlab("Family Size")+
ggtitle("Survival by Family Size")+
scale_x_discrete(breaks=c(1:11),labels=c(1:11))+
theme( plot.title = element_text(hjust = 0.5),legend.position = c(0.9,0.8),legend.background = element_rect(fill = 'darkgrey'))We could much more to learn about titanic and its passengers but lets built a simple model.
mdata=select(data,c(Survived,Pclass,Sex,Title,Age,Fare,Embarked,FamSize,SibSp,Parch)) # select variables
mdata$Survived=factor(mdata$Survived,
levels = 0:1,
labels = c("no", "yes"))
test$Pclass=as.numeric(test$Pclass)
accuracy=NULL
for(i in 1:1000){
te=sample(1:889,889,replace = TRUE)
train=mdata[te,]
test=mdata[-te,]
model=glm(Survived~.,train,family = 'binomial')
p=round(predict(model,test[,-1],type="response"))
accuracy[i]=sum(diag(table(p,test[,1])))/sum(table(p,test[,1]))
}
mean(accuracy)## [1] 0.8248693
sd(accuracy)## [1] 0.01734939