#ESSOUNAINI SOUFIANE
Summary:
I tried to do some exploratory data analysis on the Titanic data and to predict the survivors of the Ship using neural network.
gender<-read.csv("gender_submission.csv")
class(gender$Survived)
## [1] "integer"
train<-read.csv("train.csv")
train$survived<-as.numeric(train$Survived)
sum(train$Survived)/891
## [1] 0.3838384
Only 38.38% of people survived the crash of the Titanic.Quite tragic! within the document. You can embed an R code chunk like this:
test<-read.csv("test.csv")
head(train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked survived
## 1 A/5 21171 7.2500 S 0
## 2 PC 17599 71.2833 C85 C 1
## 3 STON/O2. 3101282 7.9250 S 1
## 4 113803 53.1000 C123 S 1
## 5 373450 8.0500 S 0
## 6 330877 8.4583 Q 0
head(test)
## PassengerId Pclass Name Sex Age
## 1 892 3 Kelly, Mr. James male 34.5
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0
## 3 894 2 Myles, Mr. Thomas Francis male 62.0
## 4 895 3 Wirz, Mr. Albert male 27.0
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0
## 6 897 3 Svensson, Mr. Johan Cervin male 14.0
## SibSp Parch Ticket Fare Cabin Embarked
## 1 0 0 330911 7.8292 Q
## 2 1 0 363272 7.0000 S
## 3 0 0 240276 9.6875 Q
## 4 0 0 315154 8.6625 S
## 5 1 1 3101298 12.2875 S
## 6 0 0 7538 9.2250 S
train1<-train[c(-2,-13)]
datas<-rbind(train1,test)
head(datas)
## PassengerId Pclass Name Sex
## 1 1 3 Braund, Mr. Owen Harris male
## 2 2 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
## 3 3 3 Heikkinen, Miss. Laina female
## 4 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female
## 5 5 3 Allen, Mr. William Henry male
## 6 6 3 Moran, Mr. James male
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 22 1 0 A/5 21171 7.2500 S
## 2 38 1 0 PC 17599 71.2833 C85 C
## 3 26 0 0 STON/O2. 3101282 7.9250 S
## 4 35 1 0 113803 53.1000 C123 S
## 5 35 0 0 373450 8.0500 S
## 6 NA 0 0 330877 8.4583 Q
You can also embed plots, for example:
data3<-train %>%
group_by(Sex) %>%
summarise(somme=sum(Survived),na.rm=TRUE) %>%
select(Sex,somme)
## `summarise()` ungrouping output (override with `.groups` argument)
g<-ggplot(data3,aes(x=Sex,y=somme))
g+geom_bar(stat="identity",aes(fill=Sex)) +theme_light()+labs(title = "Survivors of the titanic crash by gender",xlab="Sex",ylab="Total")
That’s totally understandable.They rescued women and babies first like we have all seen in the movie.
data4<-train %>%
group_by(Pclass) %>%
summarise(Survivors=sum(Survived),na.rm=TRUE) %>%
select(Pclass,Survivors)
## `summarise()` ungrouping output (override with `.groups` argument)
g<-ggplot(data4,aes(x=Pclass,y=Survivors))
g+geom_bar(stat="identity",aes(fill=Pclass)) +theme_light()+labs(title = "Survivors of the titanic crash by economic class",xlab="Class",ylab="Survivors")
Like we see in the plot.People who were in the first class were more likely to survive the incident.
par(mfrow=c(1,3))
data5<-train %>%
group_by(Pclass,Sex) %>%
summarise(Survivors=sum(Survived),na.rm=TRUE) %>%
select(Sex,Pclass,Survivors)
## `summarise()` regrouping output by 'Pclass' (override with `.groups` argument)
data51<-data5[1:2,]
data52<-data5[3:4,]
data53<-data5[5:6,]
head(data5)
## # A tibble: 6 x 3
## # Groups: Pclass [3]
## Sex Pclass Survivors
## <fct> <int> <int>
## 1 female 1 91
## 2 male 1 45
## 3 female 2 70
## 4 male 2 17
## 5 female 3 72
## 6 male 3 47
g<-ggplot(data51,aes(x=Sex,y=Survivors))
g+geom_bar(stat="identity") +theme_light()+labs(title="Survivors by gender of the 1st economic class")
g1<-ggplot(data52,aes(x=Sex,y=Survivors))
g1+geom_bar(stat="identity") +theme_light()+labs(title="Survivors by gender of the 2nd economic class")
g2<-ggplot(data53,aes(x=Sex,y=Survivors))
g2+geom_bar(stat="identity") +theme_light()+labs(title="Survivors by gender of the 3rd economic class")
Like we see in the previous plots.Both men and women who were in the 1st class had more chance of being rescued than the others.
Looking_for_jack <- subset(train,Name=="Dawson,Mr. Jack")
head(Looking_for_jack)
## [1] PassengerId Survived Pclass Name Sex Age
## [7] SibSp Parch Ticket Fare Cabin Embarked
## [13] survived
## <0 rows> (or 0-length row.names)
Just for fun: Like you see, we couldn’t find any information about Jack Dawson (The character played by Leonardo Dicaprio in the movie “Titanic”)
# Data Partition
set.seed(222)
levels(train$Sex)<-c(1,0)
levels(test$Sex)<-c(1,0)
head(train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris 0 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) 1 38 1 0
## 3 Heikkinen, Miss. Laina 1 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 35 1 0
## 5 Allen, Mr. William Henry 0 35 0 0
## 6 Moran, Mr. James 0 NA 0 0
## Ticket Fare Cabin Embarked survived
## 1 A/5 21171 7.2500 S 0
## 2 PC 17599 71.2833 C85 C 1
## 3 STON/O2. 3101282 7.9250 S 1
## 4 113803 53.1000 C123 S 1
## 5 373450 8.0500 S 0
## 6 330877 8.4583 Q 0
head(test)
## PassengerId Pclass Name Sex Age
## 1 892 3 Kelly, Mr. James 0 34.5
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) 1 47.0
## 3 894 2 Myles, Mr. Thomas Francis 0 62.0
## 4 895 3 Wirz, Mr. Albert 0 27.0
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 22.0
## 6 897 3 Svensson, Mr. Johan Cervin 0 14.0
## SibSp Parch Ticket Fare Cabin Embarked
## 1 0 0 330911 7.8292 Q
## 2 1 0 363272 7.0000 S
## 3 0 0 240276 9.6875 Q
## 4 0 0 315154 8.6625 S
## 5 1 1 3101298 12.2875 S
## 6 0 0 7538 9.2250 S
train$Sex<-as.numeric(train$Sex)
train$Age<-as.numeric(train$Age)
test$Sex<-as.numeric(test$Sex)
test$Age<-as.numeric(test$Age)
train$Age
## [1] 22.00 38.00 26.00 35.00 35.00 NA 54.00 2.00 27.00 14.00 4.00 58.00
## [13] 20.00 39.00 14.00 55.00 2.00 NA 31.00 NA 35.00 34.00 15.00 28.00
## [25] 8.00 38.00 NA 19.00 NA NA 40.00 NA NA 66.00 28.00 42.00
## [37] NA 21.00 18.00 14.00 40.00 27.00 NA 3.00 19.00 NA NA NA
## [49] NA 18.00 7.00 21.00 49.00 29.00 65.00 NA 21.00 28.50 5.00 11.00
## [61] 22.00 38.00 45.00 4.00 NA NA 29.00 19.00 17.00 26.00 32.00 16.00
## [73] 21.00 26.00 32.00 25.00 NA NA 0.83 30.00 22.00 29.00 NA 28.00
## [85] 17.00 33.00 16.00 NA 23.00 24.00 29.00 20.00 46.00 26.00 59.00 NA
## [97] 71.00 23.00 34.00 34.00 28.00 NA 21.00 33.00 37.00 28.00 21.00 NA
## [109] 38.00 NA 47.00 14.50 22.00 20.00 17.00 21.00 70.50 29.00 24.00 2.00
## [121] 21.00 NA 32.50 32.50 54.00 12.00 NA 24.00 NA 45.00 33.00 20.00
## [133] 47.00 29.00 25.00 23.00 19.00 37.00 16.00 24.00 NA 22.00 24.00 19.00
## [145] 18.00 19.00 27.00 9.00 36.50 42.00 51.00 22.00 55.50 40.50 NA 51.00
## [157] 16.00 30.00 NA NA 44.00 40.00 26.00 17.00 1.00 9.00 NA 45.00
## [169] NA 28.00 61.00 4.00 1.00 21.00 56.00 18.00 NA 50.00 30.00 36.00
## [181] NA NA 9.00 1.00 4.00 NA NA 45.00 40.00 36.00 32.00 19.00
## [193] 19.00 3.00 44.00 58.00 NA 42.00 NA 24.00 28.00 NA 34.00 45.50
## [205] 18.00 2.00 32.00 26.00 16.00 40.00 24.00 35.00 22.00 30.00 NA 31.00
## [217] 27.00 42.00 32.00 30.00 16.00 27.00 51.00 NA 38.00 22.00 19.00 20.50
## [229] 18.00 NA 35.00 29.00 59.00 5.00 24.00 NA 44.00 8.00 19.00 33.00
## [241] NA NA 29.00 22.00 30.00 44.00 25.00 24.00 37.00 54.00 NA 29.00
## [253] 62.00 30.00 41.00 29.00 NA 30.00 35.00 50.00 NA 3.00 52.00 40.00
## [265] NA 36.00 16.00 25.00 58.00 35.00 NA 25.00 41.00 37.00 NA 63.00
## [277] 45.00 NA 7.00 35.00 65.00 28.00 16.00 19.00 NA 33.00 30.00 22.00
## [289] 42.00 22.00 26.00 19.00 36.00 24.00 24.00 NA 23.50 2.00 NA 50.00
## [301] NA NA 19.00 NA NA 0.92 NA 17.00 30.00 30.00 24.00 18.00
## [313] 26.00 28.00 43.00 26.00 24.00 54.00 31.00 40.00 22.00 27.00 30.00 22.00
## [325] NA 36.00 61.00 36.00 31.00 16.00 NA 45.50 38.00 16.00 NA NA
## [337] 29.00 41.00 45.00 45.00 2.00 24.00 28.00 25.00 36.00 24.00 40.00 NA
## [349] 3.00 42.00 23.00 NA 15.00 25.00 NA 28.00 22.00 38.00 NA NA
## [361] 40.00 29.00 45.00 35.00 NA 30.00 60.00 NA NA 24.00 25.00 18.00
## [373] 19.00 22.00 3.00 NA 22.00 27.00 20.00 19.00 42.00 1.00 32.00 35.00
## [385] NA 18.00 1.00 36.00 NA 17.00 36.00 21.00 28.00 23.00 24.00 22.00
## [397] 31.00 46.00 23.00 28.00 39.00 26.00 21.00 28.00 20.00 34.00 51.00 3.00
## [409] 21.00 NA NA NA 33.00 NA 44.00 NA 34.00 18.00 30.00 10.00
## [421] NA 21.00 29.00 28.00 18.00 NA 28.00 19.00 NA 32.00 28.00 NA
## [433] 42.00 17.00 50.00 14.00 21.00 24.00 64.00 31.00 45.00 20.00 25.00 28.00
## [445] NA 4.00 13.00 34.00 5.00 52.00 36.00 NA 30.00 49.00 NA 29.00
## [457] 65.00 NA 50.00 NA 48.00 34.00 47.00 48.00 NA 38.00 NA 56.00
## [469] NA 0.75 NA 38.00 33.00 23.00 22.00 NA 34.00 29.00 22.00 2.00
## [481] 9.00 NA 50.00 63.00 25.00 NA 35.00 58.00 30.00 9.00 NA 21.00
## [493] 55.00 71.00 21.00 NA 54.00 NA 25.00 24.00 17.00 21.00 NA 37.00
## [505] 16.00 18.00 33.00 NA 28.00 26.00 29.00 NA 36.00 54.00 24.00 47.00
## [517] 34.00 NA 36.00 32.00 30.00 22.00 NA 44.00 NA 40.50 50.00 NA
## [529] 39.00 23.00 2.00 NA 17.00 NA 30.00 7.00 45.00 30.00 NA 22.00
## [541] 36.00 9.00 11.00 32.00 50.00 64.00 19.00 NA 33.00 8.00 17.00 27.00
## [553] NA 22.00 22.00 62.00 48.00 NA 39.00 36.00 NA 40.00 28.00 NA
## [565] NA 24.00 19.00 29.00 NA 32.00 62.00 53.00 36.00 NA 16.00 19.00
## [577] 34.00 39.00 NA 32.00 25.00 39.00 54.00 36.00 NA 18.00 47.00 60.00
## [589] 22.00 NA 35.00 52.00 47.00 NA 37.00 36.00 NA 49.00 NA 49.00
## [601] 24.00 NA NA 44.00 35.00 36.00 30.00 27.00 22.00 40.00 39.00 NA
## [613] NA NA 35.00 24.00 34.00 26.00 4.00 26.00 27.00 42.00 20.00 21.00
## [625] 21.00 61.00 57.00 21.00 26.00 NA 80.00 51.00 32.00 NA 9.00 28.00
## [637] 32.00 31.00 41.00 NA 20.00 24.00 2.00 NA 0.75 48.00 19.00 56.00
## [649] NA 23.00 NA 18.00 21.00 NA 18.00 24.00 NA 32.00 23.00 58.00
## [661] 50.00 40.00 47.00 36.00 20.00 32.00 25.00 NA 43.00 NA 40.00 31.00
## [673] 70.00 31.00 NA 18.00 24.50 18.00 43.00 36.00 NA 27.00 20.00 14.00
## [685] 60.00 25.00 14.00 19.00 18.00 15.00 31.00 4.00 NA 25.00 60.00 52.00
## [697] 44.00 NA 49.00 42.00 18.00 35.00 18.00 25.00 26.00 39.00 45.00 42.00
## [709] 22.00 NA 24.00 NA 48.00 29.00 52.00 19.00 38.00 27.00 NA 33.00
## [721] 6.00 17.00 34.00 50.00 27.00 20.00 30.00 NA 25.00 25.00 29.00 11.00
## [733] NA 23.00 23.00 28.50 48.00 35.00 NA NA NA 36.00 21.00 24.00
## [745] 31.00 70.00 16.00 30.00 19.00 31.00 4.00 6.00 33.00 23.00 48.00 0.67
## [757] 28.00 18.00 34.00 33.00 NA 41.00 20.00 36.00 16.00 51.00 NA 30.50
## [769] NA 32.00 24.00 48.00 57.00 NA 54.00 18.00 NA 5.00 NA 43.00
## [781] 13.00 17.00 29.00 NA 25.00 25.00 18.00 8.00 1.00 46.00 NA 16.00
## [793] NA NA 25.00 39.00 49.00 31.00 30.00 30.00 34.00 31.00 11.00 0.42
## [805] 27.00 31.00 39.00 18.00 39.00 33.00 26.00 39.00 35.00 6.00 30.50 NA
## [817] 23.00 31.00 43.00 10.00 52.00 27.00 38.00 27.00 2.00 NA NA 1.00
## [829] NA 62.00 15.00 0.83 NA 23.00 18.00 39.00 21.00 NA 32.00 NA
## [841] 20.00 16.00 30.00 34.50 17.00 42.00 NA 35.00 28.00 NA 4.00 74.00
## [853] 9.00 16.00 44.00 18.00 45.00 51.00 24.00 NA 41.00 21.00 48.00 NA
## [865] 24.00 42.00 27.00 31.00 NA 4.00 26.00 47.00 33.00 47.00 28.00 15.00
## [877] 20.00 19.00 NA 56.00 25.00 33.00 22.00 28.00 25.00 39.00 27.00 19.00
## [889] NA 26.00 32.00
# Neural Networks
library(neuralnet)
## Warning: package 'neuralnet' was built under R version 3.6.3
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
set.seed(333)
n <- neuralnet(Survived~Pclass+Sex,
data = train,
hidden = 5,
err.fct = "ce",
linear.output = FALSE)
plot(n)
# Prediction
output <- compute(n, train[,c(-2,-13)])
head(output$net.result)
## [,1]
## [1,] 0.1354676
## [2,] 0.9682994
## [3,] 0.5002184
## [4,] 0.9682994
## [5,] 0.1354676
## [6,] 0.1354676
head(train[c(-2,-13)])
## PassengerId Pclass Name Sex
## 1 1 3 Braund, Mr. Owen Harris 2
## 2 2 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) 1
## 3 3 3 Heikkinen, Miss. Laina 1
## 4 4 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1
## 5 5 3 Allen, Mr. William Henry 2
## 6 6 3 Moran, Mr. James 2
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 22 1 0 A/5 21171 7.2500 S
## 2 38 1 0 PC 17599 71.2833 C85 C
## 3 26 0 0 STON/O2. 3101282 7.9250 S
## 4 35 1 0 113803 53.1000 C123 S
## 5 35 0 0 373450 8.0500 S
## 6 NA 0 0 330877 8.4583 Q
# Confusion Matrix & Misclassification Error - training data
output <- compute(n, train[,c(-2,-13)])
p1 <- output$net.result
pred1 <- ifelse(p1>0.5, 1, 0)
tab1 <- table(pred1, train$survived)
tab1
##
## pred1 0 1
## 0 468 109
## 1 81 233
1-sum(diag(tab1))/sum(tab1)
## [1] 0.2132435
We have as accracy 0.786 which is not bad at all.
here are the predictions of our survivors :
# Confusion Matrix & Misclassification Error - testing data
output <- compute(n,test)
p2 <- output$net.result
pred2 <- ifelse(p2>0.5, 1, 0)
tab2 <- table(pred2)
tab2
## pred2
## 0 1
## 266 152
1-sum(diag(tab2))/sum(tab2)
## [1] 0