This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
r<-read.csv("titanic_train.csv")
head(r)
str(r)
'data.frame': 850 obs. of 15 variables:
$ passenger_id: int 1216 699 1267 449 576 1083 898 560 1079 908 ...
$ pclass : int 3 3 3 2 2 3 3 2 3 3 ...
$ name : chr "Smyth, Miss. Julia" "Cacic, Mr. Luka" "Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)" "Hocking, Mrs. Elizabeth (Eliza Needs)" ...
$ sex : chr "female" "male" "female" "female" ...
$ age : num NA 38 30 54 40 28 19 30 22 21 ...
$ sibsp : int 0 0 1 1 0 0 0 0 0 1 ...
$ parch : int 0 0 1 3 0 0 0 0 0 0 ...
$ ticket : chr "335432" "315089" "345773" "29105" ...
$ fare : num 7.73 8.66 24.15 23 13 ...
$ cabin : chr "" "" "" "" ...
$ embarked : chr "Q" "S" "S" "S" ...
$ boat : chr "13" "" "" "4" ...
$ body : int NA NA NA NA NA 173 NA NA NA NA ...
$ home.dest : chr "" "Croatia" "" "Cornwall / Akron, OH" ...
$ survived : int 1 0 0 1 0 0 0 1 1 0 ...
colSums(is.na(r))
passenger_id pclass name
0 0 0
sex age sibsp
0 174 0
parch ticket fare
0 0 1
cabin embarked boat
0 0 0
body home.dest survived
777 0 0
colSums(r=="")
passenger_id pclass name
0 0 0
sex age sibsp
0 NA 0
parch ticket fare
0 0 NA
cabin embarked boat
659 1 542
body home.dest survived
NA 386 0
r$embarked[r$embarked==""]="C"
apply(r,2, function(a) length(unique(a)))
passenger_id pclass name
850 3 849
sex age sibsp
2 89 7
parch ticket fare
8 660 237
cabin embarked boat
136 3 27
body home.dest survived
74 273 2
co<-c("survived","pclass","sex","embarked")
for (i in co){
r[,i] <- as.factor(r[,i])
}
str(r)
'data.frame': 850 obs. of 15 variables:
$ passenger_id: int 1216 699 1267 449 576 1083 898 560 1079 908 ...
$ pclass : Factor w/ 3 levels "1","2","3": 3 3 3 2 2 3 3 2 3 3 ...
$ name : chr "Smyth, Miss. Julia" "Cacic, Mr. Luka" "Van Impe, Mrs. Jean Baptiste (Rosalie Paula Govaert)" "Hocking, Mrs. Elizabeth (Eliza Needs)" ...
$ sex : Factor w/ 2 levels "female","male": 1 2 1 1 2 2 2 1 1 1 ...
$ age : num NA 38 30 54 40 28 19 30 22 21 ...
$ sibsp : int 0 0 1 1 0 0 0 0 0 1 ...
$ parch : int 0 0 1 3 0 0 0 0 0 0 ...
$ ticket : chr "335432" "315089" "345773" "29105" ...
$ fare : num 7.73 8.66 24.15 23 13 ...
$ cabin : chr "" "" "" "" ...
$ embarked : Factor w/ 3 levels "C","Q","S": 2 3 3 3 3 3 3 3 3 3 ...
$ boat : chr "13" "" "" "4" ...
$ body : int NA NA NA NA NA 173 NA NA NA NA ...
$ home.dest : chr "" "Croatia" "" "Cornwall / Akron, OH" ...
$ survived : Factor w/ 2 levels "0","1": 2 1 1 2 1 1 1 2 2 1 ...
library(ggplot2)
package 㤼㸱ggplot2㤼㸲 was built under R version 4.0.5
ggplot(r,aes(x=sex,fill=survived))+geom_bar()
ggplot(r,aes(x=embarked,fill=survived))+geom_bar(position="fill")+ylab("Frequency")
t<-table(r$embarked,r$survived)
for (i in 1:dim(t)[1]){
t[i,]<-t[i,]/sum(t[i,])*100
}
print(t)
0 1
C 48.02260 51.97740
Q 65.47619 34.52381
S 67.40238 32.59762
ggplot(data = r,aes(x=pclass,fill=survived))+geom_bar(position="fill")+ylab("Frequency")
ggplot(r,aes(x=sibsp,fill=survived))+geom_bar()
ggplot(r,aes(x=parch,fill=survived))+geom_bar()
#Embarked in ‘C’ (55% compared to 33% and 38%). ### 2. It looks like you have a better chance to survive if you in lower ticket class. ### 3.That shows that families with a family size bigger or equal to 2 but less than 6 have a more than 50% to survive, in contrast to families with 1 member or more than 5 members.