df <- read.csv("titanic_full.csv",stringsAsFactors=TRUE,na.strings=c("","NA")) # reads the objects in the .csv file
str(df) # displays strcture of the objetcs in .csv file
## 'data.frame': 1309 obs. of 21 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 156 287 531 430 23 826 775 922 613 855 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 929 levels "110152","110413",..: 721 817 915 66 650 374 110 542 478 175 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 186 levels "A10","A11","A14",..: NA 107 NA 71 NA NA 164 NA NA NA ...
## $ Embarked : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
## $ WikiId : int 691 90 865 127 627 785 200 1108 902 520 ...
## $ Name_wiki : Factor w/ 1303 levels "Abbing, Mr. Anthony",..: 183 319 545 450 24 371 786 932 628 867 ...
## $ Age_wiki : num 22 35 26 35 35 22 54 2 26 14 ...
## $ Hometown : Factor w/ 566 levels "Abbeyleix, Laois, Ireland[note 1]",..: 71 359 261 439 55 122 143 60 464 562 ...
## $ Boarded : Factor w/ 4 levels "Belfast","Cherbourg",..: 4 2 4 4 4 3 4 4 4 2 ...
## $ Destination: Factor w/ 291 levels "Aberdeen, South Dakota, US",..: 216 186 185 241 185 185 72 56 249 59 ...
## $ Lifeboat : Factor w/ 24 levels "?","1","10","11",..: NA 14 8 24 NA NA NA NA 9 1 ...
## $ Body : Factor w/ 135 levels "?","??","[66][67]",..: NA 6 6 6 6 6 42 2 2 2 ...
## $ Class : int 3 1 3 1 3 3 1 3 3 2 ...
TABLE FUNCTIONS Table function analysis - 1
table(df$Sex)# counts the no of male and female
##
## female male
## 466 843
Table function analysis - 2
df1 <-table(df$Sex) # creates a new data object variable df1
df1 <- as.data.frame(df1) # converts the data variable to data frame
str(df1) # returns structure of the new created data frame
## 'data.frame': 2 obs. of 2 variables:
## $ Var1: Factor w/ 2 levels "female","male": 1 2
## $ Freq: int 466 843
Table function analysis - 3
table (df$Sex,df$Boarded) # count of males and female boarded in Cherbourg Queenstown Southampton respectively
##
## Belfast Cherbourg Queenstown Southampton
## female 0 114 58 291
## male 10 145 61 625
Table function analysis - 4
table(df$Age>25,df$Class) # returns count of age greater than 25 in all the 3 classes
##
## 1 2 3
## FALSE 60 101 280
## TRUE 227 156 219
Table function analysis - 5
apply(is.na(df),2,sum)
## PassengerId Survived Pclass Name Sex Age
## 0 418 0 0 0 263
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 1 1014 2
## WikiId Name_wiki Age_wiki Hometown Boarded Destination
## 5 5 7 5 5 5
## Lifeboat Body Class
## 807 1137 5
# missing data in the dataset
CROSS TABLE FUNCTIONS
Cross Table function analysis - 1
xtabs(~ Sex + Class, data= df) # returns count of male and female in 1,2,and 3 class
## Class
## Sex 1 2 3
## female 144 105 214
## male 182 167 492
Cross Table function analysis - 2
xtabs(~ Survived + Boarded, data= df) # returns data of how many survived or didn't based on respective places they boarded
## Boarded
## Survived Belfast Cherbourg Queenstown Southampton
## 0 9 73 47 419
## 1 0 93 29 219
Cross Table function analysis - 3
crosstab <- xtabs(~ Sex + Class, data= df)
ftable(crosstab)
## Class 1 2 3
## Sex
## female 144 105 214
## male 182 167 492
100* prop.table(crosstab,1) # returns % to proportion by row
## Class
## Sex 1 2 3
## female 31.10151 22.67819 46.22030
## male 21.64090 19.85731 58.50178
ftable(crosstab)
## Class 1 2 3
## Sex
## female 144 105 214
## male 182 167 492
100 * prop.table(crosstab,2) #returns % to proportion by column
## Class
## Sex 1 2 3
## female 44.17178 38.60294 30.31161
## male 55.82822 61.39706 69.68839
Cross Table function analysis - 4
library(gmodels)
CrossTable(df$Sex,df$Class) # report percentages (row, column, cell), specify decimal places, produce Chi-square
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 1304
##
##
## | df$Class
## df$Sex | 1 | 2 | 3 | Row Total |
## -------------|-----------|-----------|-----------|-----------|
## female | 144 | 105 | 214 | 463 |
## | 6.895 | 0.735 | 5.365 | |
## | 0.311 | 0.227 | 0.462 | 0.355 |
## | 0.442 | 0.386 | 0.303 | |
## | 0.110 | 0.081 | 0.164 | |
## -------------|-----------|-----------|-----------|-----------|
## male | 182 | 167 | 492 | 841 |
## | 3.796 | 0.404 | 2.954 | |
## | 0.216 | 0.199 | 0.585 | 0.645 |
## | 0.558 | 0.614 | 0.697 | |
## | 0.140 | 0.128 | 0.377 | |
## -------------|-----------|-----------|-----------|-----------|
## Column Total | 326 | 272 | 706 | 1304 |
## | 0.250 | 0.209 | 0.541 | |
## -------------|-----------|-----------|-----------|-----------|
##
##