mydata=read.csv('D:/Titanic/train.csv', stringsAsFactors = TRUE)
mydata[mydata==""]<-NA #replace "" factor levels with NA
Types and Levels, PassengerID & Age Passenger ID is qualitative measured at the NOMINAL (name only). Age is quantitative measured at the RATIO level.
V1=noquote(c('Passenger ID:', typeof(mydata$PassengerId)))
V2=noquote(c('Age: ', typeof(mydata$Age)))
myprint(rbind(V1,V2), 'Types from Data Frame')
| V1 | Passenger ID: | integer |
| V2 | Age: | double |
Most missing?
mymax=which.max(apply(mydata, 2, function(x) sum(is.na(x))))
print(noquote(c(names(mymax),": ", sum(is.na(mydata[,as.numeric(mymax)])))))
## [1] Cabin : 687
missmap(mydata)
Impute missing observations for Age, SibSp, and Parch with the median.
mydata$Age[is.na(mydata$Age)]=median(mydata$Age, na.rm=TRUE)
mydata$SibSp[is.na(mydata$SibSp)]=median(mydata$SibSp, na.rm=TRUE)
mydata$Parch[is.na(mydata$Age)]=median(mydata$Parch, na.rm=TRUE)
Descriptive statistics.
myprint(round(describe(mydata[c("Age", "SibSp", "Parch")]),3), 'Descriptives')
| vars | n | mean | sd | median | trimmed | mad | min | max | range | skew | kurtosis | se | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Age | 1 | 891 | 29.362 | 13.020 | 28 | 28.825 | 8.896 | 0.42 | 80 | 79.58 | 0.509 | 0.973 | 0.436 |
| SibSp | 2 | 891 | 0.523 | 1.103 | 0 | 0.272 | 0.000 | 0.00 | 8 | 8.00 | 3.683 | 17.727 | 0.037 |
| Parch | 3 | 891 | 0.382 | 0.806 | 0 | 0.182 | 0.000 | 0.00 | 6 | 6.00 | 2.740 | 9.688 | 0.027 |
Provide a cross-tabulation of Survived and Sex.
addmargins(table(mydata$Survived, mydata$Sex))%>%kbl(caption="F Surv > M ")%>%kable_classic(html_font='Cambria')
| female | male | Sum | |
|---|---|---|---|
| 0 | 81 | 468 | 549 |
| 1 | 233 | 109 | 342 |
| Sum | 314 | 577 | 891 |
Provide notched boxplots for Survived and Age. What do you notice?
boxplot(mydata$Age~mydata$Survived, notch=TRUE, col=c('Red','Blue'), main="Little Discernible Effect", xlab="Survived", ylab="Age")