This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(readr)
library(RCurl)
## Loading required package: bitops
# Choosing the Titanic dataset as it has large number of observations
# I uploaded the data set to github and used the github https link to access the data
fileURL <- "https://raw.githubusercontent.com/vincentarelbundock/Rdatasets/master/csv/datasets/Titanic.csv"
titanicDF <- read.csv(text = getURL(fileURL), header = TRUE, sep = ",")
# print basic info to make sure data shows correctly
head(titanicDF)
## X Name PClass Age Sex
## 1 1 Allen, Miss Elisabeth Walton 1st 29.00 female
## 2 2 Allison, Miss Helen Loraine 1st 2.00 female
## 3 3 Allison, Mr Hudson Joshua Creighton 1st 30.00 male
## 4 4 Allison, Mrs Hudson JC (Bessie Waldo Daniels) 1st 25.00 female
## 5 5 Allison, Master Hudson Trevor 1st 0.92 male
## 6 6 Anderson, Mr Harry 1st 47.00 male
## Survived SexCode
## 1 1 1
## 2 0 1
## 3 0 0
## 4 0 1
## 5 1 0
## 6 1 0
nrow(titanicDF)
## [1] 1313
# Summary of entire dataset
summary(titanicDF)
## X Name PClass
## Min. : 1 Carlsson, Mr Frans Olof : 2 * : 1
## 1st Qu.: 329 Connolly, Miss Kate : 2 1st:322
## Median : 657 Kelly, Mr James : 2 2nd:279
## Mean : 657 Abbing, Mr Anthony : 1 3rd:711
## 3rd Qu.: 985 Abbott, Master Eugene Joseph: 1
## Max. :1313 Abbott, Mr Rossmore Edward : 1
## (Other) :1304
## Age Sex Survived SexCode
## Min. : 0.17 female:462 Min. :0.0000 Min. :0.0000
## 1st Qu.:21.00 male :851 1st Qu.:0.0000 1st Qu.:0.0000
## Median :28.00 Median :0.0000 Median :0.0000
## Mean :30.40 Mean :0.3427 Mean :0.3519
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :71.00 Max. :1.0000 Max. :1.0000
## NA's :557
# NOTE: I decided to cleanup my data as the data contained NA for many values
# Cleanup the data by removing NA values for calculating better mean and median values
# Use complete.cases() to get rows which do not have NA values
titanicCleanDF1 <- titanicDF[complete.cases(titanicDF), ]
nrow(titanicDF)
## [1] 1313
nrow(titanicCleanDF1)
## [1] 756
# mean and median of Age of people on the ship
mean(titanicCleanDF1$Age)
## [1] 30.39799
median(titanicCleanDF1$Age)
## [1] 28
# median of people who survived or not
mean(titanicCleanDF1$Survived)
## [1] 0.4140212
median(titanicCleanDF1$Survived)
## [1] 0
# Save the mean and median values for original dataframe
meanAgeOrg = mean(titanicCleanDF1$Age)
medianAgeOrg = median(titanicCleanDF1$Age)
meanSurvivedOrg = mean(titanicCleanDF1$Survived)
medianSurvivedOrg = median(titanicCleanDF1$Survived)
# Create Subset by selecting people who survived on Titanic
titanicFemaleDF <- subset(titanicCleanDF1, titanicCleanDF1$Sex=="female", select = c(Name,PClass,Age,Sex,Survived))
nrow(titanicFemaleDF)
## [1] 288
# Create new column names
names(titanicFemaleDF) <- c("FullName1", "Class1", "Age1", "Sex1", "Survived1")
head(titanicFemaleDF)
## FullName1 Class1 Age1 Sex1
## 1 Allen, Miss Elisabeth Walton 1st 29 female
## 2 Allison, Miss Helen Loraine 1st 2 female
## 4 Allison, Mrs Hudson JC (Bessie Waldo Daniels) 1st 25 female
## 7 Andrews, Miss Kornelia Theodosia 1st 63 female
## 9 Appleton, Mrs Edward Dale (Charlotte Lamson) 1st 58 female
## 12 Astor, Mrs John Jacob (Madeleine Talmadge Force) 1st 19 female
## Survived1
## 1 1
## 2 0
## 4 0
## 7 1
## 9 1
## 12 1
#summary
summary(titanicFemaleDF)
## FullName1 Class1 Age1
## Connolly, Miss Kate : 2 * : 0 Min. : 0.17
## Abbott, Mrs Stanton (Rosa) : 1 1st:101 1st Qu.:19.00
## Abelseth, Miss Anna Karen : 1 2nd: 85 Median :27.00
## Abelson, Mrs Samuel (Anna) : 1 3rd:102 Mean :29.40
## Abraham, Mrs Joseph (Sophie Easu): 1 3rd Qu.:39.00
## Ahlin, Mrs Johanna Persdotter : 1 Max. :69.00
## (Other) :281
## Sex1 Survived1
## female:288 Min. :0.0000
## male : 0 1st Qu.:1.0000
## Median :1.0000
## Mean :0.7535
## 3rd Qu.:1.0000
## Max. :1.0000
##
#mean and median
mean(titanicFemaleDF$Age1)
## [1] 29.39642
median(titanicFemaleDF$Age1)
## [1] 27
mean(titanicFemaleDF$Survived1)
## [1] 0.7534722
median(titanicFemaleDF$Survived1)
## [1] 1
# Save the mean and median values for new dataframe
meanAgeNew = mean(titanicFemaleDF$Age1)
medianAgeNew = median(titanicFemaleDF$Age1)
meanSurvivedNew = mean(titanicFemaleDF$Survived1)
medianSurvivedNew = median(titanicFemaleDF$Survived1)
# compare the mean and medians
all <- c(meanAgeOrg, medianAgeOrg, meanSurvivedOrg, medianSurvivedOrg)
females <- c(meanAgeNew,medianAgeNew, meanSurvivedNew, medianSurvivedNew)
compare <- data.frame(all,females)
rownames(compare) <- c("Mean Age","Median Age","Mean Survived","Median Survived")
compare
## all females
## Mean Age 30.3979894 29.3964236
## Median Age 28.0000000 27.0000000
## Mean Survived 0.4140212 0.7534722
## Median Survived 0.0000000 1.0000000
# class of titanicAliveDF$Class1 is factor and its easier to convert them to character and change
# change Class1 from "1st" to "First"
titanicFemaleDF$Class1 <- as.character(titanicFemaleDF$Class1)
titanicFemaleDF$Class1[titanicFemaleDF$Class1 == "1st"] = "First"
titanicFemaleDF$Class1 <- as.factor(titanicFemaleDF$Class1)
# change Class1 from "2nd" to "Second"
titanicFemaleDF$Class1 <- as.character(titanicFemaleDF$Class1)
titanicFemaleDF$Class1[titanicFemaleDF$Class1 == "2nd"] = "Second"
titanicFemaleDF$Class1 <- as.factor(titanicFemaleDF$Class1)
# change Class1 from "3rd" to "Third"
titanicFemaleDF$Class1 <- as.character(titanicFemaleDF$Class1)
titanicFemaleDF$Class1[titanicFemaleDF$Class1 == "3rd"] = "Third"
titanicFemaleDF$Class1 <- as.factor(titanicFemaleDF$Class1)
#Print summary
summary(titanicFemaleDF)
## FullName1 Class1 Age1
## Connolly, Miss Kate : 2 First :101 Min. : 0.17
## Abbott, Mrs Stanton (Rosa) : 1 Second: 85 1st Qu.:19.00
## Abelseth, Miss Anna Karen : 1 Third :102 Median :27.00
## Abelson, Mrs Samuel (Anna) : 1 Mean :29.40
## Abraham, Mrs Joseph (Sophie Easu): 1 3rd Qu.:39.00
## Ahlin, Mrs Johanna Persdotter : 1 Max. :69.00
## (Other) :281
## Sex1 Survived1
## female:288 Min. :0.0000
## male : 0 1st Qu.:1.0000
## Median :1.0000
## Mean :0.7535
## 3rd Qu.:1.0000
## Max. :1.0000
##
#Print df head 10 rows
head(titanicFemaleDF,10)
## FullName1 Class1 Age1 Sex1
## 1 Allen, Miss Elisabeth Walton First 29 female
## 2 Allison, Miss Helen Loraine First 2 female
## 4 Allison, Mrs Hudson JC (Bessie Waldo Daniels) First 25 female
## 7 Andrews, Miss Kornelia Theodosia First 63 female
## 9 Appleton, Mrs Edward Dale (Charlotte Lamson) First 58 female
## 12 Astor, Mrs John Jacob (Madeleine Talmadge Force) First 19 female
## 16 Baxter, Mrs James (Helene DeLaudeniere Chaput) First 50 female
## 20 Beckwith, Mrs Richard Leonard (Sallie Monypeny) First 47 female
## 24 Bishop, Mrs Dickinson H (Helen Walton) First 19 female
## 28 Bonnell, Miss Caroline First 30 female
## Survived1
## 1 1
## 2 0
## 4 0
## 7 1
## 9 1
## 12 1
## 16 1
## 20 1
## 24 1
## 28 1
#Print df middle to middle+10
middle <- round(nrow(titanicFemaleDF)/2)
titanicFemaleDF[middle:(middle+9),]
## FullName1 Class1 Age1 Sex1 Survived1
## 466 Karnes, Mrs J Frank (Claire Bennett) Second 22 female 0
## 469 Kelly, Mrs Florence (Fannie) Second 45 female 1
## 474 Lahtinen, Mrs William (Anna Sylvan) Second 26 female 0
## 476 Lemore, Mrs Amelia Second 34 female 1
## 478 LaRoche, Mrs Joseph (Juliet) Second 22 female 1
## 479 LaRoche, Miss Louise Second 1 female 1
## 480 LaRoche, Miss Simonne Second 3 female 1
## 488 Mack, Mrs Mary Second 57 female 0
## 495 Marshall, Mrs Kate Louise Phillips Second 19 female 1
## 501 Mellenger, Mrs Elizabeth Anne Second 41 female 1
#Print df head
tail(titanicFemaleDF,10)
## FullName1 Class1 Age1 Sex1 Survived1
## 1189 Sandstrom, Miss Beatrice Irene Third 1.5 female 0
## 1264 Turja, Miss Anna Sofia Third 18.0 female 1
## 1265 Turkula, Mrs Hedvig Third 63.0 female 1
## 1270 Van der Planke, Miss Augusta Third 18.0 female 0
## 1272 Van der Planke, Mrs Jules Third 31.0 female 0
## 1277 Van Impe, Miss Catharine Third 10.0 female 0
## 1279 Van Impe, Mrs Jean Baptiste Third 30.0 female 0
## 1284 Vestrom, Miss Hulda Amanda Adolfina Third 14.0 female 0
## 1294 Wilkes, Mrs Ellen Third 45.0 female 1
## 1305 Yasbeck, Mrs Antoni Third 15.0 female 1