Overview

For exploring, I decided to take the Titanic manifest down to just those under 18. In doing so, I noticed several females with “Mrs.” in the Name column, and realized at that time, that was probably plenty old enough to be married. Back then, it was likely that children were either referred to by their name, or perhaps Miss or Master. As well, the age of consent was very differnet in the 1910’s.

library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)

#7.  BONUS – place the original .csv in a github file and read from a link.
loc<-"https://raw.githubusercontent.com/tonythor/pyspark-env/main/data/titanic.csv"
titanic_dataset <- loc
titanic <- read.csv(titanic_dataset)

#3. Create new column names for the new data frame.
titanic <- dplyr::rename(titanic, c("Class" = "Pclass"), c("Gender" = "Sex"))
titanic[c('LastName', 'FirstName')]<- str_split_fixed(titanic$Name, ',', 2)

#2. Create a new data frame with a subset of the columns and rows. Make 
#   sure to rename it.
t_children = sqldf("select  Survived, Class, LastName, FirstName,
                         Gender, Age from titanic where age < 18 
                         order by Class, Age desc")

#1 Use the summary function to gain an overview of the data set. Then 
#  display the mean and median for at least two attributes.

summary(t_children)
##     Survived          Class         LastName          FirstName        
##  Min.   :0.0000   Min.   :1.000   Length:113         Length:113        
##  1st Qu.:0.0000   1st Qu.:2.000   Class :character   Class :character  
##  Median :1.0000   Median :3.000   Mode  :character   Mode  :character  
##  Mean   :0.5398   Mean   :2.584                                        
##  3rd Qu.:1.0000   3rd Qu.:3.000                                        
##  Max.   :1.0000   Max.   :3.000                                        
##     Gender               Age        
##  Length:113         Min.   : 0.420  
##  Class :character   1st Qu.: 3.000  
##  Mode  :character   Median : 9.000  
##                     Mean   : 9.041  
##                     3rd Qu.:16.000  
##                     Max.   :17.000
sprintf("P Class : mean:%s median:%s", 
        round(mean(t_children$Class), digits=4), 
        round(median(t_children$Class),digits=4))
## [1] "P Class : mean:2.5841 median:3"
sprintf("Survived: mean:%s median:%s", 
        round(mean(t_children$Survived), digits=4), 
        round(mean(t_children$Survived), digits=4))
## [1] "Survived: mean:0.5398 median:0.5398"
# Add that column in there based on salutations
t_children = t_children %>%
  mutate(PossiblyMarried = case_when(
    grepl("Mrs.", FirstName) ~ "True",
    grepl("Mister.", FirstName ) ~ "True",
    grepl("Mr.", FirstName ) ~ "True",
    .default = ""
  ))

#5 For at least 3 values in a column please rename so that every value in 
#  that column is renamed.
replacements  = c('Miss.'='','Master.'='', 'Mrs.'='', 'Mister.'='', 'Mr.'='')
t_children$FirstName<- str_replace_all(t_children$FirstName, replacements )


# The first name column is pretty long. Let's truncate it so it'll fit on one page.
t_children = t_children %>% mutate(FirstName = str_trunc(FirstName, width = 10))

married = nrow(sqldf("select PossiblyMarried from t_children where length(PossiblyMarried) > 1"))
sprintf('Number of children in this dataset: %s', nrow(t_children))
## [1] "Number of children in this dataset: 113"
sprintf('Number of children who might already be married: %s', married)
## [1] "Number of children who might already be married: 26"
#6. Display enough rows to see examples of all of steps 1-5 above.
head(t_children, 50)