Day2TitanicDataset

Background

The sinking of the Titanic is one of the most infamous shipwrecks in history.

On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.

While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.

Load Data

  • You can use the getwd() to check your working directory.

.. Implies go back one level.

remove(list = ls())

?getwd # returns an absolute filepath representing the current working directory of the R process

getwd()
[1] "/Users/hannahrobinson/Desktop/BCE Coding/Coding Sheets"
?dir() # produce a character vector of the names of files or directories in the named directory.
dir()
 [1] "Day2TitanicDataSet_files"            
 [2] "Day2TitanicDataSet.html"             
 [3] "Day2TitanicDataSet.qmd"              
 [4] "Day2TitanicDataSet.rmarkdown"        
 [5] "First_RMarkdownBasics.html"          
 [6] "HannahRobinson_datasetsummarystats.R"
 [7] "HannahRobinson_ImportTitanic.html"   
 [8] "HannahRobinson_ImportTitanic.Rmd"    
 [9] "ImportDataset.html"                  
[10] "ImportDataset.Rmd"                   
[11] "ImportingData.html"                  
[12] "ImportingData.Rmd"                   
[13] "ImportTitanic.R"                     
[14] "rsconnect"                           
[15] "Testing.docx"                        
[16] "Testing.html"                        
train <- read.csv("~/Desktop/train.csv")

Exploratory Data Analysis

?head
head(train) # prints the first six rows of dataset
  PassengerId Survived Pclass
1           1        0      3
2           2        1      1
3           3        1      3
4           4        1      1
5           5        0      3
6           6        0      3
                                                 Name    Sex Age SibSp Parch
1                             Braund, Mr. Owen Harris   male  22     1     0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
3                              Heikkinen, Miss. Laina female  26     0     0
4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
5                            Allen, Mr. William Henry   male  35     0     0
6                                    Moran, Mr. James   male  NA     0     0
            Ticket    Fare Cabin Embarked
1        A/5 21171  7.2500              S
2         PC 17599 71.2833   C85        C
3 STON/O2. 3101282  7.9250              S
4           113803 53.1000  C123        S
5           373450  8.0500              S
6           330877  8.4583              Q
head(train, n = 4) # first 4 entries
  PassengerId Survived Pclass
1           1        0      3
2           2        1      1
3           3        1      3
4           4        1      1
                                                 Name    Sex Age SibSp Parch
1                             Braund, Mr. Owen Harris   male  22     1     0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
3                              Heikkinen, Miss. Laina female  26     0     0
4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
            Ticket    Fare Cabin Embarked
1        A/5 21171  7.2500              S
2         PC 17599 71.2833   C85        C
3 STON/O2. 3101282  7.9250              S
4           113803 53.1000  C123        S

Variables with missing values

  • vis_dat will only work for small data sets.
#install.packages("visdat")

library(visdat)

df <- train
# vis_miss(df)

vis_dat(df) 

library(psych)
age_sum_stats <- describe(df$Age)

typeof(age_sum_stats)
[1] "list"
length(df$Age)
[1] 891
age_sum_stats[[2]]
[1] 714
length(df$Age) - age_sum_stats[[2]]
[1] 177

Age has 177 missing values.

typeof(df$Age)
[1] "double"
class(df$Age)
[1] "numeric"
head(is.na(df$Age)) # logical vector of true or false
[1] FALSE FALSE FALSE FALSE FALSE  TRUE
class(is.na(df$Age))
[1] "logical"
temp1 <- is.na(df$Age)
temp2 <- as.numeric(is.na(df$Age))
class(as.numeric(is.na(df$Age)))
[1] "numeric"
sum(is.na(df$Age))
[1] 177

The same answer, age has 177 missing values.

names(df)
 [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
 [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
[11] "Cabin"       "Embarked"   
# install.packages("dplyr")
library("dplyr")

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
?glimpse()
Help on topic 'glimpse' was found in the following packages:

  Package               Library
  dplyr                 /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
  tibble                /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
  pillar                /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library


Using the first match ...
glimpse(df)
Rows: 891
Columns: 12
$ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
$ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
$ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
$ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
$ Sex         <chr> "male", "female", "female", "female", "male", "male", "mal…
$ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
$ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
$ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
$ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
$ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
$ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
$ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…
table(df$Pclass)

  1   2   3 
216 184 491 
table(df$Embarked)

      C   Q   S 
  2 168  77 644 

Type of variable and level of measurement

Variable Name Type of Variable Level of Measurement
Passenger Id Qualitative Nominal
Survived * Qualitative Ordinal
PClass Qualitative Ordinal
Name Qualitative Nominal
Sex Qualitative Nominal
Age Quantitative Ratio
SibSp (# of siblings/spouses on board) Quantitative Ratio
Parch (# of parents/children on board) Quanitative Ratio
Ticket * Qualitative Nominal
Fare Quantitative Ratio
Cabin Qualitative Nominal
Embarked Qualitative Nominal

Professional looking summary statistics

Use the stargazer packageLinks to an external site. to create a basic professional looking summary statistics table.  Make sure to comment your code, indent it properly, and even explicitly specify the argument. In less than 3 sentences describe any interesting trends you find.

#install.packages("stargazer")

library(stargazer)

Please cite as: 
 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
?`stargazer-package`
 

stargazer(df, type = "text" )

==============================================
Statistic    N   Mean   St. Dev.  Min    Max  
----------------------------------------------
PassengerId 891 446.000 257.354    1     891  
Survived    891  0.384   0.487     0      1   
Pclass      891  2.309   0.836     1      3   
Age         714 29.699   14.526  0.420 80.000 
SibSp       891  0.523   1.103     0      8   
Parch       891  0.382   0.806     0      6   
Fare        891 32.204   49.693  0.000 512.329
----------------------------------------------
variable_labels <- c("Passenger Id", "Survived", "Passenger Class",               "Age", "# of Siblings/Spouses", "# of Children/Parents", "Fare")

class(variable_labels)
[1] "character"
length(variable_labels)
[1] 7
stargazer(df, 
          type              = "text", 
          title             = "Summary Statistics", 
          covariate.labels  = variable_labels,
          notes             = c("N = 891.", "Age has 177 missing values."), 
          omit.summary.stat = "n", 
          digits            = 2)

Summary Statistics
=================================================
Statistic              Mean  St. Dev. Min   Max  
-------------------------------------------------
Passenger Id          446.00  257.35   1    891  
Survived               0.38    0.49    0     1   
Passenger Class        2.31    0.84    1     3   
Age                   29.70   14.53   0.42 80.00 
# of Siblings/Spouses  0.52    1.10    0     8   
# of Children/Parents  0.38    0.81    0     6   
Fare                  32.20   49.69   0.00 512.33
-------------------------------------------------
N = 891.                                         
Age has 177 missing values.                      
table(df$Sex)

female   male 
   314    577 

Measures of central tendency

data <- c(20, 40, 25, 30, 50, 37, 421, 77, 1, 53, 99, 51, 33)
median(data)
[1] 40
mode(data)
[1] "numeric"
mean(data)
[1] 72.07692
hist(data)

Measures of Dispersion

Standard deviation

?sd

sd(x = data)
[1] 107.7532
var(x = data)
[1] 11610.74
var(data) == sd(x = data)^2
[1] TRUE
max(data)
[1] 421
min(data)
[1] 1
range(data)
[1]   1 421
max(data) - min(data)
[1] 420
round(x = sd(x = data), digits = 2)
[1] 107.75
?boxplot
boxplot(data, horizontal = T)

?quartile
No documentation for 'quartile' in specified packages and libraries:
you could try '??quartile'
summary(data)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   30.00   40.00   72.08   53.00  421.00 
IQR(data)
[1] 23
boxplot(data, horizontal = T, range = 3)