Day2TitanicDataset

Background

The sinking of the Titanic is one of the most infamous shipwrecks in history.

On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.

While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.

Load Data

You can use the getwd() to check your working directory.

.. Implies go back one level.

remove(list = ls())

?getwd # returns an absolute filepath representing the current working directory of the R process

getwd()

[1] "/Users/hannahrobinson/Desktop/BCE Coding/Coding Sheets"

?dir() # produce a character vector of the names of files or directories in the named directory.
dir()

 [1] "Day2TitanicDataSet_files"            
 [2] "Day2TitanicDataSet.html"             
 [3] "Day2TitanicDataSet.qmd"              
 [4] "Day2TitanicDataSet.rmarkdown"        
 [5] "First_RMarkdownBasics.html"          
 [6] "HannahRobinson_datasetsummarystats.R"
 [7] "HannahRobinson_ImportTitanic.html"   
 [8] "HannahRobinson_ImportTitanic.Rmd"    
 [9] "ImportDataset.html"                  
[10] "ImportDataset.Rmd"                   
[11] "ImportingData.html"                  
[12] "ImportingData.Rmd"                   
[13] "ImportTitanic.R"                     
[14] "rsconnect"                           
[15] "Testing.docx"                        
[16] "Testing.html"

train <- read.csv("~/Desktop/train.csv")

Exploratory Data Analysis

?head
head(train) # prints the first six rows of dataset

  PassengerId Survived Pclass
1           1        0      3
2           2        1      1
3           3        1      3
4           4        1      1
5           5        0      3
6           6        0      3
                                                 Name    Sex Age SibSp Parch
1                             Braund, Mr. Owen Harris   male  22     1     0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
3                              Heikkinen, Miss. Laina female  26     0     0
4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
5                            Allen, Mr. William Henry   male  35     0     0
6                                    Moran, Mr. James   male  NA     0     0
            Ticket    Fare Cabin Embarked
1        A/5 21171  7.2500              S
2         PC 17599 71.2833   C85        C
3 STON/O2. 3101282  7.9250              S
4           113803 53.1000  C123        S
5           373450  8.0500              S
6           330877  8.4583              Q

head(train, n = 4) # first 4 entries

  PassengerId Survived Pclass
1           1        0      3
2           2        1      1
3           3        1      3
4           4        1      1
                                                 Name    Sex Age SibSp Parch
1                             Braund, Mr. Owen Harris   male  22     1     0
2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
3                              Heikkinen, Miss. Laina female  26     0     0
4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
            Ticket    Fare Cabin Embarked
1        A/5 21171  7.2500              S
2         PC 17599 71.2833   C85        C
3 STON/O2. 3101282  7.9250              S
4           113803 53.1000  C123        S

Variables with missing values

vis_dat will only work for small data sets.

#install.packages("visdat")

library(visdat)

df <- train
# vis_miss(df)

vis_dat(df)

library(psych)
age_sum_stats <- describe(df$Age)

typeof(age_sum_stats)

[1] "list"

length(df$Age)

[1] 891

age_sum_stats[[2]]

[1] 714

length(df$Age) - age_sum_stats[[2]]

[1] 177

Age has 177 missing values.

typeof(df$Age)

[1] "double"

class(df$Age)

[1] "numeric"

head(is.na(df$Age)) # logical vector of true or false

[1] FALSE FALSE FALSE FALSE FALSE  TRUE

class(is.na(df$Age))

[1] "logical"

temp1 <- is.na(df$Age)
temp2 <- as.numeric(is.na(df$Age))
class(as.numeric(is.na(df$Age)))

[1] "numeric"

sum(is.na(df$Age))

[1] 177

The same answer, age has 177 missing values.

names(df)

 [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
 [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
[11] "Cabin"       "Embarked"

# install.packages("dplyr")
library("dplyr")


Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

?glimpse()

Help on topic 'glimpse' was found in the following packages:

  Package               Library
  dplyr                 /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
  tibble                /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
  pillar                /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library


Using the first match ...

glimpse(df)

Rows: 891
Columns: 12
$ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
$ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
$ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
$ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
$ Sex         <chr> "male", "female", "female", "female", "male", "male", "mal…
$ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
$ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
$ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
$ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
$ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
$ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
$ Embarked    <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"…

table(df$Pclass)


  1   2   3 
216 184 491

table(df$Embarked)


      C   Q   S 
  2 168  77 644

Type of variable and level of measurement

Variable Name	Type of Variable	Level of Measurement
Passenger Id	Qualitative	Nominal
Survived *	Qualitative	Ordinal
PClass	Qualitative	Ordinal
Name	Qualitative	Nominal
Sex	Qualitative	Nominal
Age	Quantitative	Ratio
SibSp (# of siblings/spouses on board)	Quantitative	Ratio
Parch (# of parents/children on board)	Quanitative	Ratio
Ticket *	Qualitative	Nominal
Fare	Quantitative	Ratio
Cabin	Qualitative	Nominal
Embarked	Qualitative	Nominal

Professional looking summary statistics

Use the stargazer packageLinks to an external site. to create a basic professional looking summary statistics table. Make sure to comment your code, indent it properly, and even explicitly specify the argument. In less than 3 sentences describe any interesting trends you find.

#install.packages("stargazer")

library(stargazer)


Please cite as:

 Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

 R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

?`stargazer-package`
 

stargazer(df, type = "text" )


==============================================
Statistic    N   Mean   St. Dev.  Min    Max  
----------------------------------------------
PassengerId 891 446.000 257.354    1     891  
Survived    891  0.384   0.487     0      1   
Pclass      891  2.309   0.836     1      3   
Age         714 29.699   14.526  0.420 80.000 
SibSp       891  0.523   1.103     0      8   
Parch       891  0.382   0.806     0      6   
Fare        891 32.204   49.693  0.000 512.329
----------------------------------------------

variable_labels <- c("Passenger Id", "Survived", "Passenger Class",               "Age", "# of Siblings/Spouses", "# of Children/Parents", "Fare")

class(variable_labels)

[1] "character"

length(variable_labels)

[1] 7

stargazer(df, 
          type              = "text", 
          title             = "Summary Statistics", 
          covariate.labels  = variable_labels,
          notes             = c("N = 891.", "Age has 177 missing values."), 
          omit.summary.stat = "n", 
          digits            = 2)


Summary Statistics
=================================================
Statistic              Mean  St. Dev. Min   Max  
-------------------------------------------------
Passenger Id          446.00  257.35   1    891  
Survived               0.38    0.49    0     1   
Passenger Class        2.31    0.84    1     3   
Age                   29.70   14.53   0.42 80.00 
# of Siblings/Spouses  0.52    1.10    0     8   
# of Children/Parents  0.38    0.81    0     6   
Fare                  32.20   49.69   0.00 512.33
-------------------------------------------------
N = 891.                                         
Age has 177 missing values.

table(df$Sex)


female   male 
   314    577

Measures of central tendency

data <- c(20, 40, 25, 30, 50, 37, 421, 77, 1, 53, 99, 51, 33)

median(data)

[1] 40

mode(data)

[1] "numeric"

mean(data)

[1] 72.07692

hist(data)

Measures of Dispersion

Standard deviation

?sd

sd(x = data)

[1] 107.7532

var(x = data)

[1] 11610.74

var(data) == sd(x = data)^2

[1] TRUE

max(data)

[1] 421

min(data)

[1] 1

range(data)

[1]   1 421

max(data) - min(data)

[1] 420

round(x = sd(x = data), digits = 2)

[1] 107.75

?boxplot
boxplot(data, horizontal = T)

?quartile

No documentation for 'quartile' in specified packages and libraries:
you could try '??quartile'

summary(data)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   30.00   40.00   72.08   53.00  421.00

IQR(data)

[1] 23

boxplot(data, horizontal = T, range = 3)