Titanic Predictions

Data Source:

https://www.kaggle.com/c/titanic

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(stringr)

LOAD DATA

#Data would be available locally here:
#"titanic\\train.csv"
#"titanic\\test.csv"

titanic_train <- read_csv("https://raw.githubusercontent.com/nono638/607_FinalProject_NCollin/main/titanic/train.csv")

## Rows: 891 Columns: 12

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

titanic_test <- read_csv("https://raw.githubusercontent.com/nono638/607_FinalProject_NCollin/main/titanic/test.csv")

## Rows: 418 Columns: 11

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (6): PassengerId, Pclass, Age, SibSp, Parch, Fare

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

#Check the data:
#str(titanic_test)

Basic EDA before combining

Column Names:

(cat("Column Names: ",(names(titanic_train))))

## Column Names:  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked

## NULL

cat("Training Median:", median(titanic_train$Age, na.rm = T),"\n")

## Training Median: 28

cat("Test Median:\t", median(titanic_test$Age, na.rm = T))

## Test Median:  27

Interestingly, the medians for the test and train set for Age only have a difference of one.

Women and Children First (in the Kaggle Dataset)?

survived.labels <- c(`0`="Perished", `1`="Survied")

ggplot(data = titanic_train[(!(is.na(titanic_train$Sex))),], aes( y = Sex, fill = Sex)) +
  geom_bar() +
  facet_grid(Sex ~Survived , labeller = labeller(Survived = survived.labels)) +
  ggtitle(" Distribution of Men and women who survived (within the Kaggle Training Dataset)")+
     theme(strip.background = element_rect(fill="lightblue", size=1, color="darkblue"),plot.title = element_text(hjust = 0.3))+
   coord_flip()

Many more men proportionately perished in the titanic disaster. This is only a portion of the dataset that Kaggle provided.

Children:

ggplot(data = titanic_train[(!(is.na(titanic_train$Age))),], aes(y = Age)) +
  geom_histogram( binwidth = 7.5) + #bin set to 7.5 for "understandability"
   facet_wrap(~Survived, nrow=1) +
  ggtitle(" Distribution by age of who survived (within the Kaggle Training Dataset)")+
     theme(strip.background = element_rect(fill="lightblue", size=1, color="darkblue"),plot.title = element_text(hjust = 0.5))+
   coord_flip()

ggplot(data = titanic_train[(!(is.na(titanic_train$Age))),], aes(y = Age)) +
  geom_histogram() + #bin set to 7.5 for "understandability"
   facet_grid(.~Survived, labeller = labeller(Survived = survived.labels)) +
  ggtitle(" Distribution by age of who survived (within the Kaggle Training Dataset)")+
     theme(strip.background = element_rect(fill="lightblue", size=1, color="darkblue"),plot.title = element_text(hjust = 0.5)) +
  coord_flip()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Actually the relationship isn’t so clear for children. The same can be said for older people, there isn’t a clear impact of age and survival of the Titanic disaster from these graphs, to my eye.

The fare one is interesting, because you might expect outliers in fare to have increased Survivabilty. I left the in the graph here.

ggplot(data = titanic_train[(!(is.na(titanic_train$Fare))),], aes(y = Fare, color = Fare)) +
  geom_histogram() + #bin set to 7.5 for "understandability"
   facet_grid(. ~Survived, labeller = labeller(Survived = survived.labels)) +
  ggtitle(" Distribution by Fare of who survived (within the Kaggle Training Dataset)")+
     theme(plot.title = element_text(hjust = 0.5))+
   coord_flip()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Combining and then Cleaning

Combine Data for cleaning together:

titanic_train$IsTrainSet <- TRUE
titanic_test$IsTrainSet <- FALSE

names(titanic_train)

##  [1] "PassengerId" "Survived"    "Pclass"      "Name"        "Sex"        
##  [6] "Age"         "SibSp"       "Parch"       "Ticket"      "Fare"       
## [11] "Cabin"       "Embarked"    "IsTrainSet"

#names(titanic_test)
# The names are the same but the test is missing the "Survived Column"

titanic_test$Survived <- NA

titanic_full <- rbind(titanic_train, titanic_test)

#Check above Combining worked:
#table(titanic_full$IsTrainSet)

#str(titanic_full)

NA values

How many NAs are in the Data? We’ll make a function to keep tabs on that as we go along, and call the function once here to test it. The function will return the name of the DataFrame you called it on and how many NAs it has in each column.

#install.packages(stringr)
library(stringr)

printRemainingNAs <- function(df = titanic_full, name = deparse(substitute(df))[1])
  {
print(str_glue("NAs in {name}:"))
for (col in 1:ncol(df) ) {
  print(str_glue("{names(df)[col]} : {sum(is.na(df[,col]))}"))
  }
}

printRemainingNAs()

## NAs in titanic_full:
## PassengerId : 0
## Survived : 418
## Pclass : 0
## Name : 0
## Sex : 0
## Age : 263
## SibSp : 0
## Parch : 0
## Ticket : 0
## Fare : 1
## Cabin : 1014
## Embarked : 2
## IsTrainSet : 0

Most of our NAs are in Age. Cabin has so many NAs as to be essentially worthless for prediction, perhaps aside for a small binning application. Fare is missing one value and Embarked is missing two values. There are 418 NA “Survived” Values because Kaggle wants us to guess those values for the competition.

Cleaning the “Embarked” Column

#There are two NA values for Embarked.  Again, we can see this by running printRemainingNAs() or the following line of code:
table(is.na(titanic_full$Embarked))

## 
## FALSE  TRUE 
##  1307     2

# get the two NA values from embarked.
print("Rows Missing Embarked: ")

## [1] "Rows Missing Embarked: "

for (i in 1:nrow(titanic_full)) {
  if ( is.na(titanic_full$Embarked[i])) {
    print(i)
  }
}

## [1] 62
## [1] 830

#Or find it this way and note the Passenger ID which, for this data, is equal to the Row Number
nullEmbarkedRows <- (titanic_full[is.na(titanic_full$Embarked),])

One possible way to clean embarked is to just set it to the Mode of the Embarked Column, which is “S”. We could alternatively just drop the rows. Or we could make a linear regression.

table(titanic_full$Embarked)

## 
##   C   Q   S 
## 270 123 914

Interestingly, the two null embarked values do have cabin values, which are otherwise mostly absent throughout the dataset. Let’s see if the mode for passengers with cabins is still S for Embarked. We can also see they both survived.

CabinedPassengers <- titanic_full[(!is.na(titanic_full$Cabin)) & (titanic_full$Survived==1),]

table(CabinedPassengers$Embarked)

## 
##  C  Q  S 
## 52  2 80

Of the passengers who survived and had a cabin, S is still the mode.

For now, let’s make the NA values for embarked S. This could possibly be improved.

titanic_full[is.na(titanic_full$Embarked),]$Embarked <- "S"

printRemainingNAs(titanic_full)

## NAs in titanic_full:
## PassengerId : 0
## Survived : 418
## Pclass : 0
## Name : 0
## Sex : 0
## Age : 263
## SibSp : 0
## Parch : 0
## Ticket : 0
## Fare : 1
## Cabin : 1014
## Embarked : 0
## IsTrainSet : 0

Cleaning Age

table(is.na(titanic_full$Age))

## 
## FALSE  TRUE 
##  1046   263

(cat("Median Age from all data:",(age.median <- median(titanic_full$Age, na.rm = T))))

## Median Age from all data: 28

## NULL

The median age without null values is 28. It’s the same as our training set.

Cleaning Age with the median might be the simplest way, but its results will be lackluster. The following is the code to use this approach, but we will try a different one using a linear regression. Again, there are 263 NA values.

#To just put the median age into the NA values, use the following line:
#titanic_full[is.na(titanic_full$Age),"Age"] <- age.median
table(is.na(titanic_full$Age))

## 
## FALSE  TRUE 
##  1046   263

(ggplot(data = titanic_full[(!(is.na(titanic_full$Age))),], aes(y = Age))+
  geom_boxplot()+
   ggtitle("Distribution of All Ages, both known and NA \"Survied\" included " )+
    theme(plot.title = element_text(hjust = 0.5))+
  coord_flip())

upperWhiskerAge <- boxplot.stats(titanic_full$Age)$stats[5]
outlier_filtered_Age <- titanic_full[(!(is.na(titanic_full$Age))),] 
outlier_filtered_Age <-  outlier_filtered_Age %>% filter(Age < upperWhiskerAge)

outlier_filtered_Age

## # A tibble: 1,036 x 13
##    PassengerId Survived Pclass Name   Sex     Age SibSp Parch Ticket  Fare Cabin
##          <dbl>    <dbl>  <dbl> <chr>  <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
##  1           1        0      3 Braun~ male     22     1     0 A/5 2~  7.25 <NA> 
##  2           2        1      1 Cumin~ fema~    38     1     0 PC 17~ 71.3  C85  
##  3           3        1      3 Heikk~ fema~    26     0     0 STON/~  7.92 <NA> 
##  4           4        1      1 Futre~ fema~    35     1     0 113803 53.1  C123 
##  5           5        0      3 Allen~ male     35     0     0 373450  8.05 <NA> 
##  6           7        0      1 McCar~ male     54     0     0 17463  51.9  E46  
##  7           8        0      3 Palss~ male      2     3     1 349909 21.1  <NA> 
##  8           9        1      3 Johns~ fema~    27     0     2 347742 11.1  <NA> 
##  9          10        1      2 Nasse~ fema~    14     1     0 237736 30.1  <NA> 
## 10          11        1      3 Sands~ fema~     4     1     1 PP 95~ 16.7  G6   
## # ... with 1,026 more rows, and 2 more variables: Embarked <chr>,
## #   IsTrainSet <lgl>

Age has 263 NA values. So it might make sense to first do a linear regression including survived, where available. Then we’ll do a second linear regression where we don’t know if they survived, but that model will benefit from the larger subset of data knowing if they did survive. This was initially intuitive to me from the “Women and Children” first outset, but the initial EDA graph of the age distributions of those who lived and died detracted from that. Still, I’m going to proceed in this fashion.

titanic_train$Sex <- as.factor(titanic_train$Sex)
titanic_train$Pclass <- as.factor(titanic_train$Pclass)
titanic_train$Survived <- as.factor(titanic_train$Survived)

SurvivalKnownWithValidAge <- titanic_train[!(is.na(titanic_train$Age)),]
SurvivalKnownWithInvalidAge <- titanic_train[is.na(titanic_train$Age),]
#Make sure no NA Ages in the ValidAge:
table(is.na( SurvivalKnownWithValidAge$Age))

## 
## FALSE 
##   714

printRemainingNAs(SurvivalKnownWithValidAge)

## NAs in SurvivalKnownWithValidAge:
## PassengerId : 0
## Survived : 0
## Pclass : 0
## Name : 0
## Sex : 0
## Age : 0
## SibSp : 0
## Parch : 0
## Ticket : 0
## Fare : 0
## Cabin : 529
## Embarked : 2
## IsTrainSet : 0

printRemainingNAs(SurvivalKnownWithInvalidAge)

## NAs in SurvivalKnownWithInvalidAge:
## PassengerId : 0
## Survived : 0
## Pclass : 0
## Name : 0
## Sex : 0
## Age : 177
## SibSp : 0
## Parch : 0
## Ticket : 0
## Fare : 0
## Cabin : 158
## Embarked : 0
## IsTrainSet : 0

SurvivalUnknownWithValidAge <- titanic_test[!(is.na(titanic_test$Age)),]
SurvivalUnknownWithInvalidAge <- titanic_test[is.na(titanic_test$Age),]
#table(is.na( SurvivalUnknownWithAge$Age))
printRemainingNAs(SurvivalUnknownWithValidAge)

## NAs in SurvivalUnknownWithValidAge:
## PassengerId : 0
## Pclass : 0
## Name : 0
## Sex : 0
## Age : 0
## SibSp : 0
## Parch : 0
## Ticket : 0
## Fare : 1
## Cabin : 245
## Embarked : 0
## IsTrainSet : 0
## Survived : 332

printRemainingNAs(SurvivalUnknownWithInvalidAge)

## NAs in SurvivalUnknownWithInvalidAge:
## PassengerId : 0
## Pclass : 0
## Name : 0
## Sex : 0
## Age : 86
## SibSp : 0
## Parch : 0
## Ticket : 0
## Fare : 0
## Cabin : 82
## Embarked : 0
## IsTrainSet : 0
## Survived : 86

Now that we’ve split up the Ages by Survived, we’ll make a linear regression for age using the survival data for our model’s equation where it’s known. Then we’ll repeat the process where Survival is unknown.

#library(tidyverse)
(ggplot(data = SurvivalKnownWithValidAge, aes(y = Age, ))+
  geom_boxplot()+
   ggtitle("Distribution of Age (With Survival known from Kaggle)")+
    theme(plot.title = element_text(hjust = 0.5))+
  coord_flip())

#Ignore warning, we're going to find the ages
upperWhiskerValidAgeSurvivalKnown <-  boxplot.stats(SurvivalKnownWithValidAge$Age)$stats[5]

outlier_filter_ValidAge_SurvivalKnown <- SurvivalKnownWithValidAge$Age < upperWhiskerValidAgeSurvivalKnown

table(outlier_filter_ValidAge_SurvivalKnown)

## outlier_filter_ValidAge_SurvivalKnown
## FALSE  TRUE 
##    11   703

non_Outliers_ValidAge_SurvivalKnown <- SurvivalKnownWithValidAge[outlier_filter_ValidAge_SurvivalKnown,]

Let’s model the age of those whose survival status is known with that data:

#Make sure none of our independent variables have NAs:
#printRemainingNAs(non_Outliers_ValidAge_SurvivalKnown)

non_Outliers_ValidAge_SurvivalKnown$Survived <- as.factor(non_Outliers_ValidAge_SurvivalKnown$Survived)

fare_equation_ValidAge_SurvialKnown = "Age ~ Pclass + Sex + Fare + SibSp + Parch + Survived"
fare_equation_ValidAge_SurvialKnown.formula <- as.formula(fare_equation_ValidAge_SurvialKnown)
#DO use Survived for this subset, survival is known
Age_model_ValidAge_SurvivalKnown <- lm(
  formula = fare_equation_ValidAge_SurvialKnown.formula,
  data = non_Outliers_ValidAge_SurvivalKnown
)

#summary(Age_model_ValidAge_SurvivalKnown)

#confint(Age_model_ValidAge_SurvivalKnown$fitted.values)


pertinentRowsColumns_Age_SurvivalKnown<- SurvivalKnownWithInvalidAge[,
             c(
               "Pclass",
               "Sex",
               "Fare",
               "SibSp",
               "Parch",
               "Survived"
               )
            ]


PredictedAgesWhereSurvivalStatusIsKnown <- predict(Age_model_ValidAge_SurvivalKnown, newdata = pertinentRowsColumns_Age_SurvivalKnown )

(cat("Predicted Median Age where Survival Status is known:",median(PredictedAgesWhereSurvivalStatusIsKnown)))

## Predicted Median Age where Survival Status is known: 28.73947

## NULL

Our median of our predicted values is sensible.

So we figured out the ages of those who had NA Ages when we knew their survival status. Next will be a) getting this data into the right dataframe in the right place and b) predicting via linear Regression those in the test set whose survival status is unknown from the Kaggle dataset.

library(tidyverse)
(ggplot(data = SurvivalUnknownWithValidAge, aes(y = Age, ))+
  geom_boxplot()+
   ggtitle("Distribution of Age (With Survival Unknown from Kaggle)")+
    theme(plot.title = element_text(hjust = 0.5))+
  coord_flip())

#Ignore warning, we're going to find the ages
upperWhiskerValidAgeSurvivalUnknown <-  boxplot.stats(SurvivalUnknownWithValidAge$Age)$stats[5]

outlier_filter_ValidAge_SurvivalUnknown <- SurvivalUnknownWithValidAge$Age < upperWhiskerValidAgeSurvivalUnknown

table(outlier_filter_ValidAge_SurvivalUnknown)

## outlier_filter_ValidAge_SurvivalUnknown
## FALSE  TRUE 
##     5   327

non_Outliers_ValidAge_SurvivalUnknown <- SurvivalUnknownWithValidAge[outlier_filter_ValidAge_SurvivalUnknown,]

#Make sure none of our independent variables have NAs:
#printRemainingNAs(non_Outliers_ValidAge_SurvivalUnknown)

fare_equation_ValidAge_SurvialUnknown = "Age ~ Pclass + Sex + Fare + SibSp + Parch" #+ Survived"
fare_equation_ValidAge_SurvialUnknown.formula = as.formula(fare_equation_ValidAge_SurvialUnknown)
#Do NOT use Survived for this subset, survival is unknown
Age_model_ValidAge_SurvivalUnknown <- lm(
  formula = fare_equation_ValidAge_SurvialUnknown.formula,
  data = non_Outliers_ValidAge_SurvivalUnknown
)



pertinentRowsColumns_Age_SurvivalUnknown<- SurvivalUnknownWithInvalidAge[,
             c(
               "Pclass",
               "Sex",
               "Fare",
               "SibSp",
               "Parch"
               #,
               #"Survived"
               )
            ]


PredictedAgesWhereSurvivalStatusIsUnknown <- predict(Age_model_ValidAge_SurvivalUnknown, newdata = pertinentRowsColumns_Age_SurvivalUnknown )

cat("Median Predicted Age where Survival status is uknown:",median(PredictedAgesWhereSurvivalStatusIsUnknown))

## Median Predicted Age where Survival status is uknown: 24.95219

So now we can put the ages into the titanic_full dataset.

#printRemainingNAs(titanic_full)

#to get rid of junk above:
AgeIsNAandSurivialKnownFilter <- (!(is.na(titanic_full$Survived))) & (is.na(titanic_full$Age))
titanic_full[AgeIsNAandSurivialKnownFilter,]$Age <- PredictedAgesWhereSurvivalStatusIsKnown

#We'll see some Ages have successfully been put back into Titanic_full with the lind below:
#printRemainingNAs(titanic_full)

AgeisNAAndSurivialUnknownFilter <- (is.na(titanic_full$Survived)) & (is.na(titanic_full$Age))
table(AgeisNAAndSurivialUnknownFilter)

## AgeisNAAndSurivialUnknownFilter
## FALSE  TRUE 
##  1223    86

titanic_full[AgeisNAAndSurivialUnknownFilter,]$Age <- PredictedAgesWhereSurvivalStatusIsUnknown

printRemainingNAs(titanic_full)

## NAs in titanic_full:
## PassengerId : 0
## Survived : 418
## Pclass : 0
## Name : 0
## Sex : 0
## Age : 0
## SibSp : 0
## Parch : 0
## Ticket : 0
## Fare : 1
## Cabin : 1014
## Embarked : 0
## IsTrainSet : 0

All the NAs in Age have been cleaned up using a linear model. Mose were predicted using their Survived Status, where available. ## “Fare” Column

There is one NA value in fare.

Again, we could simply replace that one missing datapoint with the median. The following would be the code fo that approach. The simple approach of using the median might be much more sensible for a very large dataset where regressions could be expensive.

fare.median <- median(titanic_full$Fare, na.rm = T)
(cat("The median fare in the titanic set, without the two missing data points, is:", fare.median))

## The median fare in the titanic set, without the two missing data points, is: 14.4542

## NULL

#To Apply Median to column, use this line:
#titanic_full[is.na(titanic_full$Fare),"Fare"] <- fare.median

The median fare, without the one missing value, is ~14.45

To Calculate a plausible model, we’ll first remove Fare’s outliers.

(ggplot(data = titanic_full[(!(is.na(titanic_full$Fare))),], aes(y = Fare))+
  geom_boxplot()+
   ggtitle("Distribution of Fares")+
    theme(plot.title = element_text(hjust = 0.5))+
  coord_flip())

#color = Sex is interesting

#take away the tailing $stats to see more than the IQR and whisker cutoffs
boxplot.stats(titanic_full$Fare)$stats

## [1]  0.0000  7.8958 14.4542 31.2750 65.0000

#to get upper bound, add [5]

UpperWhiskerFare <- boxplot.stats(titanic_full$Fare)$stats[5]

outlier_filter_Fare <- titanic_full$Fare < UpperWhiskerFare
table(outlier_filter_Fare)

## outlier_filter_Fare
## FALSE  TRUE 
##   176  1132

non_Outliers_Fare <- titanic_full[outlier_filter_Fare,]

Now that outliers have been removed, let’s model Fare using several columns.

#Make sure none of our independent variables have NAs:
#printRemainingNAs()

fare_equation = "Fare ~ Pclass + Sex + Age + SibSp + Parch + Embarked"
fare_equation.formula <- as.formula(fare_equation)
#Don't use Survived because future data won't have it
fare_model <- lm(
  formula = fare_equation.formula,
  data = non_Outliers_Fare
)

pertinentFareRowsColumns<- titanic_full[
  is.na(titanic_full$Fare),
             c(
               "Pclass",
               "Sex",
               "Age",
               "SibSp",
               "Parch",
               "Embarked"
               )
            ]


PredictedFare <- (predict(fare_model, newdata = pertinentFareRowsColumns ))


titanic_full[(is.na( titanic_full$Fare)),"Fare"] <- PredictedFare

printRemainingNAs(titanic_full)

## NAs in titanic_full:
## PassengerId : 0
## Survived : 418
## Pclass : 0
## Name : 0
## Sex : 0
## Age : 0
## SibSp : 0
## Parch : 0
## Ticket : 0
## Fare : 0
## Cabin : 1014
## Embarked : 0
## IsTrainSet : 0

The missing fare value has been replaced with its predicted one.

#19:58
#Categorical Casting
str(titanic_full)

## spec_tbl_df [1,309 x 13] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ PassengerId: num [1:1309] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : num [1:1309] 0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : num [1:1309] 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr [1:1309] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr [1:1309] "male" "female" "female" "female" ...
##  $ Age        : num [1:1309] 22 38 26 35 35 ...
##  $ SibSp      : num [1:1309] 1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : num [1:1309] 0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr [1:1309] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num [1:1309] 7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr [1:1309] NA "C85" NA "C123" ...
##  $ Embarked   : chr [1:1309] "S" "C" "S" "S" ...
##  $ IsTrainSet : logi [1:1309] TRUE TRUE TRUE TRUE TRUE TRUE ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PassengerId = col_double(),
##   ..   Survived = col_double(),
##   ..   Pclass = col_double(),
##   ..   Name = col_character(),
##   ..   Sex = col_character(),
##   ..   Age = col_double(),
##   ..   SibSp = col_double(),
##   ..   Parch = col_double(),
##   ..   Ticket = col_character(),
##   ..   Fare = col_double(),
##   ..   Cabin = col_character(),
##   ..   Embarked = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

titanic_full$Pclass <-  as.factor(titanic_full$Pclass)
titanic_full$Sex <- as.factor(titanic_full$Sex)
titanic_full$Embarked <- as.factor(titanic_full$Embarked)

titanic_train <-  titanic_full[titanic_full$IsTrainSet==T,]

titanic_test <-  titanic_full[titanic_full$IsTrainSet==F,]

titanic_train$Survived <-  as.factor(titanic_train$Survived)

str(titanic_train)

## tibble [891 x 13] (S3: tbl_df/tbl/data.frame)
##  $ PassengerId: num [1:891] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
##  $ Pclass     : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr [1:891] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age        : num [1:891] 22 38 26 35 35 ...
##  $ SibSp      : num [1:891] 1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : num [1:891] 0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr [1:891] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num [1:891] 7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr [1:891] NA "C85" NA "C123" ...
##  $ Embarked   : Factor w/ 3 levels "C","Q","S": 3 1 3 3 3 2 3 3 3 1 ...
##  $ IsTrainSet : logi [1:891] TRUE TRUE TRUE TRUE TRUE TRUE ...

#TODO Solve "Embarked"

Survived.equation <- "Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare+ Embarked"

Survived.formula <- as.formula(Survived.equation)

#options("install.lock"=F)
#install.packages("randomForest", dependencies = TRUE)

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

#printRemainingNAs(titanic_train)

titanic_model <- randomForest(formula = Survived.formula, data = titanic_train, ntree = 500, mtry = 3, nodesize = 0.1 * nrow(titanic_train))
#maybe last paramater is wrong.  These paramater were taken googling somehwere, but I lost the link

Features.equation <-  "Pclass + Sex + Age + SibSp + Parch + Fare + Embarked"

Survived <-  predict(titanic_model, newdata = titanic_test)

PassengerId <- titanic_test$PassengerId

output.df <- as.data.frame(PassengerId)

output.df$Survived <-  Survived

write.csv(output.df, file = "kaggle_submission.csv", row.names = F)

#((range(titanic_full$Fare, na.rm = T)))

proportionWhoDieInPredictions <- table(output.df$Survived)[1] / sum(table(output.df$Survived)[1] , table(output.df$Survived)[2])

proportionWhoDieInAllData <- table(titanic_full$Survived)[1] / sum(table(titanic_full$Survived)[1] , table(titanic_full$Survived)[2])

61 % of the passengers died in all the data where as 64.5% of the passengers died in the Kaggle test set. I feel like that’s possibly a reasonable guess.

This work got a 76% on Kaggle’s competition

Tableau And API Section - unfinished

Professor, this is where I have to admit I didn’t accomplish all I’d set out to do. I wanted to have a Tableau widget with sliders that could be slid back and forth and you could see where you might live or die on the Titanic. I felt it was a little silly with my model’s ~76% accuracy overall. I also struggled to make a CSV that would work well with Tableau. I figured I’d have to generate a lot of fake data and then predict on that with the models made above. Ultimately I didn’t finish this portion. I’m including it because I succeeded in calling APIs to convert 2021 US Dollars to 1912 Great British Pounds. And I used a library specifically to make the API call. So I do think this work could be made to work but I spent most oy my time trying to improve the model above instead of working on this portion.

#library(tidyverse)

#tableauDF <- data_frame(#Age = 1:100, 
                        #TicketPrice2021Dollars = seq(1000,150000,1500),
                        #)

table(titanic_train$Pclass)

## 
##   1   2   3 
## 216 184 491

tableauDF <- data_frame(Age = 0, TicketPrice2021Dollars =0, Sex = "male", Pclass =0)

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

for (i in 1:100) {
  ages <- rep(i, each=100)
  dollaryDoos <- seq(0,79999,800)
  tempDF <- data_frame(Age =ages, TicketPrice2021Dollars = dollaryDoos, Sex ="male")
  tempDF <- rbind(tempDF,tempDF)
  tempDF <- rbind(tempDF,tempDF[1:100,])
  Pclass <- rep(1:3, each=100)
  tempDF$Pclass <- Pclass
  tableauDF <-  rbind(tableauDF,tempDF)
}


for (i in 1:100) {
  ages <- rep(i, each=100)
  dollaryDoos <- seq(0,79999,800)
  tempDF <- data_frame(Age =ages, TicketPrice2021Dollars = dollaryDoos, Sex ="female")
  tempDF <- rbind(tempDF,tempDF)
  tempDF <- rbind(tempDF,tempDF[1:100,])
  Pclass <- rep(1:3, each=100)
  tempDF$Pclass <- Pclass
  tableauDF <-  rbind(tableauDF,tempDF)
}


# #this worked earlier without Pclass:
# for (i in 1:100) {
#   ages <- rep(i, each=100)
#   dollaryDoos <- seq(0,79999,800)
#   tempDF <- data_frame(Age =ages, TicketPrice2021Dollars = dollaryDoos, Sex ="female")
#   tableauDF <-  rbind(tableauDF,tempDF)
# }

tableauDF <- tableauDF[tableauDF$Age>0,]

I’m going to use Fixer.io for currency conversions. I only get 100 free API Calls / month, so please test sparingly! I’ll also use the fixerapi library to simplify API calls.
https://cran.r-project.org/web/packages/fixerapi/vignettes/introduction.html The Fare column is in British Pounds (https://betterprogramming.pub/titanic-survival-prediction-using-machine-learning-4c5ff1e3fa16) So We’ll Convert Modern Dollars to British Pounds

#install.packages("fixerapi")
library(fixerapi)

FixerAPIKey <- "c9da32e36f7357bf6017d34bd349906b"
#The following line didn't work...:
#fixer_api_key(FixerAPIKey)

Sys.setenv(FIXER_API_KEY = FixerAPIKey)

print(fixer_symbols())

## # A tibble: 168 x 2
##    name  value                        
##    <chr> <chr>                        
##  1 AED   United Arab Emirates Dirham  
##  2 AFN   Afghan Afghani               
##  3 ALL   Albanian Lek                 
##  4 AMD   Armenian Dram                
##  5 ANG   Netherlands Antillean Guilder
##  6 AOA   Angolan Kwanza               
##  7 ARS   Argentine Peso               
##  8 AUD   Australian Dollar            
##  9 AWG   Aruban Florin                
## 10 AZN   Azerbaijani Manat            
## # ... with 158 more rows

#For the free version, you have to use a base of Euro.  Then I'll convert Dollars to Euros to Pounds. If you try to set base to USD, expect error 105

#TO CALL API, uncomment the line below:
#DollarsToPoundsFromAPI <- fixer_latest(base= "EUR",
                                       #symbols = c("GBP", "USD"))

#These Results were received on 12/4/2021:
EuroToDollars <- 1.131344
EuroToPounds <- 0.854570

USDToGBPConversionRate <- ((1 / EuroToDollars) * EuroToPounds)

# https://www.officialdata.org/uk/inflation/1912?amount=1
#I looked for an API call for this data but couldn't find one.
priceDifference1912To2021Pounds <-  119.96

numYears <- 2021 - 1912

#convert 0's to 1's?

ZeroToOneFilter <- tableauDF$TicketPrice2021Dollars == 0

tableauDF[ZeroToOneFilter,]$TicketPrice2021Dollars =1


tableauDF$USDtoGBP2021Rate <- USDToGBPConversionRate

tableauDF <- tableauDF %>% mutate(TicketPrice2021Pounds = TicketPrice2021Dollars * USDtoGBP2021Rate)

tableauDF <-  tableauDF%>%
  mutate(Fare =  (TicketPrice2021Pounds /priceDifference1912To2021Pounds))

There’s admittedly some rounding errors going on here, particularly with the 199.96.

Survived.Tableau.equation <- "Survived ~ Age + Sex + Fare + Pclass"

Survived.Tableau.formula <- as.formula(Survived.Tableau.equation)

#options("install.lock"=F)
#install.packages("randomForest", dependencies = TRUE)

library(randomForest)

#printRemainingNAs(titanic_train)

age.outlier <- boxplot.stats(titanic_full$Age)$stats[5]
fare.outlier <- boxplot.stats(titanic_full$Fare)$stats[5]

titanic_trainOutlierFilter <- (titanic_train$Age < age.outlier & titanic_train$Fare < fare.outlier )

table(titanic_trainOutlierFilter)

## titanic_trainOutlierFilter
## FALSE  TRUE 
##   144   747

titanic_train_Outliers_removed <- titanic_train[titanic_trainOutlierFilter,]

titanic_Tableau_model <- randomForest(formula = Survived.Tableau.formula, data = titanic_train_Outliers_removed, ntree = 500, mtry = 3, nodesize = 0.1 * nrow(titanic_train_Outliers_removed))

(proportionWhoDieInTableau <- sum(tableauDF$Survived) / nrow(tableauDF))

## Warning: Unknown or uninitialised column: `Survived`.

## [1] 0

tableauDF$Sex <- as.factor(tableauDF$Sex)
tableauDF$Pclass <- as.factor(tableauDF$Pclass)

Predictions2 <-  predict(titanic_Tableau_model, newdata = tableauDF[,c("Age","Sex", "Fare", "Pclass")])

table(Predictions2)

## Predictions2
##     0     1 
## 34201 25799

tableauDF$Survived <- Predictions2


#(predict(fare_model, newdata = pertinentFareRowsColumns ))

I tired to convert survived to a numerical and do a linear model. But it gave bizzare results. Ultimatley, linear models aren’t good for categorical predictions like “Survived”.

# 
# titanic_train_Outliers_removed$Sex <- as.factor(titanic_train_Outliers_removed$Sex)
# titanic_train_Outliers_removed$Survived <- as.factor(titanic_train_Outliers_removed$Survived)
# #This is not wha'ts meant to happen, but let's try it out.
# LMTableauModel <- lm(
#   formula = Survived.Tableau.formula,
#   data = titanic_train_Outliers_removed
# )
# 
# (Predictions3 <-  predict(LMTableauModel, newdata = tableauDF[,c("Age","Sex", "Fare")]))

write.csv(tableauDF[,c(1:4,7,8)], file = "TableauData.csv", row.names = F)

FinalProject2_NCollin_607

Noah Collin

11/29/2021