Dealing with missing values in Titanic dataset.
Load the titanic3.xls data into dataframe titanic.
require(xlsx)
## Loading required package: xlsx
## Loading required package: rJava
## Loading required package: xlsxjars
lapply("titanic3.xls", function(f) {
df = read.xlsx(f, sheetName = "titanic3")
write.csv(df, gsub(".xls","_original.csv", f), row.names=FALSE)
})
## [[1]]
## NULL
titanic <- read.csv("titanic3_original.csv", header = TRUE, sep = ",")
Installing packages for tidy data.
# install.packages("dplyr")
# install.packages("tidyr")
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidyr)
## Loading required package: tidyr
df <- tbl_df(titanic)
Port of embarkation variable: find the missing values and replace them with S.
df$embarked[is.na(df$embarked)] <- "S"
Age variable: use mean (or median) value to populate the missing values.
df %>% summarise(Min = min(age, na.rm=TRUE),
Median = median(age, na.rm=TRUE),
Mean = mean(age, na.rm=TRUE),
Max = max(age, na.rm=TRUE))
## # A tibble: 1 x 4
## Min Median Mean Max
## <dbl> <dbl> <dbl> <dbl>
## 1 0.1667 28 29.88113 80
age_mean <- mean(df$age, na.rm=TRUE)
df$age[is.na(df$age)] <- age_mean
Boat variable: fill missing values with a dummy value, e.g. the string ‘None’ or ‘NA’.
class(df$boat)
## [1] "factor"
df$boat <- as.character(df$boat)
df$boat[is.na(df$boat)] <- "NA"
Cabin variable: the cabin number is missing might be a useful indicator of survival, thus create a new column has_cabin_number which has 1 if there is a cabin number, and 0 otherwise.
df <- df %>% mutate(has_cabin_number = ifelse(!is.na(cabin),1,0))
Write the data in cleaned up csv file.
write.csv(df, "titanic_clean.csv", row.names=FALSE)