library(dplyr)## Warning: package 'dplyr' was built under R version 3.2.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library (ggplot2)## Warning: package 'ggplot2' was built under R version 3.2.5
library(titanic)#library(sparklyr)
#sc <- spark_connect(master = "local")test <- titanic_test
train<- titanic_train
head (test)## PassengerId Pclass Name Sex
## 1 892 3 Kelly, Mr. James male
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female
## 3 894 2 Myles, Mr. Thomas Francis male
## 4 895 3 Wirz, Mr. Albert male
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6 897 3 Svensson, Mr. Johan Cervin male
## Age SibSp Parch Ticket Fare Cabin Embarked
## 1 34.5 0 0 330911 7.8292 Q
## 2 47.0 1 0 363272 7.0000 S
## 3 62.0 0 0 240276 9.6875 Q
## 4 27.0 0 0 315154 8.6625 S
## 5 22.0 1 1 3101298 12.2875 S
## 6 14.0 0 0 7538 9.2250 S
str(train)## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
dim(test)## [1] 418 11
dim(train)## [1] 891 12
Turn Survived variable from int to factor
train$Survived<- factor(train$Survived)Create Survived dummy variable in test before combining
test<- mutate(test, Survived = "none")Create sorting variable dataset before combining (so we can re-sort later)
test <- mutate(test, dataset = "testset")
train <- mutate(train, dataset = "trainset")Combine datasets for feature engineering
titanic.combined <- rbind(test, train)
str(titanic.combined)## 'data.frame': 1309 obs. of 13 variables:
## $ PassengerId: int 892 893 894 895 896 897 898 899 900 901 ...
## $ Pclass : int 3 3 2 3 3 3 3 2 3 3 ...
## $ Name : chr "Kelly, Mr. James" "Wilkes, Mrs. James (Ellen Needs)" "Myles, Mr. Thomas Francis" "Wirz, Mr. Albert" ...
## $ Sex : chr "male" "female" "male" "male" ...
## $ Age : num 34.5 47 62 27 22 14 30 26 18 21 ...
## $ SibSp : int 0 1 0 0 1 0 0 1 0 2 ...
## $ Parch : int 0 0 0 0 1 0 0 1 0 0 ...
## $ Ticket : chr "330911" "363272" "240276" "315154" ...
## $ Fare : num 7.83 7 9.69 8.66 12.29 ...
## $ Cabin : chr "" "" "" "" ...
## $ Embarked : chr "Q" "S" "Q" "S" ...
## $ Survived : chr "none" "none" "none" "none" ...
## $ dataset : chr "testset" "testset" "testset" "testset" ...
Make local df for simplicity
data<- tbl_df (titanic.combined)Factorize vars Pclass, dataset, Survived
data$Pclass <- factor(data$Pclass)
data$dataset <- factor(data$dataset)
data$Survived<- factor(data$Survived)Check for duplicates
IDdups <- distinct(data, PassengerId)
dim(IDdups)## [1] 1309 1
PassengerId values are all distinct.
Namedups <- distinct(data, Name)
dim(Namedups)## [1] 1307 1
There are 2 duplicate names, which is possible. But, all PassengerId are distinct. Lets investigate:
filter(data, duplicated(Name)) ## Warning: package 'bindrcpp' was built under R version 3.2.5
## # A tibble: 2 x 13
## PassengerId Pclass Name Sex Age SibSp Parch Ticket
## <int> <fctr> <chr> <chr> <dbl> <int> <int> <chr>
## 1 290 3 Connolly, Miss. Kate female 22 0 0 370373
## 2 697 3 Kelly, Mr. James male 44 0 0 363592
## # ... with 5 more variables: Fare <dbl>, Cabin <chr>, Embarked <chr>,
## # Survived <fctr>, dataset <fctr>
filter(data, grepl('Kelly|Connolly', Name, Age ))## # A tibble: 7 x 13
## PassengerId Pclass Name Sex
## <int> <fctr> <chr> <chr>
## 1 892 3 Kelly, Mr. James male
## 2 898 3 Connolly, Miss. Kate female
## 3 290 3 Connolly, Miss. Kate female
## 4 301 3 "Kelly, Miss. Anna Katherine \"Annie Kate\"" female
## 5 574 3 Kelly, Miss. Mary female
## 6 697 3 Kelly, Mr. James male
## 7 707 2 "Kelly, Mrs. Florence \"Fannie\"" female
## # ... with 9 more variables: Age <dbl>, SibSp <int>, Parch <int>,
## # Ticket <chr>, Fare <dbl>, Cabin <chr>, Embarked <chr>,
## # Survived <fctr>, dataset <fctr>
missing values?
sapply(data, function(x) sum(is.na(x)))## PassengerId Pclass Name Sex Age SibSp
## 0 0 0 0 263 0
## Parch Ticket Fare Cabin Embarked Survived
## 0 0 1 0 0 0
## dataset
## 0
Missing age in a lot of these. Consider dropping or filling
Drop na:
# data <- na.omit(data)Finally, lets look at some descriptive statistics:
summary(tbl_df(data))## PassengerId Pclass Name Sex
## Min. : 1 1:323 Length:1309 Length:1309
## 1st Qu.: 328 2:277 Class :character Class :character
## Median : 655 3:709 Mode :character Mode :character
## Mean : 655
## 3rd Qu.: 982
## Max. :1309
##
## Age SibSp Parch Ticket
## Min. : 0.17 Min. :0.0000 Min. :0.000 Length:1309
## 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.000 Class :character
## Median :28.00 Median :0.0000 Median :0.000 Mode :character
## Mean :29.88 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.00 Max. :8.0000 Max. :9.000
## NA's :263
## Fare Cabin Embarked Survived
## Min. : 0.000 Length:1309 Length:1309 0 :549
## 1st Qu.: 7.896 Class :character Class :character 1 :342
## Median : 14.454 Mode :character Mode :character none:418
## Mean : 33.295
## 3rd Qu.: 31.275
## Max. :512.329
## NA's :1
## dataset
## testset :418
## trainset:891
##
##
##
##
##
head(data)## # A tibble: 6 x 13
## PassengerId Pclass Name Sex
## <int> <fctr> <chr> <chr>
## 1 892 3 Kelly, Mr. James male
## 2 893 3 Wilkes, Mrs. James (Ellen Needs) female
## 3 894 2 Myles, Mr. Thomas Francis male
## 4 895 3 Wirz, Mr. Albert male
## 5 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female
## 6 897 3 Svensson, Mr. Johan Cervin male
## # ... with 9 more variables: Age <dbl>, SibSp <int>, Parch <int>,
## # Ticket <chr>, Fare <dbl>, Cabin <chr>, Embarked <chr>,
## # Survived <fctr>, dataset <fctr>
Take slice of data to investigate distributional properties
trainset<-na.omit(data)%>% arrange(dataset)%>%slice(419:1309)
head (trainset)## # A tibble: 6 x 13
## PassengerId Pclass Name Sex Age SibSp
## <int> <fctr> <chr> <chr> <dbl> <int>
## 1 113 3 Barton, Mr. David John male 22.0 0
## 2 114 3 Jussila, Miss. Katriina female 20.0 1
## 3 115 3 Attalah, Miss. Malake female 17.0 0
## 4 116 3 Pekoniemi, Mr. Edvard male 21.0 0
## 5 117 3 Connors, Mr. Patrick male 70.5 0
## 6 118 2 Turpin, Mr. William John Robert male 29.0 1
## # ... with 7 more variables: Parch <int>, Ticket <chr>, Fare <dbl>,
## # Cabin <chr>, Embarked <chr>, Survived <fctr>, dataset <fctr>
glimpse(trainset);## Observations: 627
## Variables: 13
## $ PassengerId <int> 113, 114, 115, 116, 117, 118, 119, 120, 121, 123, ...
## $ Pclass <fctr> 3, 3, 3, 3, 3, 2, 1, 3, 2, 2, 2, 1, 3, 3, 3, 3, 3...
## $ Name <chr> "Barton, Mr. David John", "Jussila, Miss. Katriina...
## $ Sex <chr> "male", "female", "female", "male", "male", "male"...
## $ Age <dbl> 22.0, 20.0, 17.0, 21.0, 70.5, 29.0, 24.0, 2.0, 21....
## $ SibSp <int> 0, 1, 0, 0, 0, 1, 0, 4, 2, 1, 0, 0, 1, 0, 0, 0, 0,...
## $ Parch <int> 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0,...
## $ Ticket <chr> "324669", "4136", "2627", "STON/O 2. 3101294", "37...
## $ Fare <dbl> 8.0500, 9.8250, 14.4583, 7.9250, 7.7500, 21.0000, ...
## $ Cabin <chr> "", "", "", "", "", "", "B58 B60", "", "", "", "E1...
## $ Embarked <chr> "S", "S", "C", "S", "Q", "S", "C", "S", "S", "C", ...
## $ Survived <fctr> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0...
## $ dataset <fctr> trainset, trainset, trainset, trainset, trainset,...
Plotting helper function:
# MULTIPLOT -------------------------------------------
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
require(grid)
plots <- c(list(...), plotlist)
numPlots = length(plots)
if (is.null(layout)) {
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots == 1) {
print(plots[[1]])
} else {
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
for (i in 1:numPlots) {
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
# ---------------------------------------------------
# BAR PLOT -------------------------------------------
plot_ggbar <- function(df_arg, var_arg, colNum){
p1<- ggplot(na.omit(df_arg), aes((factor(var_arg)), fill = factor(df_arg$Survived))) +
geom_bar(width = .95) # defaults to stacking
p2<- ggplot(na.omit(df_arg), aes(factor(var_arg), fill = factor(df_arg$Survived))) +
geom_bar(position = "fill",width = .95)
multiplot(p1,p2,cols=colNum)
}
# -------------------------------------------plot_ggbar(trainset, trainset$Age, 1)## Loading required package: grid
plot_ggbar(trainset, trainset$Sex, 2)plot_ggbar(trainset, trainset$Pclass, 2)plot_ggbar(trainset, trainset$Fare, 1)#library(e1071)