library(readr)
## Warning: package 'readr' was built under R version 4.1.1
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v stringr 1.4.0
## v tidyr   1.1.3     v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
titanic_b2 <- read_csv("gender_submission.csv")
## Rows: 418 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): PassengerId, Survived
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic_b1 <- read_csv("test.csv")
## Rows: 418 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (6): PassengerId, Pclass, Age, SibSp, Parch, Fare
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic_a <- read_csv("train.csv")
## Rows: 891 Columns: 12
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic_b <- dplyr::left_join(titanic_b2, titanic_b1, by = "PassengerId")
titanic <- rbind.data.frame(titanic_a, titanic_b)
str(titanic)
## spec_tbl_df [1,309 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ PassengerId: num [1:1309] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : num [1:1309] 0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass     : num [1:1309] 3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr [1:1309] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr [1:1309] "male" "female" "female" "female" ...
##  $ Age        : num [1:1309] 22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : num [1:1309] 1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : num [1:1309] 0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr [1:1309] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num [1:1309] 7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr [1:1309] NA "C85" NA "C123" ...
##  $ Embarked   : chr [1:1309] "S" "C" "S" "S" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   PassengerId = col_double(),
##   ..   Survived = col_double(),
##   ..   Pclass = col_double(),
##   ..   Name = col_character(),
##   ..   Sex = col_character(),
##   ..   Age = col_double(),
##   ..   SibSp = col_double(),
##   ..   Parch = col_double(),
##   ..   Ticket = col_character(),
##   ..   Fare = col_double(),
##   ..   Cabin = col_character(),
##   ..   Embarked = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
titanic$Survived <- as.factor(titanic$Survived)
titanic$Pclass <- as.factor(titanic$Pclass)
head(titanic,10)
## # A tibble: 10 x 12
##    PassengerId Survived Pclass Name   Sex     Age SibSp Parch Ticket  Fare Cabin
##          <dbl> <fct>    <fct>  <chr>  <chr> <dbl> <dbl> <dbl> <chr>  <dbl> <chr>
##  1           1 0        3      Braun~ male     22     1     0 A/5 2~  7.25 <NA> 
##  2           2 1        1      Cumin~ fema~    38     1     0 PC 17~ 71.3  C85  
##  3           3 1        3      Heikk~ fema~    26     0     0 STON/~  7.92 <NA> 
##  4           4 1        1      Futre~ fema~    35     1     0 113803 53.1  C123 
##  5           5 0        3      Allen~ male     35     0     0 373450  8.05 <NA> 
##  6           6 0        3      Moran~ male     NA     0     0 330877  8.46 <NA> 
##  7           7 0        1      McCar~ male     54     0     0 17463  51.9  E46  
##  8           8 0        3      Palss~ male      2     3     1 349909 21.1  <NA> 
##  9           9 1        3      Johns~ fema~    27     0     2 347742 11.1  <NA> 
## 10          10 1        2      Nasse~ fema~    14     1     0 237736 30.1  <NA> 
## # ... with 1 more variable: Embarked <chr>

Apa saja yang mungkin bisa berpengaruh? - Kelas Passenger - Umur - Sex - Lokasi dari passenger - tempat tinggal - area rekreasi - area dining - Fare(?)

Masih bingung - SibSp - Parch - Embarked


0.1 Age Distribution of Passenger’s

ggplotly(
  titanic %>% 
    group_by(Sex, Age) %>% 
    summarise(freq = n()) %>% 
    ggplot(mapping = aes(Sex, Age)) +
    geom_boxplot(aes(fill = Sex))
)
## `summarise()` has grouped output by 'Sex'. You can override using the `.groups` argument.
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
ggplot(titanic, aes(Age, after_stat(count))) +
  geom_density()
## Warning: Removed 263 rows containing non-finite values (stat_density).

age_hist <- titanic %>% 
  ggplot(mapping = aes(Age, after_stat(count))) +
  geom_histogram(fill = "#ADD8E6", color = "black", bins = 30) +
  geom_density(kernel = "gaussian", color = "Black")
ggplotly(age_hist)
## Warning: Removed 263 rows containing non-finite values (stat_bin).
## Warning: Removed 263 rows containing non-finite values (stat_density).

kalo hist range nya kan berupa range, untuk label di plotly nya gimana caranya nunjukin range? ekstrak var. rumusnya

trus ini geom_density nya udah bener belom?

0.2 Fare Distribution

fare_box <- titanic %>% 
  ggplot(mapping = aes(y=Fare)) +
  geom_boxplot(fill = "#3EB489")
ggplotly(fare_box)
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
fare_hist <- titanic %>% 
  ggplot(mapping = aes(Fare, after_stat(count))) +
  geom_histogram(fill = "#ADD8E6", color = "black", bins = 40)
ggplotly(fare_hist)
## Warning: Removed 1 rows containing non-finite values (stat_bin).

1 Scatterplot of Age and Fare according to how many survived

titanic %>% 
  ggplot(aes(x = Age, y = Fare)) +
  geom_jitter(aes(col = Survived))
## Warning: Removed 264 rows containing missing values (geom_point).

1.1 Chances of Survival by Gender

sex_surv_df <- titanic %>%
  select(Survived, Sex) %>% 
  group_by(Sex, Survived) %>%
  summarise(Freq = n())
## `summarise()` has grouped output by 'Sex'. You can override using the `.groups` argument.
sex_surv_df$Chance <- ifelse(sex_surv_df$Sex == "female", sex_surv_df$Freq / colSums(sex_surv_df[c(1,2),3, drop = F]), sex_surv_df$Freq / colSums(sex_surv_df[c(3,4),3, drop = F]))

sex_surv_bar <- ggplot(sex_surv_df, aes(x = Survived, y = Chance)) +
  geom_col(aes(fill = Survived)) + facet_wrap(~ Sex)

ggplotly(sex_surv_bar)

1.2 Class, Sex and Age on Survivability

titanic %>% 
  select(Survived, Pclass, Sex, Age) %>% 
  filter(Survived == 1) %>% 
  group_by(Sex, Age) %>% 
  
  ggplot(aes(x = Sex)) +
  geom_bar(aes(fill = Sex)) +
  facet_grid(~Survived ~Pclass, scales = "free_x")

Ini biar lebih presentible gimana ya? Untuk ganti label gimana? - 0 -> Didn’t Survive - 1 -> Survive - 123 -> Passenger Class

s.apply match dplyr::case_when()