library(readr)## Warning: package 'readr' was built under R version 4.1.1
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v stringr 1.4.0
## v tidyr 1.1.3 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
titanic_b2 <- read_csv("gender_submission.csv")## Rows: 418 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): PassengerId, Survived
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic_b1 <- read_csv("test.csv")## Rows: 418 Columns: 11
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (6): PassengerId, Pclass, Age, SibSp, Parch, Fare
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic_a <- read_csv("train.csv")## Rows: 891 Columns: 12
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (5): Name, Sex, Ticket, Cabin, Embarked
## dbl (7): PassengerId, Survived, Pclass, Age, SibSp, Parch, Fare
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
titanic_b <- dplyr::left_join(titanic_b2, titanic_b1, by = "PassengerId")titanic <- rbind.data.frame(titanic_a, titanic_b)
str(titanic)## spec_tbl_df [1,309 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ PassengerId: num [1:1309] 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : num [1:1309] 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : num [1:1309] 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr [1:1309] "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr [1:1309] "male" "female" "female" "female" ...
## $ Age : num [1:1309] 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : num [1:1309] 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : num [1:1309] 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr [1:1309] "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num [1:1309] 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr [1:1309] NA "C85" NA "C123" ...
## $ Embarked : chr [1:1309] "S" "C" "S" "S" ...
## - attr(*, "spec")=
## .. cols(
## .. PassengerId = col_double(),
## .. Survived = col_double(),
## .. Pclass = col_double(),
## .. Name = col_character(),
## .. Sex = col_character(),
## .. Age = col_double(),
## .. SibSp = col_double(),
## .. Parch = col_double(),
## .. Ticket = col_character(),
## .. Fare = col_double(),
## .. Cabin = col_character(),
## .. Embarked = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
titanic$Survived <- as.factor(titanic$Survived)
titanic$Pclass <- as.factor(titanic$Pclass)head(titanic,10)## # A tibble: 10 x 12
## PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin
## <dbl> <fct> <fct> <chr> <chr> <dbl> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 0 3 Braun~ male 22 1 0 A/5 2~ 7.25 <NA>
## 2 2 1 1 Cumin~ fema~ 38 1 0 PC 17~ 71.3 C85
## 3 3 1 3 Heikk~ fema~ 26 0 0 STON/~ 7.92 <NA>
## 4 4 1 1 Futre~ fema~ 35 1 0 113803 53.1 C123
## 5 5 0 3 Allen~ male 35 0 0 373450 8.05 <NA>
## 6 6 0 3 Moran~ male NA 0 0 330877 8.46 <NA>
## 7 7 0 1 McCar~ male 54 0 0 17463 51.9 E46
## 8 8 0 3 Palss~ male 2 3 1 349909 21.1 <NA>
## 9 9 1 3 Johns~ fema~ 27 0 2 347742 11.1 <NA>
## 10 10 1 2 Nasse~ fema~ 14 1 0 237736 30.1 <NA>
## # ... with 1 more variable: Embarked <chr>
Apa saja yang mungkin bisa berpengaruh? - Kelas Passenger - Umur - Sex - Lokasi dari passenger - tempat tinggal - area rekreasi - area dining - Fare(?)
Masih bingung - SibSp - Parch - Embarked
ggplotly(
titanic %>%
group_by(Sex, Age) %>%
summarise(freq = n()) %>%
ggplot(mapping = aes(Sex, Age)) +
geom_boxplot(aes(fill = Sex))
)## `summarise()` has grouped output by 'Sex'. You can override using the `.groups` argument.
## Warning: Removed 2 rows containing non-finite values (stat_boxplot).
ggplot(titanic, aes(Age, after_stat(count))) +
geom_density()## Warning: Removed 263 rows containing non-finite values (stat_density).
age_hist <- titanic %>%
ggplot(mapping = aes(Age, after_stat(count))) +
geom_histogram(fill = "#ADD8E6", color = "black", bins = 30) +
geom_density(kernel = "gaussian", color = "Black")
ggplotly(age_hist)## Warning: Removed 263 rows containing non-finite values (stat_bin).
## Warning: Removed 263 rows containing non-finite values (stat_density).
kalo hist range nya kan berupa range, untuk label di plotly nya gimana caranya nunjukin range? ekstrak var. rumusnya
trus ini geom_density nya udah bener belom?
fare_box <- titanic %>%
ggplot(mapping = aes(y=Fare)) +
geom_boxplot(fill = "#3EB489")
ggplotly(fare_box)## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
fare_hist <- titanic %>%
ggplot(mapping = aes(Fare, after_stat(count))) +
geom_histogram(fill = "#ADD8E6", color = "black", bins = 40)
ggplotly(fare_hist)## Warning: Removed 1 rows containing non-finite values (stat_bin).
titanic %>%
ggplot(aes(x = Age, y = Fare)) +
geom_jitter(aes(col = Survived))## Warning: Removed 264 rows containing missing values (geom_point).
sex_surv_df <- titanic %>%
select(Survived, Sex) %>%
group_by(Sex, Survived) %>%
summarise(Freq = n())## `summarise()` has grouped output by 'Sex'. You can override using the `.groups` argument.
sex_surv_df$Chance <- ifelse(sex_surv_df$Sex == "female", sex_surv_df$Freq / colSums(sex_surv_df[c(1,2),3, drop = F]), sex_surv_df$Freq / colSums(sex_surv_df[c(3,4),3, drop = F]))
sex_surv_bar <- ggplot(sex_surv_df, aes(x = Survived, y = Chance)) +
geom_col(aes(fill = Survived)) + facet_wrap(~ Sex)
ggplotly(sex_surv_bar)titanic %>%
select(Survived, Pclass, Sex, Age) %>%
filter(Survived == 1) %>%
group_by(Sex, Age) %>%
ggplot(aes(x = Sex)) +
geom_bar(aes(fill = Sex)) +
facet_grid(~Survived ~Pclass, scales = "free_x")Ini biar lebih presentible gimana ya? Untuk ganti label gimana? - 0 -> Didn’t Survive - 1 -> Survive - 123 -> Passenger Class
s.apply match dplyr::case_when()