Persiapan Package
library(dplyr)
library(tidyr)
library(ggplot2)
library(arules)
Dataset
data <- read.csv('http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv')
data = data[, c('PassengerId', 'Age')]
data = data %>% drop_na()
Sampel = sample(1:nrow(data), 100)
data = data[Sampel, ]
head(data)
## PassengerId Age
## 714 891 32
## 503 636 28
## 358 446 4
## 624 783 29
## 470 593 47
## 516 653 21
Dataset yang digunakan adalah data penumpang Titanic, yang diambil sebagai sampel sebanyak 100 Orang
summary(data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.75 19.00 26.00 27.46 34.00 74.00
Visualisasi Data
ggplot(data, aes(x = Age)) +
geom_dotplot() +
labs(title = "Age of The 100 Passenger",
y = "Proportion",
x = "Age")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Binning : Equal Interval Width
data$Age_Cat = discretize(data$Age, breaks = 4, method = 'interval')
levels(data$Age_Cat)
## [1] "[0.75,19.1)" "[19.1,37.4)" "[37.4,55.7)" "[55.7,74]"
signif.lines = data %>%
group_by(Age_Cat) %>%
summarise(xvalue=abs(max(Age)))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data, aes(x = Age, fill = Age_Cat)) +
geom_dotplot() +
geom_vline(data=signif.lines, aes(xintercept=xvalue)) +
labs(title = "Age Binning By Interval",
y = "Proportion",
x = "Age")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Binning : Equal Frequency
data$Age_Cat = discretize(data$Age, breaks = 4, method = 'frequency')
levels(data$Age_Cat)
## [1] "[0.75,19)" "[19,26)" "[26,34)" "[34,74]"
signif.lines = data %>%
group_by(Age_Cat) %>%
summarise(xvalue=abs(max(Age)))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data, aes(x = Age, fill = Age_Cat)) +
geom_dotplot() +
geom_vline(data=signif.lines, aes(xintercept=xvalue)) +
labs(title = "Age Binning By Frequency",
y = "Proportion",
x = "Age")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Binning : User Specification
data$Age_Cat = discretize(data$Age, breaks = c(0,2,39,59,Inf), method = 'fixed',
labels = c('Baby','Young Adults','Midle-Aged Adults','Old Adults'))
signif.lines = data %>%
group_by(Age_Cat) %>%
summarise(xvalue=abs(max(Age)))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data, aes(x = Age, fill = Age_Cat)) +
geom_dotplot() +
geom_vline(data=signif.lines, aes(xintercept=xvalue)) +
labs(title = "Age Binning By Manual Specification",
y = "Proportion",
x = "Age")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.
