Persiapan Package

library(dplyr)
library(tidyr)
library(ggplot2)
library(arules)

Dataset

data <- read.csv('http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv')
data = data[, c('PassengerId', 'Age')]
data = data %>% drop_na()
Sampel = sample(1:nrow(data), 100)
data = data[Sampel, ]

head(data)
##     PassengerId Age
## 714         891  32
## 503         636  28
## 358         446   4
## 624         783  29
## 470         593  47
## 516         653  21

Dataset yang digunakan adalah data penumpang Titanic, yang diambil sebagai sampel sebanyak 100 Orang

summary(data$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.75   19.00   26.00   27.46   34.00   74.00

Visualisasi Data

ggplot(data, aes(x = Age)) +
  geom_dotplot() + 
  labs(title = "Age of The 100 Passenger",
       y = "Proportion",
       x = "Age")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Binning : Equal Interval Width

data$Age_Cat = discretize(data$Age, breaks = 4, method = 'interval')
levels(data$Age_Cat)
## [1] "[0.75,19.1)" "[19.1,37.4)" "[37.4,55.7)" "[55.7,74]"
signif.lines = data %>% 
  group_by(Age_Cat) %>%
  summarise(xvalue=abs(max(Age)))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data, aes(x = Age, fill = Age_Cat)) +
  geom_dotplot() + 
  geom_vline(data=signif.lines, aes(xintercept=xvalue)) + 
  labs(title = "Age Binning By Interval",
       y = "Proportion",
       x = "Age")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Binning : Equal Frequency

data$Age_Cat = discretize(data$Age, breaks = 4, method = 'frequency')
levels(data$Age_Cat)
## [1] "[0.75,19)" "[19,26)"   "[26,34)"   "[34,74]"
signif.lines = data %>% 
  group_by(Age_Cat) %>%
  summarise(xvalue=abs(max(Age)))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data, aes(x = Age, fill = Age_Cat)) +
  geom_dotplot() + 
  geom_vline(data=signif.lines, aes(xintercept=xvalue)) + 
  labs(title = "Age Binning By Frequency",
       y = "Proportion",
       x = "Age")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.

Binning : User Specification

data$Age_Cat = discretize(data$Age, breaks = c(0,2,39,59,Inf), method = 'fixed', 
           labels = c('Baby','Young Adults','Midle-Aged Adults','Old Adults'))

signif.lines = data %>% 
  group_by(Age_Cat) %>%
  summarise(xvalue=abs(max(Age)))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(data, aes(x = Age, fill = Age_Cat)) +
  geom_dotplot() + 
  geom_vline(data=signif.lines, aes(xintercept=xvalue)) + 
  labs(title = "Age Binning By Manual Specification",
       y = "Proportion",
       x = "Age")
## `stat_bindot()` using `bins = 30`. Pick better value with `binwidth`.