library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(hrbrthemes)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(viridis)
## Loading required package: viridisLite
options(knitr.table.format = "html")
library(plotly)
## 
## Attaching package: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggrepel)

Kasus 1


1.1 Dataset

Data yang bersumber dari https://github.com/mwaskom/seaborn-data/blob/master/tips.csv

data <- read.table("https://raw.githubusercontent.com/mwaskom/seaborn-data/master/tips.csv", header=T, sep=",") %>%
  mutate(tip = round(tip/total_bill*100, 1))
data

1.2 Counts the Number of Value Per Group and Subgroup

Berikut banyaknya pengunjung restoran (n) dan median dari tip yang diterima staf restoran berdasarkan hari (day) dan jenis kelamin (sex) pengunjung yang datang

#counts the number of value per group and subgroup
counts=data %>%
  group_by(sex,day) %>%
  summarize(
    n=n(),
    median=median(tip)
  )
## `summarise()` has grouped output by 'sex'. You can override using the `.groups`
## argument.
counts

1.3 Boxplot

Plotting data dengan boxplot masing-masing hari (day) dan jenis kelamin (sex).

#grouped
data %>%
  mutate(sex=fct_reorder(sex, tip)) %>%
  mutate(sex=factor(sex, levels = c("Male","Female")))%>%
  ggplot(aes(fill=day, y=tip, x=sex)) + 
  geom_boxplot(position=position_dodge2(preserve = "total"), alpha=0.5, outlier.colour="transparent", varwidth = TRUE) +
  geom_point(color="green", size=1, width=0.1, position=position_jitterdodge() , alpha=0.4) +
  scale_fill_viridis(discrete=T, name="") +
  geom_text(data=counts, aes(label=paste0("n: ",n), y=median-2), position=position_dodge(1), 
            hjust=0.5) +
  theme_ipsum()  +
  xlab("") +
  ylab("Tip (%)") +
  ylim(0,35)
## Warning: Can't preserve total widths when `varwidth = TRUE`.
## Warning in geom_point(color = "green", size = 1, width = 0.1, position =
## position_jitterdodge(), : Ignoring unknown parameters: `width`
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family
## not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

Dari boxplot di atas, terlihat bahwa boxplot sebaran tip yang diterima staf restoran dari pengunjung perempuan maupun laki-laki semakin besar saat weekend. Pengunjung perempuan memberikan tip lebih banyak pada hari Jumat, sedangkan pengunjung laki-laki lebih banyak memberikan tip pada hari Minggu.

Kasus 2


2.1 Dataset

Data bersumber dari : https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/4_ThreeNum.csv.

#data_pop
data_pop <- read.table("https://raw.githubusercontent.com/holtzy/data_to_viz/master/Example_dataset/4_ThreeNum.csv", 
                       header=T, sep=",")
data_pop[which(data_pop$gdpPercap==-1),"gdpPercap"] <- 1
colnames(data_pop) <- c("Country", "Continent", "lifeExp", "pop" , "gdpPercap")
data_pop

2.2 Lollipop Plot

Lollipop plot pada dasarnya adalah barplot, dimana bar diubah menjadi garis dan titik. Berikut lollipop plot berdasarkan variabel gdpPercap (produk domestik bruto perkapita) untuk masing-masing variabel Country.

# lollipop
data_pop %>%
  filter(!is.na(gdpPercap)) %>%
  arrange(gdpPercap) %>%
  tail(50) %>%
  arrange(Continent, gdpPercap) %>%
  mutate(Country=factor(Country, Country)) %>%
  mutate(gdpPercap) %>%
  ggplot( aes(x=Country, y=gdpPercap, color=Continent) ) +
  geom_segment( aes(x=Country ,xend=Country, y=0, yend=gdpPercap), color="grey") +
  geom_point(size=3) +
  scale_color_viridis(discrete=TRUE) +
  coord_flip() +
  theme_ipsum() +
  theme(
    panel.grid.minor.y = element_blank(),
    panel.grid.major.y = element_blank(),
    legend.position="none"
  ) +
  xlab("") +
  ylab("gdpPercap (USD)")
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

Plot ini menunjukkan hubungan antar numerik dan variabel kategori. Dari hasil tersebut terlihat bahwa setiap warna mewakili setiap benua yang berbeda. Diketahui bahwa negara yang memperoleh gdpPercap (USD) tertinggi yaitu negara Norway. Selanjutnya yaitu Kuwait dan Singapore. Sedangkan negara yang memperoleh gdpPercap (USD) terendah yaitu Mauritius.

2.3 Tree Map

Treemap merupakan visualisasi data dalam bentuk rectangle berwarna-warni yang mewakili kategori tertentu dan ukurannya menggambarkan proporsi nilai yang berkaitan dengan kategori tersebut. Berikut treemap untuk pengelompokkan benua berdasarkan warna pada setiap negara dari hasil variabel gdpPercap (USD).

# treemap
library(treemap)
p <- treemap(data_pop,
             # data
             index=c("Country", "Continent", "lifeExp", "pop"),
             vSize="gdpPercap",
             type="index",
             # Main
             title="",
             palette="Dark2",
             # Borders:
             border.col=c("black", "grey", "grey"),             
             border.lwds=c(1,0.5,0.1),                         
             # Labels
             fontsize.labels=c(0.7, 0.4, 0.3),
             fontcolor.labels=c("white", "white", "black"),
             fontface.labels=1,            
             bg.labels=c("transparent"),              
             align.labels=list( c("center", "center"), c("left", "top"),
                                c("right", "bottom")),                                  
             overlap.labels=0.5,
             inflate.labels=T           
)

Treemap di atas dibuat berdasarkan tingginya gdpPercap (USD) di setiap negara. Warna yang paling gelap menunjukkan besarnya gdpPercap (USD) suatu negara. Benua Europe merupakan benua yang tinggi angka gdpPercap (USD), yaitu pada negara Norway. Selanjutnya benua Asia yaitu negara Kuwait serta Singapore.