Visualisasi Data di R

Salah satu fungsi terbaik dari R adalah bagaimana membuat visualisasi data secara cepat dan mudah, bisa di replikasi dan modifikasi dengan beberapa kode Sederhana.

Pada sesi ini, kita akan melihat-lihat,

A. R Base Graphics

Materi terkait R Base Graphics ini, bersumber dari Susan E. Johnston, di sini.

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#   # Susan E. Johnston.
#                                                #   # Susan.Johnston@ed.ac.uk
#              R GRAPHICS SEMINAR                #
#                30th August 2012                #
#                                                #
#                                                #
#             Part 1: Base Graphics              #
#                                                #
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

A.0 Pendahuluan

Seluruh data dan file yang digunakan dalam R Base Graphic, tersedia di sini.

Silahkan Download, Extract, dan gunakan setwd untuk mengatur Lokasi `Working Directory

setwd("D:/GDrive/01-Event/17-R/Materi 4 - 5 Juni 2021 - Sesi 2")

Kemudian, berikut adalah beberapa konsep yang perlu diketahui sebelum lebih lanjut.

VECTORS!

height <- c(145, 167, 176, 123, 150)
weight <- c(51, 63, 64, 40, 55)

plot(height,weight)

Catatan Penting : Plot dibawah ini mengeluarkan hasil yang sama dengan plot(height,weight)

plot(height,
     weight)

DATA FRAMES!

data <- cbind.data.frame(height, weight)
data
##   height weight
## 1    145     51
## 2    167     63
## 3    176     64
## 4    123     40
## 5    150     55
data$height
## [1] 145 167 176 123 150
data$weight
## [1] 51 63 64 40 55
plot(data$height, data$weight)

TABLES

usia <- c(23, 26, 25, 21, 27, 28, 27, 26, 26, 25, 26, 24, 25, 25, 23, 21)
usia.tab <- table(usia)
usia.tab
## usia
## 21 23 24 25 26 27 28 
##  2  2  1  4  4  2  1
plot(usia.tab)

barplot(usia.tab)

A.1 Basic Histogram

# Data

unicorns <- read.table("unicorns.txt" ,header = T)
head(unicorns)
##   birthweight  sex longevity
## 1    4.478424 Male         1
## 2    5.753458 Male         0
## 3    3.277265 Male         0
## 4    3.929379 Male         0
## 5    3.972810 Male         0
## 6    4.912954 Male         0
# Syntax Dasar Histogram
hist(unicorns$birthweight)

# Syntax Histogram
hist(unicorns$birthweight,                        # Nilai X
     breaks = 40,                                 # Jumlah Kotak
     xlab = "Birth Weight",                       # Label X
     main = "Histogram of Unicorn Birth Weight",  # Judul Plot
     ylim = c(0,80))                              # Batas Bawah dan Atas dari Y

# Bisa ditambah, garis-garis yang lain
abline(v=mean(unicorns$birthweight),              # Membuat garis v (vertical), yang merupakan rata-rata
       col="green",                               # Memberi warna hijau
       lwd=3)                                     # Tebal Garis

A.2 Basic Line Graph with Regression

# Data
moomins <- read.table("Moomin Density.txt", header = T)
head(moomins)
##   Year PopSize
## 1 1971     500
## 2 1972     562
## 3 1973     544
## 4 1974     532
## 5 1975     580
## 6 1976     590
# Syntax Dasar
plot(moomins$Year, moomins$PopSize)

# Syntax
plot(moomins$Year, moomins$PopSize,                              # x variable, y variable
     type = "l",                                                 # draw a line graphs
     col = "red",                                                # red line colour
     lwd = 3,                                                    # line width of 3
     xlab = "Year",                                              # x axis label
     ylab = "Population Size",                                   # y axis label
     main = "Moomin Population Size on Ruissalo 1971 - 2001")    # plot title

fit1 <- lm (PopSize ~ Year, data = moomins)                      # carry out a linear regression
abline(fit1, lty = "dashed")                                     # add the regression line to the plot
text(x = 1978, y = 750, labels = "R2 = 0.896\nP = 2.615e-15")    # add a label to the plot at (x,y)

A.3 Scatterplot with Legend

# Data
data(iris)
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

pairs(iris)

pairs(iris, col = iris$Species)

plot(iris$Sepal.Length, iris$Petal.Length,        # x variable, y variable
     col = iris$Species,                          # colour by species
     pch = 16,                                    # type of point to use
     cex = 2,                                     # size of point to use
     xlab = "Sepal Length",                       # x axis label
     ylab = "Petal Length",                       # y axis label
     main = "Flower Characteristics in Iris")     # plot title

legend (x = 4.5, y = 7, legend = levels(iris$Species), col = c(1:3), pch = 16)

A.4 Boxplot

# Syntax Dasar

boxplot(iris$Sepal.Length ~ iris$Species)

# Syntax Boxplot
boxplot(iris$Sepal.Length ~ iris$Species,              # x variable, y variable
        notch = T,                                     # Draw notch
        las = 1,                                       # Orientate the axis tick labels
        xlab = "Species",                              # X-axis label
        ylab = "Sepal Length",                         # Y-axis label
        main = "Sepal Length by Species in Iris",      # Plot title
        col = c("red", "yellow", "green"),             # Memberi Warna
        cex.lab = 1.5,                                 # Size of axis labels
        cex.axis = 1.5,                                # Size of the tick mark labels
        cex.main = 2)                                  # Size of the plot title

A.5 Menampilkan lebih dari Satu Plot sekaligus

par(mfrow=c(1,2))      # number of rows, number of columns

plot(iris$Sepal.Length, iris$Petal.Length,        # x variable, y variable
     col = iris$Species,                          # colour by species
     main = "Sepal vs Petal Length")              # plot title

plot(iris$Sepal.Length, iris$Sepal.Width,         # x variable, y variable
     col = iris$Species,                          # colour by species
     main = "Sepal Length vs Width")      # plot title

A.6 Menyimpan graphic

# png
png("Sepal vs Petal Length in Iris.png", width = 500, height = 500, res = 72)

plot(iris$Sepal.Length, iris$Petal.Length,
     col = iris$Species,
     main = "Sepal vs Petal Length in Iris")

B. ggplot2

Materi terkait ggplot2 ini, bersumber dari ini dan itu.

B.0 Pendahuluan

ggplot2 adalah library/package visualisasi yang menjadi bagian dari tidyverse. ggplot2 mengikuti konsep Grammar of Graphics,

Komponen dari graphics

Dimana ada 7 element Graphics, yaitu

Element Deskripsi jargon
Data The Data-set being plotted variable of interest
Aesthetics The scales onto which we map our data x-axis, y-axis, colour, fill, size, labels, shape, alpha, line width, line type
Geometrics The visual elements used for our data point, line, histogram, bar, boxplot
Facets Plotting small multiples (subgroup) colums, row
Statistics Representation of our data to aid understanding binning, smoothing, descriptive, inferential
Coordinates The space on which the data will be plotted cartesian, fixed, polar, limits
Themes All non-data ink

A.0.1 Package

Anda perlu melakukan instalasi Package terlebih dahulu, untuk menggunakan ggplot2,

install.packages("ggplot2")
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5

B.1 data

Data yang digunakan adalah diamonds

data("diamonds")
str(diamonds)
## tibble[,10] [53,940 x 10] (S3: tbl_df/tbl/data.frame)
##  $ carat  : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
##  $ cut    : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
##  $ color  : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
##  $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
##  $ depth  : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
##  $ table  : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
##  $ price  : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
##  $ x      : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
##  $ y      : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
##  $ z      : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...

Keterangan

price price in US dollars ($326–$18,823)

carat weight of the diamond (0.2–5.01)

cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)

color diamond colour, from D (best) to J (worst)

clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

x length in mm (0–10.74)

y width in mm (0–58.9)

z depth in mm (0–31.8)

depth total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43–79)

table width of top of diamond relative to widest point (43–95)

ggplot(data=diamonds)

B.2 geom_

Di sini akan Kita coba geom_point, dan beberapa geom lainnya.

ggplot(data=diamonds)+
  geom_point()

Fungsi di atas, sayangnya tidak bisa dijalankan. Untuk itu perlu ditambahkan aesthetics

B.3 aesthetics

ggplot(data=diamonds)+
  geom_point()+
  aes(x=carat,y=price)

B.4 facet

ggplot(data=diamonds)+
  geom_point()+
  aes(x=carat,y=price)+
  facet_wrap(~cut)

B.5 stat

ggplot(data=diamonds)+
  geom_point()+
  aes(x=carat,y=price)+
  facet_wrap(~cut)+
  stat_smooth(aes(x=carat, y=price), method="lm", formula = y ~ x)

B.6 coord

ggplot(data=diamonds)+
  geom_point()+
  aes(x=carat,y=price)+
  facet_wrap(~cut)+
  stat_smooth(aes(x=carat, y=price), method="lm", formula = y ~ x)+
  coord_polar()

B.7 theme

  • theme_gray() is the default.
  • theme_bw() is useful when you use transparency.
  • theme_classic() is more traditional.
  • theme_void() removes everything but the data, etc
ggplot(data=diamonds)+
  geom_point()+
  aes(x=carat,y=price)+
  facet_wrap(~cut)+
  stat_smooth(aes(x=carat, y=price), method="lm", formula = y ~ x)+
  coord_polar()+
  theme_light()

Extra

Warna

ggplot(data=diamonds)+
  geom_point()+
  aes(x=carat,y=price, color=cut)+
  theme_minimal()+
  labs(title = "Harga Berlian berdasarkan Ukuran",
       x= "Carat",
       y= "Harga")

Bekerja dengan library ‘dplyr’

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
diamonds %>% filter(cut==c("Premium", "Ideal")) %>%
ggplot()+
  geom_point()+
  aes(x=carat, y=price, color=cut)+
  theme_minimal()+
  labs(title = "Harga Berlian berdasarkan Ukuran",
       x= "Carat",
       y= "Harga")

C. Basic Line Chart for Time Series dengan ggplot2

Materi ini berasal dari sini dan situ

C.1 Membangkitkan Data

# Dummy data
data <- data.frame(
  day = as.Date("2017-06-14") - 0:364,
  value = runif(365) + seq(-140, 224)^2 / 10000
)

ggplot(data) +
  geom_line() +
  aes(x=day, y=value)

Jika ingin merubah Waktu pada x-axis, gunakan format berikut

ggplot(data) +
  geom_line() +
  aes(x=day, y=value)+
  scale_x_date(date_labels = "%d %b %y")

C.2 Menggunakan Dataset lainnya

# Data

data("economics")
str(economics)
## spec_tbl_df[,6] [574 x 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ date    : Date[1:574], format: "1967-07-01" "1967-08-01" ...
##  $ pce     : num [1:574] 507 510 516 512 517 ...
##  $ pop     : num [1:574] 198712 198911 199113 199311 199498 ...
##  $ psavert : num [1:574] 12.6 12.6 11.9 12.9 12.8 11.8 11.7 12.3 11.7 12.3 ...
##  $ uempmed : num [1:574] 4.5 4.7 4.6 4.9 4.7 4.8 5.1 4.5 4.1 4.6 ...
##  $ unemploy: num [1:574] 2944 2945 2958 3143 3066 ...

This dataset was produced from US economic time series data available from https://fred.stlouisfed.org/. economics is in “wide” format, economics_long is in “long” format.

A data frame with 574 rows and 6 variables:

date Month of data collection

pce personal consumption expenditures, in billions of dollars, https://fred.stlouisfed.org/series/PCE

pop total population, in thousands, https://fred.stlouisfed.org/series/POP

psavert personal savings rate, https://fred.stlouisfed.org/series/PSAVERT/

uempmed median duration of unemployment, in weeks, https://fred.stlouisfed.org/series/UEMPMED

unemploy number of unemployed in thousands, https://fred.stlouisfed.org/series/UNEMPLOY

library(tidyr)

df <- economics %>%
  select(date, psavert, uempmed) %>%
  gather(key = "variable", value = "value", -date)

ggplot(df, aes(x = date, y = value)) + 
  geom_line(aes(color = variable), size = 1) +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  theme_minimal()

# Area plot
ggplot(df, aes(x = date, y = value)) + 
  geom_area(aes(color = variable, fill = variable), 
            alpha = 0.5, position = position_dodge(0.8)) +
  scale_color_manual(values = c("#00AFBB", "#E7B800")) +
  scale_fill_manual(values = c("#00AFBB", "#E7B800"))

# Base plot with date axis
p <- ggplot(data = economics, aes(x = date, y = psavert)) + 
     geom_line(color = "#00AFBB", size = 1)
p

# Set axis limits c(min, max)
min <- as.Date("2002-1-1")
max <- NA
p + scale_x_date(limits = c(min, max))
## Warning: Removed 414 row(s) containing missing values (geom_path).

p + stat_smooth(
  color = "#FC4E07", fill = "#FC4E07",
  method = "loess"
  )
## `geom_smooth()` using formula 'y ~ x'

C.3 Menggunakan Package TSstudio

Materi ini, diambil dari sini.

install.packages("TSstudio")
eco.ts <- ts(economics[,6], start = c(1967, 7), frequency = 12)
library(TSstudio)
## Warning: package 'TSstudio' was built under R version 4.0.5
ts_info(eco.ts)
##  The eco.ts series is a ts object with 1 variable and 574 observations
##  Frequency: 12 
##  Start time: 1967 7 
##  End time: 2015 4
ts_plot(eco.ts,
        title = "US Monthly Number of Unemployed",
        Ytitle = "in Thousands",
        Xtitle = "Source: https://fred.stlouisfed.org/series/UNEMPLOY", 
        slider = TRUE)
ts_decompose(eco.ts)
ts_seasonal(eco.ts, type = "all")
ts_heatmap(eco.ts)

D. Spatial Data

library(raster)
## Loading required package: sp
## 
## Attaching package: 'raster'
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:dplyr':
## 
##     select
library(rgdal)
## rgdal: version: 1.5-23, (SVN revision 1121)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 3.2.1, released 2020/12/29
## Path to GDAL shared files: C:/Users/NrSD/Documents/R/win-library/4.0/rgdal/gdal
## GDAL binary built with GEOS: TRUE 
## Loaded PROJ runtime: Rel. 7.2.1, January 1st, 2021, [PJ_VERSION: 721]
## Path to PROJ shared files: C:/Users/NrSD/Documents/R/win-library/4.0/rgdal/proj
## PROJ CDN enabled: FALSE
## Linking to sp version:1.4-5
## To mute warnings of possible GDAL/OSR exportToProj4() degradation,
## use options("rgdal_show_exportToProj4_warnings"="none") before loading rgdal.
## Overwritten PROJ_LIB was C:/Users/NrSD/Documents/R/win-library/4.0/rgdal/proj
library(sp)

Materi ini bersumber dari sini.

ID_1<-getData("GADM", country="Indonesia",level=1)

head(ID_1@data)
##    GID_0    NAME_0   GID_1          NAME_1 VARNAME_1 NL_NAME_1    TYPE_1
## 1    IDN Indonesia IDN.1_1            Aceh      <NA>      <NA> Propinisi
## 12   IDN Indonesia IDN.2_1            Bali      <NA>      <NA> Propinisi
## 23   IDN Indonesia IDN.3_1 Bangka Belitung      <NA>      <NA> Propinisi
## 28   IDN Indonesia IDN.4_1          Banten      <NA>      <NA> Propinisi
## 29   IDN Indonesia IDN.5_1        Bengkulu      <NA>      <NA> Propinisi
## 30   IDN Indonesia IDN.6_1       Gorontalo      <NA>      <NA> Propinisi
##    ENGTYPE_1 CC_1 HASC_1
## 1   Province   11  ID.AC
## 12  Province   51  ID.BA
## 23  Province   19  ID.BB
## 28  Province   36  ID.BT
## 29  Province   17  ID.BE
## 30  Province   75  ID.GO
id<-fortify(ID_1,regions="VARNAME_1")
## Regions defined for each Polygons
head(id)
##       long      lat order  hole piece id group
## 1 98.17778 2.238341     1 FALSE     1  1   1.1
## 2 98.15694 2.206381     2 FALSE     1  1   1.1
## 3 98.15770 2.206381     3 FALSE     1  1   1.1
## 4 98.15590 2.203519     4 FALSE     1  1   1.1
## 5 98.15517 2.203666     5 FALSE     1  1   1.1
## 6 98.14831 2.193152     6 FALSE     1  1   1.1
tail(id)
##             long       lat order  hole piece id group
## 1592458 110.5537 -8.134108  9349 FALSE    15 27 27.15
## 1592459 110.5536 -8.134037  9350 FALSE    15 27 27.15
## 1592460 110.5536 -8.133841  9351 FALSE    15 27 27.15
## 1592461 110.5536 -8.133802  9352 FALSE    15 27 27.15
## 1592462 110.5537 -8.133867  9353 FALSE    15 27 27.15
## 1592463 110.5538 -8.133986  9354 FALSE    15 27 27.15
ggplot(data=id,
       aes(x=long,
           y=lat,
           group=group)) + 
  geom_point(fill="blue")

ggplot(data=id,
       aes(x=long,
           y=lat,
           group=group)) +
  geom_polygon(aes(fill=id),
               color="gray40",
               show.legend = F) +
  labs(title="Indonesia Map",
       x="Longitude",
       y="Latitude") +
  theme(plot.title = element_text(hjust = 0.5))

ggplot(data=id,
       aes(x=long,
           y=lat,
           group=group)) +
  geom_polygon(aes(fill=id),
               color="gray40",
               show.legend = F) +
  labs(title="Indonesia Map",
       x="Longitude",
       y="Latitude") +
  theme(plot.title = element_text(hjust = 0.5)) +
  coord_quickmap()