Indonesia Database for Policy and Economic Research (INDO-DAPOER)

Dataset

Indonesia Database for Policy and Economic Research (INDO-DAPOER) berisi lebih dari 200 indikator tingkat provinsi dan kabupaten yang mencakup empat kategori utama: fiskal, ekonomi, sosial dan demografi, serta infrastruktur. Dalam kesempatan ini akan dilakukan eksplorasi data INDO-DAPOER dengan berfokus pada bidang sosial-ekonomi pada tahun 2018.

Dari 200 indikator yang ada, akan dipilih 6 Variabel, yaitu:
1. Human Development Index
2. Literacy Rate
3. Number of people employed
4. Number of people underemployed
5. Number of people unemployed
6. Poverty Gap
7. Total Population

library(readxl)
library(ggplot2)

## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.1.2

## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'hms'

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(data.table)

## Warning: package 'data.table' was built under R version 4.1.2

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(RColorBrewer)
library(sm)

## Warning: package 'sm' was built under R version 4.1.3

## Package 'sm', version 2.2-5.7: type help(sm) for summary information

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.1.2

## corrplot 0.92 loaded

library(PerformanceAnalytics)

## Warning: package 'PerformanceAnalytics' was built under R version 4.1.3

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## Attaching package: 'xts'

## The following objects are masked from 'package:data.table':
## 
##     first, last

## The following objects are masked from 'package:dplyr':
## 
##     first, last

## 
## Attaching package: 'PerformanceAnalytics'

## The following object is masked from 'package:graphics':
## 
##     legend

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.3

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v tibble  3.1.7     v purrr   0.3.4
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1

## Warning: package 'tibble' was built under R version 4.1.3

## Warning: package 'readr' was built under R version 4.1.2

## Warning: package 'forcats' was built under R version 4.1.2

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x data.table::between() masks dplyr::between()
## x dplyr::filter()       masks stats::filter()
## x xts::first()          masks data.table::first(), dplyr::first()
## x dplyr::lag()          masks stats::lag()
## x xts::last()           masks data.table::last(), dplyr::last()
## x purrr::transpose()    masks data.table::transpose()

library(hrbrthemes)

## Warning: package 'hrbrthemes' was built under R version 4.1.2

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.

##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and

##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

library(viridis)

## Warning: package 'viridis' was built under R version 4.1.3

## Loading required package: viridisLite

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.1.2

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

df <- read_excel("D:/Kuliah/STA563 - Eksplorasi dan Visualisasi Data/Data Indo Dapoer.xlsx")
head(df)

## # A tibble: 6 x 9
##   ProvincesName  Time IDX.HDI.REV SE.LITR.15UP.ZS SL.EMP.TOTL SL.EMP.UNDR
##   <chr>         <dbl>       <dbl>           <dbl>       <dbl>       <dbl>
## 1 Bali, Prop.    2015        73.3           0.928       0.560       0.115
## 2 Bali, Prop.    2016        73.6           0.928       0.575       0.123
## 3 Bali, Prop.    2017        74.3           0.929       0.565       0.133
## 4 Bali, Prop.    2018        74.8           0.930       0.580       0.141
## 5 Banten, Prop.  2015        70.3           0.974       0.404       0.213
## 6 Banten, Prop.  2016        71.0           0.975       0.417       0.226
## # ... with 3 more variables: SL.UEM.TOTL <dbl>, SI.POV.NGAP <dbl>,
## #   SP.POP.TOTL <dbl>

Melihat summary data

df1 <- filter(df, Time == 2018)
summary(df1)

##  ProvincesName           Time       IDX.HDI.REV    SE.LITR.15UP.ZS 
##  Length:34          Min.   :2018   Min.   :60.06   Min.   :0.7679  
##  Class :character   1st Qu.:2018   1st Qu.:68.87   1st Qu.:0.9370  
##  Mode  :character   Median :2018   Median :70.64   Median :0.9789  
##                     Mean   :2018   Mean   :70.39   Mean   :0.9600  
##                     3rd Qu.:2018   3rd Qu.:71.62   3rd Qu.:0.9884  
##                     Max.   :2018   Max.   :80.47   Max.   :0.9987  
##   SL.EMP.TOTL      SL.EMP.UNDR       SL.UEM.TOTL        SI.POV.NGAP     
##  Min.   :0.3947   Min.   :0.02396   Min.   :0.004286   Min.   :0.00510  
##  1st Qu.:0.4330   1st Qu.:0.07429   1st Qu.:0.010454   1st Qu.:0.01010  
##  Median :0.4560   Median :0.14800   Median :0.018721   Median :0.01555  
##  Mean   :0.4644   Mean   :0.25201   Mean   :0.049707   Mean   :0.01965  
##  3rd Qu.:0.4830   3rd Qu.:0.24699   3rd Qu.:0.042207   3rd Qu.:0.02525  
##  Max.   :0.5803   Max.   :1.48480   Max.   :0.445903   Max.   :0.06730  
##   SP.POP.TOTL      
##  Min.   :  716407  
##  1st Qu.: 2223489  
##  Median : 3992784  
##  Mean   : 7794568  
##  3rd Qu.: 7981467  
##  Max.   :48683861

Bar Chart

hdi <- df1 %>% select(ProvincesName, IDX.HDI.REV) %>% arrange(desc(IDX.HDI.REV))
head(hdi)

## # A tibble: 6 x 2
##   ProvincesName           IDX.HDI.REV
##   <chr>                         <dbl>
## 1 DKI Jakarta, Prop.             80.5
## 2 DI Yogyakarta, Prop.           79.5
## 3 Kalimantan Timur, Prop.        75.8
## 4 Kepulauan Riau, Prop.          74.8
## 5 Bali, Prop.                    74.8
## 6 Riau, Prop.                    72.4

Human Development Index (HDI) adalah pengukuran perbandingan dari harapan hidup, melek huruf, pendidikan dan standar hidup. Tiga provinsi dengan HDI terbesar di Indonesia pada tahun 2018 adalah DKI Jakarta, DI Yogyakarta, dan Kalimantan Timur. Jika dilihat dari tahun ke tahun, berikut HDI masing-masing provinsi.

dfj <- filter(df, ProvincesName == 'DKI Jakarta, Prop.')
dfy <- filter(df, ProvincesName == 'DI Yogyakarta, Prop.')
dfkt <- filter(df, ProvincesName == 'Kalimantan Timur, Prop.')

jkt <- ggplot(data=dfj, aes(x=Time, y=IDX.HDI.REV)) +
      geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
  theme_minimal()

ygy <- ggplot(data=dfy, aes(x=Time, y=IDX.HDI.REV)) +
      geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
  theme_minimal()

kalt <- ggplot(data=dfkt, aes(x=Time, y=IDX.HDI.REV)) +
      geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
  theme_minimal()

ggarrange(jkt, ygy, kalt)

hdi2 <- df1 %>% select(ProvincesName, IDX.HDI.REV) %>% arrange(IDX.HDI.REV)
head(hdi2)

## # A tibble: 6 x 2
##   ProvincesName              IDX.HDI.REV
##   <chr>                            <dbl>
## 1 Papua, Prop.                      60.1
## 2 Papua Barat, Prop.                63.7
## 3 Nusa Tenggara Timur, Prop.        64.4
## 4 Sulawesi Barat, Prop.             65.1
## 5 Kalimantan Barat, Prop.           67.0
## 6 Nusa Tenggara Barat, Prop.        67.3

Sementara 3 provinsi dengan HDI terendah adalah Papua, Papua Barat, dan Nusa Tenggara Timur.

dfp <- filter(df, ProvincesName == 'Papua, Prop.')
dfpb <- filter(df, ProvincesName == 'Papua Barat, Prop.')
dfntt <- filter(df, ProvincesName == 'Nusa Tenggara Timur, Prop.')

pap <- ggplot(data=dfp, aes(x=Time, y=IDX.HDI.REV)) +
      geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
  theme_minimal()

papb <- ggplot(data=dfpb, aes(x=Time, y=IDX.HDI.REV)) +
      geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
  theme_minimal()

ntt <- ggplot(data=dfntt, aes(x=Time, y=IDX.HDI.REV)) +
      geom_bar(stat="identity", fill="steelblue")+
  geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
  theme_minimal()

ggarrange(pap, papb, ntt)

# Pie Chart

pop <- df1 %>% select(ProvincesName, SP.POP.TOTL) %>% arrange(desc(SP.POP.TOTL))
head(pop)

## # A tibble: 6 x 2
##   ProvincesName         SP.POP.TOTL
##   <chr>                       <dbl>
## 1 Jawa Barat, Prop.        48683861
## 2 Jawa Timur, Prop.        39500851
## 3 Jawa Tengah, Prop.       34490835
## 4 Sumatera Utara, Prop.    14415391
## 5 Banten, Prop.            12689736
## 6 DKI Jakarta, Prop.       10467629

top <- head(pop,6)
bottom <- tail(pop,6)
pals <- brewer.pal(6, "Paired") 
ph <- pie(top$SP.POP.TOTL, labels = c("Jawa Barat, Prop.", "Jawa Timur, Prop.", "Jawa Tengah, Prop.", "Sumatera Utara, Prop.", "Banten, Prop.", "DKI Jakarta, Prop."), border="white", col=pals)

pt <- pie(bottom$SP.POP.TOTL, labels = c("Kepulauan Bangka-Belitung, Prop.", "Sulawesi Barat, Prop.", "Maluku Utara, Prop.", "Gorontalo, Prop.", "Papua Barat, Prop.", "Kalimantan Utara, Prop."), col=pals)

Histogram

a <- ggplot(data = df1,aes(x=IDX.HDI.REV))+ geom_histogram(colour = "black", fill="dark red")
b <- ggplot(data = df1,aes(x=SE.LITR.15UP.ZS))+ geom_histogram(colour = "black", fill="dark green")
c <- ggplot(data = df1, aes(x=SL.EMP.TOTL)) + geom_histogram(colour = "black", fill="light blue")
d <- ggplot(data = df1, aes(x=SL.EMP.UNDR)) + geom_histogram(colour = "black", fill="yellow")
e <- ggplot(data = df1, aes(x=SL.UEM.TOTL)) + geom_histogram(colour = "black", fill="tan")
f <- ggplot(data = df1, aes(x=SI.POV.NGAP)) + geom_histogram(colour = "black", fill="purple")

ggarrange(a,b,c,d,e,f)

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Density Plot

m <- ggplot(data = df1,aes(x=IDX.HDI.REV))+ geom_density(colour = "black", fill="dark red")
n <- ggplot(data = df1,aes(x=SE.LITR.15UP.ZS))+ geom_density(colour = "black", fill="dark green")
o <- ggplot(data = df1, aes(x=SL.EMP.TOTL)) + geom_density(colour = "black", fill="light blue")
p <- ggplot(data = df1, aes(x=SL.EMP.UNDR)) + geom_density(colour = "black", fill="yellow")
q <- ggplot(data = df1, aes(x=SL.UEM.TOTL)) + geom_density(colour = "black", fill="tan")
r <- ggplot(data = df1, aes(x=SI.POV.NGAP)) + geom_density(colour = "black", fill="purple")

ggarrange(m,n,o,p,q,r)

# Box Plot

g <- ggplot(data = df1,aes(x=IDX.HDI.REV))+ geom_boxplot(colour = "black", fill="dark red")
h <- ggplot(data = df1,aes(x=SE.LITR.15UP.ZS))+ geom_boxplot(colour = "black", fill="dark green")
i <- ggplot(data = df1, aes(x=SL.EMP.TOTL)) + geom_boxplot(colour = "black", fill="light blue")
j <- ggplot(data = df1, aes(x=SL.EMP.UNDR)) + geom_boxplot(colour = "black", fill="yellow")
k <- ggplot(data = df1, aes(x=SL.UEM.TOTL)) + geom_boxplot(colour = "black", fill="tan")
l <- ggplot(data = df1, aes(x=SI.POV.NGAP)) + geom_boxplot(colour = "black", fill="purple")

ggarrange(g,h,i,j,k,l)

Scatter Plot

attach(df1)
plot(IDX.HDI.REV, SE.LITR.15UP.ZS, ylab="Literacy Rate",
     xlab = "Human Development Index",
     pch = 19, col = "navy", cex = 1.5)

attach(df1)

## The following objects are masked from df1 (pos = 3):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

plot(IDX.HDI.REV,  SL.EMP.TOTL, ylab="Number of People Employed",
     xlab = "Human Development Index",
     pch = 19, col = "navy", cex = 1.5)

attach(df1)

## The following objects are masked from df1 (pos = 3):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

## The following objects are masked from df1 (pos = 4):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

plot(IDX.HDI.REV,  SL.EMP.UNDR, ylab="Number of People Under Employed",
     xlab = "Human Development Index",
     pch = 19, col = "navy", cex = 1.5)

attach(df1)

## The following objects are masked from df1 (pos = 3):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

## The following objects are masked from df1 (pos = 4):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

## The following objects are masked from df1 (pos = 5):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

plot(IDX.HDI.REV,  SL.UEM.TOTL, ylab="Number of People Unmployed",
     xlab = "Human Development Index",
     pch = 19, col = "navy", cex = 1.5)

attach(df1)

## The following objects are masked from df1 (pos = 3):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

## The following objects are masked from df1 (pos = 4):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

## The following objects are masked from df1 (pos = 5):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

## The following objects are masked from df1 (pos = 6):
## 
##     IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
##     SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time

plot(IDX.HDI.REV,  SI.POV.NGAP, ylab="Poverty Gap",
     xlab = "Human Development Index",
     pch = 19, col = "navy", cex = 1.5)

Matrix Correlation

dfc <- df1[,-c(1:2)]
matcorr <- cor(dfc)
matcorr

##                 IDX.HDI.REV SE.LITR.15UP.ZS SL.EMP.TOTL SL.EMP.UNDR SL.UEM.TOTL
## IDX.HDI.REV      1.00000000      0.50914054  0.08437649  0.04092159  0.14931583
## SE.LITR.15UP.ZS  0.50914054      1.00000000 -0.38796882 -0.15628285  0.04508805
## SL.EMP.TOTL      0.08437649     -0.38796882  1.00000000  0.18251748 -0.07200936
## SL.EMP.UNDR      0.04092159     -0.15628285  0.18251748  1.00000000  0.81529331
## SL.UEM.TOTL      0.14931583      0.04508805 -0.07200936  0.81529331  1.00000000
## SI.POV.NGAP     -0.67677347     -0.53145049  0.07476628 -0.05124510 -0.15133912
## SP.POP.TOTL      0.13330845     -0.04680084  0.06258340  0.94440144  0.95705430
##                 SI.POV.NGAP SP.POP.TOTL
## IDX.HDI.REV     -0.67677347  0.13330845
## SE.LITR.15UP.ZS -0.53145049 -0.04680084
## SL.EMP.TOTL      0.07476628  0.06258340
## SL.EMP.UNDR     -0.05124510  0.94440144
## SL.UEM.TOTL     -0.15133912  0.95705430
## SI.POV.NGAP      1.00000000 -0.12725757
## SP.POP.TOTL     -0.12725757  1.00000000

corrplot(matcorr, type="upper", order="hclust", 
         tl.col = "black", tl.srt = 45)

chart.Correlation(dfc, histogram=TRUE, pch=19)

Bubble Plot

df1 %>%
  mutate(SP.POP.TOTL=SP.POP.TOTL/100000) %>%
  arrange(desc(SP.POP.TOTL)) %>%
ggplot(aes(x=SE.LITR.15UP.ZS, y=IDX.HDI.REV, size = SP.POP.TOTL, color = ProvincesName)) +
    geom_point(alpha=0.7) +
    scale_size(range = c(1.4, 10), name="Population (M)") +
    scale_color_viridis(discrete=TRUE, guide=FALSE) +
    theme_ipsum() +
    theme(legend.position="bottom") +
    geom_text_repel(aes(label=ProvincesName), size=3)

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

Local Regression

lrhdi <- range(df1$IDX.HDI.REV)
plot(df1$IDX.HDI.REV, df1$SI.POV.NGAP, xlim=lrhdi ,cex =.5, col ="black", ylab="Poverty Gap", xlab="Human Development Index")
title (" Local Regression ")
fit=loess(SI.POV.NGAP~IDX.HDI.REV ,span =.5, data=df1)
fit2=loess(SI.POV.NGAP~IDX.HDI.REV ,span =.7, data=df1)
fit3=loess(SI.POV.NGAP~IDX.HDI.REV ,span =.9, data=df1)
hdi.grid=seq(from=lrhdi[1], to=lrhdi[2])
lines(hdi.grid,predict(fit ,data.frame(IDX.HDI.REV=hdi.grid)), col ="red",lwd =2)
lines(hdi.grid,predict(fit2 ,data.frame(IDX.HDI.REV=hdi.grid)), col ="blue",lwd =2)
lines(hdi.grid,predict(fit3 ,data.frame(IDX.HDI.REV=hdi.grid)), col ="dark green",lwd =2)
legend("topright",legend =c("Span=0.5", "Span=0.7", "Span=0.9"), col=c("red","blue", "dark green"),lty =1, 
       lwd =2, cex =.8)