Indonesia Database for Policy and Economic Research (INDO-DAPOER)
Dataset
Indonesia Database for Policy and Economic Research (INDO-DAPOER) berisi lebih dari 200 indikator tingkat provinsi dan kabupaten yang mencakup empat kategori utama: fiskal, ekonomi, sosial dan demografi, serta infrastruktur. Dalam kesempatan ini akan dilakukan eksplorasi data INDO-DAPOER dengan berfokus pada bidang sosial-ekonomi pada tahun 2018.
Dari 200 indikator yang ada, akan dipilih 6 Variabel, yaitu: Â
1. Human Development Index Â
2. Literacy Rate Â
3. Number of people employed Â
4. Number of people underemployed Â
5. Number of people unemployed Â
6. Poverty Gap Â
7. Total Population
library(readxl)
library(ggplot2)## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'pillar'
library(ggpubr)## Warning: package 'ggpubr' was built under R version 4.1.2
## Warning: replacing previous import 'lifecycle::last_warnings' by
## 'rlang::last_warnings' when loading 'hms'
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(data.table)## Warning: package 'data.table' was built under R version 4.1.2
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(RColorBrewer)
library(sm)## Warning: package 'sm' was built under R version 4.1.3
## Package 'sm', version 2.2-5.7: type help(sm) for summary information
library(corrplot)## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
library(PerformanceAnalytics)## Warning: package 'PerformanceAnalytics' was built under R version 4.1.3
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
library(tidyverse)## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.7 v purrr 0.3.4
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x data.table::between() masks dplyr::between()
## x dplyr::filter() masks stats::filter()
## x xts::first() masks data.table::first(), dplyr::first()
## x dplyr::lag() masks stats::lag()
## x xts::last() masks data.table::last(), dplyr::last()
## x purrr::transpose() masks data.table::transpose()
library(hrbrthemes)## Warning: package 'hrbrthemes' was built under R version 4.1.2
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(viridis)## Warning: package 'viridis' was built under R version 4.1.3
## Loading required package: viridisLite
library(gridExtra)##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(ggrepel)## Warning: package 'ggrepel' was built under R version 4.1.2
library(plotly)##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
df <- read_excel("D:/Kuliah/STA563 - Eksplorasi dan Visualisasi Data/Data Indo Dapoer.xlsx")
head(df)## # A tibble: 6 x 9
## ProvincesName Time IDX.HDI.REV SE.LITR.15UP.ZS SL.EMP.TOTL SL.EMP.UNDR
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Bali, Prop. 2015 73.3 0.928 0.560 0.115
## 2 Bali, Prop. 2016 73.6 0.928 0.575 0.123
## 3 Bali, Prop. 2017 74.3 0.929 0.565 0.133
## 4 Bali, Prop. 2018 74.8 0.930 0.580 0.141
## 5 Banten, Prop. 2015 70.3 0.974 0.404 0.213
## 6 Banten, Prop. 2016 71.0 0.975 0.417 0.226
## # ... with 3 more variables: SL.UEM.TOTL <dbl>, SI.POV.NGAP <dbl>,
## # SP.POP.TOTL <dbl>
Melihat summary data
df1 <- filter(df, Time == 2018)
summary(df1)## ProvincesName Time IDX.HDI.REV SE.LITR.15UP.ZS
## Length:34 Min. :2018 Min. :60.06 Min. :0.7679
## Class :character 1st Qu.:2018 1st Qu.:68.87 1st Qu.:0.9370
## Mode :character Median :2018 Median :70.64 Median :0.9789
## Mean :2018 Mean :70.39 Mean :0.9600
## 3rd Qu.:2018 3rd Qu.:71.62 3rd Qu.:0.9884
## Max. :2018 Max. :80.47 Max. :0.9987
## SL.EMP.TOTL SL.EMP.UNDR SL.UEM.TOTL SI.POV.NGAP
## Min. :0.3947 Min. :0.02396 Min. :0.004286 Min. :0.00510
## 1st Qu.:0.4330 1st Qu.:0.07429 1st Qu.:0.010454 1st Qu.:0.01010
## Median :0.4560 Median :0.14800 Median :0.018721 Median :0.01555
## Mean :0.4644 Mean :0.25201 Mean :0.049707 Mean :0.01965
## 3rd Qu.:0.4830 3rd Qu.:0.24699 3rd Qu.:0.042207 3rd Qu.:0.02525
## Max. :0.5803 Max. :1.48480 Max. :0.445903 Max. :0.06730
## SP.POP.TOTL
## Min. : 716407
## 1st Qu.: 2223489
## Median : 3992784
## Mean : 7794568
## 3rd Qu.: 7981467
## Max. :48683861
Bar Chart
hdi <- df1 %>% select(ProvincesName, IDX.HDI.REV) %>% arrange(desc(IDX.HDI.REV))
head(hdi)## # A tibble: 6 x 2
## ProvincesName IDX.HDI.REV
## <chr> <dbl>
## 1 DKI Jakarta, Prop. 80.5
## 2 DI Yogyakarta, Prop. 79.5
## 3 Kalimantan Timur, Prop. 75.8
## 4 Kepulauan Riau, Prop. 74.8
## 5 Bali, Prop. 74.8
## 6 Riau, Prop. 72.4
Human Development Index (HDI) adalah pengukuran perbandingan dari harapan hidup, melek huruf, pendidikan dan standar hidup. Tiga provinsi dengan HDI terbesar di Indonesia pada tahun 2018 adalah DKI Jakarta, DI Yogyakarta, dan Kalimantan Timur. Jika dilihat dari tahun ke tahun, berikut HDI masing-masing provinsi.
dfj <- filter(df, ProvincesName == 'DKI Jakarta, Prop.')
dfy <- filter(df, ProvincesName == 'DI Yogyakarta, Prop.')
dfkt <- filter(df, ProvincesName == 'Kalimantan Timur, Prop.')
jkt <- ggplot(data=dfj, aes(x=Time, y=IDX.HDI.REV)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
theme_minimal()
ygy <- ggplot(data=dfy, aes(x=Time, y=IDX.HDI.REV)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
theme_minimal()
kalt <- ggplot(data=dfkt, aes(x=Time, y=IDX.HDI.REV)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
theme_minimal()
ggarrange(jkt, ygy, kalt)hdi2 <- df1 %>% select(ProvincesName, IDX.HDI.REV) %>% arrange(IDX.HDI.REV)
head(hdi2)## # A tibble: 6 x 2
## ProvincesName IDX.HDI.REV
## <chr> <dbl>
## 1 Papua, Prop. 60.1
## 2 Papua Barat, Prop. 63.7
## 3 Nusa Tenggara Timur, Prop. 64.4
## 4 Sulawesi Barat, Prop. 65.1
## 5 Kalimantan Barat, Prop. 67.0
## 6 Nusa Tenggara Barat, Prop. 67.3
Sementara 3 provinsi dengan HDI terendah adalah Papua, Papua Barat, dan Nusa Tenggara Timur.
dfp <- filter(df, ProvincesName == 'Papua, Prop.')
dfpb <- filter(df, ProvincesName == 'Papua Barat, Prop.')
dfntt <- filter(df, ProvincesName == 'Nusa Tenggara Timur, Prop.')
pap <- ggplot(data=dfp, aes(x=Time, y=IDX.HDI.REV)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
theme_minimal()
papb <- ggplot(data=dfpb, aes(x=Time, y=IDX.HDI.REV)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
theme_minimal()
ntt <- ggplot(data=dfntt, aes(x=Time, y=IDX.HDI.REV)) +
geom_bar(stat="identity", fill="steelblue")+
geom_text(aes(label=IDX.HDI.REV), vjust=1.6, color="white", size=3.5)+
theme_minimal()
ggarrange(pap, papb, ntt)
# Pie Chart
pop <- df1 %>% select(ProvincesName, SP.POP.TOTL) %>% arrange(desc(SP.POP.TOTL))
head(pop)## # A tibble: 6 x 2
## ProvincesName SP.POP.TOTL
## <chr> <dbl>
## 1 Jawa Barat, Prop. 48683861
## 2 Jawa Timur, Prop. 39500851
## 3 Jawa Tengah, Prop. 34490835
## 4 Sumatera Utara, Prop. 14415391
## 5 Banten, Prop. 12689736
## 6 DKI Jakarta, Prop. 10467629
top <- head(pop,6)
bottom <- tail(pop,6)
pals <- brewer.pal(6, "Paired")
ph <- pie(top$SP.POP.TOTL, labels = c("Jawa Barat, Prop.", "Jawa Timur, Prop.", "Jawa Tengah, Prop.", "Sumatera Utara, Prop.", "Banten, Prop.", "DKI Jakarta, Prop."), border="white", col=pals)pt <- pie(bottom$SP.POP.TOTL, labels = c("Kepulauan Bangka-Belitung, Prop.", "Sulawesi Barat, Prop.", "Maluku Utara, Prop.", "Gorontalo, Prop.", "Papua Barat, Prop.", "Kalimantan Utara, Prop."), col=pals)Histogram
a <- ggplot(data = df1,aes(x=IDX.HDI.REV))+ geom_histogram(colour = "black", fill="dark red")
b <- ggplot(data = df1,aes(x=SE.LITR.15UP.ZS))+ geom_histogram(colour = "black", fill="dark green")
c <- ggplot(data = df1, aes(x=SL.EMP.TOTL)) + geom_histogram(colour = "black", fill="light blue")
d <- ggplot(data = df1, aes(x=SL.EMP.UNDR)) + geom_histogram(colour = "black", fill="yellow")
e <- ggplot(data = df1, aes(x=SL.UEM.TOTL)) + geom_histogram(colour = "black", fill="tan")
f <- ggplot(data = df1, aes(x=SI.POV.NGAP)) + geom_histogram(colour = "black", fill="purple")
ggarrange(a,b,c,d,e,f)## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Density Plot
m <- ggplot(data = df1,aes(x=IDX.HDI.REV))+ geom_density(colour = "black", fill="dark red")
n <- ggplot(data = df1,aes(x=SE.LITR.15UP.ZS))+ geom_density(colour = "black", fill="dark green")
o <- ggplot(data = df1, aes(x=SL.EMP.TOTL)) + geom_density(colour = "black", fill="light blue")
p <- ggplot(data = df1, aes(x=SL.EMP.UNDR)) + geom_density(colour = "black", fill="yellow")
q <- ggplot(data = df1, aes(x=SL.UEM.TOTL)) + geom_density(colour = "black", fill="tan")
r <- ggplot(data = df1, aes(x=SI.POV.NGAP)) + geom_density(colour = "black", fill="purple")
ggarrange(m,n,o,p,q,r)
# Box Plot
g <- ggplot(data = df1,aes(x=IDX.HDI.REV))+ geom_boxplot(colour = "black", fill="dark red")
h <- ggplot(data = df1,aes(x=SE.LITR.15UP.ZS))+ geom_boxplot(colour = "black", fill="dark green")
i <- ggplot(data = df1, aes(x=SL.EMP.TOTL)) + geom_boxplot(colour = "black", fill="light blue")
j <- ggplot(data = df1, aes(x=SL.EMP.UNDR)) + geom_boxplot(colour = "black", fill="yellow")
k <- ggplot(data = df1, aes(x=SL.UEM.TOTL)) + geom_boxplot(colour = "black", fill="tan")
l <- ggplot(data = df1, aes(x=SI.POV.NGAP)) + geom_boxplot(colour = "black", fill="purple")
ggarrange(g,h,i,j,k,l)Scatter Plot
attach(df1)
plot(IDX.HDI.REV, SE.LITR.15UP.ZS, ylab="Literacy Rate",
xlab = "Human Development Index",
pch = 19, col = "navy", cex = 1.5)attach(df1)## The following objects are masked from df1 (pos = 3):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
plot(IDX.HDI.REV, SL.EMP.TOTL, ylab="Number of People Employed",
xlab = "Human Development Index",
pch = 19, col = "navy", cex = 1.5)attach(df1)## The following objects are masked from df1 (pos = 3):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
## The following objects are masked from df1 (pos = 4):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
plot(IDX.HDI.REV, SL.EMP.UNDR, ylab="Number of People Under Employed",
xlab = "Human Development Index",
pch = 19, col = "navy", cex = 1.5)attach(df1)## The following objects are masked from df1 (pos = 3):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
## The following objects are masked from df1 (pos = 4):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
## The following objects are masked from df1 (pos = 5):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
plot(IDX.HDI.REV, SL.UEM.TOTL, ylab="Number of People Unmployed",
xlab = "Human Development Index",
pch = 19, col = "navy", cex = 1.5)attach(df1)## The following objects are masked from df1 (pos = 3):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
## The following objects are masked from df1 (pos = 4):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
## The following objects are masked from df1 (pos = 5):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
## The following objects are masked from df1 (pos = 6):
##
## IDX.HDI.REV, ProvincesName, SE.LITR.15UP.ZS, SI.POV.NGAP,
## SL.EMP.TOTL, SL.EMP.UNDR, SL.UEM.TOTL, SP.POP.TOTL, Time
plot(IDX.HDI.REV, SI.POV.NGAP, ylab="Poverty Gap",
xlab = "Human Development Index",
pch = 19, col = "navy", cex = 1.5)Matrix Correlation
dfc <- df1[,-c(1:2)]
matcorr <- cor(dfc)
matcorr## IDX.HDI.REV SE.LITR.15UP.ZS SL.EMP.TOTL SL.EMP.UNDR SL.UEM.TOTL
## IDX.HDI.REV 1.00000000 0.50914054 0.08437649 0.04092159 0.14931583
## SE.LITR.15UP.ZS 0.50914054 1.00000000 -0.38796882 -0.15628285 0.04508805
## SL.EMP.TOTL 0.08437649 -0.38796882 1.00000000 0.18251748 -0.07200936
## SL.EMP.UNDR 0.04092159 -0.15628285 0.18251748 1.00000000 0.81529331
## SL.UEM.TOTL 0.14931583 0.04508805 -0.07200936 0.81529331 1.00000000
## SI.POV.NGAP -0.67677347 -0.53145049 0.07476628 -0.05124510 -0.15133912
## SP.POP.TOTL 0.13330845 -0.04680084 0.06258340 0.94440144 0.95705430
## SI.POV.NGAP SP.POP.TOTL
## IDX.HDI.REV -0.67677347 0.13330845
## SE.LITR.15UP.ZS -0.53145049 -0.04680084
## SL.EMP.TOTL 0.07476628 0.06258340
## SL.EMP.UNDR -0.05124510 0.94440144
## SL.UEM.TOTL -0.15133912 0.95705430
## SI.POV.NGAP 1.00000000 -0.12725757
## SP.POP.TOTL -0.12725757 1.00000000
corrplot(matcorr, type="upper", order="hclust",
tl.col = "black", tl.srt = 45)chart.Correlation(dfc, histogram=TRUE, pch=19)Bubble Plot
df1 %>%
mutate(SP.POP.TOTL=SP.POP.TOTL/100000) %>%
arrange(desc(SP.POP.TOTL)) %>%
ggplot(aes(x=SE.LITR.15UP.ZS, y=IDX.HDI.REV, size = SP.POP.TOTL, color = ProvincesName)) +
geom_point(alpha=0.7) +
scale_size(range = c(1.4, 10), name="Population (M)") +
scale_color_viridis(discrete=TRUE, guide=FALSE) +
theme_ipsum() +
theme(legend.position="bottom") +
geom_text_repel(aes(label=ProvincesName), size=3)## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning: It is deprecated to specify `guide = FALSE` to remove a guide. Please
## use `guide = "none"` instead.
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning: ggrepel: 14 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
Local Regression
lrhdi <- range(df1$IDX.HDI.REV)
plot(df1$IDX.HDI.REV, df1$SI.POV.NGAP, xlim=lrhdi ,cex =.5, col ="black", ylab="Poverty Gap", xlab="Human Development Index")
title (" Local Regression ")
fit=loess(SI.POV.NGAP~IDX.HDI.REV ,span =.5, data=df1)
fit2=loess(SI.POV.NGAP~IDX.HDI.REV ,span =.7, data=df1)
fit3=loess(SI.POV.NGAP~IDX.HDI.REV ,span =.9, data=df1)
hdi.grid=seq(from=lrhdi[1], to=lrhdi[2])
lines(hdi.grid,predict(fit ,data.frame(IDX.HDI.REV=hdi.grid)), col ="red",lwd =2)
lines(hdi.grid,predict(fit2 ,data.frame(IDX.HDI.REV=hdi.grid)), col ="blue",lwd =2)
lines(hdi.grid,predict(fit3 ,data.frame(IDX.HDI.REV=hdi.grid)), col ="dark green",lwd =2)
legend("topright",legend =c("Span=0.5", "Span=0.7", "Span=0.9"), col=c("red","blue", "dark green"),lty =1,
lwd =2, cex =.8)