Data Exploration Using World Bank

The Indonesia Database for Policy and Economic Research (INDO-DAPOER) contains relevant economic and social indicators at the province- and district-level, which span across four main categories: fiscal, economic, social and demographic, as well as infrastructure. This report contains exploration, such as:

Histogram
Boxplot
Density Plot
Pie Chart
Scatter Plot
Matrix Correlation
Bubble Plot
Local Regression

Data Cleansing

library(readxl)
datadapoer <- read_excel("D:/S2 Statistika/Ekplorasi dan Visualisasi/datadapoer.xlsx")
head(datadapoer)

## # A tibble: 6 × 14
##   Provinces Na…¹ Provi…² Serie…³ Serie…⁴ 2011 …⁵ 2012 …⁶ 2013 …⁷ 2014 …⁸ 2015 …⁹
##   <chr>          <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
## 1 Bali, Prop.    ID.BA   Agricu… FC.XPD… 326780… 149132… ..      ..      ..     
## 2 Bali, Prop.    ID.BA   Averag… SE.NEX… ..      ..      ..      ..      ..     
## 3 Bali, Prop.    ID.BA   Averag… SE.NEX… ..      ..      ..      ..      ..     
## 4 Bali, Prop.    ID.BA   Averag… SE.NEX… ..      ..      ..      ..      ..     
## 5 Bali, Prop.    ID.BA   Birth … SH.STA… 96.457  96.9896 98.494… 98.354… 99.809…
## 6 Bali, Prop.    ID.BA   BPK Au… BPK.AU… 2       2       5       1       1      
## # … with 5 more variables: `2016 [YR2016]` <chr>, `2017 [YR2017]` <chr>,
## #   `2018 [YR2018]` <chr>, `2019 [YR2019]` <chr>, `2020 [YR2020]` <chr>, and
## #   abbreviated variable names ¹`Provinces Name`, ²`Provinces Code`,
## #   ³`Series Name`, ⁴`Series Code`, ⁵`2011 [YR2011]`, ⁶`2012 [YR2012]`,
## #   ⁷`2013 [YR2013]`, ⁸`2014 [YR2014]`, ⁹`2015 [YR2015]`

library(tidyverse)
names(datadapoer) <- names(datadapoer) %>% make.names()
head(datadapoer)

## # A tibble: 6 × 14
##   Provinces.Name Provi…¹ Serie…² Serie…³ X2011…⁴ X2012…⁵ X2013…⁶ X2014…⁷ X2015…⁸
##   <chr>          <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
## 1 Bali, Prop.    ID.BA   Agricu… FC.XPD… 326780… 149132… ..      ..      ..     
## 2 Bali, Prop.    ID.BA   Averag… SE.NEX… ..      ..      ..      ..      ..     
## 3 Bali, Prop.    ID.BA   Averag… SE.NEX… ..      ..      ..      ..      ..     
## 4 Bali, Prop.    ID.BA   Averag… SE.NEX… ..      ..      ..      ..      ..     
## 5 Bali, Prop.    ID.BA   Birth … SH.STA… 96.457  96.9896 98.494… 98.354… 99.809…
## 6 Bali, Prop.    ID.BA   BPK Au… BPK.AU… 2       2       5       1       1      
## # … with 5 more variables: X2016..YR2016. <chr>, X2017..YR2017. <chr>,
## #   X2018..YR2018. <chr>, X2019..YR2019. <chr>, X2020..YR2020. <chr>, and
## #   abbreviated variable names ¹Provinces.Code, ²Series.Name, ³Series.Code,
## #   ⁴X2011..YR2011., ⁵X2012..YR2012., ⁶X2013..YR2013., ⁷X2014..YR2014.,
## #   ⁸X2015..YR2015.

Data Filtering

1. Goods and services expenditure

gos <- datadapoer %>% filter(Series.Name=="Goods and services expenditure (in IDR)")  %>% 
  select(c(Provinces.Name,5:14))
colnames(gos) <- c('Provinces', 2011:2020)
library(data.table)
gos <- melt(setDT(gos), id.vars = c("Provinces"), variable.name = "Year")
gos$value <- as.numeric(gos$value)/(10^12)
gos2 <- gos
head(gos)

##               Provinces Year     value
## 1:          Bali, Prop. 2011 0.5093046
## 2:        Banten, Prop. 2011 0.9726174
## 3:      Bengkulu, Prop. 2011 0.2375907
## 4: DI Yogyakarta, Prop. 2011 0.3743235
## 5:   DKI Jakarta, Prop. 2011 8.3418312
## 6:     Gorontalo, Prop. 2011 0.2254455

2. Household Access to Electricity: Total (in % of total household)

elec <- datadapoer %>% filter(Series.Name=="Household Access to Electricity: Total (in % of total household)")  %>% 
  select(c(Provinces.Name,5:14))
colnames(elec) <- c('Provinces', 2011:2020)
library(data.table)
elec <- melt(setDT(elec), id.vars = c("Provinces"), variable.name = "Year")
elec$value <- as.numeric(elec$value)
head(elec)

##               Provinces Year    value
## 1:          Bali, Prop. 2011 98.85030
## 2:        Banten, Prop. 2011 99.27480
## 3:      Bengkulu, Prop. 2011 87.69487
## 4: DI Yogyakarta, Prop. 2011 99.54836
## 5:   DKI Jakarta, Prop. 2011 99.94611
## 6:     Gorontalo, Prop. 2011 80.87733

3. Household per capita expenditure (in IDR)

house <- datadapoer %>% filter(Series.Name=="Household per capita expenditure (in IDR)")  %>% 
  select(c(Provinces.Name,5:14))
colnames(house) <- c('Provinces', 2011:2020)
library(data.table)
house <- melt(setDT(house), id.vars = c("Provinces"), variable.name = "Year")
house$value <- as.numeric(house$value)
head(house)

##               Provinces Year   value
## 1:          Bali, Prop. 2011  760449
## 2:        Banten, Prop. 2011  694545
## 3:      Bengkulu, Prop. 2011  563066
## 4: DI Yogyakarta, Prop. 2011  649901
## 5:   DKI Jakarta, Prop. 2011 1223955
## 6:     Gorontalo, Prop. 2011  480449

4. Tourism and culture function expenditure (in IDR)

tourism <- datadapoer %>% filter(Series.Name=="Tourism and culture function expenditure (in IDR)")  %>% 
  select(c(Provinces.Name,5:14))
colnames(tourism) <- c('Provinces', 2011:2020)
library(data.table)
tourism <- melt(setDT(tourism), id.vars = c("Provinces"), variable.name = "Year")
tourism$value <- as.numeric(tourism$value)

target <- c("Bali, Prop.","DI Yogyakarta, Prop.", 'Jawa Timur, Prop.', "Nusa Tenggara Timur, Prop.")
tourism <- tourism %>% filter(Provinces%in% target) %>% filter(Year==2020)
head(tourism)

##                     Provinces Year        value
## 1:                Bali, Prop. 2020  58659967293
## 2:       DI Yogyakarta, Prop. 2020 313198463331
## 3:          Jawa Timur, Prop. 2020  90073235455
## 4: Nusa Tenggara Timur, Prop. 2020  25083710468

Exploration

1. Histogram

library(ggplot2)
library(hrbrthemes)
target <- c(2019,2020)
gos %>% filter(Year %in% target) %>%
  ggplot( aes(x=value, fill=Year)) +
  geom_histogram( color="#e9ecef", alpha=0.6, position = 'identity') + 
  scale_fill_manual(values=c("#69b3a2", "#404080")) +
  theme_ipsum() +
  ggtitle("Goods and services expenditure \n(in trillion IDR)") + ylab("GOS") + theme(plot.title = element_text(color="black", size=12, face="bold.italic"))

2. Boxplot

library(viridis)
target <- c(2017:2020)
elec %>% filter(Year %in% target) %>%
  ggplot( aes(x=Year, y=value, fill=Year)) +
    geom_boxplot() +
    scale_fill_viridis(discrete = TRUE, alpha=0.6, option="A") +
    theme_ipsum() +
    theme(
      legend.position="none",
      plot.title = element_text(size=11)) +
    ggtitle("Household Access to Electricity: Total (in % of total household)") +
    ylab("elec")

3. Density Plot

library(ggridges)
target <- c(2011,2020)
house %>% filter(Year %in% target) %>% 
  ggplot(aes(x = value, y = Year, fill=Year)) + 
  geom_density_ridges() +
  theme_ridges() +
  theme_ipsum() +
  theme(legend.position = "none")+theme(
      legend.position="none",
      plot.title = element_text(size=11)) +
    ggtitle("Comparison Household per capita expenditure (in IDR) 2011 & 2020")

4. Pie Chart

library(RColorBrewer)
myPalette <- brewer.pal(4, "Set2") 
pie(tourism$value, labels = c("Bali","DI Yogyakarta", 'Jawa Timur', "Nusa Tenggara Timur"), border="white", col=myPalette, 
    main="Tourism and culture function \nexpenditure (Share) 2020")

5. Scatter Plot

# library
elec <- elec %>% filter(Year == 2020)
house <- house %>% filter(Year == 2020)
scat <- cbind(elec$value, house$value)
colnames(scat) <- c("elec","house")
scat <- as.data.frame(scat)
library(ggplot2)
ggplot(scat, aes(x=house, y=elec)) + 
    geom_point(
        color="orange",
        fill="#69b3a2",
        shape=21,
        alpha=0.5,
        size=6,
        stroke = 2) +
         theme(
      legend.position="none",
      plot.title = element_text(size=11)) +
    ggtitle("Relationship between Household Access to Electricity(elec) \nand Household per capita expenditure (house)")

6. Matrix Correlation

elec <- elec %>% filter(Year == 2020)
house <- house %>% filter(Year == 2020)
gos <- gos %>% filter(Year == 2020)
korelasi <- cbind(elec$value, house$value, gos$value)
colnames(korelasi) <- c("elec","house","gos")
korelasi <- as.data.frame(korelasi)

library(corrgram)
corrgram(korelasi, order=NULL, lower.panel=panel.shade, upper.panel=NULL, text.panel=panel.txt, main="Correlation Matrix")

library(PerformanceAnalytics)
chart.Correlation(korelasi, histogram = TRUE, pch= 19)

7. Bubble Plot

gos2 <- gos2 %>% filter(Year == 2020)
korelasi <- cbind(korelasi, gos2$Provinces)

tmp <- korelasi %>%
 dplyr::mutate(
   annotation = case_when(
    house > 1800000 & elec > 99 ~ "yes",
    house < 1000000 ~ "yes",
     elec < 80 ~ "yes"
    )
) %>% mutate(provinces = factor(gos2$Provinces, gos2$Provinces))

library(gridExtra)
library(ggrepel)
library(plotly)

tmp %>%
  ggplot(aes(x=house, y=elec, size = gos,color=provinces )) +
    geom_point(alpha=0.7) +
    scale_size(range = c(1.4, 19), name="GOS") +
    scale_color_viridis(discrete=TRUE, guide=FALSE) +
    theme_ipsum() +
    theme(legend.position="none") +
    geom_text_repel(data=tmp %>% filter(annotation=="yes"), aes(label=provinces), size=4 )

8. Local Regression

gos3 <- datadapoer %>% filter(Series.Name=="Goods and services expenditure (in IDR)")  %>%  select(c(Provinces.Name,5:14))
colnames(gos3) <- c('Provinces', 2011:2020)
gos3 <- melt(setDT(gos3), id.vars = c("Provinces"), variable.name = "Year")
gos3$value <- as.numeric(gos3$value)/(10^9)
gos3 <- gos3 %>% filter(Provinces=="Bali, Prop.")
gos3$index <- 1:nrow(gos3)  # create index variable

loessMod25 <- loess(value ~ index, data=gos3, span=0.25) # 25% smoothing span
loessMod50 <- loess(value ~ index, data=gos3, span=0.50) # 50% smoothing span
loessMod75 <- loess(value ~ index, data=gos3, span=0.75) # 75% smoothing span

smoothed25 <- predict(loessMod25) 
smoothed50 <- predict(loessMod50) 
smoothed75 <- predict(loessMod75)

plot(gos3$value, x=gos3$Year, type="l", main="Bali, Prop. Goods and services expenditure (in billion IDR)", xlab="Year", ylab="GOS")
lines(smoothed25, x=gos3$Year, col="red")
lines(smoothed50, x=gos3$Year, col="green")
lines(smoothed75, x=gos3$Year, col="blue")
legend('bottomright', legend=c('.25', '.50', '.75'),
        col=c('red', 'green', 'blue'), pch=19, title='Smoothing Span')

Data Exploration Using World Bank - INDODAPOER

Deri Siswara

G1501211036