Tugas Praktikum 1 STA1563

Import Library & Data

library(ggplot2)
library(dplyr)
library(ggcorrplot)
library(plotly)
library(scales)

data_tugas <- read.csv("D:/Kuliah Pasca/2021/STA1563 Eksplorasi dan Visualisasi Data/Praktikum/Data Tugas 1.csv", sep=";")

Data tugas berasal dari Indonesia Database for Policy and Economic Research, data tersebut berisi informasi total pengeluaran, pengeluaran modal, dan pengeluaran fungsi pendidikan seluruh provinsi di Indonesia dari tahun 2017 sampai dengan 2020. Kemudian ditambahkan juga kolom Island_Group yang berisi pengelompokan provinsi berdasarkan pulau.

cols <- c("Provinces_Name","Island_Group")
data_tugas[cols] <- lapply(data_tugas[cols], factor)
str(data_tugas)

## 'data.frame':    34 obs. of  15 variables:
##  $ Provinces_Name: Factor w/ 34 levels "Bali, Prop.",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Provinces_Code: chr  "ID.BA" "ID.BT" "ID.BE" "ID.YO" ...
##  $ Island_Group  : Factor w/ 6 levels "Bali Nusra","Jawa",..: 1 2 6 2 2 5 6 2 2 2 ...
##  $ Total_Ex_2017 : num  6.07e+12 9.51e+12 2.87e+12 4.92e+12 5.11e+13 ...
##  $ Total_Ex_2018 : num  6.00e+12 9.99e+12 2.98e+12 5.30e+12 6.14e+13 ...
##  $ Total_Ex_2019 : num  6.52e+12 1.13e+13 3.12e+12 5.54e+12 6.49e+13 ...
##  $ Total_Ex_2020 : num  6.36e+12 9.88e+12 2.70e+12 5.43e+12 5.21e+13 ...
##  $ Cap_Ex_2017   : num  6.60e+11 1.35e+12 7.11e+11 1.05e+12 1.10e+13 ...
##  $ Cap_Ex_2018   : num  4.41e+11 1.36e+12 6.18e+11 1.13e+12 1.41e+13 ...
##  $ Cap_Ex_2019   : num  5.57e+11 1.38e+12 7.42e+11 1.04e+12 1.16e+13 ...
##  $ Cap_Ex_2020   : num  4.58e+11 9.94e+11 4.17e+11 9.41e+11 3.17e+12 ...
##  $ Edu_Ex_2017   : num  1.84e+12 3.33e+12 6.80e+11 1.46e+12 1.33e+13 ...
##  $ Edu_Ex_2018   : num  1.82e+12 3.54e+12 1.03e+12 1.27e+12 1.58e+13 ...
##  $ Edu_Ex_2019   : num  1.86e+12 3.92e+12 1.07e+12 1.29e+12 2.18e+13 ...
##  $ Edu_Ex_2020   : num  1.90e+12 4.30e+12 1.15e+12 1.75e+12 1.46e+13 ...

Histogram

hist(data_tugas$Total_Ex_2020/1000000000000, breaks = 9, col = "lightblue",
     xlab = "Total Expenditure (Rp Trillion)",
     main = "Indonesia's Province Total Expenditure Distribution in 2020")

Dari histogram di atas dapat dilihat bahwa sebaran data total pengeluaran provinsi di Indonesia pada tahun 2020 menjulur ke kanan, selain itu terdapat indikasi pencilan untuk provinsi dengan pengeluaran di atas 20 triliun rupiah.

Boxplot

a <- boxplot(data_tugas$Total_Ex_2020/1000000000000, plot = FALSE)

qplot(y = data_tugas$Total_Ex_2020/1000000000000, geom = 'boxplot') +
  labs(y ="Total Expenditure (Rp Trillion)") +
  annotate(geom = "text",
           x = rep(0.1, length(a$out)),
           y = a$out,
           label = data_tugas$Provinces_Name[which((data_tugas$Total_Ex_2020/1000000000000)  %in%  a$out)],
           size = 2.5) +
  theme_classic()

Dari box plot dapa dilihat median total pengeluaran provinsi di Indonesia pada tahun 2020 ada di sekitar 6 triliun rupiah. Ada 4 provinsi yang termasuk ke dalam pencilan yaitu DKI Jakarta, Jawa Barat, Jawa Tengah dan Jawa Timur dengan total pengeluaran di atas 20 triliun rupiah.

Density Plot

# Basic density
ggplot(data_tugas, aes(x=Total_Ex_2020/1000000000000)) + 
  labs(x ="Total Expenditure (Rp Trillion)") +
  geom_histogram(aes(y=..density..), colour="darkblue", fill="lightblue") +
  geom_density(alpha=.2, fill="#FF6666") +
  theme_classic()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Bar Chart

data_tugas %>% group_by(Island_Group) %>% 
  summarise(Total_Ex_2020=sum(Total_Ex_2020/1000000000000)) %>%
  ggplot(aes(x=Island_Group, y=Total_Ex_2020)) +
  labs(y ="Total Expenditure (Rp Trillion)",x="Group") +
  geom_bar(stat = "identity") +
  theme_classic()

Dari bar chart dapat dilihat total pengeluaran provinsi di pulau jawa jauh lebih tinggi dibandingkan pulau besar lainnya.

Pie Chart

data_pie <- data_tugas %>% group_by(Island_Group) %>%
  summarise(Total_Ex_2020=sum(Total_Ex_2020/1000000000000)) %>% 
  ungroup() %>%
  mutate(perc = `Total_Ex_2020` / sum(`Total_Ex_2020`)) %>% 
  arrange(perc) %>%
  mutate(labels = percent(perc,accuracy =  0.01))

plot_ly(data_pie, labels = ~Island_Group, values = ~Total_Ex_2020, type = 'pie',textposition = 'inside',textinfo = 'label+percent') %>%
  layout(showlegend = F,xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

Total pengeluaran provinsi di pulau Jawa pada tahun 2020 hampir 50% dari total pengeluaran Indonesia

Scatter Plot

ggplot(data_tugas, aes(x=Total_Ex_2020, y=Edu_Ex_2020)) +
  geom_point() +
  labs(y ="Education Function Expenditure (Rp Trillion)",x="Total Expenditure (Rp Trillion)") +
  scale_y_continuous(
  labels = scales::number_format(accuracy = 0.01,
                                 decimal.mark = ',',scale = 1/1000000000000)) +
  scale_x_continuous(
  labels = scales::number_format(accuracy = 0.01,
                                 decimal.mark = ',',scale = 1/1000000000000))

Dari scatter plot dapat dilihat hubungan antara total pengeluaran dan pengeluaran untuk pendidikan adalah linier positif

Correlation Matrix

num_col <- c("Total_Ex_2020","Total_Ex_2019","Total_Ex_2018",
             "Cap_Ex_2020","Cap_Ex_2019","Cap_Ex_2018",
             "Edu_Ex_2020","Edu_Ex_2019","Edu_Ex_2018")
corr <- round(cor(data_tugas[,num_col]), 1)
ggcorrplot(corr, method = "circle",lab = TRUE)

## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

Dari matriks korelasi di atas dapat disimpulkan semua data pengeluaran saling berkorelasi positif dimana korelasi yang paling rendah ada pada korelasi pengeluaran untuk pendidikan pada tahun 2020 dengan pengeluaran modal tahun 2019 dan 2018

Bubble Plot

data_tugas1B <- read.csv("D:/Kuliah Pasca/2021/STA1563 Eksplorasi dan Visualisasi Data/Praktikum/Data Tugas 1B.csv", sep=";")

cols <- c("Provinces_Name","Island_Group")
data_tugas1B[cols] <- lapply(data_tugas1B[cols], factor)
str(data_tugas1B)

## 'data.frame':    34 obs. of  7 variables:
##  $ Provinces_Name : Factor w/ 34 levels "Bali, Prop.",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Provinces_Code : chr  "ID.BA" "ID.BT" "ID.BE" "ID.YO" ...
##  $ Total_Ex_2018  : num  6.00e+12 9.99e+12 2.98e+12 5.30e+12 6.14e+13 ...
##  $ Total_Pop_2018 : int  4292154 12689736 1963300 3802872 10467629 1185492 3570272 48683861 34490835 39500851 ...
##  $ Total_Rev_2018 : num  6.26e+12 1.03e+13 2.85e+12 5.44e+12 6.12e+13 ...
##  $ Num_PeoEmp_2018: int  2490870 5332496 963463 2118392 4726779 555533 1721362 20779888 17245548 20449949 ...
##  $ Island_Group   : Factor w/ 6 levels "Bali Nusra","Jawa",..: 1 2 6 2 2 5 6 2 2 2 ...

data_tugas1B %>%
  ggplot(aes(x=Total_Ex_2018/1000000000000  , y=Num_PeoEmp_2018/1000000 , size = Total_Pop_2018/1000000 , color = Island_Group)) +
  geom_point(alpha=0.3) +
  labs(y ="Number of People Employed (Million)",x="Total Expenditure (Rp Trillion)") +
    scale_size(range = c(1.4, 12), name="Population (Million)")

Local Regression

loessMod10 <- loess(Total_Rev_2018 ~ Num_PeoEmp_2018, data=data_tugas1B, span=0.10) # 10% smoothing span
loessMod25 <- loess(Total_Rev_2018 ~ Num_PeoEmp_2018, data=data_tugas1B, span=0.25) # 25% smoothing span
loessMod50 <- loess(Total_Rev_2018 ~ Num_PeoEmp_2018, data=data_tugas1B, span=0.50) # 50% smoothing span

smoothed10 <- predict(loessMod10) 
smoothed25 <- predict(loessMod25) 
smoothed50 <- predict(loessMod50)

plot(y=data_tugas1B$Total_Rev_2018, x=data_tugas1B$Num_PeoEmp_2018, type="l", main="Loess Smoothing and Prediction", xlab="Number of People Employed", ylab="Total Revenue (Rp)")
lines(smoothed10, x=data_tugas1B$Num_PeoEmp_2018, col="red")
lines(smoothed25, x=data_tugas1B$Num_PeoEmp_2018, col="green")
lines(smoothed50, x=data_tugas1B$Num_PeoEmp_2018, col="blue")
legend('bottomright', legend=c('.1', '.25', '.5'),
        col=c('red', 'green', 'blue'), pch=19, title='Smoothing Span')