# activate package foreign
library(foreign)
# import data from computer
mydata <- read.dbf("indonesia.dbf", as.is = F)
# show the data
head(mydata)
##   GID_0    NAME_0   GID_1          NAME_1 VARNAME_1 NL_NAME_1    TYPE_1
## 1   IDN Indonesia IDN.1_1            Aceh      <NA>      <NA> Propinisi
## 2   IDN Indonesia IDN.2_1            Bali      <NA>      <NA> Propinisi
## 3   IDN Indonesia IDN.3_1 Bangka Belitung      <NA>      <NA> Propinisi
## 4   IDN Indonesia IDN.4_1          Banten      <NA>      <NA> Propinisi
## 5   IDN Indonesia IDN.5_1        Bengkulu      <NA>      <NA> Propinisi
## 6   IDN Indonesia IDN.6_1       Gorontalo      <NA>      <NA> Propinisi
##   ENGTYPE_1 CC_1 HASC_1
## 1  Province   11  ID.AC
## 2  Province   51  ID.BA
## 3  Province   19  ID.BB
## 4  Province   36  ID.BT
## 5  Province   17  ID.BE
## 6  Province   75  ID.GO
# check variables inside mydata
names(mydata)
##  [1] "GID_0"     "NAME_0"    "GID_1"     "NAME_1"    "VARNAME_1" "NL_NAME_1"
##  [7] "TYPE_1"    "ENGTYPE_1" "CC_1"      "HASC_1"
# activate package for scrape BPS - Statistics Data
library(rvest)
## Warning: package 'rvest' was built under R version 4.0.5
library(xml2)
## Warning: package 'xml2' was built under R version 4.0.4
library(stringr)
## Warning: package 'stringr' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
url <- read_html("https://www.archive.bps.go.id/indicator/19/1172/1/upah-rata---rata-per-jam-pekerja-menurut-provinsi.html")
url
## {html_document}
## <html lang="en" class="sb-init">
##  [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF- ...
##  [2] <body id="top" class="no-touch">\n\t<!-- popup baru -->\n\t<div style="d ...
##  [3] <script type="text/javascript">\n\t\n    $(document).ready(function() {\ ...
##  [4] <script src="/js/mobile/bootstrap.min.js"></script>
##  [5] <script type="text/javascript" src="/js/accordion/responsive-accordion.j ...
##  [6] <script src="/js/mobile/slidebars.min.js"></script>
##  [7] <script type="text/javascript" src="/slick/slick.js"></script>
##  [8] <script type="text/javascript" src="/mod/simplemodal/js/jquery.simplemod ...
##  [9] <script>\n\t\t\t(function($) {\n\t\t\t\t$(document).ready(function() {\n ...
## [10] <script>\n\t\t\t(function($) {\n\t\t\t\t$(document).ready(function() {\n ...
## [11] <script src="/js/mobile/form-validation.js"></script>
## [12] <script>/*\n\t\t\t(function($) {\n\t\t\t\t$(document).ready(function() { ...
# get province's name
prov <- url %>% html_nodes('.vervar') %>% html_text()
prov <- prov[1:34] %>% str_to_title() %>% str_replace("Dki Jakarta", "DKI Jakarta") %>%
  str_replace("Di Yogyakarta", "DI Yogyakarta")
prov
##  [1] "Aceh"                 "Sumatera Utara"       "Sumatera Barat"      
##  [4] "Riau"                 "Jambi"                "Sumatera Selatan"    
##  [7] "Bengkulu"             "Lampung"              "Kep. Bangka Belitung"
## [10] "Kep. Riau"            "DKI Jakarta"          "Jawa Barat"          
## [13] "Jawa Tengah"          "DI Yogyakarta"        "Jawa Timur"          
## [16] "Banten"               "Bali"                 "Nusa Tenggara Barat" 
## [19] "Nusa Tenggara Timur"  "Kalimantan Barat"     "Kalimantan Tengah"   
## [22] "Kalimantan Selatan"   "Kalimantan Timur"     "Kalimantan Utara"    
## [25] "Sulawesi Utara"       "Sulawesi Tengah"      "Sulawesi Selatan"    
## [28] "Sulawesi Tenggara"    "Gorontalo"            "Sulawesi Barat"      
## [31] "Maluku"               "Maluku Utara"         "Papua Barat"         
## [34] "Papua"
# get average salary per hours by province
salary <- url %>% html_nodes('.text-right~ .text-right+ td') %>% html_text()
salary <- salary[1:34]
salary
##  [1] "16772" "15131" "15887" "18626" "16042" "15978" "16501" "13218" "18132"
## [10] "23528" "32685" "19038" "12604" "14916" "15119" "23880" "16857" "11734"
## [19] "13012" "16337" "19795" "18661" "22281" "20320" "18335" "15890" "17257"
## [28] "17542" "14289" "14774" "17006" "18278" "24128" "24097"
# making data frame from scraping data
mydata2 <- data.frame(NAME_1 = prov, SALARY = as.numeric(salary))
mydata2
##                  NAME_1 SALARY
## 1                  Aceh  16772
## 2        Sumatera Utara  15131
## 3        Sumatera Barat  15887
## 4                  Riau  18626
## 5                 Jambi  16042
## 6      Sumatera Selatan  15978
## 7              Bengkulu  16501
## 8               Lampung  13218
## 9  Kep. Bangka Belitung  18132
## 10            Kep. Riau  23528
## 11          DKI Jakarta  32685
## 12           Jawa Barat  19038
## 13          Jawa Tengah  12604
## 14        DI Yogyakarta  14916
## 15           Jawa Timur  15119
## 16               Banten  23880
## 17                 Bali  16857
## 18  Nusa Tenggara Barat  11734
## 19  Nusa Tenggara Timur  13012
## 20     Kalimantan Barat  16337
## 21    Kalimantan Tengah  19795
## 22   Kalimantan Selatan  18661
## 23     Kalimantan Timur  22281
## 24     Kalimantan Utara  20320
## 25       Sulawesi Utara  18335
## 26      Sulawesi Tengah  15890
## 27     Sulawesi Selatan  17257
## 28    Sulawesi Tenggara  17542
## 29            Gorontalo  14289
## 30       Sulawesi Barat  14774
## 31               Maluku  17006
## 32         Maluku Utara  18278
## 33          Papua Barat  24128
## 34                Papua  24097
# change some province's name on mydata
mydata$NAME_1 <- mydata$NAME_1 %>% str_replace("Yogyakarta", "DI Yogyakarta") %>%
  str_replace("Kepulauan Riau", "Kep. Riau") %>% str_replace("Jakarta Raya", "DKI Jakarta") %>%
  str_replace("Bangka Belitung", "Kep. Bangka Belitung")
mydata$NAME_1
##  [1] "Aceh"                 "Bali"                 "Kep. Bangka Belitung"
##  [4] "Banten"               "Bengkulu"             "Gorontalo"           
##  [7] "DKI Jakarta"          "Jambi"                "Jawa Barat"          
## [10] "Jawa Tengah"          "Jawa Timur"           "Kalimantan Barat"    
## [13] "Kalimantan Selatan"   "Kalimantan Tengah"    "Kalimantan Timur"    
## [16] "Kep. Riau"            "Lampung"              "Maluku"              
## [19] "Maluku Utara"         "Nusa Tenggara Barat"  "Nusa Tenggara Timur" 
## [22] "Papua"                "Papua Barat"          "Riau"                
## [25] "Sulawesi Barat"       "Sulawesi Selatan"     "Sulawesi Tengah"     
## [28] "Sulawesi Tenggara"    "Sulawesi Utara"       "Sumatera Barat"      
## [31] "Sumatera Selatan"     "Sumatera Utara"       "DI Yogyakarta"
# join data using inner_join
myjoindata <- left_join(mydata2, mydata, by = "NAME_1")[1:2]
myjoindata
##                  NAME_1 SALARY
## 1                  Aceh  16772
## 2        Sumatera Utara  15131
## 3        Sumatera Barat  15887
## 4                  Riau  18626
## 5                 Jambi  16042
## 6      Sumatera Selatan  15978
## 7              Bengkulu  16501
## 8               Lampung  13218
## 9  Kep. Bangka Belitung  18132
## 10            Kep. Riau  23528
## 11          DKI Jakarta  32685
## 12           Jawa Barat  19038
## 13          Jawa Tengah  12604
## 14        DI Yogyakarta  14916
## 15           Jawa Timur  15119
## 16               Banten  23880
## 17                 Bali  16857
## 18  Nusa Tenggara Barat  11734
## 19  Nusa Tenggara Timur  13012
## 20     Kalimantan Barat  16337
## 21    Kalimantan Tengah  19795
## 22   Kalimantan Selatan  18661
## 23     Kalimantan Timur  22281
## 24     Kalimantan Utara  20320
## 25       Sulawesi Utara  18335
## 26      Sulawesi Tengah  15890
## 27     Sulawesi Selatan  17257
## 28    Sulawesi Tenggara  17542
## 29            Gorontalo  14289
## 30       Sulawesi Barat  14774
## 31               Maluku  17006
## 32         Maluku Utara  18278
## 33          Papua Barat  24128
## 34                Papua  24097
# activate library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
# Make a box plot of salary by province
ggplot(data = myjoindata, aes(x = NULL, y = SALARY)) +
  geom_boxplot(fill = "skyblue", color = "darkblue") +
  labs(title = "Box Plot Salary per Hour by Provinsi in Indonesia, 2022",
       y = "IDR/H", caption = "Data Source: bps.go.id") +
  theme_minimal()