# activate package foreign
library(foreign)
# import data from computer
mydata <- read.dbf("indonesia.dbf", as.is = F)
# show the data
head(mydata)
## GID_0 NAME_0 GID_1 NAME_1 VARNAME_1 NL_NAME_1 TYPE_1
## 1 IDN Indonesia IDN.1_1 Aceh <NA> <NA> Propinisi
## 2 IDN Indonesia IDN.2_1 Bali <NA> <NA> Propinisi
## 3 IDN Indonesia IDN.3_1 Bangka Belitung <NA> <NA> Propinisi
## 4 IDN Indonesia IDN.4_1 Banten <NA> <NA> Propinisi
## 5 IDN Indonesia IDN.5_1 Bengkulu <NA> <NA> Propinisi
## 6 IDN Indonesia IDN.6_1 Gorontalo <NA> <NA> Propinisi
## ENGTYPE_1 CC_1 HASC_1
## 1 Province 11 ID.AC
## 2 Province 51 ID.BA
## 3 Province 19 ID.BB
## 4 Province 36 ID.BT
## 5 Province 17 ID.BE
## 6 Province 75 ID.GO
# check variables inside mydata
names(mydata)
## [1] "GID_0" "NAME_0" "GID_1" "NAME_1" "VARNAME_1" "NL_NAME_1"
## [7] "TYPE_1" "ENGTYPE_1" "CC_1" "HASC_1"
# activate package for scrape BPS - Statistics Data
library(rvest)
## Warning: package 'rvest' was built under R version 4.0.5
library(xml2)
## Warning: package 'xml2' was built under R version 4.0.4
library(stringr)
## Warning: package 'stringr' was built under R version 4.0.5
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
url <- read_html("https://www.archive.bps.go.id/indicator/19/1172/1/upah-rata---rata-per-jam-pekerja-menurut-provinsi.html")
url
## {html_document}
## <html lang="en" class="sb-init">
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF- ...
## [2] <body id="top" class="no-touch">\n\t<!-- popup baru -->\n\t<div style="d ...
## [3] <script type="text/javascript">\n\t\n $(document).ready(function() {\ ...
## [4] <script src="/js/mobile/bootstrap.min.js"></script>
## [5] <script type="text/javascript" src="/js/accordion/responsive-accordion.j ...
## [6] <script src="/js/mobile/slidebars.min.js"></script>
## [7] <script type="text/javascript" src="/slick/slick.js"></script>
## [8] <script type="text/javascript" src="/mod/simplemodal/js/jquery.simplemod ...
## [9] <script>\n\t\t\t(function($) {\n\t\t\t\t$(document).ready(function() {\n ...
## [10] <script>\n\t\t\t(function($) {\n\t\t\t\t$(document).ready(function() {\n ...
## [11] <script src="/js/mobile/form-validation.js"></script>
## [12] <script>/*\n\t\t\t(function($) {\n\t\t\t\t$(document).ready(function() { ...
# get province's name
prov <- url %>% html_nodes('.vervar') %>% html_text()
prov <- prov[1:34] %>% str_to_title() %>% str_replace("Dki Jakarta", "DKI Jakarta") %>%
str_replace("Di Yogyakarta", "DI Yogyakarta")
prov
## [1] "Aceh" "Sumatera Utara" "Sumatera Barat"
## [4] "Riau" "Jambi" "Sumatera Selatan"
## [7] "Bengkulu" "Lampung" "Kep. Bangka Belitung"
## [10] "Kep. Riau" "DKI Jakarta" "Jawa Barat"
## [13] "Jawa Tengah" "DI Yogyakarta" "Jawa Timur"
## [16] "Banten" "Bali" "Nusa Tenggara Barat"
## [19] "Nusa Tenggara Timur" "Kalimantan Barat" "Kalimantan Tengah"
## [22] "Kalimantan Selatan" "Kalimantan Timur" "Kalimantan Utara"
## [25] "Sulawesi Utara" "Sulawesi Tengah" "Sulawesi Selatan"
## [28] "Sulawesi Tenggara" "Gorontalo" "Sulawesi Barat"
## [31] "Maluku" "Maluku Utara" "Papua Barat"
## [34] "Papua"
# get average salary per hours by province
salary <- url %>% html_nodes('.text-right~ .text-right+ td') %>% html_text()
salary <- salary[1:34]
salary
## [1] "16772" "15131" "15887" "18626" "16042" "15978" "16501" "13218" "18132"
## [10] "23528" "32685" "19038" "12604" "14916" "15119" "23880" "16857" "11734"
## [19] "13012" "16337" "19795" "18661" "22281" "20320" "18335" "15890" "17257"
## [28] "17542" "14289" "14774" "17006" "18278" "24128" "24097"
# making data frame from scraping data
mydata2 <- data.frame(NAME_1 = prov, SALARY = as.numeric(salary))
mydata2
## NAME_1 SALARY
## 1 Aceh 16772
## 2 Sumatera Utara 15131
## 3 Sumatera Barat 15887
## 4 Riau 18626
## 5 Jambi 16042
## 6 Sumatera Selatan 15978
## 7 Bengkulu 16501
## 8 Lampung 13218
## 9 Kep. Bangka Belitung 18132
## 10 Kep. Riau 23528
## 11 DKI Jakarta 32685
## 12 Jawa Barat 19038
## 13 Jawa Tengah 12604
## 14 DI Yogyakarta 14916
## 15 Jawa Timur 15119
## 16 Banten 23880
## 17 Bali 16857
## 18 Nusa Tenggara Barat 11734
## 19 Nusa Tenggara Timur 13012
## 20 Kalimantan Barat 16337
## 21 Kalimantan Tengah 19795
## 22 Kalimantan Selatan 18661
## 23 Kalimantan Timur 22281
## 24 Kalimantan Utara 20320
## 25 Sulawesi Utara 18335
## 26 Sulawesi Tengah 15890
## 27 Sulawesi Selatan 17257
## 28 Sulawesi Tenggara 17542
## 29 Gorontalo 14289
## 30 Sulawesi Barat 14774
## 31 Maluku 17006
## 32 Maluku Utara 18278
## 33 Papua Barat 24128
## 34 Papua 24097
# change some province's name on mydata
mydata$NAME_1 <- mydata$NAME_1 %>% str_replace("Yogyakarta", "DI Yogyakarta") %>%
str_replace("Kepulauan Riau", "Kep. Riau") %>% str_replace("Jakarta Raya", "DKI Jakarta") %>%
str_replace("Bangka Belitung", "Kep. Bangka Belitung")
mydata$NAME_1
## [1] "Aceh" "Bali" "Kep. Bangka Belitung"
## [4] "Banten" "Bengkulu" "Gorontalo"
## [7] "DKI Jakarta" "Jambi" "Jawa Barat"
## [10] "Jawa Tengah" "Jawa Timur" "Kalimantan Barat"
## [13] "Kalimantan Selatan" "Kalimantan Tengah" "Kalimantan Timur"
## [16] "Kep. Riau" "Lampung" "Maluku"
## [19] "Maluku Utara" "Nusa Tenggara Barat" "Nusa Tenggara Timur"
## [22] "Papua" "Papua Barat" "Riau"
## [25] "Sulawesi Barat" "Sulawesi Selatan" "Sulawesi Tengah"
## [28] "Sulawesi Tenggara" "Sulawesi Utara" "Sumatera Barat"
## [31] "Sumatera Selatan" "Sumatera Utara" "DI Yogyakarta"
# join data using inner_join
myjoindata <- left_join(mydata2, mydata, by = "NAME_1")[1:2]
myjoindata
## NAME_1 SALARY
## 1 Aceh 16772
## 2 Sumatera Utara 15131
## 3 Sumatera Barat 15887
## 4 Riau 18626
## 5 Jambi 16042
## 6 Sumatera Selatan 15978
## 7 Bengkulu 16501
## 8 Lampung 13218
## 9 Kep. Bangka Belitung 18132
## 10 Kep. Riau 23528
## 11 DKI Jakarta 32685
## 12 Jawa Barat 19038
## 13 Jawa Tengah 12604
## 14 DI Yogyakarta 14916
## 15 Jawa Timur 15119
## 16 Banten 23880
## 17 Bali 16857
## 18 Nusa Tenggara Barat 11734
## 19 Nusa Tenggara Timur 13012
## 20 Kalimantan Barat 16337
## 21 Kalimantan Tengah 19795
## 22 Kalimantan Selatan 18661
## 23 Kalimantan Timur 22281
## 24 Kalimantan Utara 20320
## 25 Sulawesi Utara 18335
## 26 Sulawesi Tengah 15890
## 27 Sulawesi Selatan 17257
## 28 Sulawesi Tenggara 17542
## 29 Gorontalo 14289
## 30 Sulawesi Barat 14774
## 31 Maluku 17006
## 32 Maluku Utara 18278
## 33 Papua Barat 24128
## 34 Papua 24097
# activate library
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
# Make a box plot of salary by province
ggplot(data = myjoindata, aes(x = NULL, y = SALARY)) +
geom_boxplot(fill = "skyblue", color = "darkblue") +
labs(title = "Box Plot Salary per Hour by Provinsi in Indonesia, 2022",
y = "IDR/H", caption = "Data Source: bps.go.id") +
theme_minimal()
