library(dplyr)
library(lubridate)
library(ggplot2)
library(plotly)
library(glue)
library(tidyr)
library(tidytext)
library(scales)
library(hrbrthemes)

1. Data Source

For LBB (learning by building) - “Programming for Data Science” assignment , i’m using Export & Import data which classified by SITC (Standard International Trade Classification) with period ranging from the year 2011 up to 2020. Data was taken from Indonesia Central Bureau of Statistics website, export & import section.


2. Read, Check and Clean Data

Importing Data

Import data and assign it into object named “exim”. Data format is in CSV, but all information are stored in separate columns, thus i use additional import function (“sep=”;“), otherwise data will be imported to RMD and combined into single column.

exim <- read.csv("exim-menurut-golongan.csv", sep=";")

Checking Data

#checking data type
str(exim)
## 'data.frame':    200 obs. of  6 variables:
##  $ Exim                       : chr  "Export" "Export" "Export" "Export" ...
##  $ nama_item_vertical_variabel: chr  "0. Bahan makanan dan binatang hidup" "1. Minuman dan tembakau" "2. Bahan-bahan mentah, tidak untuk dimakan" "3. Bahan bakar pelikan, bahan penyemir dan bahan-bahan yang berkenaan dengan itu" ...
##  $ nama_tahun                 : int  2020 2020 2020 2020 2020 2020 2020 2020 2020 2020 ...
##  $ nama_turunan_tahun         : chr  "Tahun" "Tahun" "Tahun" "Tahun" ...
##  $ nilai                      : num  14565 1244 14034 25502 19710 ...
##  $ volume                     : num  12862 324 48568 436554 28776 ...
#checking missing value
colSums(is.na(exim))
##                        Exim nama_item_vertical_variabel 
##                           0                           0 
##                  nama_tahun          nama_turunan_tahun 
##                           0                           0 
##                       nilai                      volume 
##                           0                           0

Data Cleaning

Based on data checking above, there are several data type “character” that need to be converted into “factor”.

exim <- exim %>% 
        mutate_if(is.character, as.factor) %>% 
        mutate(nama_tahun = as.factor(nama_tahun))

nama_item_vertical_variabel need to be clean and modify to be more intuitive.

#To shorten information in column "nama_item_vertical_variabel"
exim$nama_item_vertical_variabel <- sapply(as.character(exim$nama_item_vertical_variabel), switch,
                                  "0. Bahan makanan dan binatang hidup" = "Food and lives animal",
                                  "1. Minuman dan tembakau" = "Beverages and tobacco",
                                  "2. Bahan-bahan mentah, tidak untuk dimakan" = "Crude materials, inedible, except fuels",
                                  "3. Bahan bakar pelikan, bahan penyemir dan bahan-bahan yang berkenaan dengan itu" = "Minerals fuels, lubricants and related materials",
                                  "4. Lemak serta minyak hewan dan nabati" = "Animal and vegetable oil, fats and waxes",
                                  "5. Bahan-bahan kimia" = "Chemical and related products",
                                  "6. Barang-barang buatan pabrik dirinci menurut bahan" = "Manufactured goods classified by material",
                                  "7. Mesin dan alat pengangkutan" = "Machinery and transport equipment",
                                  "8. Berbagai jenis barang buatan pabrik" = "Miscellaneous manufactured articles",
                                  "9. Barang-barang transaksi tidak dirinci" = "Commodities and transaction not classified")

exim$nama_item_vertical_variabel <- as.factor(exim$nama_item_vertical_variabel)
unique(exim$nama_turunan_tahun) #To check unique value in `nama_turunan_tahun`
## [1] Tahun
## Levels: Tahun

Noted nama_tunuran_tahun column only contain single information “Tahun”, thus we will delete it from DF.

exim <- exim[-c(4)] #To delete "nama_turunan_tahun" column, since it contain unnecessary information.
#To change columns name to be more understandable / intuitive
names(exim)[1] <- "Category"
names(exim)[2] <- "Classification_SITC"
names(exim)[3] <- "Years"
names(exim)[4] <- "Value"
names(exim)[5] <- "Volume"

3. Data Explanation

#To re-check data type, column name and "classification_SITC" information.
glimpse(exim)
## Rows: 200
## Columns: 5
## $ Category            <fct> Export, Export, Export, Export, Export, Export, Ex~
## $ Classification_SITC <fct> "Food and lives animal", "Beverages and tobacco", ~
## $ Years               <fct> 2020, 2020, 2020, 2020, 2020, 2020, 2020, 2020, 20~
## $ Value               <dbl> 14565.1, 1243.6, 14034.4, 25501.8, 19709.5, 12659.~
## $ Volume              <dbl> 12861.5, 324.1, 48568.0, 436554.2, 28775.7, 16692.~

Category : Categorize by Export / Import.

Classification_SITC : information of Export / Import classified based on SITC.

Years : Categorize by Years.

Value : Export / Import value in $ million.

Volume : Export / Import volume in kilotonne.

The Standard international trade classification, abbreviated as SITC, is a product classification of the United Nations (UN) used for external trade statistics (export and Import values and volumes of goods), allowing for international comparisons of commodities and manufactured goods.

The groupings of SITC reflect:

-the production materials

-the processing stage

-market practices and uses of the products

-the importance of the goods in world trade

-technological changes

SITC Source

4. Plot and Analysis

Total Export & Import 2011 - 2020

EXIM Value

exim_val <- exim %>% 
  group_by(Years ,Category) %>% 
  summarise(total_value = sum(Value)) %>% 
  mutate(label = glue("Value = {comma(round(total_value,0))}"))


exim_val_plot <- ggplot(exim_val, aes(fill = Category, x = Years, y = total_value, text = label))+
  geom_bar(position = "dodge", stat = "identity")+
  scale_fill_manual(values=c("lightseagreen", "navyblue"))+
  scale_y_continuous(labels = scales::comma)+
  ggtitle("Indonesia Export & Import (Value)")+
  labs(x = "Years",
       y = "Value in $ Million", fill = "")+
  theme_minimal()+
  theme(axis.text = element_text(colour = "royalblue"), 
        plot.title = element_text(hjust = 0.5))

ggplotly(exim_val_plot, tooltip = "label")

EXIM Volume

exim_vol <- exim %>% 
  group_by(Years ,Category) %>% 
  summarise(total_vol = sum(Volume)) %>% 
  mutate(label = glue("Value = {comma(round(total_vol,0))}"))

exim_vol_plot <- ggplot(data = exim_vol, aes(fill = Category, x = Years, y = total_vol, text = label))+
  geom_bar(position = "dodge", stat = "identity")+
  scale_fill_manual(values=c("burlywood1", "darkorange"))+
  scale_y_continuous(labels = scales::comma)+
  ggtitle("Indonesia Export & Import (Volume)")+
  labs(x = "Years",
       y = "Volume in Killo Tonne", fill = "")+
  theme_minimal()+
  theme(axis.text = element_text(colour = "royalblue"), 
        plot.title = element_text(hjust = 0.5))

ggplotly(exim_vol_plot, tooltip = "label") 

Interpretations:

  • Export Value tend to decrease from its peak in 2011 to 2016, this is due to government regulation in mining industry related to mineral ore export policy. In 2017, mining export policy is unveiled, whereas under certain condition export of mineral ore is re-allowed. Source
  • Import Value also tend to decrease from its peak in 2012 to 2016, main reason due to decrease of production from import country of origin such as US and China. Source

EXIM Value and Volume Correlation

Export

exim_scat_ex <- exim %>%  
  filter(Category == "Export") %>% 
  group_by(Category, Years) %>% 
  summarise(val = sum(Value), vol = sum(Volume)) %>% 
  mutate(label = glue("Year = {Years}
                      Value = {comma(round(val,2))}
                      Volume = {comma(round(vol,2))}"))

exim_scat_imp_plot <- ggplot(data = exim_scat_ex, aes(x = val, y = vol, colour = Years, text = label)) +
  geom_point(size = 6)+
  scale_y_continuous(labels = scales::comma)+
  scale_x_continuous(labels = scales::comma)+
  ggtitle("Export Value & Volume Correlation")+
  labs(x = "Value in $ Million",
       y = "Volume in Kilotonne ")+
  theme_minimal()+
    theme(legend.position = "none",
          plot.title = element_text(hjust = 0.5))

ggplotly(exim_scat_imp_plot, tooltip = "label")

Import

exim_scat_im <- exim %>% 
  filter(Category == "Import") %>% 
  group_by(Category, Years) %>% 
  summarise(val = sum(Value), vol = sum(Volume)) %>% 
  mutate(label = glue("Year = {Years}
                      Value = {comma(round(val,2))}
                      Volume = {comma(round(vol,2))}"))

exim_scat_im_plot <- ggplot(data = exim_scat_im, aes(x = val, y = vol, color = Years, text = label)) +
  geom_point(size = 6)+
  scale_y_continuous(labels = scales::comma)+
  scale_x_continuous(labels = scales::comma)+
  ggtitle("Import Value & Volume Correlation")+
  labs(x = "Value in $ Million",
       y = "Volume in Kilotonne ")+
  theme_minimal()+
    theme(legend.position = "none", 
          plot.title = element_text(hjust = 0.5))

ggplotly(exim_scat_im_plot, tooltip = "label")

Interpretations:

  • From graphic above we can assume that price per volume have a high fluctuations.
  • In Export chart, year “2013” have the highest Volume but not in Value, this has contra condition with the year “2011”.
  • Same case also occur in Import chart, where in the year “2018”, “2012” and “2013” Value amount are close, but not in Volume.

EXIM SITC Contibution

Due to there is a large range gap between Export and Import Volume, instead of using facet_wrap, we decide to split the chart.

By Value EXIM

exim_stack <- exim %>% 
  mutate(label = glue("Category = {Classification_SITC}
                      Value = {comma(round(Value,0))}"))


exim_stack_plot <- ggplot(data = exim_stack, aes(x = Years, y = Value, fill = Classification_SITC, text = label)) +
  geom_bar(position = "stack", stat = "identity")+
  scale_y_continuous(labels = scales::comma)+
  ggtitle("Export - Import by SITC (Value)")+
  labs(x = "Year",
       y = "Value in $ Million", fill = "")+
  theme_minimal()+
  theme(legend.position = "none", 
        plot.title = element_text(hjust = 0.5))+
  facet_wrap(~ Category) 

ggplotly(exim_stack_plot, tooltip = "label")

By Volume Export

exim_stack_ex <- exim %>% 
  filter(Category == "Export") %>% 
  group_by(Category, Years, Classification_SITC) %>% 
  summarise(val_ex = sum(Volume)) %>% 
  mutate(label = glue("Volume = {comma(round(val_ex,0))}"))


exim_stack_ex_plot <- ggplot(data = exim_stack_ex, aes(x = Years, y = val_ex, fill = Classification_SITC, text = label)) +
  geom_bar(position = "stack", stat = "identity")+
  scale_y_continuous(labels = scales::comma)+
  ggtitle("Export by SITC (Volume)")+
  labs(x = "Year",
       y = "Volume in Killotonne", fill = "")+
  theme_minimal()+
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(exim_stack_ex_plot, tooltip = "label")

By Volume Import

exim_stack_imp <- exim %>% 
  filter(Category == "Import") %>% 
  group_by(Category, Years, Classification_SITC) %>% 
  summarise(val_imp = sum(Volume)) %>% 
  mutate(label = glue("Volume = {comma(round(val_imp,0))}"))


exim_stack_imp_plot <- ggplot(data = exim_stack_imp, aes(x = Years, y = val_imp, fill = Classification_SITC, text = label)) +
  geom_bar(position = "stack", stat = "identity")+
  scale_y_continuous(labels = scales::comma)+
  ggtitle("Import by SITC (Volume)")+
  labs(x = "Year",
       y = "Volume in Killotonne", fill = "")+
  theme_minimal()+
  theme(plot.title = element_text(hjust = 0.5))

ggplotly(exim_stack_imp_plot, tooltip = "label")

Interpretations:

  • “Minerals fuels, lubricants and related materials” class has significant contribution in both Export-Import Value and Volume. As seen from chart above, if “Minerals fuels, lubricants and related materials” class decrease, it affecting the whole “Bar” in each Years.
  • From the Import side, “Animal and vegetable oil, fats and waxes” has the lowest or almost no contribution.
  • Import “Machinery and transport equipment” contribute huge amount of Value but low in Volume, as machinery part and equipment is considered as an expensive goods.