EDA example

This is a quick simple example of exploratory data analysis (EDA) step by cleaning and building some plots.
I used dataset form here, which is an open government data of Thailand. The data set is about long term unemployment from 2015-2016.

## Set up environment by called Tidyverse package.
library(tidyverse)
## Load dataset into R
unemploy<-read.csv("https://apis1.nso.go.th/data?table=OS_02_0021_01&format=csv",encoding = "UTF-8")
## Quick explore the dataset 
glimpse(unemploy)

## Rows: 3,140
## Columns: 9
## $ X.U.FEFF.year <int> 2558, 2558, 2558, 2558, 2558, 2558, 2558, 2558, 2558, 25~
## $ quarter       <chr> "ไตรมาสที่ 1", "ไตรมาสที่ 1", "ไตรมาสที่ 1", "ไตรมาสที่ 1", "ไตร~
## $ region        <chr> "ทั่วประเทศ", "ทั่วประเทศ", "ทั่วประเทศ", "กรุงเทพมหานคร", "ภาค~
## $ area          <chr> "รวม", "ในเขตเทศบาล", "นอกเขตเทศบาล", "ในเขตเทศบาล", "รว~
## $ sex           <chr> "รวม", "รวม", "รวม", "รวม", "รวม", "รวม", "รวม", "รวม", ~
## $ age_group     <chr> "รวม", "รวม", "รวม", "รวม", "รวม", "รวม", "รวม", "รวม", ~
## $ value         <dbl> 0.9, 0.9, 0.9, 1.1, 1.0, 1.0, 1.1, 0.8, 0.9, 0.8, 0.8, 0~
## $ unit          <chr> "ร้อยละ", "ร้อยละ", "ร้อยละ", "ร้อยละ", "ร้อยละ", "ร้อยละ", "ร้~
## $ source        <chr> "สำนักงานสถิติแห่งชาติ", "สำนักงานสถิติแห่งชาติ", "สำนักงานสถิติแห่งชา~

head(unemploy, n=10)

##    X.U.FEFF.year     quarter        region         area sex age_group value
## 1           2558 ไตรมาสที่ 1    ทั่วประเทศ          รวม รวม       รวม   0.9
## 2           2558 ไตรมาสที่ 1    ทั่วประเทศ  ในเขตเทศบาล รวม       รวม   0.9
## 3           2558 ไตรมาสที่ 1    ทั่วประเทศ นอกเขตเทศบาล รวม       รวม   0.9
## 4           2558 ไตรมาสที่ 1 กรุงเทพมหานคร  ในเขตเทศบาล รวม       รวม   1.1
## 5           2558 ไตรมาสที่ 1       ภาคกลาง          รวม รวม       รวม   1.0
## 6           2558 ไตรมาสที่ 1       ภาคกลาง  ในเขตเทศบาล รวม       รวม   1.0
## 7           2558 ไตรมาสที่ 1       ภาคกลาง นอกเขตเทศบาล รวม       รวม   1.1
## 8           2558 ไตรมาสที่ 1      ภาคเหนือ          รวม รวม       รวม   0.8
## 9           2558 ไตรมาสที่ 1      ภาคเหนือ  ในเขตเทศบาล รวม       รวม   0.9
## 10          2558 ไตรมาสที่ 1      ภาคเหนือ นอกเขตเทศบาล รวม       รวม   0.8
##      unit                source
## 1  ร้อยละ สำนักงานสถิติแห่งชาติ
## 2  ร้อยละ สำนักงานสถิติแห่งชาติ
## 3  ร้อยละ สำนักงานสถิติแห่งชาติ
## 4  ร้อยละ สำนักงานสถิติแห่งชาติ
## 5  ร้อยละ สำนักงานสถิติแห่งชาติ
## 6  ร้อยละ สำนักงานสถิติแห่งชาติ
## 7  ร้อยละ สำนักงานสถิติแห่งชาติ
## 8  ร้อยละ สำนักงานสถิติแห่งชาติ
## 9  ร้อยละ สำนักงานสถิติแห่งชาติ
## 10 ร้อยละ สำนักงานสถิติแห่งชาติ

## The dataset created by using Buddha year and Thai language.
## So let cleaning the data.
unemploy <-unemploy %>%
            mutate("year" =X.U.FEFF.year-543)%>%   
            ## Switch to CE year.
            mutate("Q"= as.numeric(str_remove(quarter,"ไตรมาสที่ ")))%>%  
            ## Cut Thai-phase which mean quarter of year. 
            rename("percent"=value)%>%## Rename column.
            select(-c(X.U.FEFF.year, quarter, unit, source))  
            ## Cut out unnecessary columns.

unique(unemploy$region)

## [1] "ทั่วประเทศ"         "กรุงเทพมหานคร"      "ภาคกลาง"           
## [4] "ภาคเหนือ"           "ภาคตะวันออกเฉียงเหนือ" "ภาคใต้"

## Translate to English.
unemploy$region <- case_when(unemploy$region=="ทั่วประเทศ"~"all",
                             unemploy$region=="กรุงเทพมหานคร"~"bangkok",
                             unemploy$region=="ภาคกลาง"~"middle",
                             unemploy$region=="ภาคเหนือ"~"north",
                             unemploy$region=="ภาคตะวันออกเฉียงเหนือ"~"northeast",
                             unemploy$region=="ภาคใต้"~"south")

unique(unemploy$area)

## [1] "รวม"          "ในเขตเทศบาล"  "นอกเขตเทศบาล"

unemploy$area <- case_when(unemploy$area == "ในเขตเทศบาล"~"municipal",
                           unemploy$area == "นอกเขตเทศบาล"~"nonmunicipal",
                           unemploy$area == "รวม"~"both")

unique(unemploy$sex)

## [1] "รวม" "ชาย" "หญิง"

unemploy$sex <- case_when(unemploy$sex == "ชาย"~"male",
                           unemploy$sex == "หญิง"~"female",
                           unemploy$sex == "รวม"~"both")

unique(unemploy$age_group)

## [1] "รวม"      "15-19 ปี" "20-24 ปี" "25-29 ปี" "30-34 ปี" "35-39 ปี" "40-49 ปี"
## [8] "50-59 ปี" "60 ปีขึ้นไป"

unemploy$age_group<-str_remove(unemploy$age_group, " ปี")
unemploy$age_group<-case_when(unemploy$age_group =="รวม"~"all",
                              unemploy$age_group =="60ขึ้นไป"~">60",
                              TRUE ~ as.character(unemploy$age_group))
    
unemploy$age_group<-factor(unemploy$age_group,
                              levels=c("15-19","20-24","25-29","30-34","35-39","40-49","50-59",">60","all"))

### Rearrange the data for tidiness.
unemploy <- unemploy %>%
                relocate(year, Q, region, area, age_group)%>%
                    arrange(year, Q, region, area, age_group, sex )%>%
                        filter(sex!="both" & area!="both"& age_group!="all",region!="all")  
##Filter out summation value due to unwanted redundancy in next EDA steps.


DT::datatable(unemploy)## Look at final dataset.

## Explore relation of age and unemploy rate adjusted by region of Thailand with visualization.
age_region<-unemploy%>%
                    group_by(region,age_group)%>%
                        summarise("percent_age"= sum(percent))

ggplot(data =age_region)+
    geom_bar(aes(x=age_group, y=percent_age), stat="identity")+
        theme(axis.text.x = element_text(angle = 90))+
            facet_wrap(~region)+
                ylab("Unemploy rate")+
                    xlab("Age group")+
                        labs(title = "Unemploy rate by age group of each region in Thailand")

## Explore trend of unemploy rate with visualization.   
trend <- unemploy%>%
            filter(sex!="both" & area!="both",age_group!="all",region!="all")%>%
                select(year,Q,percent)%>%
                    group_by(year,Q)%>%
                        summarize("percent_Q"=sum(percent))

ggplot()+
    geom_line(aes(x=filter(trend,year==2015)$Q, 
                  y=filter(trend,year==2015)$percent_Q, 
                  color = "red"))+
    geom_line(aes(x=filter(trend,year==2016)$Q, 
                  y=filter(trend,year==2016)$percent_Q, 
                  color = "blue"))+
        ylim(0,300)+
            labs(color="year")+
                scale_color_discrete(name="year",
                         labels=c("2015","2016"))+
                    xlab("Quater of year")+
                        ylab("Unemploy rate")+
                            labs(title = "Unemploy rate of 2015 compare with 2016 in Thailand")

## Explore relation of areas and unemploy rate adjusted by region of Thailand with visualization.
municipal_eda<- unemploy%>%
                    filter(region!="bangkok")%>%
                        select(region,area,percent)%>%
                            group_by(region, area)%>%
                                summarise("percent_area"=sum(percent))


ggplot(data = municipal_eda)+
    geom_bar(aes(x=area,y=percent_area), stat = "identity")+
        facet_wrap(~region)+
            xlab("Area")+
                ylab("Unemploy rate")+
                    labs(title = "Unemploy rate of municipal area versus non municipal area for each region in Thailand")

## Explore relation of sex and unemploy rate adjusted by region of Thailand with visualization.
sex_eda <- unemploy %>%
            select(sex,region,percent)%>%
                group_by(region,sex)%>%
                    summarise("percent_sex"=sum(percent))

ggplot(data = sex_eda)+
    geom_bar(aes(x= sex, y= percent_sex),stat="identity")+
        facet_wrap(~region)+
            xlab("Sex")+
                ylab("Unemploy  rate")+
                    labs(title = "Unemploy rate by sex of each region in Thailand")

EDA example

Jirapanakorn Sutham

2022-06-11