Kiểm tra missing data.

library(VIM)
library(dplyr)

year <- 
  raw %>% 
  select(year) %>% 
  distinct() %>% 
  arrange(year)

year <- year$year

for(i in year){
raw %>% 
  filter(year == i) %>% 
  aggr(., col=c('navyblue','yellow'),
     numbers = TRUE, sortVars = TRUE, labels = names(raw), 
     cex.axis = .7, gap = 3, ylab = c("Du Lieu Thieu","Pattern"))
  i %>% print()
}

## 
##  Variables sorted by number of missings: 
##                  Variable       Count
##                     rural 1.000000000
##                      poor 1.000000000
##          no_boards_member 1.000000000
##        social_orientation 1.000000000
##          social_committee 1.000000000
##                social_exp 1.000000000
##                     infra 1.000000000
##           cost_per_borrow 0.473118280
##                 bhr_scale 0.231182796
##         cost_per_borrower 0.043010753
##                       nab 0.021505376
##              ln_borrowers 0.021505376
##                 ln_assets 0.005376344
##         gross_loan_profit 0.005376344
##                        id 0.000000000
##                   id_name 0.000000000
##                      year 0.000000000
##                id_country 0.000000000
##                       roa 0.000000000
##                       roe 0.000000000
##                       oss 0.000000000
##  avg_loan_gni_per_capital 0.000000000
##       capital_asset_ratio 0.000000000
##           deposit_to_loan 0.000000000
##   female_borrowers_percen 0.000000000
##   no_boards_member_female 0.000000000
##                     lngdp 0.000000000
##                      ggdp 0.000000000
##            internet_users 0.000000000
##            portfolio_risk 0.000000000
##            loan_loss_rate 0.000000000
## [1] 2006

## 
##  Variables sorted by number of missings: 
##                  Variable       Count
##          no_boards_member 1.000000000
##                     rural 0.971563981
##                      poor 0.971563981
##        social_orientation 0.971563981
##          social_committee 0.971563981
##                social_exp 0.971563981
##           cost_per_borrow 0.322274882
##                 bhr_scale 0.270142180
##         cost_per_borrower 0.052132701
##                       nab 0.037914692
##              ln_borrowers 0.037914692
##                     infra 0.037914692
##                 ln_assets 0.004739336
##         gross_loan_profit 0.004739336
##                        id 0.000000000
##                   id_name 0.000000000
##                      year 0.000000000
##                id_country 0.000000000
##                       roa 0.000000000
##                       roe 0.000000000
##                       oss 0.000000000
##  avg_loan_gni_per_capital 0.000000000
##       capital_asset_ratio 0.000000000
##           deposit_to_loan 0.000000000
##   female_borrowers_percen 0.000000000
##   no_boards_member_female 0.000000000
##                     lngdp 0.000000000
##                      ggdp 0.000000000
##            internet_users 0.000000000
##            portfolio_risk 0.000000000
##            loan_loss_rate 0.000000000
## [1] 2007

## 
##  Variables sorted by number of missings: 
##                  Variable      Count
##          no_boards_member 1.00000000
##                     rural 0.84753363
##                      poor 0.84753363
##        social_orientation 0.84753363
##          social_committee 0.84753363
##                social_exp 0.84753363
##                 bhr_scale 0.26905830
##           cost_per_borrow 0.15695067
##         cost_per_borrower 0.08968610
##                     infra 0.03587444
##                       nab 0.01793722
##              ln_borrowers 0.01793722
##                        id 0.00000000
##                   id_name 0.00000000
##                      year 0.00000000
##                id_country 0.00000000
##                       roa 0.00000000
##                       roe 0.00000000
##                       oss 0.00000000
##  avg_loan_gni_per_capital 0.00000000
##                 ln_assets 0.00000000
##       capital_asset_ratio 0.00000000
##           deposit_to_loan 0.00000000
##   female_borrowers_percen 0.00000000
##   no_boards_member_female 0.00000000
##                     lngdp 0.00000000
##                      ggdp 0.00000000
##            internet_users 0.00000000
##         gross_loan_profit 0.00000000
##            portfolio_risk 0.00000000
##            loan_loss_rate 0.00000000
## [1] 2008

## 
##  Variables sorted by number of missings: 
##                  Variable       Count
##          no_boards_member 0.991769547
##                     rural 0.740740741
##                      poor 0.740740741
##        social_orientation 0.740740741
##          social_committee 0.740740741
##                social_exp 0.740740741
##                 bhr_scale 0.242798354
##           cost_per_borrow 0.111111111
##                     infra 0.041152263
##         cost_per_borrower 0.041152263
##                       nab 0.012345679
##              ln_borrowers 0.012345679
##                 ln_assets 0.004115226
##         gross_loan_profit 0.004115226
##                        id 0.000000000
##                   id_name 0.000000000
##                      year 0.000000000
##                id_country 0.000000000
##                       roa 0.000000000
##                       roe 0.000000000
##                       oss 0.000000000
##  avg_loan_gni_per_capital 0.000000000
##       capital_asset_ratio 0.000000000
##           deposit_to_loan 0.000000000
##   female_borrowers_percen 0.000000000
##   no_boards_member_female 0.000000000
##                     lngdp 0.000000000
##                      ggdp 0.000000000
##            internet_users 0.000000000
##            portfolio_risk 0.000000000
##            loan_loss_rate 0.000000000
## [1] 2009

## 
##  Variables sorted by number of missings: 
##                  Variable       Count
##                     rural 0.625000000
##                      poor 0.625000000
##        social_orientation 0.625000000
##          social_committee 0.625000000
##                social_exp 0.625000000
##          no_boards_member 0.246323529
##                 bhr_scale 0.231617647
##           cost_per_borrow 0.102941176
##                     infra 0.058823529
##         cost_per_borrower 0.025735294
##                       nab 0.018382353
##              ln_borrowers 0.018382353
##                 ln_assets 0.003676471
##         gross_loan_profit 0.003676471
##                        id 0.000000000
##                   id_name 0.000000000
##                      year 0.000000000
##                id_country 0.000000000
##                       roa 0.000000000
##                       roe 0.000000000
##                       oss 0.000000000
##  avg_loan_gni_per_capital 0.000000000
##       capital_asset_ratio 0.000000000
##           deposit_to_loan 0.000000000
##   female_borrowers_percen 0.000000000
##   no_boards_member_female 0.000000000
##                     lngdp 0.000000000
##                      ggdp 0.000000000
##            internet_users 0.000000000
##            portfolio_risk 0.000000000
##            loan_loss_rate 0.000000000
## [1] 2010

## 
##  Variables sorted by number of missings: 
##                  Variable       Count
##                     rural 0.498233216
##                      poor 0.498233216
##        social_orientation 0.498233216
##          social_committee 0.498233216
##                social_exp 0.498233216
##                 bhr_scale 0.208480565
##          no_boards_member 0.183745583
##           cost_per_borrow 0.074204947
##                     infra 0.067137809
##         cost_per_borrower 0.056537102
##                       nab 0.038869258
##              ln_borrowers 0.038869258
##                 ln_assets 0.003533569
##         gross_loan_profit 0.003533569
##                        id 0.000000000
##                   id_name 0.000000000
##                      year 0.000000000
##                id_country 0.000000000
##                       roa 0.000000000
##                       roe 0.000000000
##                       oss 0.000000000
##  avg_loan_gni_per_capital 0.000000000
##       capital_asset_ratio 0.000000000
##           deposit_to_loan 0.000000000
##   female_borrowers_percen 0.000000000
##   no_boards_member_female 0.000000000
##                     lngdp 0.000000000
##                      ggdp 0.000000000
##            internet_users 0.000000000
##            portfolio_risk 0.000000000
##            loan_loss_rate 0.000000000
## [1] 2011

## 
##  Variables sorted by number of missings: 
##                  Variable      Count
##                     rural 0.47417840
##                      poor 0.47417840
##        social_orientation 0.47417840
##          social_committee 0.47417840
##                social_exp 0.47417840
##          no_boards_member 0.16901408
##                 bhr_scale 0.10328638
##                     infra 0.08920188
##           cost_per_borrow 0.02347418
##         cost_per_borrower 0.01877934
##                       nab 0.01408451
##              ln_borrowers 0.01408451
##                        id 0.00000000
##                   id_name 0.00000000
##                      year 0.00000000
##                id_country 0.00000000
##                       roa 0.00000000
##                       roe 0.00000000
##                       oss 0.00000000
##  avg_loan_gni_per_capital 0.00000000
##                 ln_assets 0.00000000
##       capital_asset_ratio 0.00000000
##           deposit_to_loan 0.00000000
##   female_borrowers_percen 0.00000000
##   no_boards_member_female 0.00000000
##                     lngdp 0.00000000
##                      ggdp 0.00000000
##            internet_users 0.00000000
##         gross_loan_profit 0.00000000
##            portfolio_risk 0.00000000
##            loan_loss_rate 0.00000000
## [1] 2012

## 
##  Variables sorted by number of missings: 
##                  Variable       Count
##                     rural 0.303482587
##                      poor 0.303482587
##        social_orientation 0.303482587
##          social_committee 0.303482587
##                social_exp 0.303482587
##          no_boards_member 0.134328358
##                 bhr_scale 0.099502488
##                     infra 0.049751244
##           cost_per_borrow 0.029850746
##                       nab 0.024875622
##              ln_borrowers 0.024875622
##         cost_per_borrower 0.004975124
##                        id 0.000000000
##                   id_name 0.000000000
##                      year 0.000000000
##                id_country 0.000000000
##                       roa 0.000000000
##                       roe 0.000000000
##                       oss 0.000000000
##  avg_loan_gni_per_capital 0.000000000
##                 ln_assets 0.000000000
##       capital_asset_ratio 0.000000000
##           deposit_to_loan 0.000000000
##   female_borrowers_percen 0.000000000
##   no_boards_member_female 0.000000000
##                     lngdp 0.000000000
##                      ggdp 0.000000000
##            internet_users 0.000000000
##         gross_loan_profit 0.000000000
##            portfolio_risk 0.000000000
##            loan_loss_rate 0.000000000
## [1] 2013

## 
##  Variables sorted by number of missings: 
##                  Variable      Count
##                 bhr_scale 0.46666667
##                     rural 0.18095238
##                      poor 0.18095238
##        social_orientation 0.18095238
##          social_committee 0.18095238
##                social_exp 0.18095238
##          no_boards_member 0.09523810
##                     infra 0.04761905
##           cost_per_borrow 0.01904762
##                       nab 0.00952381
##              ln_borrowers 0.00952381
##         cost_per_borrower 0.00952381
##                        id 0.00000000
##                   id_name 0.00000000
##                      year 0.00000000
##                id_country 0.00000000
##                       roa 0.00000000
##                       roe 0.00000000
##                       oss 0.00000000
##  avg_loan_gni_per_capital 0.00000000
##                 ln_assets 0.00000000
##       capital_asset_ratio 0.00000000
##           deposit_to_loan 0.00000000
##   female_borrowers_percen 0.00000000
##   no_boards_member_female 0.00000000
##                     lngdp 0.00000000
##                      ggdp 0.00000000
##            internet_users 0.00000000
##         gross_loan_profit 0.00000000
##            portfolio_risk 0.00000000
##            loan_loss_rate 0.00000000
## [1] 2014

## 
##  Variables sorted by number of missings: 
##                  Variable       Count
##                     rural 0.945945946
##                      poor 0.945945946
##        social_orientation 0.945945946
##          social_committee 0.945945946
##                social_exp 0.945945946
##                 bhr_scale 0.524324324
##          no_boards_member 0.129729730
##                     infra 0.064864865
##                     lngdp 0.010810811
##                      ggdp 0.010810811
##                       nab 0.005405405
##              ln_borrowers 0.005405405
##         cost_per_borrower 0.005405405
##           cost_per_borrow 0.005405405
##                        id 0.000000000
##                   id_name 0.000000000
##                      year 0.000000000
##                id_country 0.000000000
##                       roa 0.000000000
##                       roe 0.000000000
##                       oss 0.000000000
##  avg_loan_gni_per_capital 0.000000000
##                 ln_assets 0.000000000
##       capital_asset_ratio 0.000000000
##           deposit_to_loan 0.000000000
##   female_borrowers_percen 0.000000000
##   no_boards_member_female 0.000000000
##            internet_users 0.000000000
##         gross_loan_profit 0.000000000
##            portfolio_risk 0.000000000
##            loan_loss_rate 0.000000000
## [1] 2015