Kiểm tra missing data.
library(VIM)
library(dplyr)
year <-
raw %>%
select(year) %>%
distinct() %>%
arrange(year)
year <- year$year
for(i in year){
raw %>%
filter(year == i) %>%
aggr(., col=c('navyblue','yellow'),
numbers = TRUE, sortVars = TRUE, labels = names(raw),
cex.axis = .7, gap = 3, ylab = c("Du Lieu Thieu","Pattern"))
i %>% print()
}

##
## Variables sorted by number of missings:
## Variable Count
## rural 1.000000000
## poor 1.000000000
## no_boards_member 1.000000000
## social_orientation 1.000000000
## social_committee 1.000000000
## social_exp 1.000000000
## infra 1.000000000
## cost_per_borrow 0.473118280
## bhr_scale 0.231182796
## cost_per_borrower 0.043010753
## nab 0.021505376
## ln_borrowers 0.021505376
## ln_assets 0.005376344
## gross_loan_profit 0.005376344
## id 0.000000000
## id_name 0.000000000
## year 0.000000000
## id_country 0.000000000
## roa 0.000000000
## roe 0.000000000
## oss 0.000000000
## avg_loan_gni_per_capital 0.000000000
## capital_asset_ratio 0.000000000
## deposit_to_loan 0.000000000
## female_borrowers_percen 0.000000000
## no_boards_member_female 0.000000000
## lngdp 0.000000000
## ggdp 0.000000000
## internet_users 0.000000000
## portfolio_risk 0.000000000
## loan_loss_rate 0.000000000
## [1] 2006

##
## Variables sorted by number of missings:
## Variable Count
## no_boards_member 1.000000000
## rural 0.971563981
## poor 0.971563981
## social_orientation 0.971563981
## social_committee 0.971563981
## social_exp 0.971563981
## cost_per_borrow 0.322274882
## bhr_scale 0.270142180
## cost_per_borrower 0.052132701
## nab 0.037914692
## ln_borrowers 0.037914692
## infra 0.037914692
## ln_assets 0.004739336
## gross_loan_profit 0.004739336
## id 0.000000000
## id_name 0.000000000
## year 0.000000000
## id_country 0.000000000
## roa 0.000000000
## roe 0.000000000
## oss 0.000000000
## avg_loan_gni_per_capital 0.000000000
## capital_asset_ratio 0.000000000
## deposit_to_loan 0.000000000
## female_borrowers_percen 0.000000000
## no_boards_member_female 0.000000000
## lngdp 0.000000000
## ggdp 0.000000000
## internet_users 0.000000000
## portfolio_risk 0.000000000
## loan_loss_rate 0.000000000
## [1] 2007

##
## Variables sorted by number of missings:
## Variable Count
## no_boards_member 1.00000000
## rural 0.84753363
## poor 0.84753363
## social_orientation 0.84753363
## social_committee 0.84753363
## social_exp 0.84753363
## bhr_scale 0.26905830
## cost_per_borrow 0.15695067
## cost_per_borrower 0.08968610
## infra 0.03587444
## nab 0.01793722
## ln_borrowers 0.01793722
## id 0.00000000
## id_name 0.00000000
## year 0.00000000
## id_country 0.00000000
## roa 0.00000000
## roe 0.00000000
## oss 0.00000000
## avg_loan_gni_per_capital 0.00000000
## ln_assets 0.00000000
## capital_asset_ratio 0.00000000
## deposit_to_loan 0.00000000
## female_borrowers_percen 0.00000000
## no_boards_member_female 0.00000000
## lngdp 0.00000000
## ggdp 0.00000000
## internet_users 0.00000000
## gross_loan_profit 0.00000000
## portfolio_risk 0.00000000
## loan_loss_rate 0.00000000
## [1] 2008

##
## Variables sorted by number of missings:
## Variable Count
## no_boards_member 0.991769547
## rural 0.740740741
## poor 0.740740741
## social_orientation 0.740740741
## social_committee 0.740740741
## social_exp 0.740740741
## bhr_scale 0.242798354
## cost_per_borrow 0.111111111
## infra 0.041152263
## cost_per_borrower 0.041152263
## nab 0.012345679
## ln_borrowers 0.012345679
## ln_assets 0.004115226
## gross_loan_profit 0.004115226
## id 0.000000000
## id_name 0.000000000
## year 0.000000000
## id_country 0.000000000
## roa 0.000000000
## roe 0.000000000
## oss 0.000000000
## avg_loan_gni_per_capital 0.000000000
## capital_asset_ratio 0.000000000
## deposit_to_loan 0.000000000
## female_borrowers_percen 0.000000000
## no_boards_member_female 0.000000000
## lngdp 0.000000000
## ggdp 0.000000000
## internet_users 0.000000000
## portfolio_risk 0.000000000
## loan_loss_rate 0.000000000
## [1] 2009

##
## Variables sorted by number of missings:
## Variable Count
## rural 0.625000000
## poor 0.625000000
## social_orientation 0.625000000
## social_committee 0.625000000
## social_exp 0.625000000
## no_boards_member 0.246323529
## bhr_scale 0.231617647
## cost_per_borrow 0.102941176
## infra 0.058823529
## cost_per_borrower 0.025735294
## nab 0.018382353
## ln_borrowers 0.018382353
## ln_assets 0.003676471
## gross_loan_profit 0.003676471
## id 0.000000000
## id_name 0.000000000
## year 0.000000000
## id_country 0.000000000
## roa 0.000000000
## roe 0.000000000
## oss 0.000000000
## avg_loan_gni_per_capital 0.000000000
## capital_asset_ratio 0.000000000
## deposit_to_loan 0.000000000
## female_borrowers_percen 0.000000000
## no_boards_member_female 0.000000000
## lngdp 0.000000000
## ggdp 0.000000000
## internet_users 0.000000000
## portfolio_risk 0.000000000
## loan_loss_rate 0.000000000
## [1] 2010

##
## Variables sorted by number of missings:
## Variable Count
## rural 0.498233216
## poor 0.498233216
## social_orientation 0.498233216
## social_committee 0.498233216
## social_exp 0.498233216
## bhr_scale 0.208480565
## no_boards_member 0.183745583
## cost_per_borrow 0.074204947
## infra 0.067137809
## cost_per_borrower 0.056537102
## nab 0.038869258
## ln_borrowers 0.038869258
## ln_assets 0.003533569
## gross_loan_profit 0.003533569
## id 0.000000000
## id_name 0.000000000
## year 0.000000000
## id_country 0.000000000
## roa 0.000000000
## roe 0.000000000
## oss 0.000000000
## avg_loan_gni_per_capital 0.000000000
## capital_asset_ratio 0.000000000
## deposit_to_loan 0.000000000
## female_borrowers_percen 0.000000000
## no_boards_member_female 0.000000000
## lngdp 0.000000000
## ggdp 0.000000000
## internet_users 0.000000000
## portfolio_risk 0.000000000
## loan_loss_rate 0.000000000
## [1] 2011

##
## Variables sorted by number of missings:
## Variable Count
## rural 0.47417840
## poor 0.47417840
## social_orientation 0.47417840
## social_committee 0.47417840
## social_exp 0.47417840
## no_boards_member 0.16901408
## bhr_scale 0.10328638
## infra 0.08920188
## cost_per_borrow 0.02347418
## cost_per_borrower 0.01877934
## nab 0.01408451
## ln_borrowers 0.01408451
## id 0.00000000
## id_name 0.00000000
## year 0.00000000
## id_country 0.00000000
## roa 0.00000000
## roe 0.00000000
## oss 0.00000000
## avg_loan_gni_per_capital 0.00000000
## ln_assets 0.00000000
## capital_asset_ratio 0.00000000
## deposit_to_loan 0.00000000
## female_borrowers_percen 0.00000000
## no_boards_member_female 0.00000000
## lngdp 0.00000000
## ggdp 0.00000000
## internet_users 0.00000000
## gross_loan_profit 0.00000000
## portfolio_risk 0.00000000
## loan_loss_rate 0.00000000
## [1] 2012

##
## Variables sorted by number of missings:
## Variable Count
## rural 0.303482587
## poor 0.303482587
## social_orientation 0.303482587
## social_committee 0.303482587
## social_exp 0.303482587
## no_boards_member 0.134328358
## bhr_scale 0.099502488
## infra 0.049751244
## cost_per_borrow 0.029850746
## nab 0.024875622
## ln_borrowers 0.024875622
## cost_per_borrower 0.004975124
## id 0.000000000
## id_name 0.000000000
## year 0.000000000
## id_country 0.000000000
## roa 0.000000000
## roe 0.000000000
## oss 0.000000000
## avg_loan_gni_per_capital 0.000000000
## ln_assets 0.000000000
## capital_asset_ratio 0.000000000
## deposit_to_loan 0.000000000
## female_borrowers_percen 0.000000000
## no_boards_member_female 0.000000000
## lngdp 0.000000000
## ggdp 0.000000000
## internet_users 0.000000000
## gross_loan_profit 0.000000000
## portfolio_risk 0.000000000
## loan_loss_rate 0.000000000
## [1] 2013

##
## Variables sorted by number of missings:
## Variable Count
## bhr_scale 0.46666667
## rural 0.18095238
## poor 0.18095238
## social_orientation 0.18095238
## social_committee 0.18095238
## social_exp 0.18095238
## no_boards_member 0.09523810
## infra 0.04761905
## cost_per_borrow 0.01904762
## nab 0.00952381
## ln_borrowers 0.00952381
## cost_per_borrower 0.00952381
## id 0.00000000
## id_name 0.00000000
## year 0.00000000
## id_country 0.00000000
## roa 0.00000000
## roe 0.00000000
## oss 0.00000000
## avg_loan_gni_per_capital 0.00000000
## ln_assets 0.00000000
## capital_asset_ratio 0.00000000
## deposit_to_loan 0.00000000
## female_borrowers_percen 0.00000000
## no_boards_member_female 0.00000000
## lngdp 0.00000000
## ggdp 0.00000000
## internet_users 0.00000000
## gross_loan_profit 0.00000000
## portfolio_risk 0.00000000
## loan_loss_rate 0.00000000
## [1] 2014

##
## Variables sorted by number of missings:
## Variable Count
## rural 0.945945946
## poor 0.945945946
## social_orientation 0.945945946
## social_committee 0.945945946
## social_exp 0.945945946
## bhr_scale 0.524324324
## no_boards_member 0.129729730
## infra 0.064864865
## lngdp 0.010810811
## ggdp 0.010810811
## nab 0.005405405
## ln_borrowers 0.005405405
## cost_per_borrower 0.005405405
## cost_per_borrow 0.005405405
## id 0.000000000
## id_name 0.000000000
## year 0.000000000
## id_country 0.000000000
## roa 0.000000000
## roe 0.000000000
## oss 0.000000000
## avg_loan_gni_per_capital 0.000000000
## ln_assets 0.000000000
## capital_asset_ratio 0.000000000
## deposit_to_loan 0.000000000
## female_borrowers_percen 0.000000000
## no_boards_member_female 0.000000000
## internet_users 0.000000000
## gross_loan_profit 0.000000000
## portfolio_risk 0.000000000
## loan_loss_rate 0.000000000
## [1] 2015