Load packages

library(dplyr)
library(ggplot2)
library(statsr)

Load Data

dataset <- read.csv("Train.csv", stringsAsFactors = TRUE)

Part 1: Research question

Analyzing the sample of people interviewed from Rwanda, Tanzania, Nigeria, Uganda we want to highlight that people with a “Tertiary education” or “Vocational / Specialized training” or a “Formally employed Government” or ” Formally employed Private” have mostly bank account than those who don’t.

Part 2: Exploratory data analysis

summary(dataset)
##      country          year               uniqueid     bank_account
##  Kenya   :6068   Min.   :2016   uniqueid_1   :    4   No :20212   
##  Rwanda  :8735   1st Qu.:2016   uniqueid_10  :    4   Yes: 3312   
##  Tanzania:6620   Median :2017   uniqueid_100 :    4               
##  Uganda  :2101   Mean   :2017   uniqueid_1000:    4               
##                  3rd Qu.:2018   uniqueid_1001:    4               
##                  Max.   :2018   uniqueid_1002:    4               
##                                 (Other)      :23500               
##  location_type cellphone_access household_size   age_of_respondent
##  Rural:14343   No : 6070        Min.   : 1.000   Min.   : 16.00   
##  Urban: 9181   Yes:17454        1st Qu.: 2.000   1st Qu.: 26.00   
##                                 Median : 3.000   Median : 35.00   
##                                 Mean   : 3.797   Mean   : 38.81   
##                                 3rd Qu.: 5.000   3rd Qu.: 49.00   
##                                 Max.   :21.000   Max.   :100.00   
##                                                                   
##  gender_of_respondent         relationship_with_head
##  Female:13877         Child              : 2229     
##  Male  : 9647         Head of Household  :12831     
##                       Other non-relatives:  190     
##                       Other relative     :  668     
##                       Parent             : 1086     
##                       Spouse             : 6520     
##                                                     
##                  marital_status                         education_level 
##  Divorced/Seperated     : 2076   No formal education            : 4515  
##  Dont know              :    8   Other/Dont know/RTA            :   35  
##  Married/Living together:10749   Primary education              :12791  
##  Single/Never Married   : 7983   Secondary education            : 4223  
##  Widowed                : 2708   Tertiary education             : 1157  
##                                  Vocational/Specialised training:  803  
##                                                                         
##                       job_type   
##  Self employed            :6437  
##  Informally employed      :5597  
##  Farming and Fishing      :5441  
##  Remittance Dependent     :2527  
##  Other Income             :1080  
##  Formally employed Private:1055  
##  (Other)                  :1387
colSums(is.na(dataset))
##                country                   year               uniqueid 
##                      0                      0                      0 
##           bank_account          location_type       cellphone_access 
##                      0                      0                      0 
##         household_size      age_of_respondent   gender_of_respondent 
##                      0                      0                      0 
## relationship_with_head         marital_status        education_level 
##                      0                      0                      0 
##               job_type 
##                      0
dataset %>%
  group_by(bank_account) %>%
  summarise(total=n(), .groups = "drop_last") %>%
  mutate(bank_account=reorder(bank_account,total)) %>%
  ggplot(aes(bank_account,total, fill=bank_account)) +
  geom_bar(stat = "identity" )+
  geom_text(aes(label=total), vjust=0)+
  ylim(0,21000)+
  ggtitle("Total number of persons by bank_account")

df <-dataset %>%
  group_by(country, bank_account) %>%
  summarise(total=n(), .groups = "drop_last")

df %>%
  ggplot(aes(country,total, fill=bank_account))+
  geom_bar(stat = "identity")+
  geom_text(aes(label=total), vjust=0)+
  ggtitle("Total number of persons by region and bank_account")

df<-dataset %>%
  group_by(location_type, bank_account) %>%
  summarise(total=n(), .groups = "drop_last") 

df %>%
  ggplot(aes(location_type,total, fill=bank_account))+
  geom_bar(stat = "identity")+
  geom_text(aes(label=total), vjust=0)+
  ggtitle("Total number of persons by location_type and bank_account")

df<-dataset %>%
  group_by(cellphone_access, bank_account) %>%
  summarise(total=n(), .groups = "drop_last") 

df %>%
  ggplot(aes(cellphone_access,total, fill=bank_account))+
  geom_bar(stat = "identity")+
  geom_text(aes(label=total), vjust=0)+
  ggtitle("Total number of persons by cellphone_access and bank_account")

df<-dataset %>%
  group_by(marital_status, bank_account) %>%
  summarise(total=n(), .groups = "drop_last") 

df %>%
  ggplot(aes(marital_status,total, fill=bank_account))+
  geom_bar(stat = "identity")+
  #geom_text(aes(label=total), vjust=0)
  coord_flip()+
  ggtitle("Total number of persons by marital_status and bank_account")

df<-dataset %>%
  group_by(education_level, bank_account) %>%
  summarise(total=n(), .groups = "drop_last") 

df %>%
  ggplot(aes(education_level,total, fill=bank_account))+
  geom_bar(stat = "identity")+
  #geom_text(aes(label=total), vjust=0)+
  coord_flip()+
  ggtitle("Total number of persons by education_level and bank_account")

df<-dataset %>%
  group_by(job_type, bank_account) %>%
  summarise(total=n(), .groups = "drop_last") 

df %>%
  ggplot(aes(job_type,total, fill=bank_account))+
  geom_bar(stat = "identity")+
  #geom_text(aes(label=total), vjust=0)+
  coord_flip()+
  ggtitle("Total number of persons by job_type and bank_account")

df<-dataset %>%
  group_by(gender_of_respondent, bank_account) %>%
  summarise(total=n(), .groups = "drop_last") 

df %>%
  ggplot(aes(gender_of_respondent,total, fill=bank_account))+
  geom_bar(stat = "identity")+
  geom_text(aes(label=total), vjust=0)+
  ggtitle("Total number of persons by gender_of_respondent  and bank_account")

dataset %>%
  ggplot(aes(bank_account,age_of_respondent, fill=bank_account))+
  geom_boxplot()+
  theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
  ggtitle("Age_of_respondent bank_account boxplot")

for (v in names(dataset[,-4])) {
  if (is.factor(dataset[, v])) {
    df <- dataset %>%
      group_by(dataset[, v], bank_account) %>%
      summarise(total = n(), .groups = "drop_last")
    
    names(df)[1] <- v
    
    fu <- df %>%
      filter(bank_account == "Yes") %>%
      select( v, total)
    
    non_fu <- df %>%
      filter(bank_account == "No") %>%
      select(v, total)
    
    
    if(nrow(fu)!=nrow(non_fu)) next
    
    vet <-as.character(as.matrix(fu[which(fu$total - non_fu$total > 0),1]))
    
    if(length(vet)==0) next
    
   
    print(df[which(as.character(as.matrix(df[,1]))  %in% vet),])
    cat("\n\n")    
   
    
  }
}
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(v)` instead of `v` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## # A tibble: 4 × 3
## # Groups:   education_level [2]
##   education_level                 bank_account total
##   <fct>                           <fct>        <int>
## 1 Tertiary education              No             566
## 2 Tertiary education              Yes            591
## 3 Vocational/Specialised training No             345
## 4 Vocational/Specialised training Yes            458
## 
## 
## # A tibble: 4 × 3
## # Groups:   job_type [2]
##   job_type                     bank_account total
##   <fct>                        <fct>        <int>
## 1 Formally employed Government No              87
## 2 Formally employed Government Yes            300
## 3 Formally employed Private    No             484
## 4 Formally employed Private    Yes            571

Part 3: Inference

Finally, through the inference, it is noted that the percentage of those who have bank accounts in Rwanda, Tanzania, Nigeria, Uganda is greater if they live in urban areas than in rural areas in fact in the test, being p_value<0.05 it is necessary to reject the null hypothesis and accept which p_Rural<p_Urban :

inference(y = bank_account, x = location_type, data = dataset, statistic = "proportion", type = "ht", 
          alternative = "less", method = "theoretical", success = "Yes")
## Warning: Missing null value, set to 0
## Response variable: categorical (2 levels, success: Yes)
## Explanatory variable: categorical (2 levels) 
## n_Rural = 14343, p_hat_Rural = 0.1165
## n_Urban = 9181, p_hat_Urban = 0.1787
## H0: p_Rural =  p_Urban
## HA: p_Rural < p_Urban
## z = -13.3879
## p_value = < 0.0001