library(dplyr)
library(ggplot2)
library(statsr)
dataset <- read.csv("Train.csv", stringsAsFactors = TRUE)
Analyzing the sample of people interviewed from Rwanda, Tanzania, Nigeria, Uganda we want to highlight that people with a “Tertiary education” or “Vocational / Specialized training” or a “Formally employed Government” or ” Formally employed Private” have mostly bank account than those who don’t.
summary(dataset)
## country year uniqueid bank_account
## Kenya :6068 Min. :2016 uniqueid_1 : 4 No :20212
## Rwanda :8735 1st Qu.:2016 uniqueid_10 : 4 Yes: 3312
## Tanzania:6620 Median :2017 uniqueid_100 : 4
## Uganda :2101 Mean :2017 uniqueid_1000: 4
## 3rd Qu.:2018 uniqueid_1001: 4
## Max. :2018 uniqueid_1002: 4
## (Other) :23500
## location_type cellphone_access household_size age_of_respondent
## Rural:14343 No : 6070 Min. : 1.000 Min. : 16.00
## Urban: 9181 Yes:17454 1st Qu.: 2.000 1st Qu.: 26.00
## Median : 3.000 Median : 35.00
## Mean : 3.797 Mean : 38.81
## 3rd Qu.: 5.000 3rd Qu.: 49.00
## Max. :21.000 Max. :100.00
##
## gender_of_respondent relationship_with_head
## Female:13877 Child : 2229
## Male : 9647 Head of Household :12831
## Other non-relatives: 190
## Other relative : 668
## Parent : 1086
## Spouse : 6520
##
## marital_status education_level
## Divorced/Seperated : 2076 No formal education : 4515
## Dont know : 8 Other/Dont know/RTA : 35
## Married/Living together:10749 Primary education :12791
## Single/Never Married : 7983 Secondary education : 4223
## Widowed : 2708 Tertiary education : 1157
## Vocational/Specialised training: 803
##
## job_type
## Self employed :6437
## Informally employed :5597
## Farming and Fishing :5441
## Remittance Dependent :2527
## Other Income :1080
## Formally employed Private:1055
## (Other) :1387
colSums(is.na(dataset))
## country year uniqueid
## 0 0 0
## bank_account location_type cellphone_access
## 0 0 0
## household_size age_of_respondent gender_of_respondent
## 0 0 0
## relationship_with_head marital_status education_level
## 0 0 0
## job_type
## 0
dataset %>%
group_by(bank_account) %>%
summarise(total=n(), .groups = "drop_last") %>%
mutate(bank_account=reorder(bank_account,total)) %>%
ggplot(aes(bank_account,total, fill=bank_account)) +
geom_bar(stat = "identity" )+
geom_text(aes(label=total), vjust=0)+
ylim(0,21000)+
ggtitle("Total number of persons by bank_account")
df <-dataset %>%
group_by(country, bank_account) %>%
summarise(total=n(), .groups = "drop_last")
df %>%
ggplot(aes(country,total, fill=bank_account))+
geom_bar(stat = "identity")+
geom_text(aes(label=total), vjust=0)+
ggtitle("Total number of persons by region and bank_account")
df<-dataset %>%
group_by(location_type, bank_account) %>%
summarise(total=n(), .groups = "drop_last")
df %>%
ggplot(aes(location_type,total, fill=bank_account))+
geom_bar(stat = "identity")+
geom_text(aes(label=total), vjust=0)+
ggtitle("Total number of persons by location_type and bank_account")
df<-dataset %>%
group_by(cellphone_access, bank_account) %>%
summarise(total=n(), .groups = "drop_last")
df %>%
ggplot(aes(cellphone_access,total, fill=bank_account))+
geom_bar(stat = "identity")+
geom_text(aes(label=total), vjust=0)+
ggtitle("Total number of persons by cellphone_access and bank_account")
df<-dataset %>%
group_by(marital_status, bank_account) %>%
summarise(total=n(), .groups = "drop_last")
df %>%
ggplot(aes(marital_status,total, fill=bank_account))+
geom_bar(stat = "identity")+
#geom_text(aes(label=total), vjust=0)
coord_flip()+
ggtitle("Total number of persons by marital_status and bank_account")
df<-dataset %>%
group_by(education_level, bank_account) %>%
summarise(total=n(), .groups = "drop_last")
df %>%
ggplot(aes(education_level,total, fill=bank_account))+
geom_bar(stat = "identity")+
#geom_text(aes(label=total), vjust=0)+
coord_flip()+
ggtitle("Total number of persons by education_level and bank_account")
df<-dataset %>%
group_by(job_type, bank_account) %>%
summarise(total=n(), .groups = "drop_last")
df %>%
ggplot(aes(job_type,total, fill=bank_account))+
geom_bar(stat = "identity")+
#geom_text(aes(label=total), vjust=0)+
coord_flip()+
ggtitle("Total number of persons by job_type and bank_account")
df<-dataset %>%
group_by(gender_of_respondent, bank_account) %>%
summarise(total=n(), .groups = "drop_last")
df %>%
ggplot(aes(gender_of_respondent,total, fill=bank_account))+
geom_bar(stat = "identity")+
geom_text(aes(label=total), vjust=0)+
ggtitle("Total number of persons by gender_of_respondent and bank_account")
dataset %>%
ggplot(aes(bank_account,age_of_respondent, fill=bank_account))+
geom_boxplot()+
theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
ggtitle("Age_of_respondent bank_account boxplot")
for (v in names(dataset[,-4])) {
if (is.factor(dataset[, v])) {
df <- dataset %>%
group_by(dataset[, v], bank_account) %>%
summarise(total = n(), .groups = "drop_last")
names(df)[1] <- v
fu <- df %>%
filter(bank_account == "Yes") %>%
select( v, total)
non_fu <- df %>%
filter(bank_account == "No") %>%
select(v, total)
if(nrow(fu)!=nrow(non_fu)) next
vet <-as.character(as.matrix(fu[which(fu$total - non_fu$total > 0),1]))
if(length(vet)==0) next
print(df[which(as.character(as.matrix(df[,1])) %in% vet),])
cat("\n\n")
}
}
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(v)` instead of `v` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## # A tibble: 4 × 3
## # Groups: education_level [2]
## education_level bank_account total
## <fct> <fct> <int>
## 1 Tertiary education No 566
## 2 Tertiary education Yes 591
## 3 Vocational/Specialised training No 345
## 4 Vocational/Specialised training Yes 458
##
##
## # A tibble: 4 × 3
## # Groups: job_type [2]
## job_type bank_account total
## <fct> <fct> <int>
## 1 Formally employed Government No 87
## 2 Formally employed Government Yes 300
## 3 Formally employed Private No 484
## 4 Formally employed Private Yes 571
Finally, through the inference, it is noted that the percentage of those who have bank accounts in Rwanda, Tanzania, Nigeria, Uganda is greater if they live in urban areas than in rural areas in fact in the test, being p_value<0.05 it is necessary to reject the null hypothesis and accept which p_Rural<p_Urban :
inference(y = bank_account, x = location_type, data = dataset, statistic = "proportion", type = "ht",
alternative = "less", method = "theoretical", success = "Yes")
## Warning: Missing null value, set to 0
## Response variable: categorical (2 levels, success: Yes)
## Explanatory variable: categorical (2 levels)
## n_Rural = 14343, p_hat_Rural = 0.1165
## n_Urban = 9181, p_hat_Urban = 0.1787
## H0: p_Rural = p_Urban
## HA: p_Rural < p_Urban
## z = -13.3879
## p_value = < 0.0001