library(dplyr)
library(ggplot2)
library(corrplot)
library(statsr)
Data from : https://zindi.africa/competitions/be-a-trailblazer
dataset <- read.csv("Train.csv")
We want to highlight that in South Africa the share of individuals attending school and the share of low-income women-led households is strongly correlated.
summary(dataset)
## ward total_households total_individuals target
## Length:2822 Min. : 1 Min. : 402 Min. : 0.00
## Class :character 1st Qu.: 1779 1st Qu.: 7071 1st Qu.:16.75
## Mode :character Median : 2398 Median : 9367 Median :24.16
## Mean : 3665 Mean :12869 Mean :24.51
## 3rd Qu.: 3987 3rd Qu.:14241 3rd Qu.:32.23
## Max. :39685 Max. :91717 Max. :55.53
## dw_00 dw_01 dw_02 dw_03
## Min. :0.0000 Min. :0.000000 Min. :0.000000 Min. :0.0000000
## 1st Qu.:0.5942 1st Qu.:0.002895 1st Qu.:0.002407 1st Qu.:0.0000000
## Median :0.7668 Median :0.010425 Median :0.005762 Median :0.0008066
## Mean :0.7122 Mean :0.092616 Mean :0.032043 Mean :0.0060567
## 3rd Qu.:0.8817 3rd Qu.:0.068209 3rd Qu.:0.027913 3rd Qu.:0.0025383
## Max. :0.9950 Max. :0.931489 Max. :0.951806 Max. :0.2642393
## dw_04 dw_05 dw_06 dw_07
## Min. :0.0000000 Min. :0.0000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.0000000 1st Qu.:0.0000000 1st Qu.:0.002716 1st Qu.:0.004716
## Median :0.0006069 Median :0.0008654 Median :0.008639 Median :0.016295
## Mean :0.0086655 Mean :0.0062888 Mean :0.022374 Mean :0.039296
## 3rd Qu.:0.0022246 3rd Qu.:0.0030272 3rd Qu.:0.025218 3rd Qu.:0.048731
## Max. :0.3920853 Max. :0.4359115 Max. :0.412936 Max. :0.455815
## dw_08 dw_09 dw_10 dw_11
## Min. :0.000000 Min. :0.0000000 Min. :0.0000000 Min. :0.000000
## 1st Qu.:0.002888 1st Qu.:0.0002329 1st Qu.:0.0000000 1st Qu.:0.001991
## Median :0.014991 Median :0.0017552 Median :0.0003909 Median :0.004092
## Mean :0.064586 Mean :0.0068641 Mean :0.0011121 Mean :0.007902
## 3rd Qu.:0.074748 3rd Qu.:0.0065068 3rd Qu.:0.0010425 3rd Qu.:0.007803
## Max. :0.798479 Max. :0.2828433 Max. :0.0687517 Max. :1.000000
## dw_12 dw_13 psa_00 psa_01
## Min. :0 Min. :0 Min. :0.0000 Min. :0.001293
## 1st Qu.:0 1st Qu.:0 1st Qu.:0.2556 1st Qu.:0.467217
## Median :0 Median :0 Median :0.3017 Median :0.540874
## Mean :0 Mean :0 Mean :0.3113 Mean :0.526568
## 3rd Qu.:0 3rd Qu.:0 3rd Qu.:0.3712 3rd Qu.:0.586087
## Max. :0 Max. :0 Max. :0.5616 Max. :0.852493
## psa_02 psa_03 psa_04 stv_00
## Min. :0.0000000 Min. :0.00000 Min. :0.04279 Min. :0.0000
## 1st Qu.:0.0001326 1st Qu.:0.01698 1st Qu.:0.11014 1st Qu.:0.0982
## Median :0.0003381 Median :0.02705 Median :0.12576 Median :0.1728
## Mean :0.0005410 Mean :0.03369 Mean :0.12793 Mean :0.2259
## 3rd Qu.:0.0006835 3rd Qu.:0.04350 3rd Qu.:0.13973 3rd Qu.:0.3034
## Max. :0.0194420 Max. :0.26738 Max. :0.99871 Max. :0.8405
## stv_01 car_00 car_01 lln_00
## Min. :0.1595 Min. :0.0000 Min. :0.04133 Min. :0.00000
## 1st Qu.:0.6966 1st Qu.:0.1310 1st Qu.:0.71851 1st Qu.:0.01732
## Median :0.8272 Median :0.1780 Median :0.82197 Median :0.04014
## Mean :0.7741 Mean :0.2503 Mean :0.74969 Mean :0.09764
## 3rd Qu.:0.9018 3rd Qu.:0.2815 3rd Qu.:0.86902 3rd Qu.:0.12087
## Max. :1.0000 Max. :0.9587 Max. :1.00000 Max. :0.76261
## lln_01 lan_00 lan_01 lan_02
## Min. :0.2374 Min. :0.000000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.8791 1st Qu.:0.002842 1st Qu.:0.009433 1st Qu.:0.004081
## Median :0.9599 Median :0.007914 Median :0.017589 Median :0.008956
## Mean :0.9024 Mean :0.097603 Mean :0.058684 Mean :0.029416
## 3rd Qu.:0.9827 3rd Qu.:0.059327 3rd Qu.:0.036612 3rd Qu.:0.015081
## Max. :1.0000 Max. :0.979246 Max. :0.939549 Max. :0.895365
## lan_03 lan_04 lan_05 lan_06
## Min. :0.000000 Min. :0.00000 Min. :0.000000 Min. :0.000000
## 1st Qu.:0.001647 1st Qu.:0.01034 1st Qu.:0.001675 1st Qu.:0.002681
## Median :0.008835 Median :0.05253 Median :0.003986 Median :0.017154
## Mean :0.039983 Mean :0.28432 Mean :0.116773 Mean :0.108053
## 3rd Qu.:0.039564 3rd Qu.:0.56850 3rd Qu.:0.055631 3rd Qu.:0.066745
## Max. :0.852927 Max. :0.98616 Max. :0.978779 Max. :0.981207
## lan_07 lan_08 lan_09 lan_10
## Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
## 1st Qu.:0.003906 1st Qu.:0.001675 1st Qu.:0.0002974 1st Qu.:0.0002999
## Median :0.008403 Median :0.003045 Median :0.0012675 Median :0.0012002
## Mean :0.130673 Mean :0.004621 Mean :0.0243186 Mean :0.0242625
## 3rd Qu.:0.065157 3rd Qu.:0.005782 3rd Qu.:0.0065378 3rd Qu.:0.0052470
## Max. :0.963219 Max. :0.034234 Max. :0.9812332 Max. :0.9828445
## lan_11 lan_12 lan_13 lan_14
## Min. :0.000000 Min. :0.000000 Min. :0 Min. :0.0000000
## 1st Qu.:0.000495 1st Qu.:0.002589 1st Qu.:0 1st Qu.:0.0000000
## Median :0.003261 Median :0.006394 Median :0 Median :0.0001459
## Mean :0.053985 Mean :0.012809 Mean :0 Mean :0.0145029
## 3rd Qu.:0.029783 3rd Qu.:0.013722 3rd Qu.:0 3rd Qu.:0.0121078
## Max. :0.991674 Max. :0.367785 Max. :0 Max. :0.9984484
## pg_00 pg_01 pg_02 pg_03
## Min. :0.01105 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
## 1st Qu.:0.87528 1st Qu.:0.001015 1st Qu.:0.0008769 1st Qu.:0.0004514
## Median :0.98975 Median :0.003124 Median :0.0017966 Median :0.0012081
## Mean :0.86214 Mean :0.040938 Mean :0.0187979 Mean :0.0744293
## 3rd Qu.:0.99562 3rd Qu.:0.012582 3rd Qu.:0.0048827 3rd Qu.:0.0418406
## Max. :1.00000 Max. :0.969519 Max. :0.9395640 Max. :0.9405628
## pg_04 lgt_00 pw_00 pw_01
## Min. :0.0000000 Min. :0.001692 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0006644 1st Qu.:0.796471 1st Qu.:0.08764 1st Qu.:0.1113
## Median :0.0016958 Median :0.914061 Median :0.27800 Median :0.3021
## Mean :0.0036926 Mean :0.836432 Mean :0.35969 Mean :0.3297
## 3rd Qu.:0.0041264 3rd Qu.:0.964334 3rd Qu.:0.58295 3rd Qu.:0.5088
## Max. :0.3678423 Max. :1.000000 Max. :0.99591 Max. :0.9376
## pw_02 pw_03 pw_04 pw_05
## Min. :0.000000 Min. :0.000000 Min. :0.0000000 Min. :0.0000000
## 1st Qu.:0.008673 1st Qu.:0.002099 1st Qu.:0.0007147 1st Qu.:0.0001595
## Median :0.069065 Median :0.016496 Median :0.0051637 Median :0.0014590
## Mean :0.127555 Mean :0.041589 Mean :0.0196551 Mean :0.0110081
## 3rd Qu.:0.183384 3rd Qu.:0.058626 3rd Qu.:0.0250545 3rd Qu.:0.0094322
## Max. :1.000000 Max. :0.327393 Max. :0.3067867 Max. :0.2282606
## pw_06 pw_07 pw_08 ADM4_PCODE lat
## Min. :0.000000 Min. :0 Min. :0 Length:2822 Min. :-32.49
## 1st Qu.:0.005217 1st Qu.:0 1st Qu.:0 Class :character 1st Qu.:-28.57
## Median :0.025165 Median :0 Median :0 Mode :character Median :-26.55
## Mean :0.110818 Mean :0 Mean :0 Mean :-26.88
## 3rd Qu.:0.116638 3rd Qu.:0 3rd Qu.:0 3rd Qu.:-25.57
## Max. :0.961522 Max. :0 Max. :0 Max. :-22.33
## lon NL
## Min. :16.76 Min. : 0.000
## 1st Qu.:27.71 1st Qu.: 3.033
## Median :28.96 Median : 9.206
## Mean :28.67 Mean :17.438
## 3rd Qu.:30.44 3rd Qu.:26.891
## Max. :32.86 Max. :63.000
From the histogram of the target variable
“target” = “Percentage of women head households with income under R19.6k out of total number of households”
a normal distribution is noted, so for the Central Limit Theorem (CLT) the mean of the “target” variable for South Africa can be calculated by inference, with the relative confidence interval at 95%:
summary(dataset$target)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 16.75 24.16 24.51 32.23 55.53
dataset %>%
ggplot(aes(target))+
geom_histogram(bins = 50, colour="red")
inference(y = target, data = dataset, statistic = "mean", type = "ci", method = "theoretical")
## Single numerical variable
## n = 2822, y-bar = 24.5076, s = 10.2944
## 95% CI: (24.1276 , 24.8875)
Similarly from the histogram of the variable psa_00
“psa_00” = “Percentage listing present school attendance as: Yes”
we see a normal distribution so for the Central Limit Theorem (CLT) we can calculate by inference the mean of psa_00 for South Africa, with the relative confidence interval at 95%:
summary(dataset$psa_00)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.2556 0.3017 0.3113 0.3712 0.5616
dataset %>%
ggplot(aes(psa_00))+
geom_histogram(bins = 50, colour="red")
inference(y = psa_00, data = dataset, statistic = "mean", type = "ci", method = "theoretical")
## Single numerical variable
## n = 2822, y-bar = 0.3113, s = 0.0766
## 95% CI: (0.3084 , 0.3141)
By visualizing the correlations between the target variable and the other numeric variables of the dataset, it is possible to trace the correlogram of the most correlated variables:
print(cor(dataset$target,dataset[,2:59]))
## Warning in cor(dataset$target, dataset[, 2:59]): la deviazione standard è zero
## total_households total_individuals target dw_00 dw_01 dw_02
## [1,] -0.3748327 -0.2938275 1 0.01846126 0.4582063 -0.2327265
## dw_03 dw_04 dw_05 dw_06 dw_07 dw_08
## [1,] -0.3381375 -0.3855334 -0.1463565 -0.248573 -0.1777628 -0.1866269
## dw_09 dw_10 dw_11 dw_12 dw_13 psa_00 psa_01
## [1,] -0.2353309 -0.12854 -0.1094896 NA NA 0.7824722 -0.7075063
## psa_02 psa_03 psa_04 stv_00 stv_01 car_00 car_01
## [1,] -0.09761256 -0.3015718 0.2039986 -0.6641815 0.6641815 -0.7028313 0.7028313
## lln_00 lln_01 lan_00 lan_01 lan_02 lan_03
## [1,] -0.6378347 0.6378347 -0.5079421 -0.4387036 -0.02016267 -0.1426849
## lan_04 lan_05 lan_06 lan_07 lan_08 lan_09
## [1,] 0.2322655 0.2752633 -0.03856322 -0.04696761 -0.003929984 0.06623029
## lan_10 lan_11 lan_12 lan_13 lan_14 pg_00 pg_01
## [1,] 0.106548 0.2105966 -0.3253671 NA -0.2870036 0.6133458 -0.2327924
## pg_02 pg_03 pg_04 lgt_00 pw_00 pw_01 pw_02
## [1,] -0.2240053 -0.5839081 -0.2138123 -0.2602861 -0.7545358 0.1136461 0.4424407
## pw_03 pw_04 pw_05 pw_06 pw_07 pw_08
## [1,] 0.4409413 0.3894672 0.3496534 0.4706758 NA NA
corrplot(cor(dataset[,c(4,6,19,20,24,25,26,27,28,29,30,31,45,48,51,53,54)]), order = "hclust")
Analyzing the correlation between target and psa_00 it can be seen that R = 78.24% therefore it is a very strong correlation, so as the percentage of people attending school increases, the percentage of poor women head households increases:
“target” = “Percentage of women head households with income under R19.6k out of total number of households” “psa_00” = “Percentage listing present school attendance as: Yes”
cor(dataset$target,dataset$psa_00)
## [1] 0.7824722
dataset %>%
ggplot(aes(target,psa_00))+
geom_point()+
ggtitle("correlation between target and psa_00")
The correlation between target and pg_00 is R = 61.33%, while the correlation between target and pg_03 is R = -58.39% therefore poverty increases among Black Africans and decreases among Whites.
“target” = “Percentage of women head households with income under R19.6k out of total number of households” “pg_00” = “Percentage in population group: Black African” “pg_03” = “Percentage in population group: White”
cor(dataset$target,dataset$pg_00)
## [1] 0.6133458
dataset %>%
ggplot(aes(target,pg_00))+
geom_point()+
ggtitle("Positive correlation between target and pg_00")
cor(dataset$target,dataset$pg_03)
## [1] -0.5839081
dataset %>%
ggplot(aes(target,pg_03))+
geom_point()+
ggtitle("Negative correlation between target and pg_03")
As black Africans grow, the percentage of those who attend school increases being R = 47.67%, while as whites increase the percentage of those who go to school decreases, being R = -39.28%, but going to school increases the level of poverty as seen before.
“psa_00” = “Percentage listing present school attendance as: Yes” “pg_00” = “Percentage in population group: Black African” “pg_03” = “Percentage in population group: White”
cor(dataset$psa_00,dataset$pg_00)
## [1] 0.4767752
dataset %>%
ggplot(aes(psa_00,pg_00))+
geom_point()+
ggtitle("Positive correlation between psa_00 and pg_00")
cor(dataset$psa_00,dataset$pg_03)
## [1] -0.3928991
dataset %>%
ggplot(aes(psa_00,pg_03))+
geom_point()+
ggtitle("Negative correlation between psa_00 and pg_03")
As poverty increases, the percentage of households with a car decreases
“car_01” = “Percentage of households with a car: No”
cor(dataset$target,dataset$car_01)
## [1] 0.7028313
dataset %>%
ggplot(aes(target,car_01))+
geom_point()+
ggtitle("Positive correlation between target and car_01")
As poverty increases, the percentage of households with Satellite TV decreases
“stv_01”, “Percentage of households with Satellite TV: No”
cor(dataset$target,dataset$stv_01)
## [1] 0.6641815
dataset %>%
ggplot(aes(target,stv_01))+
geom_point()+
ggtitle("Positive correlation between target and stv_01")
As poverty increases, Percentage listing landline ownership decreases
“lln_01”, “Percentage listing landline ownership as: No” ,,
cor(dataset$target,dataset$lln_01)
## [1] 0.6378347
dataset %>%
ggplot(aes(target,lln_01))+
geom_point()+
ggtitle("Positive correlation between target and lln_01")
As poverty increases, the percentage with piped water access decreases: Piped (tap) water inside dwelling / institution
“pw_00” = “Percentage with piped water access: Piped (tap) water inside dwelling / institution” ,,
cor(dataset$target,dataset$pw_00)
## [1] -0.7545358
dataset %>%
ggplot(aes(target,pw_00))+
geom_point()+
ggtitle("Negative correlation between target and pw_00")
If you create a categorical variable that is School_Yes when percentage listing present school attendance as Yes is greater than mean, you can see that the average percentage of women head households with income under R19.6k out of total number of households grows both in the sample and in South Africa, in fact, p_value <0.05 implies that mu_School_No <mu_School_Yes, being mu the average in South Africa.
dataset$go_school <- as.factor(ifelse(dataset$psa_00>mean(dataset$psa_00),"School_Yes","School_No"))
dataset %>%
ggplot(aes(go_school,target, fill=go_school))+
geom_boxplot()+
ggtitle("target go_school Boxplot")
inference(y = target, x=go_school, data = dataset, statistic = "mean", type = "ht", alternative = "less", method = "theoretical")
## Warning: Missing null value, set to 0
## Response variable: numerical
## Explanatory variable: categorical (2 levels)
## n_School_No = 1524, y_bar_School_No = 17.8749, s_School_No = 7.0582
## n_School_Yes = 1298, y_bar_School_Yes = 32.2951, s_School_Yes = 7.721
## H0: mu_School_No = mu_School_Yes
## HA: mu_School_No < mu_School_Yes
## t = -51.4295, df = 1297
## p_value = < 0.0001