Load packages

library(dplyr)
library(ggplot2)
library(corrplot)
library(statsr)

Load Data

Data from : https://zindi.africa/competitions/be-a-trailblazer

dataset <- read.csv("Train.csv")

Part 1: Research question

We want to highlight that in South Africa the share of individuals attending school and the share of low-income women-led households is strongly correlated.

Part 2: Exploratory data analysis

summary(dataset)
##      ward           total_households total_individuals     target     
##  Length:2822        Min.   :    1    Min.   :  402     Min.   : 0.00  
##  Class :character   1st Qu.: 1779    1st Qu.: 7071     1st Qu.:16.75  
##  Mode  :character   Median : 2398    Median : 9367     Median :24.16  
##                     Mean   : 3665    Mean   :12869     Mean   :24.51  
##                     3rd Qu.: 3987    3rd Qu.:14241     3rd Qu.:32.23  
##                     Max.   :39685    Max.   :91717     Max.   :55.53  
##      dw_00            dw_01              dw_02              dw_03          
##  Min.   :0.0000   Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000  
##  1st Qu.:0.5942   1st Qu.:0.002895   1st Qu.:0.002407   1st Qu.:0.0000000  
##  Median :0.7668   Median :0.010425   Median :0.005762   Median :0.0008066  
##  Mean   :0.7122   Mean   :0.092616   Mean   :0.032043   Mean   :0.0060567  
##  3rd Qu.:0.8817   3rd Qu.:0.068209   3rd Qu.:0.027913   3rd Qu.:0.0025383  
##  Max.   :0.9950   Max.   :0.931489   Max.   :0.951806   Max.   :0.2642393  
##      dw_04               dw_05               dw_06              dw_07         
##  Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.0000000   1st Qu.:0.0000000   1st Qu.:0.002716   1st Qu.:0.004716  
##  Median :0.0006069   Median :0.0008654   Median :0.008639   Median :0.016295  
##  Mean   :0.0086655   Mean   :0.0062888   Mean   :0.022374   Mean   :0.039296  
##  3rd Qu.:0.0022246   3rd Qu.:0.0030272   3rd Qu.:0.025218   3rd Qu.:0.048731  
##  Max.   :0.3920853   Max.   :0.4359115   Max.   :0.412936   Max.   :0.455815  
##      dw_08              dw_09               dw_10               dw_11         
##  Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000   Min.   :0.000000  
##  1st Qu.:0.002888   1st Qu.:0.0002329   1st Qu.:0.0000000   1st Qu.:0.001991  
##  Median :0.014991   Median :0.0017552   Median :0.0003909   Median :0.004092  
##  Mean   :0.064586   Mean   :0.0068641   Mean   :0.0011121   Mean   :0.007902  
##  3rd Qu.:0.074748   3rd Qu.:0.0065068   3rd Qu.:0.0010425   3rd Qu.:0.007803  
##  Max.   :0.798479   Max.   :0.2828433   Max.   :0.0687517   Max.   :1.000000  
##      dw_12       dw_13       psa_00           psa_01        
##  Min.   :0   Min.   :0   Min.   :0.0000   Min.   :0.001293  
##  1st Qu.:0   1st Qu.:0   1st Qu.:0.2556   1st Qu.:0.467217  
##  Median :0   Median :0   Median :0.3017   Median :0.540874  
##  Mean   :0   Mean   :0   Mean   :0.3113   Mean   :0.526568  
##  3rd Qu.:0   3rd Qu.:0   3rd Qu.:0.3712   3rd Qu.:0.586087  
##  Max.   :0   Max.   :0   Max.   :0.5616   Max.   :0.852493  
##      psa_02              psa_03            psa_04            stv_00      
##  Min.   :0.0000000   Min.   :0.00000   Min.   :0.04279   Min.   :0.0000  
##  1st Qu.:0.0001326   1st Qu.:0.01698   1st Qu.:0.11014   1st Qu.:0.0982  
##  Median :0.0003381   Median :0.02705   Median :0.12576   Median :0.1728  
##  Mean   :0.0005410   Mean   :0.03369   Mean   :0.12793   Mean   :0.2259  
##  3rd Qu.:0.0006835   3rd Qu.:0.04350   3rd Qu.:0.13973   3rd Qu.:0.3034  
##  Max.   :0.0194420   Max.   :0.26738   Max.   :0.99871   Max.   :0.8405  
##      stv_01           car_00           car_01            lln_00       
##  Min.   :0.1595   Min.   :0.0000   Min.   :0.04133   Min.   :0.00000  
##  1st Qu.:0.6966   1st Qu.:0.1310   1st Qu.:0.71851   1st Qu.:0.01732  
##  Median :0.8272   Median :0.1780   Median :0.82197   Median :0.04014  
##  Mean   :0.7741   Mean   :0.2503   Mean   :0.74969   Mean   :0.09764  
##  3rd Qu.:0.9018   3rd Qu.:0.2815   3rd Qu.:0.86902   3rd Qu.:0.12087  
##  Max.   :1.0000   Max.   :0.9587   Max.   :1.00000   Max.   :0.76261  
##      lln_01           lan_00             lan_01             lan_02        
##  Min.   :0.2374   Min.   :0.000000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.8791   1st Qu.:0.002842   1st Qu.:0.009433   1st Qu.:0.004081  
##  Median :0.9599   Median :0.007914   Median :0.017589   Median :0.008956  
##  Mean   :0.9024   Mean   :0.097603   Mean   :0.058684   Mean   :0.029416  
##  3rd Qu.:0.9827   3rd Qu.:0.059327   3rd Qu.:0.036612   3rd Qu.:0.015081  
##  Max.   :1.0000   Max.   :0.979246   Max.   :0.939549   Max.   :0.895365  
##      lan_03             lan_04            lan_05             lan_06        
##  Min.   :0.000000   Min.   :0.00000   Min.   :0.000000   Min.   :0.000000  
##  1st Qu.:0.001647   1st Qu.:0.01034   1st Qu.:0.001675   1st Qu.:0.002681  
##  Median :0.008835   Median :0.05253   Median :0.003986   Median :0.017154  
##  Mean   :0.039983   Mean   :0.28432   Mean   :0.116773   Mean   :0.108053  
##  3rd Qu.:0.039564   3rd Qu.:0.56850   3rd Qu.:0.055631   3rd Qu.:0.066745  
##  Max.   :0.852927   Max.   :0.98616   Max.   :0.978779   Max.   :0.981207  
##      lan_07             lan_08             lan_09              lan_10         
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000  
##  1st Qu.:0.003906   1st Qu.:0.001675   1st Qu.:0.0002974   1st Qu.:0.0002999  
##  Median :0.008403   Median :0.003045   Median :0.0012675   Median :0.0012002  
##  Mean   :0.130673   Mean   :0.004621   Mean   :0.0243186   Mean   :0.0242625  
##  3rd Qu.:0.065157   3rd Qu.:0.005782   3rd Qu.:0.0065378   3rd Qu.:0.0052470  
##  Max.   :0.963219   Max.   :0.034234   Max.   :0.9812332   Max.   :0.9828445  
##      lan_11             lan_12             lan_13      lan_14         
##  Min.   :0.000000   Min.   :0.000000   Min.   :0   Min.   :0.0000000  
##  1st Qu.:0.000495   1st Qu.:0.002589   1st Qu.:0   1st Qu.:0.0000000  
##  Median :0.003261   Median :0.006394   Median :0   Median :0.0001459  
##  Mean   :0.053985   Mean   :0.012809   Mean   :0   Mean   :0.0145029  
##  3rd Qu.:0.029783   3rd Qu.:0.013722   3rd Qu.:0   3rd Qu.:0.0121078  
##  Max.   :0.991674   Max.   :0.367785   Max.   :0   Max.   :0.9984484  
##      pg_00             pg_01              pg_02               pg_03          
##  Min.   :0.01105   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000  
##  1st Qu.:0.87528   1st Qu.:0.001015   1st Qu.:0.0008769   1st Qu.:0.0004514  
##  Median :0.98975   Median :0.003124   Median :0.0017966   Median :0.0012081  
##  Mean   :0.86214   Mean   :0.040938   Mean   :0.0187979   Mean   :0.0744293  
##  3rd Qu.:0.99562   3rd Qu.:0.012582   3rd Qu.:0.0048827   3rd Qu.:0.0418406  
##  Max.   :1.00000   Max.   :0.969519   Max.   :0.9395640   Max.   :0.9405628  
##      pg_04               lgt_00             pw_00             pw_01       
##  Min.   :0.0000000   Min.   :0.001692   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.0006644   1st Qu.:0.796471   1st Qu.:0.08764   1st Qu.:0.1113  
##  Median :0.0016958   Median :0.914061   Median :0.27800   Median :0.3021  
##  Mean   :0.0036926   Mean   :0.836432   Mean   :0.35969   Mean   :0.3297  
##  3rd Qu.:0.0041264   3rd Qu.:0.964334   3rd Qu.:0.58295   3rd Qu.:0.5088  
##  Max.   :0.3678423   Max.   :1.000000   Max.   :0.99591   Max.   :0.9376  
##      pw_02              pw_03              pw_04               pw_05          
##  Min.   :0.000000   Min.   :0.000000   Min.   :0.0000000   Min.   :0.0000000  
##  1st Qu.:0.008673   1st Qu.:0.002099   1st Qu.:0.0007147   1st Qu.:0.0001595  
##  Median :0.069065   Median :0.016496   Median :0.0051637   Median :0.0014590  
##  Mean   :0.127555   Mean   :0.041589   Mean   :0.0196551   Mean   :0.0110081  
##  3rd Qu.:0.183384   3rd Qu.:0.058626   3rd Qu.:0.0250545   3rd Qu.:0.0094322  
##  Max.   :1.000000   Max.   :0.327393   Max.   :0.3067867   Max.   :0.2282606  
##      pw_06              pw_07       pw_08    ADM4_PCODE             lat        
##  Min.   :0.000000   Min.   :0   Min.   :0   Length:2822        Min.   :-32.49  
##  1st Qu.:0.005217   1st Qu.:0   1st Qu.:0   Class :character   1st Qu.:-28.57  
##  Median :0.025165   Median :0   Median :0   Mode  :character   Median :-26.55  
##  Mean   :0.110818   Mean   :0   Mean   :0                      Mean   :-26.88  
##  3rd Qu.:0.116638   3rd Qu.:0   3rd Qu.:0                      3rd Qu.:-25.57  
##  Max.   :0.961522   Max.   :0   Max.   :0                      Max.   :-22.33  
##       lon              NL        
##  Min.   :16.76   Min.   : 0.000  
##  1st Qu.:27.71   1st Qu.: 3.033  
##  Median :28.96   Median : 9.206  
##  Mean   :28.67   Mean   :17.438  
##  3rd Qu.:30.44   3rd Qu.:26.891  
##  Max.   :32.86   Max.   :63.000

From the histogram of the target variable

“target” = “Percentage of women head households with income under R19.6k out of total number of households”

a normal distribution is noted, so for the Central Limit Theorem (CLT) the mean of the “target” variable for South Africa can be calculated by inference, with the relative confidence interval at 95%:

summary(dataset$target)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   16.75   24.16   24.51   32.23   55.53
dataset %>%
  ggplot(aes(target))+
  geom_histogram(bins = 50, colour="red")

inference(y = target, data = dataset, statistic = "mean", type = "ci", method = "theoretical")
## Single numerical variable
## n = 2822, y-bar = 24.5076, s = 10.2944
## 95% CI: (24.1276 , 24.8875)

Similarly from the histogram of the variable psa_00

“psa_00” = “Percentage listing present school attendance as: Yes”

we see a normal distribution so for the Central Limit Theorem (CLT) we can calculate by inference the mean of psa_00 for South Africa, with the relative confidence interval at 95%:

summary(dataset$psa_00)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2556  0.3017  0.3113  0.3712  0.5616
dataset %>%
  ggplot(aes(psa_00))+
  geom_histogram(bins = 50, colour="red")

inference(y = psa_00, data = dataset, statistic = "mean", type = "ci", method = "theoretical")
## Single numerical variable
## n = 2822, y-bar = 0.3113, s = 0.0766
## 95% CI: (0.3084 , 0.3141)

By visualizing the correlations between the target variable and the other numeric variables of the dataset, it is possible to trace the correlogram of the most correlated variables:

print(cor(dataset$target,dataset[,2:59]))
## Warning in cor(dataset$target, dataset[, 2:59]): la deviazione standard è zero
##      total_households total_individuals target      dw_00     dw_01      dw_02
## [1,]       -0.3748327        -0.2938275      1 0.01846126 0.4582063 -0.2327265
##           dw_03      dw_04      dw_05     dw_06      dw_07      dw_08
## [1,] -0.3381375 -0.3855334 -0.1463565 -0.248573 -0.1777628 -0.1866269
##           dw_09    dw_10      dw_11 dw_12 dw_13    psa_00     psa_01
## [1,] -0.2353309 -0.12854 -0.1094896    NA    NA 0.7824722 -0.7075063
##           psa_02     psa_03    psa_04     stv_00    stv_01     car_00    car_01
## [1,] -0.09761256 -0.3015718 0.2039986 -0.6641815 0.6641815 -0.7028313 0.7028313
##          lln_00    lln_01     lan_00     lan_01      lan_02     lan_03
## [1,] -0.6378347 0.6378347 -0.5079421 -0.4387036 -0.02016267 -0.1426849
##         lan_04    lan_05      lan_06      lan_07       lan_08     lan_09
## [1,] 0.2322655 0.2752633 -0.03856322 -0.04696761 -0.003929984 0.06623029
##        lan_10    lan_11     lan_12 lan_13     lan_14     pg_00      pg_01
## [1,] 0.106548 0.2105966 -0.3253671     NA -0.2870036 0.6133458 -0.2327924
##           pg_02      pg_03      pg_04     lgt_00      pw_00     pw_01     pw_02
## [1,] -0.2240053 -0.5839081 -0.2138123 -0.2602861 -0.7545358 0.1136461 0.4424407
##          pw_03     pw_04     pw_05     pw_06 pw_07 pw_08
## [1,] 0.4409413 0.3894672 0.3496534 0.4706758    NA    NA
corrplot(cor(dataset[,c(4,6,19,20,24,25,26,27,28,29,30,31,45,48,51,53,54)]), order = "hclust")

Analyzing the correlation between target and psa_00 it can be seen that R = 78.24% therefore it is a very strong correlation, so as the percentage of people attending school increases, the percentage of poor women head households increases:

“target” = “Percentage of women head households with income under R19.6k out of total number of households” “psa_00” = “Percentage listing present school attendance as: Yes”

cor(dataset$target,dataset$psa_00)
## [1] 0.7824722
dataset %>%
    ggplot(aes(target,psa_00))+
    geom_point()+
    ggtitle("correlation between target and psa_00")

The correlation between target and pg_00 is R = 61.33%, while the correlation between target and pg_03 is R = -58.39% therefore poverty increases among Black Africans and decreases among Whites.

“target” = “Percentage of women head households with income under R19.6k out of total number of households” “pg_00” = “Percentage in population group: Black African” “pg_03” = “Percentage in population group: White”

cor(dataset$target,dataset$pg_00)
## [1] 0.6133458
dataset %>%
    ggplot(aes(target,pg_00))+
    geom_point()+
    ggtitle("Positive correlation between target and pg_00")

cor(dataset$target,dataset$pg_03)
## [1] -0.5839081
dataset %>%
    ggplot(aes(target,pg_03))+
    geom_point()+
    ggtitle("Negative correlation between target and pg_03")

As black Africans grow, the percentage of those who attend school increases being R = 47.67%, while as whites increase the percentage of those who go to school decreases, being R = -39.28%, but going to school increases the level of poverty as seen before.

“psa_00” = “Percentage listing present school attendance as: Yes” “pg_00” = “Percentage in population group: Black African” “pg_03” = “Percentage in population group: White”

cor(dataset$psa_00,dataset$pg_00)
## [1] 0.4767752
dataset %>%
    ggplot(aes(psa_00,pg_00))+
    geom_point()+
    ggtitle("Positive correlation between psa_00 and pg_00")

cor(dataset$psa_00,dataset$pg_03)
## [1] -0.3928991
dataset %>%
    ggplot(aes(psa_00,pg_03))+
    geom_point()+
    ggtitle("Negative correlation between psa_00 and pg_03")

As poverty increases, the percentage of households with a car decreases

“car_01” = “Percentage of households with a car: No”

cor(dataset$target,dataset$car_01)
## [1] 0.7028313
dataset %>%
    ggplot(aes(target,car_01))+
    geom_point()+
    ggtitle("Positive correlation between target and car_01")

As poverty increases, the percentage of households with Satellite TV decreases

“stv_01”, “Percentage of households with Satellite TV: No”

cor(dataset$target,dataset$stv_01)
## [1] 0.6641815
dataset %>%
    ggplot(aes(target,stv_01))+
    geom_point()+
    ggtitle("Positive correlation between target and stv_01")

As poverty increases, Percentage listing landline ownership decreases

“lln_01”, “Percentage listing landline ownership as: No” ,,

cor(dataset$target,dataset$lln_01)
## [1] 0.6378347
dataset %>%
    ggplot(aes(target,lln_01))+
    geom_point()+
    ggtitle("Positive correlation between target and lln_01")

As poverty increases, the percentage with piped water access decreases: Piped (tap) water inside dwelling / institution

“pw_00” = “Percentage with piped water access: Piped (tap) water inside dwelling / institution” ,,

cor(dataset$target,dataset$pw_00)
## [1] -0.7545358
dataset %>%
    ggplot(aes(target,pw_00))+
    geom_point()+
    ggtitle("Negative correlation between target and pw_00")

Part 3: Inference

If you create a categorical variable that is School_Yes when percentage listing present school attendance as Yes is greater than mean, you can see that the average percentage of women head households with income under R19.6k out of total number of households grows both in the sample and in South Africa, in fact, p_value <0.05 implies that mu_School_No <mu_School_Yes, being mu the average in South Africa.

dataset$go_school <- as.factor(ifelse(dataset$psa_00>mean(dataset$psa_00),"School_Yes","School_No"))

dataset %>%
  ggplot(aes(go_school,target, fill=go_school))+
  geom_boxplot()+
  ggtitle("target go_school Boxplot")

inference(y = target, x=go_school, data = dataset, statistic = "mean", type = "ht", alternative = "less", method = "theoretical")
## Warning: Missing null value, set to 0
## Response variable: numerical
## Explanatory variable: categorical (2 levels) 
## n_School_No = 1524, y_bar_School_No = 17.8749, s_School_No = 7.0582
## n_School_Yes = 1298, y_bar_School_Yes = 32.2951, s_School_Yes = 7.721
## H0: mu_School_No =  mu_School_Yes
## HA: mu_School_No < mu_School_Yes
## t = -51.4295, df = 1297
## p_value = < 0.0001