Load packages

library(dplyr)
library(ggplot2)

Part 1 : Data

Data from : https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/

Load data

train_set <- read.csv("train_values.csv", stringsAsFactors = TRUE)
train_set <- train_set[,-20]
label <- read.csv("train_label.csv", stringsAsFactors = TRUE)
train_set <- cbind(train_set, status_group=label$status_group)

Part 2: Research question

Analyzing the sample under examination, we want to observe which variables are related to the fact that the wells are more or less functional. First we look at some numeric variables to status_group (functional, functional needs repair, non functional) and then the categorical variables to status_group.

Part 3: Exploratory data analysis

train_set$date_recorded <- as.Date(train_set$date_recorded)

range(train_set$date_recorded)
## [1] "2002-10-14" "2013-12-03"
summary(train_set)
##        id          amount_tsh       date_recorded       
##  Min.   :    0   Min.   :     0.0   Min.   :2002-10-14  
##  1st Qu.:18520   1st Qu.:     0.0   1st Qu.:2011-04-01  
##  Median :37062   Median :     0.0   Median :2012-10-10  
##  Mean   :37115   Mean   :   317.7   Mean   :2012-03-29  
##  3rd Qu.:55656   3rd Qu.:    20.0   3rd Qu.:2013-02-09  
##  Max.   :74247   Max.   :350000.0   Max.   :2013-12-03  
##                                                         
##                     funder        gps_height          installer    
##  Government Of Tanzania: 9084   Min.   : -90.0   DWE       :17402  
##                        : 3635   1st Qu.:   0.0             : 3655  
##  Danida                : 3114   Median : 369.0   Government: 1825  
##  Hesawa                : 2202   Mean   : 668.3   RWE       : 1206  
##  Rwssp                 : 1374   3rd Qu.:1319.2   Commu     : 1060  
##  World Bank            : 1349   Max.   :2770.0   DANIDA    : 1050  
##  (Other)               :38642                    (Other)   :33202  
##    longitude        latitude            wpt_name      num_private       
##  Min.   : 0.00   Min.   :-11.649   none     : 3563   Min.   :   0.0000  
##  1st Qu.:33.09   1st Qu.: -8.541   Shuleni  : 1748   1st Qu.:   0.0000  
##  Median :34.91   Median : -5.022   Zahanati :  830   Median :   0.0000  
##  Mean   :34.08   Mean   : -5.706   Msikitini:  535   Mean   :   0.4741  
##  3rd Qu.:37.18   3rd Qu.: -3.326   Kanisani :  323   3rd Qu.:   0.0000  
##  Max.   :40.35   Max.   :  0.000   Bombani  :  271   Max.   :1776.0000  
##                                    (Other)  :52130                      
##              basin          subvillage            region       region_code  
##  Lake Victoria  :10248   Madukani:  508   Iringa     : 5294   Min.   : 1.0  
##  Pangani        : 8940   Shuleni :  506   Shinyanga  : 4982   1st Qu.: 5.0  
##  Rufiji         : 7976   Majengo :  502   Mbeya      : 4639   Median :12.0  
##  Internal       : 7785   Kati    :  373   Kilimanjaro: 4379   Mean   :15.3  
##  Lake Tanganyika: 6432           :  371   Morogoro   : 4006   3rd Qu.:17.0  
##  Wami / Ruvu    : 5987   Mtakuja :  262   Arusha     : 3350   Max.   :99.0  
##  (Other)        :12032   (Other) :56878   (Other)    :32750                 
##  district_code             lga               ward         population     
##  Min.   : 0.00   Njombe      : 2503   Igosi    :  307   Min.   :    0.0  
##  1st Qu.: 2.00   Arusha Rural: 1252   Imalinyi :  252   1st Qu.:    0.0  
##  Median : 3.00   Moshi Rural : 1251   Siha Kati:  232   Median :   25.0  
##  Mean   : 5.63   Bariadi     : 1177   Mdandu   :  231   Mean   :  179.9  
##  3rd Qu.: 5.00   Rungwe      : 1106   Nduruma  :  217   3rd Qu.:  215.0  
##  Max.   :80.00   Kilosa      : 1094   Kitunda  :  203   Max.   :30500.0  
##                  (Other)     :51017   (Other)  :57958                    
##  public_meeting       scheme_management        scheme_name      permit     
##       : 3334    VWC            :36793                :28166        : 3056  
##  False: 5055    WUG            : 5206   K            :  682   False:17492  
##  True :51011                   : 3877   None         :  644   True :38852  
##                 Water authority: 3153   Borehole     :  546                
##                 WUA            : 2883   Chalinze wate:  405                
##                 Water Board    : 2748   M            :  400                
##                 (Other)        : 4740   (Other)      :28557                
##  construction_year    extraction_type  extraction_type_group
##  Min.   :   0      gravity    :26780   gravity    :26780    
##  1st Qu.:   0      nira/tanira: 8154   nira/tanira: 8154    
##  Median :1986      other      : 6430   other      : 6430    
##  Mean   :1301      submersible: 4764   submersible: 6179    
##  3rd Qu.:2004      swn 80     : 3670   swn 80     : 3670    
##  Max.   :2013      mono       : 2865   mono       : 2865    
##                    (Other)    : 6737   (Other)    : 5322    
##   extraction_type_class            management      management_group
##  gravity     :26780     vwc             :40507   commercial: 3638  
##  handpump    :16456     wug             : 6515   other     :  943  
##  motorpump   : 2987     water board     : 2933   parastatal: 1768  
##  other       : 6430     wua             : 2535   unknown   :  561  
##  rope pump   :  451     private operator: 1971   user-group:52490  
##  submersible : 6179     parastatal      : 1768                     
##  wind-powered:  117     (Other)         : 3171                     
##                   payment          payment_type           water_quality  
##  never pay            :25348   annually  : 3642   soft           :50818  
##  other                : 1054   monthly   : 8300   salty          : 4856  
##  pay annually         : 3642   never pay :25348   unknown        : 1876  
##  pay monthly          : 8300   on failure: 3914   milky          :  804  
##  pay per bucket       : 8985   other     : 1054   coloured       :  490  
##  pay when scheme fails: 3914   per bucket: 8985   salty abandoned:  339  
##  unknown              : 8157   unknown   : 8157   (Other)        :  217  
##   quality_group           quantity          quantity_group 
##  colored :  490   dry         : 6246   dry         : 6246  
##  fluoride:  217   enough      :33186   enough      :33186  
##  good    :50818   insufficient:15129   insufficient:15129  
##  milky   :  804   seasonal    : 4050   seasonal    : 4050  
##  salty   : 5195   unknown     :  789   unknown     :  789  
##  unknown : 1876                                            
##                                                            
##                   source                    source_type         source_class  
##  spring              :17021   borehole            :11949   groundwater:45794  
##  shallow well        :16824   dam                 :  656   surface    :13328  
##  machine dbh         :11075   other               :  278   unknown    :  278  
##  river               : 9612   rainwater harvesting: 2295                      
##  rainwater harvesting: 2295   river/lake          :10377                      
##  hand dtw            :  874   shallow well        :16824                      
##  (Other)             : 1699   spring              :17021                      
##                     waterpoint_type         waterpoint_type_group
##  cattle trough              :  116   cattle trough     :  116    
##  communal standpipe         :28522   communal standpipe:34625    
##  communal standpipe multiple: 6103   dam               :    7    
##  dam                        :    7   hand pump         :17488    
##  hand pump                  :17488   improved spring   :  784    
##  improved spring            :  784   other             : 6380    
##  other                      : 6380                               
##                   status_group  
##  functional             :32259  
##  functional needs repair: 4317  
##  non functional         :22824  
##                                 
##                                 
##                                 
## 
colSums(is.na(train_set))
##                    id            amount_tsh         date_recorded 
##                     0                     0                     0 
##                funder            gps_height             installer 
##                     0                     0                     0 
##             longitude              latitude              wpt_name 
##                     0                     0                     0 
##           num_private                 basin            subvillage 
##                     0                     0                     0 
##                region           region_code         district_code 
##                     0                     0                     0 
##                   lga                  ward            population 
##                     0                     0                     0 
##        public_meeting     scheme_management           scheme_name 
##                     0                     0                     0 
##                permit     construction_year       extraction_type 
##                     0                     0                     0 
## extraction_type_group extraction_type_class            management 
##                     0                     0                     0 
##      management_group               payment          payment_type 
##                     0                     0                     0 
##         water_quality         quality_group              quantity 
##                     0                     0                     0 
##        quantity_group                source           source_type 
##                     0                     0                     0 
##          source_class       waterpoint_type waterpoint_type_group 
##                     0                     0                     0 
##          status_group 
##                     0
train_set %>%
  group_by(status_group) %>%
  summarise(total=n(), .groups = "drop_last") %>%
  mutate(status_group=reorder(status_group,total)) %>%
  ggplot(aes(status_group,total, fill=status_group)) +
  geom_bar(stat = "identity" )+
  geom_text(aes(label=total), vjust=0)+
  ylim(0,34000)+
  theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
  ggtitle("Total number of wells by status_group")

train_set %>%
  group_by(region) %>%
  summarise(total=n(), .groups = "drop_last") %>%
  mutate(region=reorder(region,total)) %>%
  ggplot(aes(region,total)) +
  geom_bar(stat = "identity",colour="red")+
  geom_text(aes(label=total), hjust=0, size=3)+
  coord_flip()+
  ylim(0,6000)+
  ggtitle("Total number of wells by region")

df<-train_set %>%
  group_by(region,status_group) %>%
  summarise(total=n(), .groups = "drop_last") 

df %>%
  ggplot(aes(region,total, fill=status_group)) +
  geom_bar(stat = "identity")+
  theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
  ggtitle("Total number of wells by region and status_group")

train_set %>%
  group_by(extraction_type_class) %>%
  summarise(total=n(), .groups = "drop_last") %>%
  mutate(extraction_type_class=reorder(extraction_type_class,total)) %>%
  ggplot(aes(extraction_type_class,total)) +
  geom_bar(stat = "identity", colour="green")+
  guides(scale="none")+
  theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
  ggtitle("Total number of wells by  extraction_type_class")

df<-train_set %>%
  group_by(extraction_type_class,status_group) %>%
  summarise(total=n(), .groups = "drop_last") 

df %>%
  ggplot(aes(extraction_type_class,total, fill=status_group)) +
  geom_bar(stat = "identity")+
  theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
  ggtitle("Total number of wells by extraction_type_class and status_group")

Variables amount_tsh and gps_height

Analyzing the following variables:
  • amount_tsh - Total static head (amount water available to waterpoint)
  • gps_height - Altitude of the well

to the variable status_group it can be seen that in the sample the wells with on average gps_height and amount_tsh greater are more functional:

train_set %>%
  ggplot(aes(status_group,gps_height, fill=status_group))+
  geom_boxplot()+
  theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
  ggtitle("Gps_heigh status_group boxplot")

Mean of gps_height for status_group:

train_set %>%
  group_by(status_group) %>%
  summarise(mean=mean(gps_height),median=median(gps_height))
## # A tibble: 3 × 3
##   status_group             mean median
##   <fct>                   <dbl>  <dbl>
## 1 functional               740.    550
## 2 functional needs repair  628.    385
## 3 non functional           574.    293

Mean of amount_tsh for status_group:

train_set %>%
  group_by(status_group) %>%
  summarise(mean=mean(amount_tsh))
## # A tibble: 3 × 2
##   status_group             mean
##   <fct>                   <dbl>
## 1 functional               462.
## 2 functional needs repair  267.
## 3 non functional           123.

Characteristics of non functional wells

Analyzing the categorical variables of the sample with respect to the variable status_group (functional, functional needs repair, non functional) we want to highlight the values for which the wells are not functional. For example, it is noted that in the Lindi, Mara, Mtwara, Rukwa and Tabora regions of Tanzania the wells are more non-functional:

for (v in names(train_set[,-40])) {
  if (is.factor(train_set[, v])) {
    df <- train_set %>%
      group_by(train_set[, v], status_group) %>%
      summarise(total = n(), .groups = "drop_last")
    
    names(df)[1] <- v
    
    fu <- df %>%
      filter(status_group == "functional") %>%
      select( v, total)
    
    non_fu <- df %>%
      filter(status_group == "non functional") %>%
      select(v, total)
    
    
    if(nrow(fu)!=nrow(non_fu)) next
    
    vet <-as.character(as.matrix(fu[which(fu$total - non_fu$total < 0),1]))
    
    if(length(vet)==0) next
    
   
    print(df[which(as.character(as.matrix(df[,1]))  %in% vet),])
    cat("\n\n")    
   
    
  }
}
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(v)` instead of `v` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## # A tibble: 6 × 3
## # Groups:   basin [2]
##   basin                   status_group            total
##   <fct>                   <fct>                   <int>
## 1 Lake Rukwa              functional               1000
## 2 Lake Rukwa              functional needs repair   270
## 3 Lake Rukwa              non functional           1184
## 4 Ruvuma / Southern Coast functional               1670
## 5 Ruvuma / Southern Coast functional needs repair   326
## 6 Ruvuma / Southern Coast non functional           2497
## 
## 
## # A tibble: 15 × 3
## # Groups:   region [5]
##    region status_group            total
##    <fct>  <fct>                   <int>
##  1 Lindi  functional                460
##  2 Lindi  functional needs repair    93
##  3 Lindi  non functional            993
##  4 Mara   functional                886
##  5 Mara   functional needs repair    60
##  6 Mara   non functional           1023
##  7 Mtwara functional                524
##  8 Mtwara functional needs repair   126
##  9 Mtwara non functional           1080
## 10 Rukwa  functional                707
## 11 Rukwa  functional needs repair   135
## 12 Rukwa  non functional            966
## 13 Tabora functional                848
## 14 Tabora functional needs repair    45
## 15 Tabora non functional           1066
## 
## 
## # A tibble: 3 × 3
## # Groups:   public_meeting [1]
##   public_meeting status_group            total
##   <fct>          <fct>                   <int>
## 1 False          functional               2173
## 2 False          functional needs repair   442
## 3 False          non functional           2440
## 
## 
## # A tibble: 15 × 3
## # Groups:   extraction_type_group [5]
##    extraction_type_group status_group            total
##    <fct>                 <fct>                   <int>
##  1 india mark iii        functional                 44
##  2 india mark iii        functional needs repair     1
##  3 india mark iii        non functional             53
##  4 mono                  functional               1082
##  5 mono                  functional needs repair   129
##  6 mono                  non functional           1654
##  7 other                 functional               1029
##  8 other                 functional needs repair   206
##  9 other                 non functional           5195
## 10 other motorpump       functional                 53
## 11 other motorpump       functional needs repair     9
## 12 other motorpump       non functional             60
## 13 wind-powered          functional                 50
## 14 wind-powered          functional needs repair     7
## 15 wind-powered          non functional             60
## 
## 
## # A tibble: 9 × 3
## # Groups:   extraction_type_class [3]
##   extraction_type_class status_group            total
##   <fct>                 <fct>                   <int>
## 1 motorpump             functional               1135
## 2 motorpump             functional needs repair   138
## 3 motorpump             non functional           1714
## 4 other                 functional               1029
## 5 other                 functional needs repair   206
## 6 other                 non functional           5195
## 7 wind-powered          functional                 50
## 8 wind-powered          functional needs repair     7
## 9 wind-powered          non functional             60
## 
## 
## # A tibble: 9 × 3
## # Groups:   management [3]
##   management     status_group            total
##   <fct>          <fct>                   <int>
## 1 company        functional                267
## 2 company        functional needs repair    15
## 3 company        non functional            403
## 4 other - school functional                 23
## 5 other - school functional needs repair     1
## 6 other - school non functional             75
## 7 unknown        functional                224
## 8 unknown        functional needs repair    27
## 9 unknown        non functional            310
## 
## 
## # A tibble: 3 × 3
## # Groups:   management_group [1]
##   management_group status_group            total
##   <fct>            <fct>                   <int>
## 1 unknown          functional                224
## 2 unknown          functional needs repair    27
## 3 unknown          non functional            310
## 
## 
## # A tibble: 6 × 3
## # Groups:   payment [2]
##   payment   status_group            total
##   <fct>     <fct>                   <int>
## 1 never pay functional              11379
## 2 never pay functional needs repair  1907
## 3 never pay non functional          12062
## 4 unknown   functional               3528
## 5 unknown   functional needs repair   432
## 6 unknown   non functional           4197
## 
## 
## # A tibble: 6 × 3
## # Groups:   payment_type [2]
##   payment_type status_group            total
##   <fct>        <fct>                   <int>
## 1 never pay    functional              11379
## 2 never pay    functional needs repair  1907
## 3 never pay    non functional          12062
## 4 unknown      functional               3528
## 5 unknown      functional needs repair   432
## 6 unknown      non functional           4197
## 
## 
## # A tibble: 8 × 3
## # Groups:   water_quality [3]
##   water_quality      status_group            total
##   <fct>              <fct>                   <int>
## 1 fluoride abandoned functional                  6
## 2 fluoride abandoned non functional             11
## 3 salty              functional               2220
## 4 salty              functional needs repair   225
## 5 salty              non functional           2411
## 6 unknown            functional                264
## 7 unknown            functional needs repair    35
## 8 unknown            non functional           1577
## 
## 
## # A tibble: 6 × 3
## # Groups:   quality_group [2]
##   quality_group status_group            total
##   <fct>         <fct>                   <int>
## 1 salty         functional               2394
## 2 salty         functional needs repair   297
## 3 salty         non functional           2504
## 4 unknown       functional                264
## 5 unknown       functional needs repair    35
## 6 unknown       non functional           1577
## 
## 
## # A tibble: 6 × 3
## # Groups:   quantity [2]
##   quantity status_group            total
##   <fct>    <fct>                   <int>
## 1 dry      functional                157
## 2 dry      functional needs repair    37
## 3 dry      non functional           6052
## 4 unknown  functional                213
## 5 unknown  functional needs repair    14
## 6 unknown  non functional            562
## 
## 
## # A tibble: 6 × 3
## # Groups:   quantity_group [2]
##   quantity_group status_group            total
##   <fct>          <fct>                   <int>
## 1 dry            functional                157
## 2 dry            functional needs repair    37
## 3 dry            non functional           6052
## 4 unknown        functional                213
## 5 unknown        functional needs repair    14
## 6 unknown        non functional            562
## 
## 
## # A tibble: 6 × 3
## # Groups:   source [2]
##   source status_group            total
##   <fct>  <fct>                   <int>
## 1 dam    functional                253
## 2 dam    functional needs repair    24
## 3 dam    non functional            379
## 4 lake   functional                162
## 5 lake   functional needs repair    12
## 6 lake   non functional            591
## 
## 
## # A tibble: 3 × 3
## # Groups:   source_type [1]
##   source_type status_group            total
##   <fct>       <fct>                   <int>
## 1 dam         functional                253
## 2 dam         functional needs repair    24
## 3 dam         non functional            379
## 
## 
## # A tibble: 6 × 3
## # Groups:   waterpoint_type [2]
##   waterpoint_type             status_group            total
##   <fct>                       <fct>                   <int>
## 1 communal standpipe multiple functional               2235
## 2 communal standpipe multiple functional needs repair   648
## 3 communal standpipe multiple non functional           3220
## 4 other                       functional                840
## 5 other                       functional needs repair   293
## 6 other                       non functional           5247
## 
## 
## # A tibble: 3 × 3
## # Groups:   waterpoint_type_group [1]
##   waterpoint_type_group status_group            total
##   <fct>                 <fct>                   <int>
## 1 other                 functional                840
## 2 other                 functional needs repair   293
## 3 other                 non functional           5247