library(dplyr)
library(ggplot2)
Data from : https://www.drivendata.org/competitions/7/pump-it-up-data-mining-the-water-table/
train_set <- read.csv("train_values.csv", stringsAsFactors = TRUE)
train_set <- train_set[,-20]
label <- read.csv("train_label.csv", stringsAsFactors = TRUE)
train_set <- cbind(train_set, status_group=label$status_group)
Analyzing the sample under examination, we want to observe which variables are related to the fact that the wells are more or less functional. First we look at some numeric variables to status_group (functional, functional needs repair, non functional) and then the categorical variables to status_group.
train_set$date_recorded <- as.Date(train_set$date_recorded)
range(train_set$date_recorded)
## [1] "2002-10-14" "2013-12-03"
summary(train_set)
## id amount_tsh date_recorded
## Min. : 0 Min. : 0.0 Min. :2002-10-14
## 1st Qu.:18520 1st Qu.: 0.0 1st Qu.:2011-04-01
## Median :37062 Median : 0.0 Median :2012-10-10
## Mean :37115 Mean : 317.7 Mean :2012-03-29
## 3rd Qu.:55656 3rd Qu.: 20.0 3rd Qu.:2013-02-09
## Max. :74247 Max. :350000.0 Max. :2013-12-03
##
## funder gps_height installer
## Government Of Tanzania: 9084 Min. : -90.0 DWE :17402
## : 3635 1st Qu.: 0.0 : 3655
## Danida : 3114 Median : 369.0 Government: 1825
## Hesawa : 2202 Mean : 668.3 RWE : 1206
## Rwssp : 1374 3rd Qu.:1319.2 Commu : 1060
## World Bank : 1349 Max. :2770.0 DANIDA : 1050
## (Other) :38642 (Other) :33202
## longitude latitude wpt_name num_private
## Min. : 0.00 Min. :-11.649 none : 3563 Min. : 0.0000
## 1st Qu.:33.09 1st Qu.: -8.541 Shuleni : 1748 1st Qu.: 0.0000
## Median :34.91 Median : -5.022 Zahanati : 830 Median : 0.0000
## Mean :34.08 Mean : -5.706 Msikitini: 535 Mean : 0.4741
## 3rd Qu.:37.18 3rd Qu.: -3.326 Kanisani : 323 3rd Qu.: 0.0000
## Max. :40.35 Max. : 0.000 Bombani : 271 Max. :1776.0000
## (Other) :52130
## basin subvillage region region_code
## Lake Victoria :10248 Madukani: 508 Iringa : 5294 Min. : 1.0
## Pangani : 8940 Shuleni : 506 Shinyanga : 4982 1st Qu.: 5.0
## Rufiji : 7976 Majengo : 502 Mbeya : 4639 Median :12.0
## Internal : 7785 Kati : 373 Kilimanjaro: 4379 Mean :15.3
## Lake Tanganyika: 6432 : 371 Morogoro : 4006 3rd Qu.:17.0
## Wami / Ruvu : 5987 Mtakuja : 262 Arusha : 3350 Max. :99.0
## (Other) :12032 (Other) :56878 (Other) :32750
## district_code lga ward population
## Min. : 0.00 Njombe : 2503 Igosi : 307 Min. : 0.0
## 1st Qu.: 2.00 Arusha Rural: 1252 Imalinyi : 252 1st Qu.: 0.0
## Median : 3.00 Moshi Rural : 1251 Siha Kati: 232 Median : 25.0
## Mean : 5.63 Bariadi : 1177 Mdandu : 231 Mean : 179.9
## 3rd Qu.: 5.00 Rungwe : 1106 Nduruma : 217 3rd Qu.: 215.0
## Max. :80.00 Kilosa : 1094 Kitunda : 203 Max. :30500.0
## (Other) :51017 (Other) :57958
## public_meeting scheme_management scheme_name permit
## : 3334 VWC :36793 :28166 : 3056
## False: 5055 WUG : 5206 K : 682 False:17492
## True :51011 : 3877 None : 644 True :38852
## Water authority: 3153 Borehole : 546
## WUA : 2883 Chalinze wate: 405
## Water Board : 2748 M : 400
## (Other) : 4740 (Other) :28557
## construction_year extraction_type extraction_type_group
## Min. : 0 gravity :26780 gravity :26780
## 1st Qu.: 0 nira/tanira: 8154 nira/tanira: 8154
## Median :1986 other : 6430 other : 6430
## Mean :1301 submersible: 4764 submersible: 6179
## 3rd Qu.:2004 swn 80 : 3670 swn 80 : 3670
## Max. :2013 mono : 2865 mono : 2865
## (Other) : 6737 (Other) : 5322
## extraction_type_class management management_group
## gravity :26780 vwc :40507 commercial: 3638
## handpump :16456 wug : 6515 other : 943
## motorpump : 2987 water board : 2933 parastatal: 1768
## other : 6430 wua : 2535 unknown : 561
## rope pump : 451 private operator: 1971 user-group:52490
## submersible : 6179 parastatal : 1768
## wind-powered: 117 (Other) : 3171
## payment payment_type water_quality
## never pay :25348 annually : 3642 soft :50818
## other : 1054 monthly : 8300 salty : 4856
## pay annually : 3642 never pay :25348 unknown : 1876
## pay monthly : 8300 on failure: 3914 milky : 804
## pay per bucket : 8985 other : 1054 coloured : 490
## pay when scheme fails: 3914 per bucket: 8985 salty abandoned: 339
## unknown : 8157 unknown : 8157 (Other) : 217
## quality_group quantity quantity_group
## colored : 490 dry : 6246 dry : 6246
## fluoride: 217 enough :33186 enough :33186
## good :50818 insufficient:15129 insufficient:15129
## milky : 804 seasonal : 4050 seasonal : 4050
## salty : 5195 unknown : 789 unknown : 789
## unknown : 1876
##
## source source_type source_class
## spring :17021 borehole :11949 groundwater:45794
## shallow well :16824 dam : 656 surface :13328
## machine dbh :11075 other : 278 unknown : 278
## river : 9612 rainwater harvesting: 2295
## rainwater harvesting: 2295 river/lake :10377
## hand dtw : 874 shallow well :16824
## (Other) : 1699 spring :17021
## waterpoint_type waterpoint_type_group
## cattle trough : 116 cattle trough : 116
## communal standpipe :28522 communal standpipe:34625
## communal standpipe multiple: 6103 dam : 7
## dam : 7 hand pump :17488
## hand pump :17488 improved spring : 784
## improved spring : 784 other : 6380
## other : 6380
## status_group
## functional :32259
## functional needs repair: 4317
## non functional :22824
##
##
##
##
colSums(is.na(train_set))
## id amount_tsh date_recorded
## 0 0 0
## funder gps_height installer
## 0 0 0
## longitude latitude wpt_name
## 0 0 0
## num_private basin subvillage
## 0 0 0
## region region_code district_code
## 0 0 0
## lga ward population
## 0 0 0
## public_meeting scheme_management scheme_name
## 0 0 0
## permit construction_year extraction_type
## 0 0 0
## extraction_type_group extraction_type_class management
## 0 0 0
## management_group payment payment_type
## 0 0 0
## water_quality quality_group quantity
## 0 0 0
## quantity_group source source_type
## 0 0 0
## source_class waterpoint_type waterpoint_type_group
## 0 0 0
## status_group
## 0
train_set %>%
group_by(status_group) %>%
summarise(total=n(), .groups = "drop_last") %>%
mutate(status_group=reorder(status_group,total)) %>%
ggplot(aes(status_group,total, fill=status_group)) +
geom_bar(stat = "identity" )+
geom_text(aes(label=total), vjust=0)+
ylim(0,34000)+
theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
ggtitle("Total number of wells by status_group")
train_set %>%
group_by(region) %>%
summarise(total=n(), .groups = "drop_last") %>%
mutate(region=reorder(region,total)) %>%
ggplot(aes(region,total)) +
geom_bar(stat = "identity",colour="red")+
geom_text(aes(label=total), hjust=0, size=3)+
coord_flip()+
ylim(0,6000)+
ggtitle("Total number of wells by region")
df<-train_set %>%
group_by(region,status_group) %>%
summarise(total=n(), .groups = "drop_last")
df %>%
ggplot(aes(region,total, fill=status_group)) +
geom_bar(stat = "identity")+
theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
ggtitle("Total number of wells by region and status_group")
train_set %>%
group_by(extraction_type_class) %>%
summarise(total=n(), .groups = "drop_last") %>%
mutate(extraction_type_class=reorder(extraction_type_class,total)) %>%
ggplot(aes(extraction_type_class,total)) +
geom_bar(stat = "identity", colour="green")+
guides(scale="none")+
theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
ggtitle("Total number of wells by extraction_type_class")
df<-train_set %>%
group_by(extraction_type_class,status_group) %>%
summarise(total=n(), .groups = "drop_last")
df %>%
ggplot(aes(extraction_type_class,total, fill=status_group)) +
geom_bar(stat = "identity")+
theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
ggtitle("Total number of wells by extraction_type_class and status_group")
to the variable status_group it can be seen that in the sample the wells with on average gps_height and amount_tsh greater are more functional:
train_set %>%
ggplot(aes(status_group,gps_height, fill=status_group))+
geom_boxplot()+
theme(axis.text.x=element_text(angle=-45, vjust=0.5,hjust=0))+
ggtitle("Gps_heigh status_group boxplot")
Mean of gps_height for status_group:
train_set %>%
group_by(status_group) %>%
summarise(mean=mean(gps_height),median=median(gps_height))
## # A tibble: 3 × 3
## status_group mean median
## <fct> <dbl> <dbl>
## 1 functional 740. 550
## 2 functional needs repair 628. 385
## 3 non functional 574. 293
Mean of amount_tsh for status_group:
train_set %>%
group_by(status_group) %>%
summarise(mean=mean(amount_tsh))
## # A tibble: 3 × 2
## status_group mean
## <fct> <dbl>
## 1 functional 462.
## 2 functional needs repair 267.
## 3 non functional 123.
Analyzing the categorical variables of the sample with respect to the variable status_group (functional, functional needs repair, non functional) we want to highlight the values for which the wells are not functional. For example, it is noted that in the Lindi, Mara, Mtwara, Rukwa and Tabora regions of Tanzania the wells are more non-functional:
for (v in names(train_set[,-40])) {
if (is.factor(train_set[, v])) {
df <- train_set %>%
group_by(train_set[, v], status_group) %>%
summarise(total = n(), .groups = "drop_last")
names(df)[1] <- v
fu <- df %>%
filter(status_group == "functional") %>%
select( v, total)
non_fu <- df %>%
filter(status_group == "non functional") %>%
select(v, total)
if(nrow(fu)!=nrow(non_fu)) next
vet <-as.character(as.matrix(fu[which(fu$total - non_fu$total < 0),1]))
if(length(vet)==0) next
print(df[which(as.character(as.matrix(df[,1])) %in% vet),])
cat("\n\n")
}
}
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(v)` instead of `v` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## # A tibble: 6 × 3
## # Groups: basin [2]
## basin status_group total
## <fct> <fct> <int>
## 1 Lake Rukwa functional 1000
## 2 Lake Rukwa functional needs repair 270
## 3 Lake Rukwa non functional 1184
## 4 Ruvuma / Southern Coast functional 1670
## 5 Ruvuma / Southern Coast functional needs repair 326
## 6 Ruvuma / Southern Coast non functional 2497
##
##
## # A tibble: 15 × 3
## # Groups: region [5]
## region status_group total
## <fct> <fct> <int>
## 1 Lindi functional 460
## 2 Lindi functional needs repair 93
## 3 Lindi non functional 993
## 4 Mara functional 886
## 5 Mara functional needs repair 60
## 6 Mara non functional 1023
## 7 Mtwara functional 524
## 8 Mtwara functional needs repair 126
## 9 Mtwara non functional 1080
## 10 Rukwa functional 707
## 11 Rukwa functional needs repair 135
## 12 Rukwa non functional 966
## 13 Tabora functional 848
## 14 Tabora functional needs repair 45
## 15 Tabora non functional 1066
##
##
## # A tibble: 3 × 3
## # Groups: public_meeting [1]
## public_meeting status_group total
## <fct> <fct> <int>
## 1 False functional 2173
## 2 False functional needs repair 442
## 3 False non functional 2440
##
##
## # A tibble: 15 × 3
## # Groups: extraction_type_group [5]
## extraction_type_group status_group total
## <fct> <fct> <int>
## 1 india mark iii functional 44
## 2 india mark iii functional needs repair 1
## 3 india mark iii non functional 53
## 4 mono functional 1082
## 5 mono functional needs repair 129
## 6 mono non functional 1654
## 7 other functional 1029
## 8 other functional needs repair 206
## 9 other non functional 5195
## 10 other motorpump functional 53
## 11 other motorpump functional needs repair 9
## 12 other motorpump non functional 60
## 13 wind-powered functional 50
## 14 wind-powered functional needs repair 7
## 15 wind-powered non functional 60
##
##
## # A tibble: 9 × 3
## # Groups: extraction_type_class [3]
## extraction_type_class status_group total
## <fct> <fct> <int>
## 1 motorpump functional 1135
## 2 motorpump functional needs repair 138
## 3 motorpump non functional 1714
## 4 other functional 1029
## 5 other functional needs repair 206
## 6 other non functional 5195
## 7 wind-powered functional 50
## 8 wind-powered functional needs repair 7
## 9 wind-powered non functional 60
##
##
## # A tibble: 9 × 3
## # Groups: management [3]
## management status_group total
## <fct> <fct> <int>
## 1 company functional 267
## 2 company functional needs repair 15
## 3 company non functional 403
## 4 other - school functional 23
## 5 other - school functional needs repair 1
## 6 other - school non functional 75
## 7 unknown functional 224
## 8 unknown functional needs repair 27
## 9 unknown non functional 310
##
##
## # A tibble: 3 × 3
## # Groups: management_group [1]
## management_group status_group total
## <fct> <fct> <int>
## 1 unknown functional 224
## 2 unknown functional needs repair 27
## 3 unknown non functional 310
##
##
## # A tibble: 6 × 3
## # Groups: payment [2]
## payment status_group total
## <fct> <fct> <int>
## 1 never pay functional 11379
## 2 never pay functional needs repair 1907
## 3 never pay non functional 12062
## 4 unknown functional 3528
## 5 unknown functional needs repair 432
## 6 unknown non functional 4197
##
##
## # A tibble: 6 × 3
## # Groups: payment_type [2]
## payment_type status_group total
## <fct> <fct> <int>
## 1 never pay functional 11379
## 2 never pay functional needs repair 1907
## 3 never pay non functional 12062
## 4 unknown functional 3528
## 5 unknown functional needs repair 432
## 6 unknown non functional 4197
##
##
## # A tibble: 8 × 3
## # Groups: water_quality [3]
## water_quality status_group total
## <fct> <fct> <int>
## 1 fluoride abandoned functional 6
## 2 fluoride abandoned non functional 11
## 3 salty functional 2220
## 4 salty functional needs repair 225
## 5 salty non functional 2411
## 6 unknown functional 264
## 7 unknown functional needs repair 35
## 8 unknown non functional 1577
##
##
## # A tibble: 6 × 3
## # Groups: quality_group [2]
## quality_group status_group total
## <fct> <fct> <int>
## 1 salty functional 2394
## 2 salty functional needs repair 297
## 3 salty non functional 2504
## 4 unknown functional 264
## 5 unknown functional needs repair 35
## 6 unknown non functional 1577
##
##
## # A tibble: 6 × 3
## # Groups: quantity [2]
## quantity status_group total
## <fct> <fct> <int>
## 1 dry functional 157
## 2 dry functional needs repair 37
## 3 dry non functional 6052
## 4 unknown functional 213
## 5 unknown functional needs repair 14
## 6 unknown non functional 562
##
##
## # A tibble: 6 × 3
## # Groups: quantity_group [2]
## quantity_group status_group total
## <fct> <fct> <int>
## 1 dry functional 157
## 2 dry functional needs repair 37
## 3 dry non functional 6052
## 4 unknown functional 213
## 5 unknown functional needs repair 14
## 6 unknown non functional 562
##
##
## # A tibble: 6 × 3
## # Groups: source [2]
## source status_group total
## <fct> <fct> <int>
## 1 dam functional 253
## 2 dam functional needs repair 24
## 3 dam non functional 379
## 4 lake functional 162
## 5 lake functional needs repair 12
## 6 lake non functional 591
##
##
## # A tibble: 3 × 3
## # Groups: source_type [1]
## source_type status_group total
## <fct> <fct> <int>
## 1 dam functional 253
## 2 dam functional needs repair 24
## 3 dam non functional 379
##
##
## # A tibble: 6 × 3
## # Groups: waterpoint_type [2]
## waterpoint_type status_group total
## <fct> <fct> <int>
## 1 communal standpipe multiple functional 2235
## 2 communal standpipe multiple functional needs repair 648
## 3 communal standpipe multiple non functional 3220
## 4 other functional 840
## 5 other functional needs repair 293
## 6 other non functional 5247
##
##
## # A tibble: 3 × 3
## # Groups: waterpoint_type_group [1]
## waterpoint_type_group status_group total
## <fct> <fct> <int>
## 1 other functional 840
## 2 other functional needs repair 293
## 3 other non functional 5247