Import data
# excel file
data <- read_excel("../00_data/myWaterData.xlsx")
data
## # A tibble: 473,315 × 13
## row_id lat_deg lon_deg report_date status_id water_source water_tech
## <dbl> <dbl> <dbl> <dttm> <chr> <chr> <chr>
## 1 3957 8.07 38.6 2017-04-06 00:00:00 y NA NA
## 2 33512 7.37 40.5 2020-08-04 00:00:00 y Protected Sp… NA
## 3 35125 0.773 34.9 2015-03-18 00:00:00 y Protected Sh… NA
## 4 37760 0.781 35.0 2015-03-18 00:00:00 y Borehole NA
## 5 38118 0.779 35.0 2015-03-18 00:00:00 y Protected Sh… NA
## 6 38501 0.308 34.1 2015-03-19 00:00:00 y Borehole NA
## 7 46357 0.419 34.3 2015-05-19 00:00:00 y Unprotected … NA
## 8 46535 0.444 34.3 2015-05-19 00:00:00 y Protected Sh… NA
## 9 46560 0.456 34.3 2015-05-19 00:00:00 y Protected Sh… NA
## 10 46782 0.467 34.3 2015-05-20 00:00:00 y Protected Sh… NA
## # ℹ 473,305 more rows
## # ℹ 6 more variables: facility_type <chr>, country_name <chr>,
## # install_year <chr>, installer <chr>, pay <chr>, status <chr>
Filter rows
filter(data, install_year == 2020, water_source == "Borehole")
## # A tibble: 2,715 × 13
## row_id lat_deg lon_deg report_date status_id water_source water_tech
## <dbl> <dbl> <dbl> <dttm> <chr> <chr> <chr>
## 1 4023 10.5 -4.77 2020-10-28 00:00:00 y Borehole Hand Pump …
## 2 4027 10.5 -4.81 2020-10-28 00:00:00 y Borehole Hand Pump …
## 3 4024 10.5 -4.77 2020-10-28 00:00:00 y Borehole Hand Pump …
## 4 4022 10.7 -4.88 2020-10-28 00:00:00 y Borehole Hand Pump …
## 5 4028 10.4 -4.81 2020-10-28 00:00:00 y Borehole Hand Pump …
## 6 4029 10.5 -4.82 2020-10-28 00:00:00 y Borehole Hand Pump …
## 7 4021 10.5 -4.73 2020-10-28 00:00:00 y Borehole Hand Pump …
## 8 4026 10.6 -4.86 2020-10-28 00:00:00 y Borehole Hand Pump …
## 9 4025 10.6 -4.86 2020-10-28 00:00:00 y Borehole Hand Pump …
## 10 33560 11.7 8.14 2020-03-31 00:00:00 y Borehole Mechanized…
## # ℹ 2,705 more rows
## # ℹ 6 more variables: facility_type <chr>, country_name <chr>,
## # install_year <chr>, installer <chr>, pay <chr>, status <chr>
Arrange rows
arrange(data, desc(install_year), desc(water_tech))
## # A tibble: 473,315 × 13
## row_id lat_deg lon_deg report_date status_id water_source water_tech
## <dbl> <dbl> <dbl> <dttm> <chr> <chr> <chr>
## 1 7878 2.46 30.9 2010-07-15 00:00:00 y NA Tapstand
## 2 7 2.87 31.0 2010-04-22 00:00:00 y NA Tapstand
## 3 843 3.35 31.0 2010-04-15 00:00:00 y NA Tapstand
## 4 5274 3.65 31.7 2010-04-21 00:00:00 y NA Tapstand
## 5 7844 2.46 31.0 2010-09-13 00:00:00 y NA Tapstand
## 6 5 2.87 31.0 2010-04-22 00:00:00 y NA Tapstand
## 7 808 3.37 31.0 2010-04-15 00:00:00 n NA Tapstand
## 8 16 2.87 31.0 2010-04-22 00:00:00 y NA Tapstand
## 9 804 3.37 31.0 2010-04-16 00:00:00 y NA Tapstand
## 10 7813 2.47 30.9 2010-09-13 00:00:00 y NA Tapstand
## # ℹ 473,305 more rows
## # ℹ 6 more variables: facility_type <chr>, country_name <chr>,
## # install_year <chr>, installer <chr>, pay <chr>, status <chr>
Select columns
select(data, install_year, country_name, water_source, water_tech)
## # A tibble: 473,315 × 4
## install_year country_name water_source water_tech
## <chr> <chr> <chr> <chr>
## 1 NA Ethiopia NA NA
## 2 2019 Ethiopia Protected Spring NA
## 3 NA Kenya Protected Shallow Well NA
## 4 NA Kenya Borehole NA
## 5 NA Kenya Protected Shallow Well NA
## 6 NA Kenya Borehole NA
## 7 NA Kenya Unprotected Spring NA
## 8 NA Kenya Protected Shallow Well NA
## 9 NA Kenya Protected Shallow Well NA
## 10 NA Kenya Protected Shallow Well NA
## # ℹ 473,305 more rows
Add columns
mutate(data,
row_ch = row_id + 1) %>%
select(row_id, row_ch)
## # A tibble: 473,315 × 2
## row_id row_ch
## <dbl> <dbl>
## 1 3957 3958
## 2 33512 33513
## 3 35125 35126
## 4 37760 37761
## 5 38118 38119
## 6 38501 38502
## 7 46357 46358
## 8 46535 46536
## 9 46560 46561
## 10 46782 46783
## # ℹ 473,305 more rows
- Adds numeric 1 digit to all row_id numbers.
Summarize by groups
summarise(data, avg = mean(row_id))
## # A tibble: 1 × 1
## avg
## <dbl>
## 1 NA
- Data does not have much numeric data to calculate numerics.