Import data

# excel file
data <- read_excel("../00_data/myWaterData.xlsx")
data
## # A tibble: 473,315 × 13
##    row_id lat_deg lon_deg report_date         status_id water_source  water_tech
##     <dbl>   <dbl>   <dbl> <dttm>              <chr>     <chr>         <chr>     
##  1   3957   8.07     38.6 2017-04-06 00:00:00 y         NA            NA        
##  2  33512   7.37     40.5 2020-08-04 00:00:00 y         Protected Sp… NA        
##  3  35125   0.773    34.9 2015-03-18 00:00:00 y         Protected Sh… NA        
##  4  37760   0.781    35.0 2015-03-18 00:00:00 y         Borehole      NA        
##  5  38118   0.779    35.0 2015-03-18 00:00:00 y         Protected Sh… NA        
##  6  38501   0.308    34.1 2015-03-19 00:00:00 y         Borehole      NA        
##  7  46357   0.419    34.3 2015-05-19 00:00:00 y         Unprotected … NA        
##  8  46535   0.444    34.3 2015-05-19 00:00:00 y         Protected Sh… NA        
##  9  46560   0.456    34.3 2015-05-19 00:00:00 y         Protected Sh… NA        
## 10  46782   0.467    34.3 2015-05-20 00:00:00 y         Protected Sh… NA        
## # ℹ 473,305 more rows
## # ℹ 6 more variables: facility_type <chr>, country_name <chr>,
## #   install_year <chr>, installer <chr>, pay <chr>, status <chr>

Filter rows

filter(data, install_year == 2020, water_source == "Borehole")
## # A tibble: 2,715 × 13
##    row_id lat_deg lon_deg report_date         status_id water_source water_tech 
##     <dbl>   <dbl>   <dbl> <dttm>              <chr>     <chr>        <chr>      
##  1   4023    10.5   -4.77 2020-10-28 00:00:00 y         Borehole     Hand Pump …
##  2   4027    10.5   -4.81 2020-10-28 00:00:00 y         Borehole     Hand Pump …
##  3   4024    10.5   -4.77 2020-10-28 00:00:00 y         Borehole     Hand Pump …
##  4   4022    10.7   -4.88 2020-10-28 00:00:00 y         Borehole     Hand Pump …
##  5   4028    10.4   -4.81 2020-10-28 00:00:00 y         Borehole     Hand Pump …
##  6   4029    10.5   -4.82 2020-10-28 00:00:00 y         Borehole     Hand Pump …
##  7   4021    10.5   -4.73 2020-10-28 00:00:00 y         Borehole     Hand Pump …
##  8   4026    10.6   -4.86 2020-10-28 00:00:00 y         Borehole     Hand Pump …
##  9   4025    10.6   -4.86 2020-10-28 00:00:00 y         Borehole     Hand Pump …
## 10  33560    11.7    8.14 2020-03-31 00:00:00 y         Borehole     Mechanized…
## # ℹ 2,705 more rows
## # ℹ 6 more variables: facility_type <chr>, country_name <chr>,
## #   install_year <chr>, installer <chr>, pay <chr>, status <chr>

Arrange rows

arrange(data, desc(install_year), desc(water_tech))
## # A tibble: 473,315 × 13
##    row_id lat_deg lon_deg report_date         status_id water_source water_tech
##     <dbl>   <dbl>   <dbl> <dttm>              <chr>     <chr>        <chr>     
##  1   7878    2.46    30.9 2010-07-15 00:00:00 y         NA           Tapstand  
##  2      7    2.87    31.0 2010-04-22 00:00:00 y         NA           Tapstand  
##  3    843    3.35    31.0 2010-04-15 00:00:00 y         NA           Tapstand  
##  4   5274    3.65    31.7 2010-04-21 00:00:00 y         NA           Tapstand  
##  5   7844    2.46    31.0 2010-09-13 00:00:00 y         NA           Tapstand  
##  6      5    2.87    31.0 2010-04-22 00:00:00 y         NA           Tapstand  
##  7    808    3.37    31.0 2010-04-15 00:00:00 n         NA           Tapstand  
##  8     16    2.87    31.0 2010-04-22 00:00:00 y         NA           Tapstand  
##  9    804    3.37    31.0 2010-04-16 00:00:00 y         NA           Tapstand  
## 10   7813    2.47    30.9 2010-09-13 00:00:00 y         NA           Tapstand  
## # ℹ 473,305 more rows
## # ℹ 6 more variables: facility_type <chr>, country_name <chr>,
## #   install_year <chr>, installer <chr>, pay <chr>, status <chr>

Select columns

select(data, install_year, country_name, water_source, water_tech)
## # A tibble: 473,315 × 4
##    install_year country_name water_source           water_tech
##    <chr>        <chr>        <chr>                  <chr>     
##  1 NA           Ethiopia     NA                     NA        
##  2 2019         Ethiopia     Protected Spring       NA        
##  3 NA           Kenya        Protected Shallow Well NA        
##  4 NA           Kenya        Borehole               NA        
##  5 NA           Kenya        Protected Shallow Well NA        
##  6 NA           Kenya        Borehole               NA        
##  7 NA           Kenya        Unprotected Spring     NA        
##  8 NA           Kenya        Protected Shallow Well NA        
##  9 NA           Kenya        Protected Shallow Well NA        
## 10 NA           Kenya        Protected Shallow Well NA        
## # ℹ 473,305 more rows

Add columns

mutate(data,
       row_ch = row_id + 1) %>%
    
    select(row_id, row_ch)
## # A tibble: 473,315 × 2
##    row_id row_ch
##     <dbl>  <dbl>
##  1   3957   3958
##  2  33512  33513
##  3  35125  35126
##  4  37760  37761
##  5  38118  38119
##  6  38501  38502
##  7  46357  46358
##  8  46535  46536
##  9  46560  46561
## 10  46782  46783
## # ℹ 473,305 more rows
  • Adds numeric 1 digit to all row_id numbers.

Summarize by groups

summarise(data, avg = mean(row_id))
## # A tibble: 1 × 1
##     avg
##   <dbl>
## 1    NA
  • Data does not have much numeric data to calculate numerics.