Exploring Subsamples of WASH Data

# Read the CSV file
data <- read.csv("C:\\Users\\am790\\Downloads\\washdash-download (1).csv")
# View summary of the data
summary(data)

##      Type              Region          Residence.Type     Service.Type      
##  Length:3367        Length:3367        Length:3367        Length:3367       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       Year         Coverage         Population        Service.level     
##  Min.   :2010   Min.   :  0.000   Min.   :0.000e+00   Length:3367       
##  1st Qu.:2013   1st Qu.:  2.486   1st Qu.:4.366e+06   Class :character  
##  Median :2016   Median : 12.110   Median :3.306e+07   Mode  :character  
##  Mean   :2016   Mean   : 22.447   Mean   :1.497e+08                     
##  3rd Qu.:2019   3rd Qu.: 34.190   3rd Qu.:1.755e+08                     
##  Max.   :2022   Max.   :100.000   Max.   :2.173e+09

# Set seed for reproducibility
set.seed(123)
# Set the display format to avoid scientific notation
options(scipen = 999)
# Determine the size of each subsample (roughly 50% of the original data size)
subsample_size <- round(nrow(data) * 0.5)

# Create 5 random subsamples
subsample_1 <- data[sample(nrow(data), subsample_size, replace = TRUE), ]
subsample_2 <- data[sample(nrow(data), subsample_size, replace = TRUE), ]
subsample_3 <- data[sample(nrow(data), subsample_size, replace = TRUE), ]
subsample_4 <- data[sample(nrow(data), subsample_size, replace = TRUE), ]
subsample_5 <- data[sample(nrow(data), subsample_size, replace = TRUE), ]

# Analyze and compare the characteristics of each subsample
summary_subsample_1 <- summary(subsample_1)
summary_subsample_2 <- summary(subsample_2)
summary_subsample_3 <- summary(subsample_3)
summary_subsample_4 <- summary(subsample_4)
summary_subsample_5 <- summary(subsample_5)

# Display summary statistics for each subsample
print(summary_subsample_1)

##      Type              Region          Residence.Type     Service.Type      
##  Length:1684        Length:1684        Length:1684        Length:1684       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       Year         Coverage         Population         Service.level     
##  Min.   :2010   Min.   :  0.000   Min.   :         0   Length:1684       
##  1st Qu.:2013   1st Qu.:  2.476   1st Qu.:   4154243   Class :character  
##  Median :2016   Median : 12.313   Median :  34402660   Mode  :character  
##  Mean   :2016   Mean   : 22.714   Mean   : 157647595                     
##  3rd Qu.:2019   3rd Qu.: 34.640   3rd Qu.: 181490195                     
##  Max.   :2022   Max.   :100.000   Max.   :2173068885

print(summary_subsample_2)

##      Type              Region          Residence.Type     Service.Type      
##  Length:1684        Length:1684        Length:1684        Length:1684       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       Year         Coverage         Population         Service.level     
##  Min.   :2010   Min.   :  0.000   Min.   :         0   Length:1684       
##  1st Qu.:2013   1st Qu.:  2.724   1st Qu.:   5175632   Class :character  
##  Median :2016   Median : 12.793   Median :  34759598   Mode  :character  
##  Mean   :2016   Mean   : 22.597   Mean   : 146462007                     
##  3rd Qu.:2019   3rd Qu.: 33.992   3rd Qu.: 174690481                     
##  Max.   :2022   Max.   :100.000   Max.   :2165634558

print(summary_subsample_3)

##      Type              Region          Residence.Type     Service.Type      
##  Length:1684        Length:1684        Length:1684        Length:1684       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       Year         Coverage         Population         Service.level     
##  Min.   :2010   Min.   :  0.000   Min.   :         0   Length:1684       
##  1st Qu.:2013   1st Qu.:  2.493   1st Qu.:   5009428   Class :character  
##  Median :2016   Median : 12.460   Median :  39553047   Mode  :character  
##  Mean   :2016   Mean   : 22.354   Mean   : 155646582                     
##  3rd Qu.:2019   3rd Qu.: 33.734   3rd Qu.: 176074957                     
##  Max.   :2022   Max.   :100.000   Max.   :2173068885

print(summary_subsample_4)

##      Type              Region          Residence.Type     Service.Type      
##  Length:1684        Length:1684        Length:1684        Length:1684       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       Year         Coverage         Population         Service.level     
##  Min.   :2010   Min.   :  0.000   Min.   :         0   Length:1684       
##  1st Qu.:2013   1st Qu.:  1.995   1st Qu.:   3897266   Class :character  
##  Median :2016   Median :  9.267   Median :  25851633   Mode  :character  
##  Mean   :2016   Mean   : 20.472   Mean   : 136173254                     
##  3rd Qu.:2019   3rd Qu.: 31.066   3rd Qu.: 137711310                     
##  Max.   :2022   Max.   :100.000   Max.   :2122935806

print(summary_subsample_5)

##      Type              Region          Residence.Type     Service.Type      
##  Length:1684        Length:1684        Length:1684        Length:1684       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       Year         Coverage         Population         Service.level     
##  Min.   :2010   Min.   :  0.000   Min.   :         0   Length:1684       
##  1st Qu.:2013   1st Qu.:  2.476   1st Qu.:   4134665   Class :character  
##  Median :2016   Median : 11.075   Median :  30676280   Mode  :character  
##  Mean   :2016   Mean   : 21.873   Mean   : 142638708                     
##  3rd Qu.:2019   3rd Qu.: 33.293   3rd Qu.: 165223167                     
##  Max.   :2022   Max.   :100.000   Max.   :2173068885

# calculate the mean population for each combination of Region and Residence.Type in each subsample
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

mean_population_subsample_1 <- subsample_1 %>%
  group_by(Region, Residence.Type) %>%
  summarise(mean_population = mean(Population, na.rm = TRUE))

## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.

mean_population_subsample_2 <- subsample_2 %>%
  group_by(Region, Residence.Type) %>%
  summarise(mean_population = mean(Population, na.rm = TRUE))

## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.

mean_population_subsample_3 <- subsample_3 %>%
  group_by(Region, Residence.Type) %>%
  summarise(mean_population = mean(Population, na.rm = TRUE))

## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.

mean_population_subsample_4 <- subsample_4 %>%
  group_by(Region, Residence.Type) %>%
  summarise(mean_population = mean(Population, na.rm = TRUE))

## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.

mean_population_subsample_5 <- subsample_5 %>%
  group_by(Region, Residence.Type) %>%
  summarise(mean_population = mean(Population, na.rm = TRUE))

## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.

# Display mean population for each combination of Region and Residence.
print(mean_population_subsample_1)

## # A tibble: 24 × 3
## # Groups:   Region [8]
##    Region                         Residence.Type mean_population
##    <chr>                          <chr>                    <dbl>
##  1 Australia and New Zealand      rural                  552201.
##  2 Australia and New Zealand      total                 5699598.
##  3 Australia and New Zealand      urban                 6630438.
##  4 Central and Southern Asia      rural               290875621.
##  5 Central and Southern Asia      total               551402921.
##  6 Central and Southern Asia      urban               174334992.
##  7 Eastern and South-Eastern Asia rural               185809073.
##  8 Eastern and South-Eastern Asia total               541939177.
##  9 Eastern and South-Eastern Asia urban               255742312.
## 10 Europe and Northern America    rural                60843089.
## # ℹ 14 more rows

print(mean_population_subsample_2)

## # A tibble: 24 × 3
## # Groups:   Region [8]
##    Region                         Residence.Type mean_population
##    <chr>                          <chr>                    <dbl>
##  1 Australia and New Zealand      rural                 1368316.
##  2 Australia and New Zealand      total                 8632325.
##  3 Australia and New Zealand      urban                 6046107.
##  4 Central and Southern Asia      rural               308488678.
##  5 Central and Southern Asia      total               440200014.
##  6 Central and Southern Asia      urban               203248365.
##  7 Eastern and South-Eastern Asia rural               172907559.
##  8 Eastern and South-Eastern Asia total               414326521.
##  9 Eastern and South-Eastern Asia urban               253826928.
## 10 Europe and Northern America    rural                45032284.
## # ℹ 14 more rows

print(mean_population_subsample_3)

## # A tibble: 24 × 3
## # Groups:   Region [8]
##    Region                         Residence.Type mean_population
##    <chr>                          <chr>                    <dbl>
##  1 Australia and New Zealand      rural                 1266910.
##  2 Australia and New Zealand      total                 7251198.
##  3 Australia and New Zealand      urban                 4052628.
##  4 Central and Southern Asia      rural               206851144.
##  5 Central and Southern Asia      total               433677903.
##  6 Central and Southern Asia      urban               179174357.
##  7 Eastern and South-Eastern Asia rural               242079099.
##  8 Eastern and South-Eastern Asia total               602445506.
##  9 Eastern and South-Eastern Asia urban               227816176.
## 10 Europe and Northern America    rural                45564835.
## # ℹ 14 more rows

print(mean_population_subsample_4)

## # A tibble: 24 × 3
## # Groups:   Region [8]
##    Region                         Residence.Type mean_population
##    <chr>                          <chr>                    <dbl>
##  1 Australia and New Zealand      rural                  896972.
##  2 Australia and New Zealand      total                 4616919.
##  3 Australia and New Zealand      urban                 2737720.
##  4 Central and Southern Asia      rural               238224639.
##  5 Central and Southern Asia      total               390874890.
##  6 Central and Southern Asia      urban               169680439.
##  7 Eastern and South-Eastern Asia rural               152135302.
##  8 Eastern and South-Eastern Asia total               447944252.
##  9 Eastern and South-Eastern Asia urban               233357763.
## 10 Europe and Northern America    rural                48765644.
## # ℹ 14 more rows

print(mean_population_subsample_5)

## # A tibble: 24 × 3
## # Groups:   Region [8]
##    Region                         Residence.Type mean_population
##    <chr>                          <chr>                    <dbl>
##  1 Australia and New Zealand      rural                  621285.
##  2 Australia and New Zealand      total                 7995427.
##  3 Australia and New Zealand      urban                 6280446.
##  4 Central and Southern Asia      rural               251934330.
##  5 Central and Southern Asia      total               416112248.
##  6 Central and Southern Asia      urban               157086007.
##  7 Eastern and South-Eastern Asia rural               213714405.
##  8 Eastern and South-Eastern Asia total               447537617.
##  9 Eastern and South-Eastern Asia urban               269456967.
## 10 Europe and Northern America    rural                48030766.
## # ℹ 14 more rows

visualization for the mean population for each combination of Region and Residence: