Introduction:
# Read the CSV file data <- read.csv("C:\\Users\\am790\\Downloads\\washdash-download (1).csv") # View summary of the data summary(data)
## Type Region Residence.Type Service.Type ## Length:3367 Length:3367 Length:3367 Length:3367 ## Class :character Class :character Class :character Class :character ## Mode :character Mode :character Mode :character Mode :character ## ## ## ## Year Coverage Population Service.level ## Min. :2010 Min. : 0.000 Min. :0.000e+00 Length:3367 ## 1st Qu.:2013 1st Qu.: 2.486 1st Qu.:4.366e+06 Class :character ## Median :2016 Median : 12.110 Median :3.306e+07 Mode :character ## Mean :2016 Mean : 22.447 Mean :1.497e+08 ## 3rd Qu.:2019 3rd Qu.: 34.190 3rd Qu.:1.755e+08 ## Max. :2022 Max. :100.000 Max. :2.173e+09
# Set seed for reproducibility set.seed(123) # Set the display format to avoid scientific notation options(scipen = 999) # Determine the size of each subsample (roughly 50% of the original data size) subsample_size <- round(nrow(data) * 0.5) # Create 5 random subsamples subsample_1 <- data[sample(nrow(data), subsample_size, replace = TRUE), ] subsample_2 <- data[sample(nrow(data), subsample_size, replace = TRUE), ] subsample_3 <- data[sample(nrow(data), subsample_size, replace = TRUE), ] subsample_4 <- data[sample(nrow(data), subsample_size, replace = TRUE), ] subsample_5 <- data[sample(nrow(data), subsample_size, replace = TRUE), ] # Analyze and compare the characteristics of each subsample summary_subsample_1 <- summary(subsample_1) summary_subsample_2 <- summary(subsample_2) summary_subsample_3 <- summary(subsample_3) summary_subsample_4 <- summary(subsample_4) summary_subsample_5 <- summary(subsample_5) # Display summary statistics for each subsample print(summary_subsample_1)
## Type Region Residence.Type Service.Type ## Length:1684 Length:1684 Length:1684 Length:1684 ## Class :character Class :character Class :character Class :character ## Mode :character Mode :character Mode :character Mode :character ## ## ## ## Year Coverage Population Service.level ## Min. :2010 Min. : 0.000 Min. : 0 Length:1684 ## 1st Qu.:2013 1st Qu.: 2.476 1st Qu.: 4154243 Class :character ## Median :2016 Median : 12.313 Median : 34402660 Mode :character ## Mean :2016 Mean : 22.714 Mean : 157647595 ## 3rd Qu.:2019 3rd Qu.: 34.640 3rd Qu.: 181490195 ## Max. :2022 Max. :100.000 Max. :2173068885
print(summary_subsample_2)
## Type Region Residence.Type Service.Type ## Length:1684 Length:1684 Length:1684 Length:1684 ## Class :character Class :character Class :character Class :character ## Mode :character Mode :character Mode :character Mode :character ## ## ## ## Year Coverage Population Service.level ## Min. :2010 Min. : 0.000 Min. : 0 Length:1684 ## 1st Qu.:2013 1st Qu.: 2.724 1st Qu.: 5175632 Class :character ## Median :2016 Median : 12.793 Median : 34759598 Mode :character ## Mean :2016 Mean : 22.597 Mean : 146462007 ## 3rd Qu.:2019 3rd Qu.: 33.992 3rd Qu.: 174690481 ## Max. :2022 Max. :100.000 Max. :2165634558
print(summary_subsample_3)
## Type Region Residence.Type Service.Type ## Length:1684 Length:1684 Length:1684 Length:1684 ## Class :character Class :character Class :character Class :character ## Mode :character Mode :character Mode :character Mode :character ## ## ## ## Year Coverage Population Service.level ## Min. :2010 Min. : 0.000 Min. : 0 Length:1684 ## 1st Qu.:2013 1st Qu.: 2.493 1st Qu.: 5009428 Class :character ## Median :2016 Median : 12.460 Median : 39553047 Mode :character ## Mean :2016 Mean : 22.354 Mean : 155646582 ## 3rd Qu.:2019 3rd Qu.: 33.734 3rd Qu.: 176074957 ## Max. :2022 Max. :100.000 Max. :2173068885
print(summary_subsample_4)
## Type Region Residence.Type Service.Type ## Length:1684 Length:1684 Length:1684 Length:1684 ## Class :character Class :character Class :character Class :character ## Mode :character Mode :character Mode :character Mode :character ## ## ## ## Year Coverage Population Service.level ## Min. :2010 Min. : 0.000 Min. : 0 Length:1684 ## 1st Qu.:2013 1st Qu.: 1.995 1st Qu.: 3897266 Class :character ## Median :2016 Median : 9.267 Median : 25851633 Mode :character ## Mean :2016 Mean : 20.472 Mean : 136173254 ## 3rd Qu.:2019 3rd Qu.: 31.066 3rd Qu.: 137711310 ## Max. :2022 Max. :100.000 Max. :2122935806
print(summary_subsample_5)
## Type Region Residence.Type Service.Type ## Length:1684 Length:1684 Length:1684 Length:1684 ## Class :character Class :character Class :character Class :character ## Mode :character Mode :character Mode :character Mode :character ## ## ## ## Year Coverage Population Service.level ## Min. :2010 Min. : 0.000 Min. : 0 Length:1684 ## 1st Qu.:2013 1st Qu.: 2.476 1st Qu.: 4134665 Class :character ## Median :2016 Median : 11.075 Median : 30676280 Mode :character ## Mean :2016 Mean : 21.873 Mean : 142638708 ## 3rd Qu.:2019 3rd Qu.: 33.293 3rd Qu.: 165223167 ## Max. :2022 Max. :100.000 Max. :2173068885
# calculate the mean population for each combination of Region and Residence.Type in each subsample library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
mean_population_subsample_1 <- subsample_1 %>% group_by(Region, Residence.Type) %>% summarise(mean_population = mean(Population, na.rm = TRUE))
mean_population_subsample_2 <- subsample_2 %>% group_by(Region, Residence.Type) %>% summarise(mean_population = mean(Population, na.rm = TRUE))
mean_population_subsample_3 <- subsample_3 %>% group_by(Region, Residence.Type) %>% summarise(mean_population = mean(Population, na.rm = TRUE))
mean_population_subsample_4 <- subsample_4 %>% group_by(Region, Residence.Type) %>% summarise(mean_population = mean(Population, na.rm = TRUE))
mean_population_subsample_5 <- subsample_5 %>% group_by(Region, Residence.Type) %>% summarise(mean_population = mean(Population, na.rm = TRUE))
# Display mean population for each combination of Region and Residence. print(mean_population_subsample_1)
## # A tibble: 24 × 3 ## # Groups: Region [8] ## Region Residence.Type mean_population ## <chr> <chr> <dbl> ## 1 Australia and New Zealand rural 552201. ## 2 Australia and New Zealand total 5699598. ## 3 Australia and New Zealand urban 6630438. ## 4 Central and Southern Asia rural 290875621. ## 5 Central and Southern Asia total 551402921. ## 6 Central and Southern Asia urban 174334992. ## 7 Eastern and South-Eastern Asia rural 185809073. ## 8 Eastern and South-Eastern Asia total 541939177. ## 9 Eastern and South-Eastern Asia urban 255742312. ## 10 Europe and Northern America rural 60843089. ## # ℹ 14 more rows
print(mean_population_subsample_2)
## # A tibble: 24 × 3 ## # Groups: Region [8] ## Region Residence.Type mean_population ## <chr> <chr> <dbl> ## 1 Australia and New Zealand rural 1368316. ## 2 Australia and New Zealand total 8632325. ## 3 Australia and New Zealand urban 6046107. ## 4 Central and Southern Asia rural 308488678. ## 5 Central and Southern Asia total 440200014. ## 6 Central and Southern Asia urban 203248365. ## 7 Eastern and South-Eastern Asia rural 172907559. ## 8 Eastern and South-Eastern Asia total 414326521. ## 9 Eastern and South-Eastern Asia urban 253826928. ## 10 Europe and Northern America rural 45032284. ## # ℹ 14 more rows
print(mean_population_subsample_3)
## # A tibble: 24 × 3 ## # Groups: Region [8] ## Region Residence.Type mean_population ## <chr> <chr> <dbl> ## 1 Australia and New Zealand rural 1266910. ## 2 Australia and New Zealand total 7251198. ## 3 Australia and New Zealand urban 4052628. ## 4 Central and Southern Asia rural 206851144. ## 5 Central and Southern Asia total 433677903. ## 6 Central and Southern Asia urban 179174357. ## 7 Eastern and South-Eastern Asia rural 242079099. ## 8 Eastern and South-Eastern Asia total 602445506. ## 9 Eastern and South-Eastern Asia urban 227816176. ## 10 Europe and Northern America rural 45564835. ## # ℹ 14 more rows
print(mean_population_subsample_4)
## # A tibble: 24 × 3 ## # Groups: Region [8] ## Region Residence.Type mean_population ## <chr> <chr> <dbl> ## 1 Australia and New Zealand rural 896972. ## 2 Australia and New Zealand total 4616919. ## 3 Australia and New Zealand urban 2737720. ## 4 Central and Southern Asia rural 238224639. ## 5 Central and Southern Asia total 390874890. ## 6 Central and Southern Asia urban 169680439. ## 7 Eastern and South-Eastern Asia rural 152135302. ## 8 Eastern and South-Eastern Asia total 447944252. ## 9 Eastern and South-Eastern Asia urban 233357763. ## 10 Europe and Northern America rural 48765644. ## # ℹ 14 more rows
print(mean_population_subsample_5)
## # A tibble: 24 × 3 ## # Groups: Region [8] ## Region Residence.Type mean_population ## <chr> <chr> <dbl> ## 1 Australia and New Zealand rural 621285. ## 2 Australia and New Zealand total 7995427. ## 3 Australia and New Zealand urban 6280446. ## 4 Central and Southern Asia rural 251934330. ## 5 Central and Southern Asia total 416112248. ## 6 Central and Southern Asia urban 157086007. ## 7 Eastern and South-Eastern Asia rural 213714405. ## 8 Eastern and South-Eastern Asia total 447537617. ## 9 Eastern and South-Eastern Asia urban 269456967. ## 10 Europe and Northern America rural 48030766. ## # ℹ 14 more rows
visualization for the mean population for each combination of Region and Residence:
# Load the ggplot2 library for visualization library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# Create a function to plot the mean population for each combination of Region and Residence.Type plot_mean_population <- function(mean_population_data, subsample_number) { ggplot(mean_population_data, aes(x = Region, y = mean_population, fill = Residence.Type)) + geom_bar(stat = "identity", position = "dodge") + labs(title = paste("Mean Population for Subsample", subsample_number), x = "Region", y = "Mean Population", fill = "Residence Type") + theme_minimal() + theme(legend.position = "top") + # Adjust legend position coord_flip() # Flip the coordinates for better readability } # Plot for subsample 1 plot_mean_population(mean_population_subsample_1, 1)
# Plot for subsample 2 plot_mean_population(mean_population_subsample_2, 2)
# Plot for subsample 3 plot_mean_population(mean_population_subsample_3, 3)
# Plot for subsample 4 plot_mean_population(mean_population_subsample_4, 4)
# Plot for subsample 5 plot_mean_population(mean_population_subsample_5, 5)