Introduction:
In this analysis, I'm exploring the variability of Water, Sanitation, and Hygiene (WASH) data by categorizing it into three distinct groups based on categorical columns. I'll summarize various variables within these groups. Specifically, I'll focus on combinations of region and service type, year and residence type, as well as type and coverage. My goal is to understand why certain groups are less common than others and draw conclusions about the implications of these findings.# Read the CSV file data <- read.csv("C:\\Users\\am790\\Downloads\\washdash-download (1).csv") # View summary of the data summary(data)
## Type Region Residence.Type Service.Type ## Length:3367 Length:3367 Length:3367 Length:3367 ## Class :character Class :character Class :character Class :character ## Mode :character Mode :character Mode :character Mode :character ## ## ## ## Year Coverage Population Service.level ## Min. :2010 Min. : 0.000 Min. :0.000e+00 Length:3367 ## 1st Qu.:2013 1st Qu.: 2.486 1st Qu.:4.366e+06 Class :character ## Median :2016 Median : 12.110 Median :3.306e+07 Mode :character ## Mean :2016 Mean : 22.447 Mean :1.497e+08 ## 3rd Qu.:2019 3rd Qu.: 34.190 3rd Qu.:1.755e+08 ## Max. :2022 Max. :100.000 Max. :2.173e+09
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
# Grouping 1: Region and Service Type grouped_data_1 <- data %>% group_by(Region, Service.Type) %>% summarise(Average_Coverage = mean(Coverage), .groups = "drop") # Identify lowest probability group lowest_prob_group_1 <- grouped_data_1 %>% filter(Average_Coverage == min(Average_Coverage)) # Add special tag to original data data$Special_Tag <- ifelse(data$Region == lowest_prob_group_1$Region & data$Service.Type == lowest_prob_group_1$Service.Type, "Lowest Probability", "") # Load required library library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# Create a bar plotusing 'grouped_data_1' and 'lowest_prob_group_1'data. ggplot(grouped_data_1, aes(x = Region, y = Average_Coverage, fill = Service.Type)) + geom_bar(stat = "identity", position = "dodge") + geom_text(data = lowest_prob_group_1, aes(label = "Lowest Probability"), hjust = 1.2, color = "red", size = 4) + # Add text for lowest probability group labs(title = "Average Coverage by Region and Service Type", x = "Region", y = "Average Coverage", fill = "Service Type") + theme_minimal() + theme(legend.position = "top") + coord_flip()
# Grouping 2: Group data by Year and Residence Type. grouped_data_2 <- data %>% group_by(Year, Residence.Type) %>% summarise(Total_Population = sum(Population))
# Identify lowest probability group lowest_prob_group_2 <- grouped_data_2 %>% filter(Total_Population == min(Total_Population)) # Add special tag to original data data$Special_Tag <- ifelse(data$Year == lowest_prob_group_2$Year & data$Residence.Type == lowest_prob_group_2$Residence.Type, "Lowest Probability", "") # Create a grouped bar plot. ggplot(grouped_data_2, aes(x = Residence.Type, y = Total_Population, fill = factor(Year))) + geom_bar(stat = "identity", position = "dodge") + scale_y_continuous(labels = scales::comma) + # Format y-axis labels as digits labs(title = "Total Population by Year and Residence Type", x = "Residence Type", y = "Total Population", fill = "Year") + theme_minimal() + theme(legend.position = "top")
# Grouping 3: Type and Coverage grouped_data_3 <- data %>% group_by(Type, Coverage) %>% summarise(Count = n())
# Identify lowest probability group lowest_prob_group_3 <- grouped_data_3 %>% filter(Count == min(Count)) # Add special tag to original data data$Special_Tag <- ifelse(data$Type %in% lowest_prob_group_3$Type & data$Coverage %in% lowest_prob_group_3$Coverage, "Lowest Probability", "") # Create a stacked bar plot ggplot(grouped_data_3, aes(x = Type, y = Count, fill = Coverage)) + geom_bar(stat = "identity") + labs(title = "Count of Occurrences by Type and Coverage", x = "Type", y = "Count", fill = "Coverage") + theme_minimal() + theme(legend.position = "top")