Exploratory Analysis by Grouping Approach

# Read the CSV file
data <- read.csv("C:\\Users\\am790\\Downloads\\washdash-download (1).csv")
# View summary of the data
summary(data)

##      Type              Region          Residence.Type     Service.Type      
##  Length:3367        Length:3367        Length:3367        Length:3367       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       Year         Coverage         Population        Service.level     
##  Min.   :2010   Min.   :  0.000   Min.   :0.000e+00   Length:3367       
##  1st Qu.:2013   1st Qu.:  2.486   1st Qu.:4.366e+06   Class :character  
##  Median :2016   Median : 12.110   Median :3.306e+07   Mode  :character  
##  Mean   :2016   Mean   : 22.447   Mean   :1.497e+08                     
##  3rd Qu.:2019   3rd Qu.: 34.190   3rd Qu.:1.755e+08                     
##  Max.   :2022   Max.   :100.000   Max.   :2.173e+09

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Grouping 1: Region and Service Type
grouped_data_1 <- data %>%
  group_by(Region, Service.Type) %>%
  summarise(Average_Coverage = mean(Coverage), .groups = "drop")

# Identify lowest probability group
lowest_prob_group_1 <- grouped_data_1 %>%
  filter(Average_Coverage == min(Average_Coverage))

# Add special tag to original data
data$Special_Tag <- ifelse(data$Region == lowest_prob_group_1$Region &
                             data$Service.Type == lowest_prob_group_1$Service.Type,
                           "Lowest Probability", "")
# Load required library
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

# Create a bar plotusing 'grouped_data_1' and 'lowest_prob_group_1'data.
ggplot(grouped_data_1, aes(x = Region, y = Average_Coverage, fill = Service.Type)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(data = lowest_prob_group_1, aes(label = "Lowest Probability"),
            hjust = 1.2, color = "red", size = 4) + # Add text for lowest probability group
  labs(title = "Average Coverage by Region and Service Type",
       x = "Region", y = "Average Coverage", fill = "Service Type") +
  theme_minimal() +
  theme(legend.position = "top") +
  coord_flip()

# Grouping 2: Group data by Year and Residence Type.
grouped_data_2 <- data %>%
  group_by(Year, Residence.Type) %>%
  summarise(Total_Population = sum(Population))

## `summarise()` has grouped output by 'Year'. You can override using the
## `.groups` argument.

# Identify lowest probability group
lowest_prob_group_2 <- grouped_data_2 %>%
  filter(Total_Population == min(Total_Population))

# Add special tag to original data
data$Special_Tag <- ifelse(data$Year == lowest_prob_group_2$Year &
                             data$Residence.Type == lowest_prob_group_2$Residence.Type,
                           "Lowest Probability", "")


# Create a grouped bar plot.
ggplot(grouped_data_2, aes(x = Residence.Type, y = Total_Population, fill = factor(Year))) +
  geom_bar(stat = "identity", position = "dodge") +
  scale_y_continuous(labels = scales::comma) + # Format y-axis labels as digits
  labs(title = "Total Population by Year and Residence Type",
       x = "Residence Type", y = "Total Population", fill = "Year") +
  theme_minimal() +
  theme(legend.position = "top")

# Grouping 3: Type and Coverage
grouped_data_3 <- data %>%
  group_by(Type, Coverage) %>%
  summarise(Count = n())

## `summarise()` has grouped output by 'Type'. You can override using the
## `.groups` argument.

# Identify lowest probability group
lowest_prob_group_3 <- grouped_data_3 %>%
  filter(Count == min(Count))

# Add special tag to original data
data$Special_Tag <- ifelse(data$Type %in% lowest_prob_group_3$Type &
                             data$Coverage %in% lowest_prob_group_3$Coverage,
                           "Lowest Probability", "")


# Create a stacked bar plot
ggplot(grouped_data_3, aes(x = Type, y = Count, fill = Coverage)) +
  geom_bar(stat = "identity") +
  labs(title = "Count of Occurrences by Type and Coverage",
       x = "Type", y = "Count", fill = "Coverage") +
  theme_minimal() +
  theme(legend.position = "top")