An Investigation Using ANOVA and Linear Regression Models

# Read the CSV file
data <- read.csv("C:\\Users\\am790\\Downloads\\washdash-download (1).csv")

# Summary of the data
summary(data)

##      Type              Region          Residence.Type     Service.Type      
##  Length:3367        Length:3367        Length:3367        Length:3367       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##       Year         Coverage         Population        Service.level     
##  Min.   :2010   Min.   :  0.000   Min.   :0.000e+00   Length:3367       
##  1st Qu.:2013   1st Qu.:  2.486   1st Qu.:4.366e+06   Class :character  
##  Median :2016   Median : 12.110   Median :3.306e+07   Mode  :character  
##  Mean   :2016   Mean   : 22.447   Mean   :1.497e+08                     
##  3rd Qu.:2019   3rd Qu.: 34.190   3rd Qu.:1.755e+08                     
##  Max.   :2022   Max.   :100.000   Max.   :2.173e+09

options(scipen = 999)
# Convert 'Service Type' to a factor
data$Service.Type <- factor(data$Service.Type)

# Perform ANOVA test
anova_result <- aov(Coverage ~ Service.Type, data = data)

# Summary of ANOVA
summary(anova_result)

##                Df  Sum Sq Mean Sq F value              Pr(>F)    
## Service.Type    2   59612   29806   45.77 <0.0000000000000002 ***
## Residuals    3364 2190520     651                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Consolidate regions if there are more than 10 categories
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data <- data %>%
  mutate(Region = ifelse(Region %in% c("Australia and New Zealand", "Europe", "Northern America"), "High-Income Regions", Region))

# Perform ANOVA test
anova_result <- aov(Coverage ~ Region, data = data)

# Summary of ANOVA
summary(anova_result)

##               Df  Sum Sq Mean Sq F value Pr(>F)
## Region         7    7148  1021.1   1.529  0.152
## Residuals   3359 2242984   667.8

# Build linear regression model
lm_model <- lm(Coverage ~ Population, data = data)

# Summary of the model
summary(lm_model)

## 
## Call:
## lm(formula = Coverage ~ Population, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -48.550 -13.156  -8.551   6.778  86.106 
## 
## Coefficients:
##                    Estimate      Std. Error t value            Pr(>|t|)    
## (Intercept) 13.659504476453  0.401306997071   34.04 <0.0000000000000002 ***
## Population   0.000000058712  0.000000001296   45.29 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.38 on 3365 degrees of freedom
## Multiple R-squared:  0.3787, Adjusted R-squared:  0.3785 
## F-statistic:  2051 on 1 and 3365 DF,  p-value: < 0.00000000000000022

# Scatter plot
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

ggplot(data, aes(x = Year, y = Coverage)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(x = "Year", y = "Coverage", title = "Relationship between Year and Coverage")

## `geom_smooth()` using formula = 'y ~ x'

# Load necessary libraries
library(ggplot2)

# Scatter plot with regression line
ggplot(data, aes(x = Population, y = Coverage)) +
  geom_point() +  # Add points
  geom_smooth(method = "lm", se = FALSE) +  # Add regression line
  labs(x = "Population", y = "Coverage", title = "Linear Regression Model Visualization")

## `geom_smooth()` using formula = 'y ~ x'