# Read the CSV file
data <- read.csv("C:\\Users\\am790\\Downloads\\washdash-download (1).csv")
# Summary of the data
summary(data)
## Type Region Residence.Type Service.Type
## Length:3367 Length:3367 Length:3367 Length:3367
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Year Coverage Population Service.level
## Min. :2010 Min. : 0.000 Min. :0.000e+00 Length:3367
## 1st Qu.:2013 1st Qu.: 2.486 1st Qu.:4.366e+06 Class :character
## Median :2016 Median : 12.110 Median :3.306e+07 Mode :character
## Mean :2016 Mean : 22.447 Mean :1.497e+08
## 3rd Qu.:2019 3rd Qu.: 34.190 3rd Qu.:1.755e+08
## Max. :2022 Max. :100.000 Max. :2.173e+09
options(scipen = 999)
# Convert 'Service Type' to a factor
data$Service.Type <- factor(data$Service.Type)
# Perform ANOVA test
anova_result <- aov(Coverage ~ Service.Type, data = data)
# Summary of ANOVA
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Service.Type 2 59612 29806 45.77 <0.0000000000000002 ***
## Residuals 3364 2190520 651
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Consolidate regions if there are more than 10 categories
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- data %>%
mutate(Region = ifelse(Region %in% c("Australia and New Zealand", "Europe", "Northern America"), "High-Income Regions", Region))
# Perform ANOVA test
anova_result <- aov(Coverage ~ Region, data = data)
# Summary of ANOVA
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Region 7 7148 1021.1 1.529 0.152
## Residuals 3359 2242984 667.8
# Build linear regression model
lm_model <- lm(Coverage ~ Population, data = data)
# Summary of the model
summary(lm_model)
##
## Call:
## lm(formula = Coverage ~ Population, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.550 -13.156 -8.551 6.778 86.106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.659504476453 0.401306997071 34.04 <0.0000000000000002 ***
## Population 0.000000058712 0.000000001296 45.29 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.38 on 3365 degrees of freedom
## Multiple R-squared: 0.3787, Adjusted R-squared: 0.3785
## F-statistic: 2051 on 1 and 3365 DF, p-value: < 0.00000000000000022
# Scatter plot
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
ggplot(data, aes(x = Year, y = Coverage)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(x = "Year", y = "Coverage", title = "Relationship between Year and Coverage")
## `geom_smooth()` using formula = 'y ~ x'
# Load necessary libraries
library(ggplot2)
# Scatter plot with regression line
ggplot(data, aes(x = Population, y = Coverage)) +
geom_point() + # Add points
geom_smooth(method = "lm", se = FALSE) + # Add regression line
labs(x = "Population", y = "Coverage", title = "Linear Regression Model Visualization")
## `geom_smooth()` using formula = 'y ~ x'