Healthcare deprivation has long been influenced by factors such as geographical access and living conditions. Despite prior research that investigated such factors, there is still a dearth of studies that support how economic deprivation influences geographical access to such services. Our analyses reveal that while economic deprivation somewhat decreases travel time, this relationship is significantly mediated by housing costs and urban-rural classification. The findings, which also reflect on model assumptions and are supported by external literature, highlight the complexities of healthcare access and underscore the need for inclusive approaches in future research to capture the multifaceted nature of economic influences.
# Libraries used
library(tidyverse)
library(haven)
library(dplyr)
library(readxl)
library(GGally)
library(forcats)
library(stargazer)
library(lme4)
library(car)
library(rstudioapi)
library(knitr)
library(kableExtra)
This chunk loads the necessary libraries for data manipulation, analysis, and plotting.
# Turning off scientific notation
options(scipen = 9999)
This line sets the option to avoid scientific notation in outputs.
# Bespoke functions used ----
PointPlot <- function(dataset, independent_var, dependent_var, colour_var = NULL, title = NULL, subtitle = NULL, caption = NULL, x_label = "X variable", y_label = "Y variable", point_size = 1, line_intercept = NULL, line_slope = NULL, line_colour = "black", line_linewidth = 1, line_linetype = "solid", add_regression_line = FALSE) {
# Loading required libraries
library(ggplot2)
library(showtext)
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
# Adjust the aesthetic mapping based on the presence of the third variable
if(!is.null(colour_var)) {
plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var, colour = colour_var))
} else {
plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var))
}
# Base point plot
plot <- plot + geom_point(alpha = 1, shape = 19, size = point_size, position = 'jitter') +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
family = 'Roboto'),
plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
family = 'Roboto'),
plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
family = 'Roboto'),
axis.title = element_text(family = 'Roboto'),
axis.text = element_text(size = 15.5,
family = 'Roboto',
colour = 'black'),
legend.position = ifelse(is.null(colour_var), 'none', 'right'),
legend.text = element_text(family = 'Roboto')) +
labs(title = title,
subtitle = subtitle,
caption = caption,
x = x_label,
y = y_label)
# Add manual line if intercept and slope are provided
if (!is.null(line_intercept) & !is.null(line_slope)) {
plot <- plot + geom_abline(intercept = line_intercept,
slope = line_slope,
colour = line_colour,
linewidth = line_linewidth,
linetype = line_linetype)
}
# Add regression line with confidence interval if requested
if (add_regression_line) {
plot <- plot + geom_smooth(method = "lm", alpha = 0.2)
}
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
# Save the plot to the working directory
ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
return(plot)
}
DistPlot <- function(dataset, variable, formatted_name, subtitle = NULL, caption = NULL, y_label = NULL) {
# Loading required libraries
library(ggplot2)
library(showtext)
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
plot <- ggplot(dataset, aes_string(x = variable)) +
geom_histogram(aes(y = ..density..),
colour = 1, fill = "white") +
geom_density(fill = '#0072C6', colour = '#0072C6', alpha = 0.25) +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
family = 'Roboto'),
plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
family = 'Roboto'),
plot.caption = element_text(colour = 'grey30', size = 12.5, face = 'italic',
family = 'Roboto'),
axis.title.x = element_blank(), # This removes the x-axis title
axis.title.y = element_text(size = 15.5,
family = 'Roboto',
colour = 'black'),
axis.text = element_text(size = 15.5,
family = 'Roboto',
colour = 'black'),
legend.position = 'none') +
labs(title = paste0("Distribution of ", formatted_name),
subtitle = subtitle,
caption = caption,
y = y_label)
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
# Save the plot to the working directory
ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
plot
}
ConfidencePlot <- function(data, title, subtitle, middle_point = 0, x_label = "Estimated Coefficient", caption = NULL) {
library(ggplot2)
library(showtext)
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
if (!middle_point %in% c(0, 1)) {
stop("middle_point must be 0 or 1")
}
conf_plot <- ggplot(data, aes(x = coefs, y = names)) +
geom_errorbar(aes(xmin = lower, xmax = upper),
col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "forestgreen"),
width = 0.5, size = 1, alpha = 0.5) +
geom_point(col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "black"), size = 1, alpha = 1) +
geom_vline(xintercept = middle_point, col = 'red', size = 0.5, linetype="dashed") +
theme_classic() +
labs(title = title,
subtitle = subtitle,
caption = caption,
x = x_label,
y = "") +
theme(axis.text = element_text(size = 15.5,
family = 'Roboto',
colour = 'black'),
axis.title = element_text(family = 'Roboto', size = 15.5),
plot.title = element_text(hjust = 0, colour = 'black', size = 17.5, face = 'bold',
family = 'Roboto', margin = margin(b = 10)),
plot.subtitle = element_text(hjust = 0, colour = 'grey30', size = 12.5, face = 'italic',
family = 'Roboto', margin = margin(b = 5)),
plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
family = 'Roboto'))
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", conf_plot$labels$title), ".png")
# Save the plot to the working directory
ggsave(filename = file_name, plot = conf_plot, width = 1200/100, height = 740/100, dpi = 100) # Adjust height as necessary
return(conf_plot)
}
PartialRegressionPlot <- function(model, variable, x_var_name = "X1", y_var_name = "Dependent Variable", title = NULL) {
# Close any existing plotting device
graphics.off()
# Construct axis labels using the specified names
xlab <- paste("Residuals of", x_var_name, "Controlling for Other Variables")
ylab <- paste("Residuals of", y_var_name, "(Excluding", x_var_name, ")")
# Create the plot without a main title
avPlots(model, variable, xlab = xlab, ylab = ylab)
# Add a title using the title function
title(main = title)
}
HomoskedasticityPlot <- function(model, title = "Homoskedasticity Check", subtitle = NULL,
x_label = "Fitted Values", y_label = "Standardized Residuals",
point_size = 1.2, add_loess_line = FALSE) {
# Loading required libraries
library(ggplot2)
library(showtext)
library(lmtest) # For Breusch-Pagan test
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
# Calculate standardized residuals
std_residuals <- rstandard(model)
# Fitted values
fitted_values <- fitted(model)
# Create a data frame for plotting
plot_data <- data.frame(FittedValues = fitted_values, StdResiduals = std_residuals)
# Perform Breusch-Pagan test
bp_test <- bptest(model)
# Format the p-value for the caption
if (bp_test$p.value < 0.05) {
p_value_text <- "p < 0.05"
} else if (bp_test$p.value > 0.05) {
p_value_text <- "p > 0.05"
} else {
p_value_text <- "p = 0.05"
}
# Create a caption including the formatted p-value text
caption <- paste("Breusch-Pagan Test:", p_value_text)
# Base point plot with adjusted sizes for readability
plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals)) +
geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 50, face = 'bold', family = 'Roboto'),
plot.subtitle = element_text(colour = 'grey30', size = 30, face = 'italic', family = 'Roboto'),
plot.caption = element_text(colour = 'grey30', size = 50, face = 'italic', family = 'Roboto'),
axis.title = element_text(size = 30, family = 'Roboto'),
axis.text = element_text(size = 30, family = 'Roboto', colour = 'black'),
legend.position = 'none') +
labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
# Add loess smoothing line if requested
if (add_loess_line) {
plot <- plot + geom_smooth(method = "loess", alpha = 0.2)
}
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
# Save the plot with dimensions that balance size and readability
ggsave(filename = file_name, plot = plot, width = 5.91, height = 4.50, dpi = 300)
return(plot)
}
HomoskedasticityDetailPlot <- function(model, dataframe, categorical_var_name,
title = "Homoskedasticity Check", subtitle = NULL,
x_label = "Fitted Values", y_label = "Standardized Residuals",
point_size = 1.2, add_loess_line = FALSE) {
# Loading required libraries
library(ggplot2)
library(showtext)
library(lmtest) # For Breusch-Pagan test
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
# Calculate standardized residuals
std_residuals <- rstandard(model)
# Fitted values
fitted_values <- fitted(model)
# Ensure the categorical variable exists in the dataframe
if (!categorical_var_name %in% names(dataframe)) {
stop("Specified categorical variable not found in the provided dataframe.")
}
# Create a data frame for plotting
plot_data <- data.frame(FittedValues = fitted_values,
StdResiduals = std_residuals,
Category = dataframe[[categorical_var_name]])
# Perform Breusch-Pagan test
bp_test <- bptest(model)
# Format the p-value for the caption
p_value_text <- ifelse(bp_test$p.value < 0.05, "p < 0.05",
ifelse(bp_test$p.value > 0.05, "p > 0.05", "p = 0.05"))
# Create a caption including the formatted p-value text
caption <- paste("Breusch-Pagan Test:", p_value_text)
# Base point plot with adjusted sizes for readability
plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals, color = Category)) +
geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold', family = 'Roboto'),
plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic', family = 'Roboto'),
plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic', family = 'Roboto'),
axis.title = element_text(size = 15.5, family = 'Roboto'),
axis.text = element_text(size = 15.5, family = 'Roboto', colour = 'black'),
legend.position = 'right', legend.text = element_text(size = 12.5, family = 'Roboto')) +
labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
# Add loess smoothing line if requested
if (add_loess_line) {
plot <- plot + geom_smooth(method = "loess", alpha = 0.2, se = FALSE)
}
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
# Save the plot with dimensions that balance size and readability
ggsave(filename = file_name, plot = plot, width = 10, height = 6.17, dpi = 100)
return(plot)
}
QQPlotModel <- function(model, title = 'Normality of Residuals Check', subtitle = 'QQ-Plot', caption = NULL) {
# Loading required libraries
library(ggplot2)
# Extract residuals from the model
model_residuals <- residuals(model)
# Create the QQ-plot
plot <- ggplot() +
stat_qq(aes(sample = model_residuals)) +
stat_qq_line(aes(sample = model_residuals), colour = "blue") +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold'),
plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic'),
plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic'),
axis.title = element_text(),
axis.text = element_text(size = 15.5, colour = 'black')) +
labs(title = title,
subtitle = subtitle,
caption = caption,
x = "Theoretical Quantiles",
y = "Sample Quantiles")
# Create a filename from the plot title or use a default one
file_name <- if (!is.null(title)) {
paste0(gsub(" ", "_", title), ".png")
} else {
"QQPlot.png"
}
# Save the plot to the working directory
ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
return(plot)
}
AHAH_df <- read.csv("Group_7_AHAH_V3_0.csv") # Access to healthy assets and hazards V3 data
IOD_income_df <- read.csv("Group_7_IoD2019_Income.csv") # Index of Multiple Deprivation Income Domian Indicator Data (measured in 2015)
IOD_health_df <- read.csv("Group_7_IoD2019_Health.csv") # Index of Multiple Deprivation Health Domian Indicator Data
IOD_barriers_df <- read.csv("Group_7_IoD2019_Barriers.csv") # Index of Multiple Deprivation Barriers to Housing and Services Domain Indicator Data
IOD_education_df <- read.csv("Group_7_IoD2019_Education.csv") # Index of Multiple Deprivation Education, Skills and Training Deprivation Domain Indicator Data
IOD_employment_df <- read.csv("Group_7_IoD2019_Employment.csv") # Index of Multiple Deprivation Employment Deprivation Domain Indicator Data (measured in 2015)
ONS_MIDYEAR_2015 <- read.csv("Group_7_ONS_MIDYEAR_2015.csv") # Mid-year population estimates for 2015 in England and Wales (for creating rates from the numerator values given in the income and employment measures)
RUC2011 <- read.csv("Group_7_RUC2011.csv")
The above code reads in the raw data, available to download from the sources provided.
## Renaming columns for consistent joining
AHAH_df <- AHAH_df %>%
rename(LSOA.code..2011. = lsoa11)
ONS_MIDYEAR_2015 <- ONS_MIDYEAR_2015 %>%
rename(LSOA.code..2011. = Area.Codes)
RUC2011 <- RUC2011 %>%
rename(LSOA.code..2011. = LSOA11CD)
## Creating complete combined dataset
complete_combined_data <- IOD_barriers_df %>%
left_join(IOD_education_df, by = "LSOA.code..2011.") %>%
left_join(IOD_employment_df, by = "LSOA.code..2011.") %>%
left_join(IOD_health_df, by = "LSOA.code..2011.") %>%
left_join(IOD_income_df, by = "LSOA.code..2011.") %>%
left_join(AHAH_df, by = "LSOA.code..2011.") %>%
left_join(ONS_MIDYEAR_2015, by = "LSOA.code..2011.") %>%
left_join(RUC2011, by = "LSOA.code..2011.")
## Retaining only columns of interest
combined_data <- complete_combined_data %>%
select(LSOA.code..2011., Local.Authority.District.name..2019.,
Income.Domain.numerator, Years.of.potential.life.lost.indicator,
Housing.affordability.indicator, Staying.on.in.education.post.16.indicator,
Employment.Domain.numerator, ah3gp, ah3hosp, ah3dent,
ah3phar, All.Ages, RUC11)
## Removing commas and converting columns 3:12 to numeric
combined_data <- combined_data %>%
mutate(across(.cols = 3:12, .fns = ~as.numeric(str_remove_all(.x,
","))))
## Creating income and employment deprivation rate
## variables
combined_data$income_deprivation_rate <- (combined_data$Income.Domain.numerator/combined_data$All.Ages) *
100
combined_data$employment_deprivation_rate <- (combined_data$Employment.Domain.numerator/combined_data$All.Ages) *
100
## Creating average distance (in minutes) to a healthcare
## service (including GP practice, hospital, dentist and
## pharmacy)
combined_data$avg_healthcare_distance <- rowMeans(combined_data[,
c("ah3gp", "ah3hosp", "ah3dent", "ah3phar")], na.rm = TRUE)
## Renaming column names for understanding
colnames(combined_data) <- c("lsoa_code", "local_authority_name",
"no._of_income_deprived_individuals_2015", "years_of_lives_lost_2013_2017",
"housing_affordability_score_2016", "proportion_of_above16_individuals_inschool_2010_2012",
"no._of_deprived_employment_2015_2016", "avg_dist_gppractice_in_mins_2022",
"avg_dist_hospital_in_mins_2022", "avg_dist_dentist_in_mins_2022",
"avg_dist_pharmacy_in_mins_2022", "population_estimates_for_allages_2015",
"rural_urban_class_2011", "income_deprivation_rate_2015",
"employment_deprivation_rate_2015", "avg_healthcare_distance_2022")
The above code reads in multiple CSV files containing various indicators of deprivation and population data. It then proceeds to clean and merge these datasets into a single comprehensive dataset by renaming inconsistent column names to facilitate joining. A series of left joins creates our combined dataset which is further refined by selecting only the columns of interest. Subsequent operations include cleaning numeric data by removing commas, calculating rates of income and employment deprivation, and computing the average distance to healthcare services. The final steps involve renaming columns to more descriptive titles and preparing the dataset for further analysis, ensuring there are no duplicate entries.
## Saving as CSV
write.csv(combined_data, file = "Group_7_combined_data.csv",
row.names = FALSE)
write.csv(complete_combined_data, file = "Group_7_complete_combined_data.csv",
row.names = FALSE)
The above code saves our shortened dataframes as csv files.
## Loading combined data
combined_data <- read.csv("Group_7_combined_data.csv")
## Checking for duplicates
There_are_duplicates <- any(duplicated(combined_data))
if (There_are_duplicates) {
cat("There are duplicates in the combined_data.\n")
} else {
cat("There are no duplicates in combined_data.\n")
}
## There are no duplicates in combined_data.
# Calculate missingness percentages for each numerical
# variable (IV and DV)
missingness <- sapply(combined_data, function(x) mean(is.na(x)) *
100)
missingness_summary <- data.frame(variable = names(missingness),
Missingness_percentage = missingness)
missingness_summary
## variable
## lsoa_code lsoa_code
## local_authority_name local_authority_name
## no._of_income_deprived_individuals_2015 no._of_income_deprived_individuals_2015
## years_of_lives_lost_2013_2017 years_of_lives_lost_2013_2017
## housing_affordability_score_2016 housing_affordability_score_2016
## proportion_of_above16_individuals_inschool_2010_2012 proportion_of_above16_individuals_inschool_2010_2012
## no._of_deprived_employment_2015_2016 no._of_deprived_employment_2015_2016
## avg_dist_gppractice_in_mins_2022 avg_dist_gppractice_in_mins_2022
## avg_dist_hospital_in_mins_2022 avg_dist_hospital_in_mins_2022
## avg_dist_dentist_in_mins_2022 avg_dist_dentist_in_mins_2022
## avg_dist_pharmacy_in_mins_2022 avg_dist_pharmacy_in_mins_2022
## population_estimates_for_allages_2015 population_estimates_for_allages_2015
## rural_urban_class_2011 rural_urban_class_2011
## income_deprivation_rate_2015 income_deprivation_rate_2015
## employment_deprivation_rate_2015 employment_deprivation_rate_2015
## avg_healthcare_distance_2022 avg_healthcare_distance_2022
## Missingness_percentage
## lsoa_code 0.00000000
## local_authority_name 0.00000000
## no._of_income_deprived_individuals_2015 0.09438558
## years_of_lives_lost_2013_2017 0.00000000
## housing_affordability_score_2016 0.00000000
## proportion_of_above16_individuals_inschool_2010_2012 0.00000000
## no._of_deprived_employment_2015_2016 0.36231884
## avg_dist_gppractice_in_mins_2022 0.00000000
## avg_dist_hospital_in_mins_2022 0.00000000
## avg_dist_dentist_in_mins_2022 0.00000000
## avg_dist_pharmacy_in_mins_2022 0.00000000
## population_estimates_for_allages_2015 0.00000000
## rural_urban_class_2011 0.00000000
## income_deprivation_rate_2015 0.09438558
## employment_deprivation_rate_2015 0.36231884
## avg_healthcare_distance_2022 0.00000000
# Removing Null Values from data set
combined_data <- na.omit(combined_data)
# First 6 rows of data
head(combined_data)
## lsoa_code local_authority_name no._of_income_deprived_individuals_2015
## 2 E01000002 City of London 40
## 3 E01000003 City of London 115
## 4 E01000005 City of London 235
## 5 E01000006 Barking and Dagenham 235
## 6 E01000007 Barking and Dagenham 435
## 7 E01000008 Barking and Dagenham 420
## years_of_lives_lost_2013_2017 housing_affordability_score_2016
## 2 69.045 1.586
## 3 82.254 3.063
## 4 67.035 3.383
## 5 59.895 3.515
## 6 66.861 3.183
## 7 73.391 2.932
## proportion_of_above16_individuals_inschool_2010_2012
## 2 0.279
## 3 0.201
## 4 0.272
## 5 0.100
## 6 0.113
## 7 0.291
## no._of_deprived_employment_2015_2016 avg_dist_gppractice_in_mins_2022
## 2 15 3.146895
## 3 70 1.151000
## 4 95 1.462800
## 5 75 1.585000
## 6 150 0.177500
## 7 145 2.641000
## avg_dist_hospital_in_mins_2022 avg_dist_dentist_in_mins_2022
## 2 1.448895 0.8875
## 3 0.649500 0.0000
## 4 1.224050 0.5650
## 5 1.585000 2.1500
## 6 0.517500 0.6475
## 7 2.861000 3.1110
## avg_dist_pharmacy_in_mins_2022 population_estimates_for_allages_2015
## 2 0.8791789 1156
## 3 0.3475000 1350
## 4 0.9426000 1121
## 5 1.8675000 2040
## 6 1.0775000 2101
## 7 3.2585000 1566
## rural_urban_class_2011 income_deprivation_rate_2015
## 2 Urban major conurbation 3.460208
## 3 Urban major conurbation 8.518519
## 4 Urban major conurbation 20.963426
## 5 Urban major conurbation 11.519608
## 6 Urban major conurbation 20.704426
## 7 Urban major conurbation 26.819923
## employment_deprivation_rate_2015 avg_healthcare_distance_2022
## 2 1.297578 1.590617
## 3 5.185185 0.537000
## 4 8.474576 1.048612
## 5 3.676471 1.796875
## 6 7.139457 0.605000
## 7 9.259259 2.967875
# Last 6 rows of data
tail(combined_data)
## lsoa_code local_authority_name no._of_income_deprived_individuals_2015
## 32839 E01033763 Liverpool 630
## 32840 E01033764 Liverpool 1275
## 32841 E01033765 Liverpool 525
## 32842 E01033766 Liverpool 105
## 32843 E01033767 Liverpool 405
## 32844 E01033768 Liverpool 350
## years_of_lives_lost_2013_2017 housing_affordability_score_2016
## 32839 101.024 1.262
## 32840 86.855 1.415
## 32841 75.078 1.396
## 32842 75.244 -1.862
## 32843 83.476 1.752
## 32844 91.496 1.505
## proportion_of_above16_individuals_inschool_2010_2012
## 32839 0.329
## 32840 0.241
## 32841 0.207
## 32842 0.142
## 32843 0.175
## 32844 0.175
## no._of_deprived_employment_2015_2016 avg_dist_gppractice_in_mins_2022
## 32839 305 1.19380
## 32840 555 1.54425
## 32841 255 1.77000
## 32842 55 1.07000
## 32843 190 3.26270
## 32844 245 0.85200
## avg_dist_hospital_in_mins_2022 avg_dist_dentist_in_mins_2022
## 32839 0.82660 2.3109
## 32840 2.07095 3.1923
## 32841 1.43750 1.9050
## 32842 1.07000 2.7176
## 32843 1.16050 2.6112
## 32844 0.85200 1.0795
## avg_dist_pharmacy_in_mins_2022 population_estimates_for_allages_2015
## 32839 0.9740 1679
## 32840 1.6249 2712
## 32841 1.9050 1445
## 32842 1.0700 1054
## 32843 1.6750 1019
## 32844 1.1720 1281
## rural_urban_class_2011 income_deprivation_rate_2015
## 32839 Urban major conurbation 37.522335
## 32840 Urban major conurbation 47.013274
## 32841 Urban major conurbation 36.332180
## 32842 Urban major conurbation 9.962049
## 32843 Urban major conurbation 39.744848
## 32844 Urban major conurbation 27.322404
## employment_deprivation_rate_2015 avg_healthcare_distance_2022
## 32839 18.165575 1.326325
## 32840 20.464602 2.108100
## 32841 17.647059 1.754375
## 32842 5.218216 1.481900
## 32843 18.645731 2.177350
## 32844 19.125683 0.988875
After loading the dataset, the above code checks for any duplicate rows, and outputs a message indicating whether duplicates are found. The script then calculates the percentage of missing values (NA values) for each variable in the dataset, summarizing the results in a data frame that lists variables alongside their respective percentages of missingness. Subsequently, it removes all rows with any missing values from the dataset. Finally, the script displays the first and last six rows of the cleaned dataset, which serves as a quick check of the data after the aforementioned operations, giving a glimpse of the data structure post-cleaning. # Initial summary statistics
summary(combined_data)
lsoa_code local_authority_name Length:32715 Length:32715
Class :character Class :character
Mode :character Mode :character
no._of_income_deprived_individuals_2015 years_of_lives_lost_2013_2017
Min. : 10.0 Min. : 18.43
1st Qu.: 90.0 1st Qu.: 47.36
Median : 165.0 Median : 54.78
Mean : 214.3 Mean : 57.29
3rd Qu.: 300.0 3rd Qu.: 64.53
Max. :1435.0 Max. :184.73
housing_affordability_score_2016 Min. :-6.658000
1st Qu.:-1.277000
Median :-0.013000
Mean : 0.000486
3rd Qu.: 1.297000
Max. : 8.019000
proportion_of_above16_individuals_inschool_2010_2012 Min. :0.0380
1st Qu.:0.1360
Median :0.1780
Mean :0.1855
3rd Qu.:0.2280
Max. :0.5500
no._of_deprived_employment_2015_2016 avg_dist_gppractice_in_mins_2022
Min. : 10.0 Min. : 0.000
1st Qu.: 45.0 1st Qu.: 1.803
Median : 75.0 Median : 3.022
Mean : 96.5 Mean : 4.660
3rd Qu.:130.0 3rd Qu.: 5.251
Max. :670.0 Max. :50.528
avg_dist_hospital_in_mins_2022 avg_dist_dentist_in_mins_2022 Min. :
0.000 Min. : 0.000
1st Qu.: 1.561 1st Qu.: 1.725
Median : 2.614 Median : 2.936
Mean : 4.301 Mean : 4.857
3rd Qu.: 4.598 3rd Qu.: 5.187
Max. :53.912 Max. :94.747
avg_dist_pharmacy_in_mins_2022 population_estimates_for_allages_2015
Min. : 0.000 Min. : 614
1st Qu.: 1.502 1st Qu.:1448
Median : 2.383 Median :1599
Mean : 3.953 Mean :1669
3rd Qu.: 3.908 3rd Qu.:1802
Max. :64.166 Max. :9551
rural_urban_class_2011 income_deprivation_rate_2015 Length:32715 Min. :
0.1912
Class :character 1st Qu.: 5.5485
Mode :character Median : 9.9291
Mean :12.8100
3rd Qu.:17.7987
Max. :60.9177
employment_deprivation_rate_2015 avg_healthcare_distance_2022 Min. :
0.1912 Min. : 0.000
1st Qu.: 2.7865 1st Qu.: 1.877
Median : 4.6030 Median : 2.898
Mean : 5.8027 Mean : 4.443
3rd Qu.: 7.8489 3rd Qu.: 4.661
Max. :34.7452 Max. :60.310
# Descriptive table for numerical variables
stargazer(combined_data, type = "html", digits = 1, title = "Table 1: Summary Statistics",
out = "table.html")
| Statistic | N | Mean | St. Dev. | Min | Max |
| no._of_income_deprived_individuals_2015 | 32,715 | 214.3 | 164.4 | 10 | 1,435 |
| years_of_lives_lost_2013_2017 | 32,715 | 57.3 | 14.0 | 18.4 | 184.7 |
| housing_affordability_score_2016 | 32,715 | 0.000 | 1.9 | -6.7 | 8.0 |
| proportion_of_above16_individuals_inschool_2010_2012 | 32,715 | 0.2 | 0.1 | 0.04 | 0.6 |
| no._of_deprived_employment_2015_2016 | 32,715 | 96.5 | 68.7 | 10 | 670 |
| avg_dist_gppractice_in_mins_2022 | 32,715 | 4.7 | 4.9 | 0.0 | 50.5 |
| avg_dist_hospital_in_mins_2022 | 32,715 | 4.3 | 5.0 | 0.0 | 53.9 |
| avg_dist_dentist_in_mins_2022 | 32,715 | 4.9 | 5.8 | 0.0 | 94.7 |
| avg_dist_pharmacy_in_mins_2022 | 32,715 | 4.0 | 5.1 | 0.0 | 64.2 |
| population_estimates_for_allages_2015 | 32,715 | 1,668.6 | 364.9 | 614 | 9,551 |
| income_deprivation_rate_2015 | 32,715 | 12.8 | 9.4 | 0.2 | 60.9 |
| employment_deprivation_rate_2015 | 32,715 | 5.8 | 4.0 | 0.2 | 34.7 |
| avg_healthcare_distance_2022 | 32,715 | 4.4 | 4.8 | 0.0 | 60.3 |
# Use rstudioapi to view the table
rstudioapi::viewer("table.html")
NULL
## SD of numerical variable
sd(combined_data$lsoa_code)
[1] NA
sd(combined_data$no._of_income_deprived_individuals_2015)
[1] 164.3886
sd(combined_data$years_of_lives_lost_2013_2017)
[1] 14.01493
sd(combined_data$housing_affordability_score_2016)
[1] 1.904849
sd(combined_data$proportion_of_above16_individuals_inschool_2010_2012)
[1] 0.06606487
sd(combined_data$no._of_deprived_employment_2015_2016)
[1] 68.72312
sd(combined_data$avg_dist_gppractice_in_mins_2022)
[1] 4.932105
sd(combined_data$avg_dist_hospital_in_mins_2022)
[1] 4.991137
sd(combined_data$avg_dist_dentist_in_mins_2022)
[1] 5.822524
sd(combined_data$avg_dist_pharmacy_in_mins_2022)
[1] 5.062794
sd(combined_data$population_estimates_for_allages_2015)
[1] 364.8729
sd(combined_data$income_deprivation_rate_2015)
[1] 9.412047
sd(combined_data$employment_deprivation_rate_2015)
[1] 4.007918
sd(combined_data$avg_healthcare_distance_2022)
[1] 4.752529 The above code provides a summary of the combined_data dataset, first by generating a descriptive statistics table, which includes mean, median, minimum, maximum, and quartile values for each numerical variable in the dataset. This table is created using the stargazer package, formatted as HTML, titled “Table 1: Summary Statistics”, and saved to a file named “table.html”. It then uses rstudioapi::viewer to display this HTML table within RStudio’s viewer pane for easy inspection. Following that, the code calculates the standard deviation (SD) for each numerical variable within combined_data individually, which quantifies the amount of variation or dispersion of a set of values.
# The number of unique names in the rural_urban_class
# column
unique_names <- table(combined_data$rural_urban_class_2011)
print(unique_names)
##
## Rural town and fringe
## 2930
## Rural town and fringe in a sparse setting
## 119
## Rural village and dispersed
## 2356
## Rural village and dispersed in a sparse setting
## 181
## Urban city and town
## 14399
## Urban city and town in a sparse setting
## 59
## Urban major conurbation
## 11465
## Urban minor conurbation
## 1206
rural_urban_class_2011_counts <- as.numeric(unique_names)
print(rural_urban_class_2011_counts)
## [1] 2930 119 2356 181 14399 59 11465 1206
## Summary statistics for Rural_urban_class_2011
max(rural_urban_class_2011_counts)
## [1] 14399
min(rural_urban_class_2011_counts)
## [1] 59
median(rural_urban_class_2011_counts)
## [1] 1781
IQR(rural_urban_class_2011_counts)
## [1] 4898.25
# The number of unique names in the local_authority_name
# column
unique_names_1 <- table(combined_data$local_authority_name)
print(unique_names_1)
##
## Adur Allerdale
## 42 60
## Amber Valley Arun
## 78 94
## Ashfield Ashford
## 74 78
## Aylesbury Vale Babergh
## 114 54
## Barking and Dagenham Barnet
## 110 209
## Barnsley Barrow-in-Furness
## 147 49
## Basildon Basingstoke and Deane
## 109 109
## Bassetlaw Bath and North East Somerset
## 70 114
## Bedford Bexley
## 103 146
## Birmingham Blaby
## 638 60
## Blackburn with Darwen Blackpool
## 91 94
## Bolsover Bolton
## 48 177
## Boston Bournemouth, Christchurch and Poole
## 36 233
## Bracknell Forest Bradford
## 75 308
## Braintree Breckland
## 87 78
## Brent Brentwood
## 173 45
## Brighton and Hove Bristol, City of
## 165 262
## Broadland Bromley
## 84 195
## Bromsgrove Broxbourne
## 58 56
## Broxtowe Burnley
## 71 60
## Bury Calderdale
## 120 128
## Cambridge Camden
## 67 132
## Cannock Chase Canterbury
## 60 90
## Carlisle Castle Point
## 68 57
## Central Bedfordshire Charnwood
## 157 99
## Chelmsford Cheltenham
## 107 75
## Cherwell Cheshire East
## 93 232
## Cheshire West and Chester Chesterfield
## 212 69
## Chichester Chiltern
## 71 54
## Chorley City of London
## 66 4
## Colchester Copeland
## 105 49
## Corby Cornwall
## 41 326
## Cotswold County Durham
## 51 324
## Coventry Craven
## 195 32
## Crawley Croydon
## 66 220
## Dacorum Darlington
## 91 65
## Dartford Daventry
## 58 44
## Derby Derbyshire Dales
## 151 43
## Doncaster Dorset
## 194 218
## Dover Dudley
## 67 201
## Ealing East Cambridgeshire
## 196 50
## East Devon East Hampshire
## 81 71
## East Hertfordshire East Lindsey
## 84 81
## East Northamptonshire East Riding of Yorkshire
## 49 210
## East Staffordshire East Suffolk
## 72 146
## Eastbourne Eastleigh
## 61 77
## Eden Elmbridge
## 36 76
## Enfield Epping Forest
## 183 78
## Epsom and Ewell Erewash
## 44 73
## Exeter Fareham
## 74 73
## Fenland Folkestone and Hythe
## 55 67
## Forest of Dean Fylde
## 50 51
## Gateshead Gedling
## 126 77
## Gloucester Gosport
## 78 53
## Gravesham Great Yarmouth
## 64 61
## Greenwich Guildford
## 150 82
## Hackney Halton
## 144 79
## Hambleton Hammersmith and Fulham
## 52 113
## Harborough Haringey
## 47 145
## Harlow Harrogate
## 54 100
## Harrow Hart
## 137 56
## Hartlepool Hastings
## 58 53
## Havant Havering
## 78 150
## Herefordshire, County of Hertsmere
## 116 61
## High Peak Hillingdon
## 59 161
## Hinckley and Bosworth Horsham
## 66 81
## Hounslow Huntingdonshire
## 142 105
## Hyndburn Ipswich
## 52 85
## Isle of Wight Isles of Scilly
## 89 1
## Islington Kensington and Chelsea
## 123 90
## Kettering King's Lynn and West Norfolk
## 57 89
## Kingston upon Hull, City of Kingston upon Thames
## 166 98
## Kirklees Knowsley
## 259 98
## Lambeth Lancaster
## 178 89
## Leeds Leicester
## 480 192
## Lewes Lewisham
## 62 169
## Lichfield Lincoln
## 58 57
## Liverpool Luton
## 297 121
## Maidstone Maldon
## 94 40
## Malvern Hills Manchester
## 45 278
## Mansfield Medway
## 67 163
## Melton Mendip
## 30 66
## Merton Mid Devon
## 122 43
## Mid Suffolk Mid Sussex
## 56 86
## Middlesbrough Milton Keynes
## 86 152
## Mole Valley New Forest
## 52 113
## Newark and Sherwood Newcastle-under-Lyme
## 70 80
## Newcastle upon Tyne Newham
## 174 162
## North Devon North East Derbyshire
## 58 63
## North East Lincolnshire North Hertfordshire
## 106 82
## North Kesteven North Lincolnshire
## 64 101
## North Norfolk North Somerset
## 62 135
## North Tyneside North Warwickshire
## 131 38
## North West Leicestershire Northampton
## 58 133
## Northumberland Norwich
## 197 83
## Nottingham Nuneaton and Bedworth
## 182 81
## Oadby and Wigston Oldham
## 36 141
## Oxford Pendle
## 79 57
## Peterborough Plymouth
## 112 161
## Portsmouth Preston
## 124 86
## Reading Redbridge
## 97 161
## Redcar and Cleveland Redditch
## 88 55
## Reigate and Banstead Ribble Valley
## 83 39
## Richmond upon Thames Richmondshire
## 115 34
## Rochdale Rochford
## 134 53
## Rossendale Rother
## 43 58
## Rotherham Rugby
## 167 61
## Runnymede Rushcliffe
## 51 68
## Rushmoor Rutland
## 58 23
## Ryedale Salford
## 30 149
## Sandwell Scarborough
## 186 71
## Sedgemoor Sefton
## 70 189
## Selby Sevenoaks
## 50 73
## Sheffield Shropshire
## 343 193
## Slough Solihull
## 80 132
## Somerset West and Taunton South Bucks
## 88 37
## South Cambridgeshire South Derbyshire
## 96 58
## South Gloucestershire South Hams
## 165 49
## South Holland South Kesteven
## 49 81
## South Lakeland South Norfolk
## 59 81
## South Northamptonshire South Oxfordshire
## 51 87
## South Ribble South Somerset
## 70 103
## South Staffordshire South Tyneside
## 68 102
## Southampton Southend-on-Sea
## 148 107
## Southwark Spelthorne
## 162 60
## St Albans St. Helens
## 83 119
## Stafford Staffordshire Moorlands
## 80 59
## Stevenage Stockport
## 52 190
## Stockton-on-Tees Stoke-on-Trent
## 120 159
## Stratford-on-Avon Stroud
## 73 69
## Sunderland Surrey Heath
## 185 53
## Sutton Swale
## 121 85
## Swindon Tameside
## 132 141
## Tamworth Tandridge
## 51 49
## Teignbridge Telford and Wrekin
## 84 108
## Tendring Test Valley
## 89 71
## Tewkesbury Thanet
## 50 84
## Three Rivers Thurrock
## 49 98
## Tonbridge and Malling Torbay
## 71 89
## Torridge Tower Hamlets
## 37 144
## Trafford Tunbridge Wells
## 138 66
## Uttlesford Vale of White Horse
## 45 76
## Wakefield Walsall
## 209 167
## Waltham Forest Wandsworth
## 144 178
## Warrington Warwick
## 127 85
## Watford Waverley
## 53 81
## Wealden Wellingborough
## 95 47
## Welwyn Hatfield West Berkshire
## 65 96
## West Devon West Lancashire
## 31 73
## West Lindsey West Oxfordshire
## 52 66
## West Suffolk Westminster
## 100 120
## Wigan Wiltshire
## 200 285
## Winchester Windsor and Maidenhead
## 69 88
## Wirral Woking
## 206 59
## Wokingham Wolverhampton
## 99 158
## Worcester Worthing
## 63 65
## Wychavon Wycombe
## 78 106
## Wyre Wyre Forest
## 69 65
## York
## 120
local_authority_name_counts <- as.numeric(unique_names_1)
print(local_authority_name_counts)
## [1] 42 60 78 94 74 78 114 54 110 209 147 49 109 109 70 114 103 146
## [19] 638 60 91 94 48 177 36 233 75 308 87 78 173 45 165 262 84 195
## [37] 58 56 71 60 120 128 67 132 60 90 68 57 157 99 107 75 93 232
## [55] 212 69 71 54 66 4 105 49 41 326 51 324 195 32 66 220 91 65
## [73] 58 44 151 43 194 218 67 201 196 50 81 71 84 81 49 210 72 146
## [91] 61 77 36 76 183 78 44 73 74 73 55 67 50 51 126 77 78 53
## [109] 64 61 150 82 144 79 52 113 47 145 54 100 137 56 58 53 78 150
## [127] 116 61 59 161 66 81 142 105 52 85 89 1 123 90 57 89 166 98
## [145] 259 98 178 89 480 192 62 169 58 57 297 121 94 40 45 278 67 163
## [163] 30 66 122 43 56 86 86 152 52 113 70 80 174 162 58 63 106 82
## [181] 64 101 62 135 131 38 58 133 197 83 182 81 36 141 79 57 112 161
## [199] 124 86 97 161 88 55 83 39 115 34 134 53 43 58 167 61 51 68
## [217] 58 23 30 149 186 71 70 189 50 73 343 193 80 132 88 37 96 58
## [235] 165 49 49 81 59 81 51 87 70 103 68 102 148 107 162 60 83 119
## [253] 80 59 52 190 120 159 73 69 185 53 121 85 132 141 51 49 84 108
## [271] 89 71 50 84 49 98 71 89 37 144 138 66 45 76 209 167 144 178
## [289] 127 85 53 81 95 47 65 96 31 73 52 66 100 120 200 285 69 88
## [307] 206 59 99 158 63 65 78 106 69 65 120
## Summary statistics for local_authority_name
max(local_authority_name_counts)
## [1] 638
min(local_authority_name_counts)
## [1] 1
median(local_authority_name_counts)
## [1] 81
IQR(local_authority_name_counts)
## [1] 72
## T-test Four sample independent t-test is used to compare
## the significance level in combined_data data set.
## Student’s t-test assumes both groups of data are
## normally distributed and the variances of the two
## distributions are the same.
x1 <- combined_data$income_deprivation_rate_2015
x2 <- combined_data$employment_deprivation_rate_2015
x3 <- combined_data$housing_affordability_score_2016
y <- combined_data$avg_healthcare_distance_2022
# Plot a 2x2 histogram grid
par(mfrow = c(2, 2))
hist(x1)
hist(x2)
hist(x3)
hist(y)
# Based on histogram plots, we have to perform a Welch's
# t-test for x1, x2, both of which are skewed, and for x3
# as well since DV is highly skewed despite x3 being
# normally distributed.
t_test1 <- t.test(x1, y)
t_test2 <- t.test(x2, y)
t_test3 <- t.test(x3, y)
t_test1
##
## Welch Two Sample t-test
##
## data: x1 and y
## t = 143.53, df = 48378, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 8.252976 8.481491
## sample estimates:
## mean of x mean of y
## 12.809981 4.442748
t_test2
##
## Welch Two Sample t-test
##
## data: x2 and y
## t = 39.567, df = 63616, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.292606 1.427343
## sample estimates:
## mean of x mean of y
## 5.802723 4.442748
t_test3
##
## Welch Two Sample t-test
##
## data: x3 and y
## t = -156.93, df = 42960, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -4.497746 -4.386779
## sample estimates:
## mean of x mean of y
## 0.0004856182 4.4427479057
## Covariance values of our primary and secondary IV
## against primary DV
cov(x1, y)
## [1] -12.79764
cov(x2, y)
## [1] -5.549447
cov(x3, y)
## [1] -2.716934
## Correlation values of our primary and secondary IV
## against primary DV As x1, x2 are not normally
## distributed, we will use 'spearman' method. We will use
## 'spearman' method for x3 despite it being normally
## distributed as DV is highly skewed.
cor(x1, y, method = "spearman")
## [1] -0.3929755
cor(x2, y, method = "spearman")
## [1] -0.3911658
cor(x3, y, method = "spearman")
## [1] -0.4408718
The above code performs frequency analysis and summary statistics for unique categories within two categorical columns of the combined_data dataset, constructs histograms for four numerical variables to assess their distribution, conducts Welch’s t-tests to compare means between independent and dependent variables given non-normal distributions, calculates covariance to measure joint variability, and computes Spearman correlation coefficients to evaluate the strength and direction of association, accounting for non-parametric data.
DistPlot(combined_data, "income_deprivation_rate_2015", "Income Deprivation Rate Across LSOAs in England (2015)",
subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")
DistPlot(combined_data, "avg_healthcare_distance_2022", "Average Minutes to Healthcare Services Across LSOAs in England (2022)",
subtitle = "Mean Access Time to Healthcare Services per LSOA",
caption = "Data source: Consumer Data Reasearch Centre (2022)")
DistPlot(combined_data, "housing_affordability_score_2016", "Housing Affordability Indicator Across LSOAs in England (2016)",
subtitle = "Higher Scores Indicate Increased Deprivation in Entering Ownership or Private Rental Market",
caption = "Data source: Consumer Data Reasearch Centre (2022)")
DistPlot(combined_data, "employment_deprivation_rate_2015", "Employment Deprivation Rate in England (2015)",
subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")
# Both the main explanatory variable and outcome variable
# have a very strong right skew. Thus, a log transformation
# in both of these is examined.
The above code generates distribution plots for four different variables within the combined_data dataset, each with its own title, subtitle, and source caption. It visualizes the distribution of income deprivation rate for 2015, the average distance to healthcare services for 2022, the housing affordability score for 2016, and the employment deprivation rate for 2015 across Lower Super Output Areas (LSOAs) in England. The code also notes a significant right skew in the main explanatory and outcome variables, suggesting that a logarithmic transformation may be examined for these variables to address the skewness.
## Adding log transformed variables to dataframe:
combined_data$log_avg_healthcare_distance_2022 <- log(combined_data$avg_healthcare_distance_2022)
combined_data$log_income_deprivation_rate_2015 <- log(combined_data$income_deprivation_rate_2015)
combined_data$log_employment_deprivation_rate_2015 <- log(combined_data$employment_deprivation_rate_2015)
## Checking distributions:
DistPlot(combined_data, "log_income_deprivation_rate_2015", "log(Income Deprivation Rate) Across LSOAs in England (2015)",
subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")
DistPlot(combined_data, "log_avg_healthcare_distance_2022", "log(Average Minutes to Healthcare Services) Across LSOAs in England (2022)",
subtitle = "Logarithm of Mean Access Time to Healthcare Services per LSOA",
caption = "Data source: Consumer Data Reasearch Centre (2022)")
DistPlot(combined_data, "log_employment_deprivation_rate_2015",
"log(Employment Deprivation Rate) Across LSOAs in England (2015)",
subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")
## Filtering out rows with infinite values and creating a
## complete case sample:
complete_case_sample <- combined_data %>%
select(log_income_deprivation_rate_2015, log_avg_healthcare_distance_2022,
housing_affordability_score_2016, rural_urban_class_2011) %>%
filter(!is.infinite(log_avg_healthcare_distance_2022), !is.infinite(log_income_deprivation_rate_2015)) %>%
na.omit()
## Saving as CSV
write.csv(complete_case_sample, file = "Group_7_complete_case.csv",
row.names = FALSE)
# Checking linearity of relationships between x variables
ggpairs(complete_case_sample[, c("log_income_deprivation_rate_2015",
"housing_affordability_score_2016", "log_avg_healthcare_distance_2022")])
PointPlot(combined_data, "log_employment_deprivation_rate_2015",
"log_income_deprivation_rate_2015", title = "Income Deprivation Rate (2015) as a Function of Employment Deprivation Rate (2015)",
subtitle = "For Lower layer Super Output Areas Across England",
x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Income Deprivation Rate) (2015)",
caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021",
add_regression_line = TRUE)
result <- cor.test(combined_data$log_employment_deprivation_rate_2015,
combined_data$income_deprivation_rate_2015, method = "pearson")
# creating a df with correlation test results
model_summary <- data.frame(Estimate = round(result$estimate,
4), t.value = round(result$statistic, 2), p = ifelse(result$p.value <
0.05, "< 0.05", round(result$p.value, 10)))
# pretty table using stargazer
stargazer(model_summary, type = "html", title = "Results of Pearson's Correlation Test",
summary = FALSE, omit.stat = c("all"), single.row = TRUE,
rownames = FALSE, out = "x1x2.html")
| Estimate | t.value | p |
| 0.897 | 367 | < 0.05 |
rstudioapi::viewer("x1x2.html")
NULL
# As employment deprivation rate and income deprivation
# rate are highly correlated, employment deprivation rate
# will be excluded from analyses (to avoid issues of
# multicollinearity).
# Examining linearity between xi and y
PointPlot(complete_case_sample, "log_income_deprivation_rate_2015",
"log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Income Deprivation Rate (2015)",
subtitle = "For Lower layer Super Output Areas Across England",
x_label = "log(Income Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)",
caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)",
add_regression_line = TRUE)
PointPlot(complete_case_sample, "housing_affordability_score_2016",
"log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Housing Affordability Score (2016)",
subtitle = "For Lower layer Super Output Areas Across England",
x_label = "Housing Affordability Score (2016)", y_label = "log(Mean Access Time to Healthcare Services) (2022)",
caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)",
add_regression_line = TRUE)
PointPlot(combined_data, "log_employment_deprivation_rate_2015",
"log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Employment Deprivation Rate (2015)",
subtitle = "For Lower layer Super Output Areas Across England",
x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)",
caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)",
add_regression_line = TRUE)
The above code performs data transformation, cleaning, visualization,
analysis, and reporting tasks on a dataset. It applies logarithmic
transformations to skewed variables related to healthcare distance and
deprivation rates to normalize their distributions, which are then
visualized using custom DistPlot functions. A cleaning step filters out
infinite and missing values, resulting in a complete_case_sample dataset
that is saved as a CSV file. The ggpairs and PointPlot functions are
used to check for linearity and visualize relationships between
variables. A Pearson correlation test is conducted between the
log-transformed employment deprivation rate and income deprivation rate,
with results formatted into an HTML table using stargazer. Based on the
high correlation observed, the employment deprivation rate is considered
for exclusion from further analysis to prevent multicollinearity. The
PointPlot visualizations with regression lines further explore the
relationships between the remaining variables, assessing the linearity
between independent variables and the dependent variable.
### PLotting of table for slide 11
combined_data_sum <- combined_data[c("local_authority_name",
"housing_affordability_score_2016", "income_deprivation_rate_2015",
"employment_deprivation_rate_2015", "avg_healthcare_distance_2022",
"rural_urban_class_2011")]
html_table <- stargazer(combined_data_sum, type = "html", title = "Table 1: Summary Statistics",
digits = 1, column.labels = c("Housing Affordability Score (2016)",
"Income Deprivation Rate (2015)", "Employment Deprivation Rate (2015)",
"Average Healthcare Distance (2022)"), out = "table.html")
| Statistic | N | Mean | St. Dev. | Min | Max |
| housing_affordability_score_2016 | 32,715 | 0.000 | 1.9 | -6.7 | 8.0 |
| income_deprivation_rate_2015 | 32,715 | 12.8 | 9.4 | 0.2 | 60.9 |
| employment_deprivation_rate_2015 | 32,715 | 5.8 | 4.0 | 0.2 | 34.7 |
| avg_healthcare_distance_2022 | 32,715 | 4.4 | 4.8 | 0.0 | 60.3 |
# See HTML table in R Studio
rstudioapi::viewer("table.html")
NULL
### Calculate missingness percentages for each variable for
### slide 11
missingness <- sapply(combined_data_sum, function(x) mean(is.na(x)) *
100)
# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 2: Missingness Summary/%",
summary = FALSE, flip = TRUE)
| local_authority_name | 0 |
| housing_affordability_score_2016 | 0 |
| income_deprivation_rate_2015 | 0 |
| employment_deprivation_rate_2015 | 0 |
| avg_healthcare_distance_2022 | 0 |
| rural_urban_class_2011 | 0 |
html_file <- "missingness_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
NULL
### count of unique names for rural/urban for slide 12
unique_class2011 <- table(combined_data_sum$rural_urban_class_2011)
unique_class2011.1 <- as.data.frame(unique_class2011)
# Rename the columns
colnames(unique_class2011.1) <- c("Rural/Urban Class (2011)",
"Count")
# Create HTML table using stargazer
html_table1 <- stargazer(unique_class2011.1, type = "html", title = "Unique Names and its Respective Counts",
summary = FALSE)
| Rural/Urban Class (2011) | Count | |
| 1 | Rural town and fringe | 2,930 |
| 2 | Rural town and fringe in a sparse setting | 119 |
| 3 | Rural village and dispersed | 2,356 |
| 4 | Rural village and dispersed in a sparse setting | 181 |
| 5 | Urban city and town | 14,399 |
| 6 | Urban city and town in a sparse setting | 59 |
| 7 | Urban major conurbation | 11,465 |
| 8 | Urban minor conurbation | 1,206 |
cat(html_table1)
| Rural/Urban Class (2011) | Count | |
| 1 | Rural town and fringe | 2,930 |
| 2 | Rural town and fringe in a sparse setting | 119 |
| 3 | Rural village and dispersed | 2,356 |
| 4 | Rural village and dispersed in a sparse setting | 181 |
| 5 | Urban city and town | 14,399 |
| 6 | Urban city and town in a sparse setting | 59 |
| 7 | Urban major conurbation | 11,465 |
| 8 | Urban minor conurbation | 1,206 |
writeLines(html_table1, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
NULL
# Calculate summary statistics
summary_ruralurban <- c(Max = max(rural_urban_class_2011_counts),
Min = min(rural_urban_class_2011_counts), Med = median(rural_urban_class_2011_counts),
IQR = IQR(rural_urban_class_2011_counts))
# Create data frame for summary statistics
summary_df <- data.frame(statistic = names(summary_ruralurban),
value = summary_ruralurban)
# Create HTML table using stargazer
html_table2 <- stargazer(summary_df, type = "html", title = "Summary Statistics",
summary = FALSE, rownames = FALSE)
| statistic | value |
| Max | 14,399 |
| Min | 59 |
| Med | 1,781 |
| IQR | 4,898.250 |
cat(html_table2)
| statistic | value |
| Max | 14,399 |
| Min | 59 |
| Med | 1,781 |
| IQR | 4,898.250 |
writeLines(html_table2, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
NULL
## count of unique names for local authority names for
## slide 12
unique_localauthority <- table(combined_data_sum$local_authority_name)
# Convert the result to a data frame
unique_localauthority1 <- as.data.frame(unique_localauthority)
colnames(unique_localauthority1) <- c("Local Authority Names",
"Count")
# Create HTML table using stargazer
html_table3 <- stargazer(unique_localauthority1, type = "html",
title = "Unique Names and its Respective Counts", summary = FALSE)
| Local Authority Names | Count | |
| 1 | Adur | 42 |
| 2 | Allerdale | 60 |
| 3 | Amber Valley | 78 |
| 4 | Arun | 94 |
| 5 | Ashfield | 74 |
| 6 | Ashford | 78 |
| 7 | Aylesbury Vale | 114 |
| 8 | Babergh | 54 |
| 9 | Barking and Dagenham | 110 |
| 10 | Barnet | 209 |
| 11 | Barnsley | 147 |
| 12 | Barrow-in-Furness | 49 |
| 13 | Basildon | 109 |
| 14 | Basingstoke and Deane | 109 |
| 15 | Bassetlaw | 70 |
| 16 | Bath and North East Somerset | 114 |
| 17 | Bedford | 103 |
| 18 | Bexley | 146 |
| 19 | Birmingham | 638 |
| 20 | Blaby | 60 |
| 21 | Blackburn with Darwen | 91 |
| 22 | Blackpool | 94 |
| 23 | Bolsover | 48 |
| 24 | Bolton | 177 |
| 25 | Boston | 36 |
| 26 | Bournemouth, Christchurch and Poole | 233 |
| 27 | Bracknell Forest | 75 |
| 28 | Bradford | 308 |
| 29 | Braintree | 87 |
| 30 | Breckland | 78 |
| 31 | Brent | 173 |
| 32 | Brentwood | 45 |
| 33 | Brighton and Hove | 165 |
| 34 | Bristol, City of | 262 |
| 35 | Broadland | 84 |
| 36 | Bromley | 195 |
| 37 | Bromsgrove | 58 |
| 38 | Broxbourne | 56 |
| 39 | Broxtowe | 71 |
| 40 | Burnley | 60 |
| 41 | Bury | 120 |
| 42 | Calderdale | 128 |
| 43 | Cambridge | 67 |
| 44 | Camden | 132 |
| 45 | Cannock Chase | 60 |
| 46 | Canterbury | 90 |
| 47 | Carlisle | 68 |
| 48 | Castle Point | 57 |
| 49 | Central Bedfordshire | 157 |
| 50 | Charnwood | 99 |
| 51 | Chelmsford | 107 |
| 52 | Cheltenham | 75 |
| 53 | Cherwell | 93 |
| 54 | Cheshire East | 232 |
| 55 | Cheshire West and Chester | 212 |
| 56 | Chesterfield | 69 |
| 57 | Chichester | 71 |
| 58 | Chiltern | 54 |
| 59 | Chorley | 66 |
| 60 | City of London | 4 |
| 61 | Colchester | 105 |
| 62 | Copeland | 49 |
| 63 | Corby | 41 |
| 64 | Cornwall | 326 |
| 65 | Cotswold | 51 |
| 66 | County Durham | 324 |
| 67 | Coventry | 195 |
| 68 | Craven | 32 |
| 69 | Crawley | 66 |
| 70 | Croydon | 220 |
| 71 | Dacorum | 91 |
| 72 | Darlington | 65 |
| 73 | Dartford | 58 |
| 74 | Daventry | 44 |
| 75 | Derby | 151 |
| 76 | Derbyshire Dales | 43 |
| 77 | Doncaster | 194 |
| 78 | Dorset | 218 |
| 79 | Dover | 67 |
| 80 | Dudley | 201 |
| 81 | Ealing | 196 |
| 82 | East Cambridgeshire | 50 |
| 83 | East Devon | 81 |
| 84 | East Hampshire | 71 |
| 85 | East Hertfordshire | 84 |
| 86 | East Lindsey | 81 |
| 87 | East Northamptonshire | 49 |
| 88 | East Riding of Yorkshire | 210 |
| 89 | East Staffordshire | 72 |
| 90 | East Suffolk | 146 |
| 91 | Eastbourne | 61 |
| 92 | Eastleigh | 77 |
| 93 | Eden | 36 |
| 94 | Elmbridge | 76 |
| 95 | Enfield | 183 |
| 96 | Epping Forest | 78 |
| 97 | Epsom and Ewell | 44 |
| 98 | Erewash | 73 |
| 99 | Exeter | 74 |
| 100 | Fareham | 73 |
| 101 | Fenland | 55 |
| 102 | Folkestone and Hythe | 67 |
| 103 | Forest of Dean | 50 |
| 104 | Fylde | 51 |
| 105 | Gateshead | 126 |
| 106 | Gedling | 77 |
| 107 | Gloucester | 78 |
| 108 | Gosport | 53 |
| 109 | Gravesham | 64 |
| 110 | Great Yarmouth | 61 |
| 111 | Greenwich | 150 |
| 112 | Guildford | 82 |
| 113 | Hackney | 144 |
| 114 | Halton | 79 |
| 115 | Hambleton | 52 |
| 116 | Hammersmith and Fulham | 113 |
| 117 | Harborough | 47 |
| 118 | Haringey | 145 |
| 119 | Harlow | 54 |
| 120 | Harrogate | 100 |
| 121 | Harrow | 137 |
| 122 | Hart | 56 |
| 123 | Hartlepool | 58 |
| 124 | Hastings | 53 |
| 125 | Havant | 78 |
| 126 | Havering | 150 |
| 127 | Herefordshire, County of | 116 |
| 128 | Hertsmere | 61 |
| 129 | High Peak | 59 |
| 130 | Hillingdon | 161 |
| 131 | Hinckley and Bosworth | 66 |
| 132 | Horsham | 81 |
| 133 | Hounslow | 142 |
| 134 | Huntingdonshire | 105 |
| 135 | Hyndburn | 52 |
| 136 | Ipswich | 85 |
| 137 | Isle of Wight | 89 |
| 138 | Isles of Scilly | 1 |
| 139 | Islington | 123 |
| 140 | Kensington and Chelsea | 90 |
| 141 | Kettering | 57 |
| 142 | King’s Lynn and West Norfolk | 89 |
| 143 | Kingston upon Hull, City of | 166 |
| 144 | Kingston upon Thames | 98 |
| 145 | Kirklees | 259 |
| 146 | Knowsley | 98 |
| 147 | Lambeth | 178 |
| 148 | Lancaster | 89 |
| 149 | Leeds | 480 |
| 150 | Leicester | 192 |
| 151 | Lewes | 62 |
| 152 | Lewisham | 169 |
| 153 | Lichfield | 58 |
| 154 | Lincoln | 57 |
| 155 | Liverpool | 297 |
| 156 | Luton | 121 |
| 157 | Maidstone | 94 |
| 158 | Maldon | 40 |
| 159 | Malvern Hills | 45 |
| 160 | Manchester | 278 |
| 161 | Mansfield | 67 |
| 162 | Medway | 163 |
| 163 | Melton | 30 |
| 164 | Mendip | 66 |
| 165 | Merton | 122 |
| 166 | Mid Devon | 43 |
| 167 | Mid Suffolk | 56 |
| 168 | Mid Sussex | 86 |
| 169 | Middlesbrough | 86 |
| 170 | Milton Keynes | 152 |
| 171 | Mole Valley | 52 |
| 172 | New Forest | 113 |
| 173 | Newark and Sherwood | 70 |
| 174 | Newcastle-under-Lyme | 80 |
| 175 | Newcastle upon Tyne | 174 |
| 176 | Newham | 162 |
| 177 | North Devon | 58 |
| 178 | North East Derbyshire | 63 |
| 179 | North East Lincolnshire | 106 |
| 180 | North Hertfordshire | 82 |
| 181 | North Kesteven | 64 |
| 182 | North Lincolnshire | 101 |
| 183 | North Norfolk | 62 |
| 184 | North Somerset | 135 |
| 185 | North Tyneside | 131 |
| 186 | North Warwickshire | 38 |
| 187 | North West Leicestershire | 58 |
| 188 | Northampton | 133 |
| 189 | Northumberland | 197 |
| 190 | Norwich | 83 |
| 191 | Nottingham | 182 |
| 192 | Nuneaton and Bedworth | 81 |
| 193 | Oadby and Wigston | 36 |
| 194 | Oldham | 141 |
| 195 | Oxford | 79 |
| 196 | Pendle | 57 |
| 197 | Peterborough | 112 |
| 198 | Plymouth | 161 |
| 199 | Portsmouth | 124 |
| 200 | Preston | 86 |
| 201 | Reading | 97 |
| 202 | Redbridge | 161 |
| 203 | Redcar and Cleveland | 88 |
| 204 | Redditch | 55 |
| 205 | Reigate and Banstead | 83 |
| 206 | Ribble Valley | 39 |
| 207 | Richmond upon Thames | 115 |
| 208 | Richmondshire | 34 |
| 209 | Rochdale | 134 |
| 210 | Rochford | 53 |
| 211 | Rossendale | 43 |
| 212 | Rother | 58 |
| 213 | Rotherham | 167 |
| 214 | Rugby | 61 |
| 215 | Runnymede | 51 |
| 216 | Rushcliffe | 68 |
| 217 | Rushmoor | 58 |
| 218 | Rutland | 23 |
| 219 | Ryedale | 30 |
| 220 | Salford | 149 |
| 221 | Sandwell | 186 |
| 222 | Scarborough | 71 |
| 223 | Sedgemoor | 70 |
| 224 | Sefton | 189 |
| 225 | Selby | 50 |
| 226 | Sevenoaks | 73 |
| 227 | Sheffield | 343 |
| 228 | Shropshire | 193 |
| 229 | Slough | 80 |
| 230 | Solihull | 132 |
| 231 | Somerset West and Taunton | 88 |
| 232 | South Bucks | 37 |
| 233 | South Cambridgeshire | 96 |
| 234 | South Derbyshire | 58 |
| 235 | South Gloucestershire | 165 |
| 236 | South Hams | 49 |
| 237 | South Holland | 49 |
| 238 | South Kesteven | 81 |
| 239 | South Lakeland | 59 |
| 240 | South Norfolk | 81 |
| 241 | South Northamptonshire | 51 |
| 242 | South Oxfordshire | 87 |
| 243 | South Ribble | 70 |
| 244 | South Somerset | 103 |
| 245 | South Staffordshire | 68 |
| 246 | South Tyneside | 102 |
| 247 | Southampton | 148 |
| 248 | Southend-on-Sea | 107 |
| 249 | Southwark | 162 |
| 250 | Spelthorne | 60 |
| 251 | St Albans | 83 |
| 252 | St. Helens | 119 |
| 253 | Stafford | 80 |
| 254 | Staffordshire Moorlands | 59 |
| 255 | Stevenage | 52 |
| 256 | Stockport | 190 |
| 257 | Stockton-on-Tees | 120 |
| 258 | Stoke-on-Trent | 159 |
| 259 | Stratford-on-Avon | 73 |
| 260 | Stroud | 69 |
| 261 | Sunderland | 185 |
| 262 | Surrey Heath | 53 |
| 263 | Sutton | 121 |
| 264 | Swale | 85 |
| 265 | Swindon | 132 |
| 266 | Tameside | 141 |
| 267 | Tamworth | 51 |
| 268 | Tandridge | 49 |
| 269 | Teignbridge | 84 |
| 270 | Telford and Wrekin | 108 |
| 271 | Tendring | 89 |
| 272 | Test Valley | 71 |
| 273 | Tewkesbury | 50 |
| 274 | Thanet | 84 |
| 275 | Three Rivers | 49 |
| 276 | Thurrock | 98 |
| 277 | Tonbridge and Malling | 71 |
| 278 | Torbay | 89 |
| 279 | Torridge | 37 |
| 280 | Tower Hamlets | 144 |
| 281 | Trafford | 138 |
| 282 | Tunbridge Wells | 66 |
| 283 | Uttlesford | 45 |
| 284 | Vale of White Horse | 76 |
| 285 | Wakefield | 209 |
| 286 | Walsall | 167 |
| 287 | Waltham Forest | 144 |
| 288 | Wandsworth | 178 |
| 289 | Warrington | 127 |
| 290 | Warwick | 85 |
| 291 | Watford | 53 |
| 292 | Waverley | 81 |
| 293 | Wealden | 95 |
| 294 | Wellingborough | 47 |
| 295 | Welwyn Hatfield | 65 |
| 296 | West Berkshire | 96 |
| 297 | West Devon | 31 |
| 298 | West Lancashire | 73 |
| 299 | West Lindsey | 52 |
| 300 | West Oxfordshire | 66 |
| 301 | West Suffolk | 100 |
| 302 | Westminster | 120 |
| 303 | Wigan | 200 |
| 304 | Wiltshire | 285 |
| 305 | Winchester | 69 |
| 306 | Windsor and Maidenhead | 88 |
| 307 | Wirral | 206 |
| 308 | Woking | 59 |
| 309 | Wokingham | 99 |
| 310 | Wolverhampton | 158 |
| 311 | Worcester | 63 |
| 312 | Worthing | 65 |
| 313 | Wychavon | 78 |
| 314 | Wycombe | 106 |
| 315 | Wyre | 69 |
| 316 | Wyre Forest | 65 |
| 317 | York | 120 |
cat(html_table3)
| Local Authority Names | Count | |
| 1 | Adur | 42 |
| 2 | Allerdale | 60 |
| 3 | Amber Valley | 78 |
| 4 | Arun | 94 |
| 5 | Ashfield | 74 |
| 6 | Ashford | 78 |
| 7 | Aylesbury Vale | 114 |
| 8 | Babergh | 54 |
| 9 | Barking and Dagenham | 110 |
| 10 | Barnet | 209 |
| 11 | Barnsley | 147 |
| 12 | Barrow-in-Furness | 49 |
| 13 | Basildon | 109 |
| 14 | Basingstoke and Deane | 109 |
| 15 | Bassetlaw | 70 |
| 16 | Bath and North East Somerset | 114 |
| 17 | Bedford | 103 |
| 18 | Bexley | 146 |
| 19 | Birmingham | 638 |
| 20 | Blaby | 60 |
| 21 | Blackburn with Darwen | 91 |
| 22 | Blackpool | 94 |
| 23 | Bolsover | 48 |
| 24 | Bolton | 177 |
| 25 | Boston | 36 |
| 26 | Bournemouth, Christchurch and Poole | 233 |
| 27 | Bracknell Forest | 75 |
| 28 | Bradford | 308 |
| 29 | Braintree | 87 |
| 30 | Breckland | 78 |
| 31 | Brent | 173 |
| 32 | Brentwood | 45 |
| 33 | Brighton and Hove | 165 |
| 34 | Bristol, City of | 262 |
| 35 | Broadland | 84 |
| 36 | Bromley | 195 |
| 37 | Bromsgrove | 58 |
| 38 | Broxbourne | 56 |
| 39 | Broxtowe | 71 |
| 40 | Burnley | 60 |
| 41 | Bury | 120 |
| 42 | Calderdale | 128 |
| 43 | Cambridge | 67 |
| 44 | Camden | 132 |
| 45 | Cannock Chase | 60 |
| 46 | Canterbury | 90 |
| 47 | Carlisle | 68 |
| 48 | Castle Point | 57 |
| 49 | Central Bedfordshire | 157 |
| 50 | Charnwood | 99 |
| 51 | Chelmsford | 107 |
| 52 | Cheltenham | 75 |
| 53 | Cherwell | 93 |
| 54 | Cheshire East | 232 |
| 55 | Cheshire West and Chester | 212 |
| 56 | Chesterfield | 69 |
| 57 | Chichester | 71 |
| 58 | Chiltern | 54 |
| 59 | Chorley | 66 |
| 60 | City of London | 4 |
| 61 | Colchester | 105 |
| 62 | Copeland | 49 |
| 63 | Corby | 41 |
| 64 | Cornwall | 326 |
| 65 | Cotswold | 51 |
| 66 | County Durham | 324 |
| 67 | Coventry | 195 |
| 68 | Craven | 32 |
| 69 | Crawley | 66 |
| 70 | Croydon | 220 |
| 71 | Dacorum | 91 |
| 72 | Darlington | 65 |
| 73 | Dartford | 58 |
| 74 | Daventry | 44 |
| 75 | Derby | 151 |
| 76 | Derbyshire Dales | 43 |
| 77 | Doncaster | 194 |
| 78 | Dorset | 218 |
| 79 | Dover | 67 |
| 80 | Dudley | 201 |
| 81 | Ealing | 196 |
| 82 | East Cambridgeshire | 50 |
| 83 | East Devon | 81 |
| 84 | East Hampshire | 71 |
| 85 | East Hertfordshire | 84 |
| 86 | East Lindsey | 81 |
| 87 | East Northamptonshire | 49 |
| 88 | East Riding of Yorkshire | 210 |
| 89 | East Staffordshire | 72 |
| 90 | East Suffolk | 146 |
| 91 | Eastbourne | 61 |
| 92 | Eastleigh | 77 |
| 93 | Eden | 36 |
| 94 | Elmbridge | 76 |
| 95 | Enfield | 183 |
| 96 | Epping Forest | 78 |
| 97 | Epsom and Ewell | 44 |
| 98 | Erewash | 73 |
| 99 | Exeter | 74 |
| 100 | Fareham | 73 |
| 101 | Fenland | 55 |
| 102 | Folkestone and Hythe | 67 |
| 103 | Forest of Dean | 50 |
| 104 | Fylde | 51 |
| 105 | Gateshead | 126 |
| 106 | Gedling | 77 |
| 107 | Gloucester | 78 |
| 108 | Gosport | 53 |
| 109 | Gravesham | 64 |
| 110 | Great Yarmouth | 61 |
| 111 | Greenwich | 150 |
| 112 | Guildford | 82 |
| 113 | Hackney | 144 |
| 114 | Halton | 79 |
| 115 | Hambleton | 52 |
| 116 | Hammersmith and Fulham | 113 |
| 117 | Harborough | 47 |
| 118 | Haringey | 145 |
| 119 | Harlow | 54 |
| 120 | Harrogate | 100 |
| 121 | Harrow | 137 |
| 122 | Hart | 56 |
| 123 | Hartlepool | 58 |
| 124 | Hastings | 53 |
| 125 | Havant | 78 |
| 126 | Havering | 150 |
| 127 | Herefordshire, County of | 116 |
| 128 | Hertsmere | 61 |
| 129 | High Peak | 59 |
| 130 | Hillingdon | 161 |
| 131 | Hinckley and Bosworth | 66 |
| 132 | Horsham | 81 |
| 133 | Hounslow | 142 |
| 134 | Huntingdonshire | 105 |
| 135 | Hyndburn | 52 |
| 136 | Ipswich | 85 |
| 137 | Isle of Wight | 89 |
| 138 | Isles of Scilly | 1 |
| 139 | Islington | 123 |
| 140 | Kensington and Chelsea | 90 |
| 141 | Kettering | 57 |
| 142 | King’s Lynn and West Norfolk | 89 |
| 143 | Kingston upon Hull, City of | 166 |
| 144 | Kingston upon Thames | 98 |
| 145 | Kirklees | 259 |
| 146 | Knowsley | 98 |
| 147 | Lambeth | 178 |
| 148 | Lancaster | 89 |
| 149 | Leeds | 480 |
| 150 | Leicester | 192 |
| 151 | Lewes | 62 |
| 152 | Lewisham | 169 |
| 153 | Lichfield | 58 |
| 154 | Lincoln | 57 |
| 155 | Liverpool | 297 |
| 156 | Luton | 121 |
| 157 | Maidstone | 94 |
| 158 | Maldon | 40 |
| 159 | Malvern Hills | 45 |
| 160 | Manchester | 278 |
| 161 | Mansfield | 67 |
| 162 | Medway | 163 |
| 163 | Melton | 30 |
| 164 | Mendip | 66 |
| 165 | Merton | 122 |
| 166 | Mid Devon | 43 |
| 167 | Mid Suffolk | 56 |
| 168 | Mid Sussex | 86 |
| 169 | Middlesbrough | 86 |
| 170 | Milton Keynes | 152 |
| 171 | Mole Valley | 52 |
| 172 | New Forest | 113 |
| 173 | Newark and Sherwood | 70 |
| 174 | Newcastle-under-Lyme | 80 |
| 175 | Newcastle upon Tyne | 174 |
| 176 | Newham | 162 |
| 177 | North Devon | 58 |
| 178 | North East Derbyshire | 63 |
| 179 | North East Lincolnshire | 106 |
| 180 | North Hertfordshire | 82 |
| 181 | North Kesteven | 64 |
| 182 | North Lincolnshire | 101 |
| 183 | North Norfolk | 62 |
| 184 | North Somerset | 135 |
| 185 | North Tyneside | 131 |
| 186 | North Warwickshire | 38 |
| 187 | North West Leicestershire | 58 |
| 188 | Northampton | 133 |
| 189 | Northumberland | 197 |
| 190 | Norwich | 83 |
| 191 | Nottingham | 182 |
| 192 | Nuneaton and Bedworth | 81 |
| 193 | Oadby and Wigston | 36 |
| 194 | Oldham | 141 |
| 195 | Oxford | 79 |
| 196 | Pendle | 57 |
| 197 | Peterborough | 112 |
| 198 | Plymouth | 161 |
| 199 | Portsmouth | 124 |
| 200 | Preston | 86 |
| 201 | Reading | 97 |
| 202 | Redbridge | 161 |
| 203 | Redcar and Cleveland | 88 |
| 204 | Redditch | 55 |
| 205 | Reigate and Banstead | 83 |
| 206 | Ribble Valley | 39 |
| 207 | Richmond upon Thames | 115 |
| 208 | Richmondshire | 34 |
| 209 | Rochdale | 134 |
| 210 | Rochford | 53 |
| 211 | Rossendale | 43 |
| 212 | Rother | 58 |
| 213 | Rotherham | 167 |
| 214 | Rugby | 61 |
| 215 | Runnymede | 51 |
| 216 | Rushcliffe | 68 |
| 217 | Rushmoor | 58 |
| 218 | Rutland | 23 |
| 219 | Ryedale | 30 |
| 220 | Salford | 149 |
| 221 | Sandwell | 186 |
| 222 | Scarborough | 71 |
| 223 | Sedgemoor | 70 |
| 224 | Sefton | 189 |
| 225 | Selby | 50 |
| 226 | Sevenoaks | 73 |
| 227 | Sheffield | 343 |
| 228 | Shropshire | 193 |
| 229 | Slough | 80 |
| 230 | Solihull | 132 |
| 231 | Somerset West and Taunton | 88 |
| 232 | South Bucks | 37 |
| 233 | South Cambridgeshire | 96 |
| 234 | South Derbyshire | 58 |
| 235 | South Gloucestershire | 165 |
| 236 | South Hams | 49 |
| 237 | South Holland | 49 |
| 238 | South Kesteven | 81 |
| 239 | South Lakeland | 59 |
| 240 | South Norfolk | 81 |
| 241 | South Northamptonshire | 51 |
| 242 | South Oxfordshire | 87 |
| 243 | South Ribble | 70 |
| 244 | South Somerset | 103 |
| 245 | South Staffordshire | 68 |
| 246 | South Tyneside | 102 |
| 247 | Southampton | 148 |
| 248 | Southend-on-Sea | 107 |
| 249 | Southwark | 162 |
| 250 | Spelthorne | 60 |
| 251 | St Albans | 83 |
| 252 | St. Helens | 119 |
| 253 | Stafford | 80 |
| 254 | Staffordshire Moorlands | 59 |
| 255 | Stevenage | 52 |
| 256 | Stockport | 190 |
| 257 | Stockton-on-Tees | 120 |
| 258 | Stoke-on-Trent | 159 |
| 259 | Stratford-on-Avon | 73 |
| 260 | Stroud | 69 |
| 261 | Sunderland | 185 |
| 262 | Surrey Heath | 53 |
| 263 | Sutton | 121 |
| 264 | Swale | 85 |
| 265 | Swindon | 132 |
| 266 | Tameside | 141 |
| 267 | Tamworth | 51 |
| 268 | Tandridge | 49 |
| 269 | Teignbridge | 84 |
| 270 | Telford and Wrekin | 108 |
| 271 | Tendring | 89 |
| 272 | Test Valley | 71 |
| 273 | Tewkesbury | 50 |
| 274 | Thanet | 84 |
| 275 | Three Rivers | 49 |
| 276 | Thurrock | 98 |
| 277 | Tonbridge and Malling | 71 |
| 278 | Torbay | 89 |
| 279 | Torridge | 37 |
| 280 | Tower Hamlets | 144 |
| 281 | Trafford | 138 |
| 282 | Tunbridge Wells | 66 |
| 283 | Uttlesford | 45 |
| 284 | Vale of White Horse | 76 |
| 285 | Wakefield | 209 |
| 286 | Walsall | 167 |
| 287 | Waltham Forest | 144 |
| 288 | Wandsworth | 178 |
| 289 | Warrington | 127 |
| 290 | Warwick | 85 |
| 291 | Watford | 53 |
| 292 | Waverley | 81 |
| 293 | Wealden | 95 |
| 294 | Wellingborough | 47 |
| 295 | Welwyn Hatfield | 65 |
| 296 | West Berkshire | 96 |
| 297 | West Devon | 31 |
| 298 | West Lancashire | 73 |
| 299 | West Lindsey | 52 |
| 300 | West Oxfordshire | 66 |
| 301 | West Suffolk | 100 |
| 302 | Westminster | 120 |
| 303 | Wigan | 200 |
| 304 | Wiltshire | 285 |
| 305 | Winchester | 69 |
| 306 | Windsor and Maidenhead | 88 |
| 307 | Wirral | 206 |
| 308 | Woking | 59 |
| 309 | Wokingham | 99 |
| 310 | Wolverhampton | 158 |
| 311 | Worcester | 63 |
| 312 | Worthing | 65 |
| 313 | Wychavon | 78 |
| 314 | Wycombe | 106 |
| 315 | Wyre | 69 |
| 316 | Wyre Forest | 65 |
| 317 | York | 120 |
writeLines(html_table3, html_file)
rstudioapi::viewer(html_file)
NULL
# Calculate summary statistics
summary_localauthority <- c(Max = max(local_authority_name_counts),
Min = min(local_authority_name_counts), Med = median(local_authority_name_counts),
IQR = IQR(local_authority_name_counts))
summary_df <- data.frame(statistic = names(summary_localauthority),
value = summary_localauthority)
# Create HTML table using stargazer
html_table4 <- stargazer(summary_df, type = "html", title = "Summary Statistics",
summary = FALSE, rownames = FALSE)
| statistic | value |
| Max | 638 |
| Min | 1 |
| Med | 81 |
| IQR | 72 |
cat(html_table4)
| statistic | value |
| Max | 638 |
| Min | 1 |
| Med | 81 |
| IQR | 72 |
writeLines(html_table4, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
NULL
### Repeat for complete_case.csv for slide 16 Read the
### complete_case file
complete_case_sum <- read.csv("Group_7_complete_case.csv")
html_table <- stargazer(complete_case_sum, type = "html", title = "Table 3: Summary Statistics",
digits = 1, column.labels = c("Log Income Deprivation Rate (2015)",
"Log Average Healthcare Distance (2022)", "Housing Affordability Score (2016)",
"Rural/Urban Class (2011)"), out = "table.html")
| Statistic | N | Mean | St. Dev. | Min | Max |
| log_income_deprivation_rate_2015 | 32,714 | 2.3 | 0.8 | -1.7 | 4.1 |
| log_avg_healthcare_distance_2022 | 32,714 | 1.2 | 0.8 | -1.5 | 4.1 |
| housing_affordability_score_2016 | 32,714 | 0.001 | 1.9 | -6.7 | 8.0 |
# See HTML table in R Studio
rstudioapi::viewer("table.html")
NULL
### Calculate missingness percentages for each variable for
### slide 16
missingness <- sapply(complete_case_sum, function(x) mean(is.na(x)) *
100)
# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 4: Missingness Summary/%",
summary = FALSE, flip = TRUE)
| log_income_deprivation_rate_2015 | 0 |
| log_avg_healthcare_distance_2022 | 0 |
| housing_affordability_score_2016 | 0 |
| rural_urban_class_2011 | 0 |
html_file <- "missingness_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
NULL
### covariance of x1,x2,x3 before log transformation for
### slide 17
x1 <- combined_data_sum$income_deprivation_rate_2015
x2 <- combined_data_sum$employment_deprivation_rate_2015
x3 <- combined_data_sum$housing_affordability_score_2016
y <- combined_data_sum$avg_healthcare_distance_2022
cov_x1y <- cov(x1, y)
cov_x2y <- cov(x2, y)
cov_x3y <- cov(x3, y)
# Create a data frame for the covariances
covariances_df <- data.frame(variables = c("x1 and y", "x2 and y",
"x3 and y"), covariance = c(cov_x1y, cov_x2y, cov_x3y))
# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances before log transformation",
summary = FALSE, rownames = FALSE)
| variables | covariance |
| x1 and y | -12.798 |
| x2 and y | -5.549 |
| x3 and y | -2.717 |
html_file <- "covariances_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
NULL
### covariance of x4,x5 after log transformation for slide
### 17
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <- complete_case_sum$log_income_deprivation_rate_2015
x5 <- complete_case_sum$housing_affordability_score_2016
y1 <- complete_case_sum$log_avg_healthcare_distance_2022
cov_x4y1 <- cov(x4, y1)
cov_x5y1 <- cov(x5, y1)
# Create a data frame for the covariances
covariances_df <- data.frame(variables = c("x4 and y1", "x5 and y1"),
covariance = c(cov_x4y1, cov_x5y1))
# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances after log transformation",
summary = FALSE, rownames = FALSE)
| variables | covariance |
| x4 and y1 | -0.219 |
| x5 and y1 | -0.597 |
html_file <- "covariances_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
NULL
## Correlation test for slide 18
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <- complete_case_sum$log_income_deprivation_rate_2015
x5 <- complete_case_sum$housing_affordability_score_2016
y1 <- complete_case_sum$log_avg_healthcare_distance_2022
corx4y1 <- cor.test(x4, y1, method = "pearson")
corx5y1 <- cor.test(x5, y1, method = "pearson")
# Create a data frame to store correlation results
# format.pval function is used to format the p-value to
# make it into a string ie. a range
corr_df <- data.frame(variables = c("x4 and y1", "x5 and y1"),
Pearson_Correlation = c(corx4y1$estimate, corx5y1$estimate),
t_value = c(corx4y1$statistic, corx5y1$statistic), Estimate = c(corx4y1$estimate,
corx5y1$estimate), p_value = ifelse(corx4y1$p.value ==
0, "< 0.001", format.pval(corx4y1$p.value)))
# Create HTML table using stargazer
html_table <- stargazer(corr_df, type = "html", title = "Pearson Correlations After Log Transformation",
summary = FALSE, rownames = FALSE)
| variables | Pearson_Correlation | t_value | Estimate | p_value |
| x4 and y1 | -0.367 | -71.408 | -0.367 | < 0.001 |
| x5 and y1 | -0.408 | -80.848 | -0.408 | < 0.001 |
html_file <- "cor_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
NULL The above code is a sequence of commands for generating HTML tables that display summary statistics, missingness percentages, unique category counts, and measures of covariance and correlation. It begins with extracting and tabulating a subset of the dataset for slide 11, continues with analyzing missing data and category counts for slides 11 and 12, then moves on to slide 16 where it reads a cleaned dataset to produce similar summaries. For slide 17, it calculates covariances pre- and post-log transformation, and for slide 18, it conducts Pearson correlation tests, with results formatted and exported as HTML tables viewable within RStudio.
# Model 1:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate.
mod1 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
summary(mod1)
##
## Call:
## lm(formula = log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015,
## data = complete_case_sample)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.70482 -0.47303 -0.06624 0.38161 2.80484
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.977622 0.012243 161.53
## log_income_deprivation_rate_2015 -0.363913 0.005096 -71.41
## Pr(>|t|)
## (Intercept) <0.0000000000000002 ***
## log_income_deprivation_rate_2015 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7149 on 32712 degrees of freedom
## Multiple R-squared: 0.1349, Adjusted R-squared: 0.1348
## F-statistic: 5099 on 1 and 32712 DF, p-value: < 0.00000000000000022
# Model 1 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate. As both the dependent and independent variables are log transformed, the coefficients may be interpreted in terms of percentage change. That is, a 1% increase in the income deprivation rate is associated with a 0.35% decrease in the average travel time to a healthcare service. This coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). Thus, according to this model, we may reject the null hypothesis that there is no relationship between income deprivation rate and average healthcare travel distance. Further, the p-value associated with the F-statistic is also significant considering the same alpha, sugegsting at least one variable in the model has some explanatory power regarding average healthcare distance. This statistically is not that useful regarding this model however, as only one explanatory variable is considered. The coefficient of determination for this model indicates that we are able to explain 13.1% of the variation in average travel time to healthcare services with this model.
# Model 2:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate and the housing affordability indicator
mod2 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)
# Exponentiating untransformed variable coefficients to allow easier interpretation
mod2$coefficients["housing_affordability_score_2016"] <- exp(mod2$coefficients["housing_affordability_score_2016"])
# Viewing model results
summary(mod2)
##
## Call:
## lm(formula = log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 +
## housing_affordability_score_2016, data = complete_case_sample)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5103 -0.4599 -0.0744 0.3675 2.7243
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 1.604985 0.014034 114.36
## log_income_deprivation_rate_2015 -0.200002 0.005940 -33.67
## housing_affordability_score_2016 0.887820 0.002419 367.07
## Pr(>|t|)
## (Intercept) <0.0000000000000002 ***
## log_income_deprivation_rate_2015 <0.0000000000000002 ***
## housing_affordability_score_2016 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6899 on 32711 degrees of freedom
## Multiple R-squared: 0.1945, Adjusted R-squared: 0.1944
## F-statistic: 3948 on 2 and 32711 DF, p-value: < 0.00000000000000022
# Model 2 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate and the housing affordability indicator. Here, we can see that a 1% increase in the income deprivation rate is associated with a 0.19% decrease in the average travel time to a healthcare service. Thus, we can see that when we control for a further underlying economic variable, the explanatory power of income deprivation alone decreases. Again, this coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). For interpretation of the housing affordability indicator, the exponential of the coefficient is considered (0.88). Thus, we can see that a 1 unit increase in the housing affordability indicator is associated with a 12% decrease in the average travel time to a healthcare service, again significant considering an alpha of 0.05. Remembering that increases in the housing affordability indicator signify increased inability to afford to enter owner-occupation or the private rental market, this again appears to indicate that more economically deprived areas have greater geographical access to healthcare services. Further, the p-value associated with the F-statistic is also significant at the 5% level, again suggesting at least one variable in the model has some explanatory power regarding average healthcare distance. The adjusted coefficient of determination for this model indicates that we are able to explain 19.3% of the variation in average travel time to healthcare services with this model.
# Model 3:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate, the housing affordability indicator, and the re-leveled Rural-Urban Classification (RUC11) with "Urban city and town" as the reference category
mod3 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + forcats::fct_relevel(rural_urban_class_2011, "Urban city and town"), data = complete_case_sample)
# Exponentiated coefficient data
mod3_exp_coeff <- data.frame(exp(coef(mod3)))
# Exponentiating untransformed variable coefficients WITHIN the model to allow easier interpretation
mod3$coefficients[3:10] <- exp(mod3$coefficients[3:10])
# Viewing model results
summary(mod3)
##
## Call:
## lm(formula = log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 +
## housing_affordability_score_2016 + forcats::fct_relevel(rural_urban_class_2011,
## "Urban city and town"), data = complete_case_sample)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35847 -0.33319 0.02095 0.34803 2.03045
##
## Coefficients:
## Estimate
## (Intercept) 1.309527
## log_income_deprivation_rate_2015 -0.103974
## housing_affordability_score_2016 0.931051
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe 1.763662
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting 1.208809
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed 4.439979
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting 6.326992
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting 0.958377
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation 0.781509
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation 0.861055
## Std. Error
## (Intercept) 0.011329
## log_income_deprivation_rate_2015 0.004576
## housing_affordability_score_2016 0.001921
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe 0.010651
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting 0.048121
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed 0.011797
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting 0.039126
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting 0.068209
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation 0.006801
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation 0.015798
## t value
## (Intercept) 115.59
## log_income_deprivation_rate_2015 -22.72
## housing_affordability_score_2016 484.75
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe 165.58
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting 25.12
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed 376.37
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting 161.71
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting 14.05
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation 114.91
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation 54.50
## Pr(>|t|)
## (Intercept) <0.0000000000000002
## log_income_deprivation_rate_2015 <0.0000000000000002
## housing_affordability_score_2016 <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation <0.0000000000000002
##
## (Intercept) ***
## log_income_deprivation_rate_2015 ***
## housing_affordability_score_2016 ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5228 on 32704 degrees of freedom
## Multiple R-squared: 0.5376, Adjusted R-squared: 0.5374
## F-statistic: 4224 on 9 and 32704 DF, p-value: < 0.00000000000000022
# Creating data for confidence interval plot
ConfidencePlotData <- data.frame (
"names" = as.factor (c("Rural town and fringe", "Rural town and fringe in a sparse setting", "Rural village and dispersed", "Rural village and dispersed in a sparse setting", "Urban city and town in a sparse setting", "Urban major conurbation", "Urban minor conurbation")), # Creating predictor names (row names)
"coefs" = mod3_exp_coeff[4:10, 1],
"lower" = confint (mod3) [4:10, 1], # Creating a lower confidence interval [ , 1] list for predictors 2-3 in the model [2:5 , ]
"upper" = confint (mod3) [4:10, 2] # Creating an upper confidence interval [ , 2] list for predictors 2-3 in the model [2:5 , ]
)
ConfidencePlotData$names <- factor(ConfidencePlotData$names, levels = unique(ConfidencePlotData$names))
# Creating confidence interval plot for RUC coefficients
ConfidencePlot(ConfidencePlotData, "Healthcare Access by Rural-Urban Classification in England", "Reference Category: Urban City and Town", x_label = "Exponentiated Coefficient", middle_point = 1, caption = "*95% CI Indicated by Green Error Bars. Red dashed line indicates reference group. Controlling for income deprivation (2015) and housing affordability indicators (2016).")
# Model 3 expands on Model 2 by including the Rural-Urban Classification (RUC11) as a factor variable, allowing us to account for geographic differences in average travel time to healthcare services. The model estimates the logarithm of average travel time to a healthcare service within a Lower Layer Super Output Area (LSOA) as a function of the logarithm of the income deprivation rate, the housing affordability indicator, and the RUC11 categories. We observe that a 1% increase in the income deprivation rate is associated with a 0.10% decrease in average travel time to healthcare services, when holding the other variables constant. This relationship is statistically significant at the 5% level, with a p-value less than 0.05, indicating that income deprivation continues to be an important factor in explaining variations in healthcare access, albeit with a smaller coefficient compared to Model 2. Similarly, a 1 unit increase in the housing affordability indicator, which reflects increased difficulty in entering owner-occupation or the private rental market, is associated with a 7% decrease in average travel time to healthcare services. This relationship is also statistically significant at the 5% level. The new variables introduced, representing the RUC11 categories, show significant variations in healthcare access across different geographic areas. For instance, compared to Urban city and town areas, Rural village and dispersed areas experience a 344% increase in average travel time to healthcare services, indicating that rural areas face substantial barriers in healthcare access. These relationships are statistically significant at the 5% level, with the exception of Urban city and town in a sparse setting category, which does not show a significant difference in healthcare access compared to Urban city and town areas. The adjusted coefficient of determination for this model is 0.537, suggesting that the model explains approximately 53.7% of the variation in average travel time to healthcare services. The inclusion of the RUC11 categories significantly enhances the explanatory power of the model compared to Model 2. The p-value associated with the F-statistic is also significant at the 5% level, indicating that at least one variable in the model has some explanatory power regarding average healthcare distance. In conclusion, Model 3 provides a more nuanced understanding of the factors affecting healthcare access, by accounting for both economic and geographic variables. The significant differences in healthcare access across the RUC11 categories highlight the importance of considering geographic disparities when analyzing healthcare access.
In these three regression models, the dependent variable—logarithm of average healthcare distance—is modeled against various predictors to study their effects on healthcare accessibility within Lower Layer Super Output Areas (LSOAs). Model 1 examines the impact of income deprivation alone, indicating a statistically significant inverse relationship where an increase in deprivation is associated with a decrease in travel time. Model 2 introduces housing affordability, showing a similar inverse relationship with both predictors, and a slightly improved explanatory power. Finally, Model 3 expands the analysis by adding Rural-Urban Classification, revealing significant geographic disparities in healthcare access; rural areas notably have longer travel times to healthcare services. Each model’s coefficient of determination increases progressively, demonstrating an improved ability to explain the variation in healthcare access by including additional socio-economic and geographic factors.
results <- stargazer(mod1, mod2, mod3, type = "html", dep.var.labels = c("log(Mean Healthcare Service Distance [MHSD])"),
covariate.labels = c("log(Income Deprivation Rate 2015)",
"Housing Affordability Score 2016 [HAS]", "Rural town and fringe [RUC 2011]",
"Rural town and fringe in a sparse setting [RUC 2011]",
"Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]",
"Urban city and town in a sparse setting [RUC 2011]",
"Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"),
notes = "Coefficients for HAS and RUC 2011 are exponentiated, reflecting a multiplicative effect on MHSD.",
ci = TRUE)
| Dependent variable: | |||
| log(Mean Healthcare Service Distance [MHSD]) | |||
| (1) | (2) | (3) | |
| log(Income Deprivation Rate 2015) | -0.364*** | -0.200*** | -0.104*** |
| (-0.374, -0.354) | (-0.212, -0.188) | (-0.113, -0.095) | |
| Housing Affordability Score 2016 [HAS] | 0.888*** | 0.931*** | |
| (0.883, 0.893) | (0.927, 0.935) | ||
| Rural town and fringe [RUC 2011] | 1.764*** | ||
| (1.743, 1.785) | |||
| Rural town and fringe in a sparse setting [RUC 2011] | 1.209*** | ||
| (1.114, 1.303) | |||
| Rural village and dispersed [RUC 2011] | 4.440*** | ||
| (4.417, 4.463) | |||
| Rural village and dispersed in a sparse setting [RUC 2011] | 6.327*** | ||
| (6.250, 6.404) | |||
| Urban city and town in a sparse setting [RUC 2011] | 0.958*** | ||
| (0.825, 1.092) | |||
| Urban major conurbation [RUC 2011] | 0.782*** | ||
| (0.768, 0.795) | |||
| Urban minor conurbation [RUC 2011] | 0.861*** | ||
| (0.830, 0.892) | |||
| Constant | 1.978*** | 1.605*** | 1.310*** |
| (1.954, 2.002) | (1.577, 1.632) | (1.287, 1.332) | |
| Observations | 32,714 | 32,714 | 32,714 |
| R2 | 0.135 | 0.194 | 0.538 |
| Adjusted R2 | 0.135 | 0.194 | 0.537 |
| Residual Std. Error | 0.715 (df = 32712) | 0.690 (df = 32711) | 0.523 (df = 32704) |
| F Statistic | 5,099.128*** (df = 1; 32712) | 3,948.166*** (df = 2; 32711) | 4,224.042*** (df = 9; 32704) |
| Note: | p<0.1; p<0.05; p<0.01 | ||
| Coefficients for HAS and RUC 2011 are exponentiated, reflecting a multiplicative effect on MHSD. | |||
# Checking assumptions: Model 1 Adding residual values to
# dataframe
complete_case_sample$resids1 <- residuals(mod1)
# Linearity (using partial regression plots)
PartialRegressionPlot(mod1, "log_income_deprivation_rate_2015",
x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR)")
# Constant variance of residuals
HomoskedasticityPlot(mod1, add_loess_line = TRUE, title = "Model 1 Scale-Location")
# Durbin-Watson test
dw1 <- dwtest(mod1)
# Create a data frame with the Durbin-Watson test results
dw_df <- data.frame(Statistic = dw1$statistic, P = format.pval(dw1$p.value,
digits = 16))
# Use stargazer to create an HTML table of the
# Durbin-Watson results
stargazer(dw_df, type = "html", title = "Durbin-Watson Test Results",
summary = FALSE, rownames = FALSE, out = "dw_test_results.html")
| Statistic | P |
| 0.831 | < 0.00000000000000022204460492503 |
# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer("dw_test_results.html")
NULL
# Normality of residuals check
QQPlotModel(mod1, subtitle = "QQ-Plot for Model 1")
# Zero conditional mean of residual values
mean(mod1$residuals)
[1] -0.00000000000000006509453
resid_mod1 <- lm(resids1 ~ log_income_deprivation_rate_2015,
data = complete_case_sample)
summary(resid_mod1)
Call: lm(formula = resids1 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
Residuals: Min 1Q Median 3Q Max -2.70482 -0.47303 -0.06624 0.38161 2.80484
Coefficients: Estimate Std. Error (Intercept) 0.0000000000000019171 0.0122433939839822377 log_income_deprivation_rate_2015 -0.0000000000000004258 0.0050962369734882749 t value Pr(>|t|) (Intercept) 0 1 log_income_deprivation_rate_2015 0 1
Residual standard error: 0.7149 on 32712 degrees of freedom Multiple R-squared: 1.914e-30, Adjusted R-squared: -3.057e-05 F-statistic: 6.261e-26 on 1 and 32712 DF, p-value: 1
# Zero conditional mean regression results
stargazer(resid_mod1, type = "html", dep.var.labels = c("Residuals of Model 1"),
covariate.labels = c("log(Income Deprivation Rate 2015)"),
out = "residmod1.html")
| Dependent variable: | |
| Residuals of Model 1 | |
| log(Income Deprivation Rate 2015) | -0.000 |
| (0.005) | |
| Constant | 0.000 |
| (0.012) | |
| Observations | 32,714 |
| R2 | 0.000 |
| Adjusted R2 | -0.00003 |
| Residual Std. Error | 0.715 (df = 32712) |
| F Statistic | 0.000 (df = 1; 32712) |
| Note: | p<0.1; p<0.05; p<0.01 |
rstudioapi::viewer("residmod1.html")
NULL
# Checking assumptions: Model 2 Adding residual values to
# dataframe
complete_case_sample$resids2 <- residuals(mod2)
# Linearity (using partial regression plots)
PartialRegressionPlot(mod2, "log_income_deprivation_rate_2015",
x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot(mod2, "housing_affordability_score_2016",
x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")
# Multicollinearity (using VIF)
vif_values1 <- vif(mod2)
# storing VIF in a dataframe
vif_df1 <- data.frame(Variable = names(vif_values1), VIF = vif_values1)
vif_df1$Variable[1] <- "log(Income Deprivation Rate 2015)"
vif_df1$Variable[2] <- "Housing Affordability Score 2016"
# creating a pretty table
stargazer(vif_df1, type = "html", title = "VIF Values", summary = FALSE,
rownames = FALSE, out = "mod2vif.html")
| Variable | VIF |
| log(Income Deprivation Rate 2015) | 1.459 |
| Housing Affordability Score 2016 | 1.459 |
# using rstudioapi to view the table
rstudioapi::viewer("mod2vif.html")
NULL
# Constant variance of residuals
HomoskedasticityPlot(mod2, add_loess_line = TRUE, title = "Model 2 Scale-Location")
# Durbin-Watson test
dw2 <- dwtest(mod2)
# Create a data frame with the Durbin-Watson test results
dw_df2 <- data.frame(Statistic = dw2$statistic, P = ifelse(dw2$p.value >
0.05, "> 0.05", ifelse(dw2$p.value < 0.01, "< 0.01", "< 0.05")))
# Use stargazer to create an HTML table of the
# Durbin-Watson results
stargazer(dw_df2, type = "html", title = "Durbin-Watson Test Results",
summary = FALSE, rownames = FALSE, out = "dw_test_results2.html")
| Statistic | P |
| 0.885 | < 0.01 |
# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer("dw_test_results2.html")
NULL
# Normality of residuals check
QQPlotModel(mod2, subtitle = "QQ-Plot for Model 2")
# Zero conditional mean of residual values
mean(mod2$residuals)
[1] -0.000000000000001023337
resid_mod2 <- lm(resids2 ~ log_income_deprivation_rate_2015 +
housing_affordability_score_2016, data = complete_case_sample)
summary(resid_mod2)
Call: lm(formula = resids2 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)
Residuals: Min 1Q Median 3Q Max -2.5103 -0.4599 -0.0744 0.3675 2.7243
Coefficients: Estimate (Intercept) -0.00000000000000123474 log_income_deprivation_rate_2015 0.00000000000000012189 housing_affordability_score_2016 0.00000000000000007123 Std. Error t value Pr(>|t|) (Intercept) 0.01403408525346747093 0 1 log_income_deprivation_rate_2015 0.00594008978139718563 0 1 housing_affordability_score_2016 0.00241867829270426540 0 1
Residual standard error: 0.6899 on 32711 degrees of freedom Multiple R-squared: 2.489e-30, Adjusted R-squared: -6.114e-05 F-statistic: 4.071e-26 on 2 and 32711 DF, p-value: 1
# Zero conditional mean regression results
stargazer(resid_mod2, type = "html", dep.var.labels = c("Residuals of Model 2"),
covariate.labels = c("log(Income Deprivation Rate 2015)",
"Housing Affordability Score 2016 [HAS]"), out = "residmod2.html")
| Dependent variable: | |
| Residuals of Model 2 | |
| log(Income Deprivation Rate 2015) | 0.000 |
| (0.006) | |
| Housing Affordability Score 2016 [HAS] | 0.000 |
| (0.002) | |
| Constant | -0.000 |
| (0.014) | |
| Observations | 32,714 |
| R2 | 0.000 |
| Adjusted R2 | -0.0001 |
| Residual Std. Error | 0.690 (df = 32711) |
| F Statistic | 0.000 (df = 2; 32711) |
| Note: | p<0.1; p<0.05; p<0.01 |
rstudioapi::viewer("residmod2.html")
NULL
# Checking assumptions: Model 3 Adding residual values to
# dataframe
complete_case_sample$resids3 <- residuals(mod3)
# Linearity (using partial regression plots)
PartialRegressionPlot(mod3, "log_income_deprivation_rate_2015",
x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot(mod3, "housing_affordability_score_2016",
x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")
# Multicollinearity (using VIF)
vif_values2 <- data.frame(vif(mod3))
vif_values2$Variable <- c("log(Income Deprivation Rate 2015)",
"Housing Affordability Score 2016", "Rural Urban Classification 2011")
vif_df2 = vif_values2[, c("Variable", "GVIF")]
# creating a pretty table
stargazer(vif_df2, type = "html", title = "GVIF Values", summary = FALSE,
rownames = FALSE, out = "mod3vif.html")
| Variable | GVIF |
| log(Income Deprivation Rate 2015) | 1.508 |
| Housing Affordability Score 2016 | 1.602 |
| Rural Urban Classification 2011 | 1.197 |
# using rstudioapi to view the table
rstudioapi::viewer("mod3vif.html")
NULL
# Constant variance of residuals plot
HomoskedasticityPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location")
# Plot indicating groups
HomoskedasticityDetailPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location",
dataframe = complete_case_sample, categorical_var_name = "rural_urban_class_2011")
# Testing for non-constant variance across groups:
leveneTest(complete_case_sample$resids3, complete_case_sample$rural_urban_class_2011,
center = mean)
Levene’s Test for Homogeneity of Variance (center = mean) Df F value
Pr(>F)
group 7 189.04 < 0.00000000000000022 *** 32706
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05
‘.’ 0.1 ’ ’ 1
# df for levene's test results
levene_results <- data.frame(`Degrees of Freedom (Group)` = 7,
`Degrees of Freedom (Residual)` = 32706, `F Value` = 189.04,
`P Value` = "< 0.01")
names(levene_results) <- c("Degrees of Freedom (Group)", "Degrees of Freedom (Residual)",
"F Value", "P Value")
# Viewing the table with descriptive column names
stargazer(levene_results, type = "html", title = "Levene's Test for Homogeneity of Variance",
out = "levene_test.html", summary = FALSE, rownames = FALSE)
| Degrees of Freedom (Group) | Degrees of Freedom (Residual) | F Value | P Value |
| 7 | 32,706 | 189.040 | < 0.01 |
rstudioapi::viewer("levene_test.html")
NULL
# Durbin-Watson test
dw3 <- dwtest(mod3)
# Create a data frame with the Durbin-Watson test results
dw_df3 <- data.frame(Statistic = dw3$statistic, P = ifelse(dw3$p.value >
0.05, "> 0.05", ifelse(dw3$p.value < 0.01, "< 0.01", "< 0.05")))
# Use stargazer to create an HTML table of the
# Durbin-Watson results
stargazer(dw_df3, type = "html", title = "Durbin-Watson Test Results",
summary = FALSE, rownames = FALSE, out = "dw_test_results3.html")
| Statistic | P |
| 1.184 | < 0.01 |
# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer("dw_test_results3.html")
NULL
# When considering the independence of residuals, we might
# have reason to think that this assumption is not
# satisfied within the model, as it is likely that some
# degree of spatial autocorrelation could be influencing
# our results - that is, that LSOAs closer to each-other
# could theoretically have more similar values of X. This
# idea is supported by the results of the durbin-watson
# test statistics shown - notably, getting closer to 2
# (representing less reduced autocorrelation) when
# including a variable that partially accounts for
# spatiality (the RUC2011 indicator). As such, we do see a
# considerable amount of positive autocorrelation in the
# data (according to a Durbin-Watson statistic of 0.83
# being considerably below 2). This means that residuals
# that follow each other in the data tend to be slightly
# similar in magnitude. Considering the null hypothesis of
# no autocorrelation and considering the p-value being
# significant considering an alpha of 0.05, we fail to
# reject the null hypothesis that there is no
# autocorrelation presented by the variables within our
# model. Thus, we might not be confident that the
# independence of residuals assumption is satisfied.
# Normality of residuals check
QQPlotModel(mod3, subtitle = "QQ-Plot for Model 3")
# Zero conditional mean of residual values
mean(mod3$residuals)
[1] 0.00000000000000001521903
resid_mod3 <- lm(resids3 ~ log_income_deprivation_rate_2015 +
housing_affordability_score_2016 + rural_urban_class_2011,
data = complete_case_sample)
summary(resid_mod3)
Call: lm(formula = resids3 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + rural_urban_class_2011, data = complete_case_sample)
Residuals: Min 1Q Median 3Q Max -2.35847 -0.33319 0.02095 0.34803 2.03045
Coefficients: Estimate (Intercept) -0.00000000000000309640 log_income_deprivation_rate_2015 -0.00000000000000014066 housing_affordability_score_2016 0.00000000000000002189 rural_urban_class_2011Rural town and fringe in a sparse setting 0.00000000000000313182 rural_urban_class_2011Rural village and dispersed 0.00000000000000311998 rural_urban_class_2011Rural village and dispersed in a sparse setting 0.00000000000000369754 rural_urban_class_2011Urban city and town 0.00000000000000390593 rural_urban_class_2011Urban city and town in a sparse setting 0.00000000000000223619 rural_urban_class_2011Urban major conurbation 0.00000000000000350995 rural_urban_class_2011Urban minor conurbation 0.00000000000000338580 Std. Error (Intercept) 0.01398036896695923660 log_income_deprivation_rate_2015 0.00457597976149103861 housing_affordability_score_2016 0.00192069232350872252 rural_urban_class_2011Rural town and fringe in a sparse setting 0.04890182920944522588 rural_urban_class_2011Rural village and dispersed 0.01450616708669712727 rural_urban_class_2011Rural village and dispersed in a sparse setting 0.04004073360934087833 rural_urban_class_2011Urban city and town 0.01065145323950207612 rural_urban_class_2011Urban city and town in a sparse setting 0.06876884655176458694 rural_urban_class_2011Urban major conurbation 0.01120479904872859027 rural_urban_class_2011Urban minor conurbation 0.01802400600454943935 t value (Intercept) 0 log_income_deprivation_rate_2015 0 housing_affordability_score_2016 0 rural_urban_class_2011Rural town and fringe in a sparse setting 0 rural_urban_class_2011Rural village and dispersed 0 rural_urban_class_2011Rural village and dispersed in a sparse setting 0 rural_urban_class_2011Urban city and town 0 rural_urban_class_2011Urban city and town in a sparse setting 0 rural_urban_class_2011Urban major conurbation 0 rural_urban_class_2011Urban minor conurbation 0 Pr(>|t|) (Intercept) 1 log_income_deprivation_rate_2015 1 housing_affordability_score_2016 1 rural_urban_class_2011Rural town and fringe in a sparse setting 1 rural_urban_class_2011Rural village and dispersed 1 rural_urban_class_2011Rural village and dispersed in a sparse setting 1 rural_urban_class_2011Urban city and town 1 rural_urban_class_2011Urban city and town in a sparse setting 1 rural_urban_class_2011Urban major conurbation 1 rural_urban_class_2011Urban minor conurbation 1
Residual standard error: 0.5228 on 32704 degrees of freedom Multiple R-squared: 3.712e-30, Adjusted R-squared: -0.0002752 F-statistic: 1.349e-26 on 9 and 32704 DF, p-value: 1
# Zero conditional mean regression results
stargazer(resid_mod3, type = "html", dep.var.labels = c("Residuals of Model 3"),
covariate.labels = c("log(Income Deprivation Rate 2015)",
"Housing Affordability Score 2016 [HAS]", "Rural town and fringe [RUC 2011]",
"Rural town and fringe in a sparse setting [RUC 2011]",
"Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]",
"Urban city and town in a sparse setting [RUC 2011]",
"Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"),
out = "residmod3.html")
| Dependent variable: | |
| Residuals of Model 3 | |
| log(Income Deprivation Rate 2015) | -0.000 |
| (0.005) | |
| Housing Affordability Score 2016 [HAS] | 0.000 |
| (0.002) | |
| Rural town and fringe [RUC 2011] | 0.000 |
| (0.049) | |
| Rural town and fringe in a sparse setting [RUC 2011] | 0.000 |
| (0.015) | |
| Rural village and dispersed [RUC 2011] | 0.000 |
| (0.040) | |
| Rural village and dispersed in a sparse setting [RUC 2011] | 0.000 |
| (0.011) | |
| Urban city and town in a sparse setting [RUC 2011] | 0.000 |
| (0.069) | |
| Urban major conurbation [RUC 2011] | 0.000 |
| (0.011) | |
| Urban minor conurbation [RUC 2011] | 0.000 |
| (0.018) | |
| Constant | -0.000 |
| (0.014) | |
| Observations | 32,714 |
| R2 | 0.000 |
| Adjusted R2 | -0.0003 |
| Residual Std. Error | 0.523 (df = 32704) |
| F Statistic | 0.000 (df = 9; 32704) |
| Note: | p<0.1; p<0.05; p<0.01 |
rstudioapi::viewer("residmod3.html")
NULL
library(knitr)
knitr::opts_chunk$set(echo = T, warning = FALSE, message = FALSE)
opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=TRUE)
# Libraries used
library(tidyverse)
library(haven)
library(dplyr)
library(readxl)
library(GGally)
library(forcats)
library(stargazer)
library(lme4)
library(car)
library(rstudioapi)
library(knitr)
library(kableExtra)
# Turning off scientific notation
options(scipen = 9999)
# Bespoke functions used ----
PointPlot <- function(dataset, independent_var, dependent_var, colour_var = NULL, title = NULL, subtitle = NULL, caption = NULL, x_label = "X variable", y_label = "Y variable", point_size = 1, line_intercept = NULL, line_slope = NULL, line_colour = "black", line_linewidth = 1, line_linetype = "solid", add_regression_line = FALSE) {
# Loading required libraries
library(ggplot2)
library(showtext)
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
# Adjust the aesthetic mapping based on the presence of the third variable
if(!is.null(colour_var)) {
plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var, colour = colour_var))
} else {
plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var))
}
# Base point plot
plot <- plot + geom_point(alpha = 1, shape = 19, size = point_size, position = 'jitter') +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
family = 'Roboto'),
plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
family = 'Roboto'),
plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
family = 'Roboto'),
axis.title = element_text(family = 'Roboto'),
axis.text = element_text(size = 15.5,
family = 'Roboto',
colour = 'black'),
legend.position = ifelse(is.null(colour_var), 'none', 'right'),
legend.text = element_text(family = 'Roboto')) +
labs(title = title,
subtitle = subtitle,
caption = caption,
x = x_label,
y = y_label)
# Add manual line if intercept and slope are provided
if (!is.null(line_intercept) & !is.null(line_slope)) {
plot <- plot + geom_abline(intercept = line_intercept,
slope = line_slope,
colour = line_colour,
linewidth = line_linewidth,
linetype = line_linetype)
}
# Add regression line with confidence interval if requested
if (add_regression_line) {
plot <- plot + geom_smooth(method = "lm", alpha = 0.2)
}
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
# Save the plot to the working directory
ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
return(plot)
}
DistPlot <- function(dataset, variable, formatted_name, subtitle = NULL, caption = NULL, y_label = NULL) {
# Loading required libraries
library(ggplot2)
library(showtext)
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
plot <- ggplot(dataset, aes_string(x = variable)) +
geom_histogram(aes(y = ..density..),
colour = 1, fill = "white") +
geom_density(fill = '#0072C6', colour = '#0072C6', alpha = 0.25) +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
family = 'Roboto'),
plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
family = 'Roboto'),
plot.caption = element_text(colour = 'grey30', size = 12.5, face = 'italic',
family = 'Roboto'),
axis.title.x = element_blank(), # This removes the x-axis title
axis.title.y = element_text(size = 15.5,
family = 'Roboto',
colour = 'black'),
axis.text = element_text(size = 15.5,
family = 'Roboto',
colour = 'black'),
legend.position = 'none') +
labs(title = paste0("Distribution of ", formatted_name),
subtitle = subtitle,
caption = caption,
y = y_label)
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
# Save the plot to the working directory
ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
plot
}
ConfidencePlot <- function(data, title, subtitle, middle_point = 0, x_label = "Estimated Coefficient", caption = NULL) {
library(ggplot2)
library(showtext)
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
if (!middle_point %in% c(0, 1)) {
stop("middle_point must be 0 or 1")
}
conf_plot <- ggplot(data, aes(x = coefs, y = names)) +
geom_errorbar(aes(xmin = lower, xmax = upper),
col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "forestgreen"),
width = 0.5, size = 1, alpha = 0.5) +
geom_point(col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "black"), size = 1, alpha = 1) +
geom_vline(xintercept = middle_point, col = 'red', size = 0.5, linetype="dashed") +
theme_classic() +
labs(title = title,
subtitle = subtitle,
caption = caption,
x = x_label,
y = "") +
theme(axis.text = element_text(size = 15.5,
family = 'Roboto',
colour = 'black'),
axis.title = element_text(family = 'Roboto', size = 15.5),
plot.title = element_text(hjust = 0, colour = 'black', size = 17.5, face = 'bold',
family = 'Roboto', margin = margin(b = 10)),
plot.subtitle = element_text(hjust = 0, colour = 'grey30', size = 12.5, face = 'italic',
family = 'Roboto', margin = margin(b = 5)),
plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
family = 'Roboto'))
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", conf_plot$labels$title), ".png")
# Save the plot to the working directory
ggsave(filename = file_name, plot = conf_plot, width = 1200/100, height = 740/100, dpi = 100) # Adjust height as necessary
return(conf_plot)
}
PartialRegressionPlot <- function(model, variable, x_var_name = "X1", y_var_name = "Dependent Variable", title = NULL) {
# Close any existing plotting device
graphics.off()
# Construct axis labels using the specified names
xlab <- paste("Residuals of", x_var_name, "Controlling for Other Variables")
ylab <- paste("Residuals of", y_var_name, "(Excluding", x_var_name, ")")
# Create the plot without a main title
avPlots(model, variable, xlab = xlab, ylab = ylab)
# Add a title using the title function
title(main = title)
}
HomoskedasticityPlot <- function(model, title = "Homoskedasticity Check", subtitle = NULL,
x_label = "Fitted Values", y_label = "Standardized Residuals",
point_size = 1.2, add_loess_line = FALSE) {
# Loading required libraries
library(ggplot2)
library(showtext)
library(lmtest) # For Breusch-Pagan test
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
# Calculate standardized residuals
std_residuals <- rstandard(model)
# Fitted values
fitted_values <- fitted(model)
# Create a data frame for plotting
plot_data <- data.frame(FittedValues = fitted_values, StdResiduals = std_residuals)
# Perform Breusch-Pagan test
bp_test <- bptest(model)
# Format the p-value for the caption
if (bp_test$p.value < 0.05) {
p_value_text <- "p < 0.05"
} else if (bp_test$p.value > 0.05) {
p_value_text <- "p > 0.05"
} else {
p_value_text <- "p = 0.05"
}
# Create a caption including the formatted p-value text
caption <- paste("Breusch-Pagan Test:", p_value_text)
# Base point plot with adjusted sizes for readability
plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals)) +
geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 50, face = 'bold', family = 'Roboto'),
plot.subtitle = element_text(colour = 'grey30', size = 30, face = 'italic', family = 'Roboto'),
plot.caption = element_text(colour = 'grey30', size = 50, face = 'italic', family = 'Roboto'),
axis.title = element_text(size = 30, family = 'Roboto'),
axis.text = element_text(size = 30, family = 'Roboto', colour = 'black'),
legend.position = 'none') +
labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
# Add loess smoothing line if requested
if (add_loess_line) {
plot <- plot + geom_smooth(method = "loess", alpha = 0.2)
}
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
# Save the plot with dimensions that balance size and readability
ggsave(filename = file_name, plot = plot, width = 5.91, height = 4.50, dpi = 300)
return(plot)
}
HomoskedasticityDetailPlot <- function(model, dataframe, categorical_var_name,
title = "Homoskedasticity Check", subtitle = NULL,
x_label = "Fitted Values", y_label = "Standardized Residuals",
point_size = 1.2, add_loess_line = FALSE) {
# Loading required libraries
library(ggplot2)
library(showtext)
library(lmtest) # For Breusch-Pagan test
# Enable showtext to render text
showtext_auto()
font_add_google("Roboto")
# Calculate standardized residuals
std_residuals <- rstandard(model)
# Fitted values
fitted_values <- fitted(model)
# Ensure the categorical variable exists in the dataframe
if (!categorical_var_name %in% names(dataframe)) {
stop("Specified categorical variable not found in the provided dataframe.")
}
# Create a data frame for plotting
plot_data <- data.frame(FittedValues = fitted_values,
StdResiduals = std_residuals,
Category = dataframe[[categorical_var_name]])
# Perform Breusch-Pagan test
bp_test <- bptest(model)
# Format the p-value for the caption
p_value_text <- ifelse(bp_test$p.value < 0.05, "p < 0.05",
ifelse(bp_test$p.value > 0.05, "p > 0.05", "p = 0.05"))
# Create a caption including the formatted p-value text
caption <- paste("Breusch-Pagan Test:", p_value_text)
# Base point plot with adjusted sizes for readability
plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals, color = Category)) +
geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold', family = 'Roboto'),
plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic', family = 'Roboto'),
plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic', family = 'Roboto'),
axis.title = element_text(size = 15.5, family = 'Roboto'),
axis.text = element_text(size = 15.5, family = 'Roboto', colour = 'black'),
legend.position = 'right', legend.text = element_text(size = 12.5, family = 'Roboto')) +
labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
# Add loess smoothing line if requested
if (add_loess_line) {
plot <- plot + geom_smooth(method = "loess", alpha = 0.2, se = FALSE)
}
# Create a filename from the plot title
file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
# Save the plot with dimensions that balance size and readability
ggsave(filename = file_name, plot = plot, width = 10, height = 6.17, dpi = 100)
return(plot)
}
QQPlotModel <- function(model, title = 'Normality of Residuals Check', subtitle = 'QQ-Plot', caption = NULL) {
# Loading required libraries
library(ggplot2)
# Extract residuals from the model
model_residuals <- residuals(model)
# Create the QQ-plot
plot <- ggplot() +
stat_qq(aes(sample = model_residuals)) +
stat_qq_line(aes(sample = model_residuals), colour = "blue") +
theme_classic() +
theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold'),
plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic'),
plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic'),
axis.title = element_text(),
axis.text = element_text(size = 15.5, colour = 'black')) +
labs(title = title,
subtitle = subtitle,
caption = caption,
x = "Theoretical Quantiles",
y = "Sample Quantiles")
# Create a filename from the plot title or use a default one
file_name <- if (!is.null(title)) {
paste0(gsub(" ", "_", title), ".png")
} else {
"QQPlot.png"
}
# Save the plot to the working directory
ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
return(plot)
}
AHAH_df <- read.csv('Group_7_AHAH_V3_0.csv') # Access to healthy assets and hazards V3 data
IOD_income_df <- read.csv('Group_7_IoD2019_Income.csv') # Index of Multiple Deprivation Income Domian Indicator Data (measured in 2015)
IOD_health_df <- read.csv('Group_7_IoD2019_Health.csv') # Index of Multiple Deprivation Health Domian Indicator Data
IOD_barriers_df <- read.csv('Group_7_IoD2019_Barriers.csv') # Index of Multiple Deprivation Barriers to Housing and Services Domain Indicator Data
IOD_education_df <- read.csv('Group_7_IoD2019_Education.csv') # Index of Multiple Deprivation Education, Skills and Training Deprivation Domain Indicator Data
IOD_employment_df <- read.csv('Group_7_IoD2019_Employment.csv') # Index of Multiple Deprivation Employment Deprivation Domain Indicator Data (measured in 2015)
ONS_MIDYEAR_2015 <- read.csv('Group_7_ONS_MIDYEAR_2015.csv') # Mid-year population estimates for 2015 in England and Wales (for creating rates from the numerator values given in the income and employment measures)
RUC2011 <- read.csv('Group_7_RUC2011.csv')
## Renaming columns for consistent joining
AHAH_df <- AHAH_df %>% rename("LSOA.code..2011." = lsoa11)
ONS_MIDYEAR_2015 <- ONS_MIDYEAR_2015 %>% rename("LSOA.code..2011." = Area.Codes)
RUC2011 <- RUC2011 %>% rename("LSOA.code..2011." = LSOA11CD)
## Creating complete combined dataset
complete_combined_data <- IOD_barriers_df %>%
left_join(IOD_education_df, by = "LSOA.code..2011.") %>%
left_join(IOD_employment_df, by = "LSOA.code..2011.") %>%
left_join(IOD_health_df, by = "LSOA.code..2011.") %>%
left_join(IOD_income_df, by = "LSOA.code..2011.") %>%
left_join(AHAH_df, by = "LSOA.code..2011.") %>%
left_join(ONS_MIDYEAR_2015, by = "LSOA.code..2011.") %>%
left_join(RUC2011, by = "LSOA.code..2011.")
## Retaining only columns of interest
combined_data <- complete_combined_data %>%
select(
LSOA.code..2011.,
Local.Authority.District.name..2019.,
Income.Domain.numerator,
Years.of.potential.life.lost.indicator,
Housing.affordability.indicator,
Staying.on.in.education.post.16.indicator,
Employment.Domain.numerator,
ah3gp,
ah3hosp,
ah3dent,
ah3phar,
All.Ages,
RUC11
)
## Removing commas and converting columns 3:12 to numeric
combined_data <- combined_data %>%
mutate(across(.cols = 3:12,
.fns = ~ as.numeric(str_remove_all(.x, ","))))
## Creating income and employment deprivation rate variables
combined_data$income_deprivation_rate <- (combined_data$Income.Domain.numerator/combined_data$All.Ages) * 100
combined_data$employment_deprivation_rate <- (combined_data$Employment.Domain.numerator /combined_data$All.Ages) * 100
## Creating average distance (in minutes) to a healthcare service (including GP practice, hospital, dentist and pharmacy)
combined_data$avg_healthcare_distance <- rowMeans(combined_data[, c("ah3gp", "ah3hosp", "ah3dent", "ah3phar")], na.rm = TRUE)
## Renaming column names for understanding
colnames(combined_data) <- c('lsoa_code', 'local_authority_name', 'no._of_income_deprived_individuals_2015', 'years_of_lives_lost_2013_2017', 'housing_affordability_score_2016', 'proportion_of_above16_individuals_inschool_2010_2012', 'no._of_deprived_employment_2015_2016', 'avg_dist_gppractice_in_mins_2022', 'avg_dist_hospital_in_mins_2022', 'avg_dist_dentist_in_mins_2022', 'avg_dist_pharmacy_in_mins_2022', 'population_estimates_for_allages_2015', 'rural_urban_class_2011','income_deprivation_rate_2015','employment_deprivation_rate_2015','avg_healthcare_distance_2022')
## Saving as CSV
write.csv(combined_data, file = "Group_7_combined_data.csv", row.names = FALSE)
write.csv(complete_combined_data, file = "Group_7_complete_combined_data.csv", row.names = FALSE)
## Loading combined data
combined_data <- read.csv('Group_7_combined_data.csv')
## Checking for duplicates
There_are_duplicates <- any(duplicated(combined_data))
if (There_are_duplicates) {
cat("There are duplicates in the combined_data.\n")
} else {
cat("There are no duplicates in combined_data.\n")
}
# Calculate missingness percentages for each numerical variable (IV and DV)
missingness <- sapply(combined_data, function(x) mean(is.na(x)) * 100)
missingness_summary <- data.frame(variable = names(missingness), Missingness_percentage = missingness)
missingness_summary
# Removing Null Values from data set
combined_data <- na.omit(combined_data)
# First 6 rows of data
head(combined_data)
# Last 6 rows of data
tail(combined_data)
summary(combined_data)
# Descriptive table for numerical variables
stargazer(combined_data, type = "html", digits = 1, title = "Table 1: Summary Statistics", out = "table.html")
# Use rstudioapi to view the table
rstudioapi::viewer("table.html")
## SD of numerical variable
sd(combined_data$lsoa_code)
sd(combined_data$no._of_income_deprived_individuals_2015)
sd(combined_data$years_of_lives_lost_2013_2017)
sd(combined_data$housing_affordability_score_2016)
sd(combined_data$proportion_of_above16_individuals_inschool_2010_2012)
sd(combined_data$no._of_deprived_employment_2015_2016)
sd(combined_data$avg_dist_gppractice_in_mins_2022)
sd(combined_data$avg_dist_hospital_in_mins_2022)
sd(combined_data$avg_dist_dentist_in_mins_2022)
sd(combined_data$avg_dist_pharmacy_in_mins_2022)
sd(combined_data$population_estimates_for_allages_2015)
sd(combined_data$income_deprivation_rate_2015)
sd(combined_data$employment_deprivation_rate_2015)
sd(combined_data$avg_healthcare_distance_2022)
# The number of unique names in the rural_urban_class column
unique_names <- table(combined_data$rural_urban_class_2011)
print(unique_names)
rural_urban_class_2011_counts <- as.numeric(unique_names)
print(rural_urban_class_2011_counts)
## Summary statistics for Rural_urban_class_2011
max(rural_urban_class_2011_counts)
min(rural_urban_class_2011_counts)
median(rural_urban_class_2011_counts)
IQR(rural_urban_class_2011_counts)
# The number of unique names in the local_authority_name column
unique_names_1 <- table(combined_data$local_authority_name)
print(unique_names_1)
local_authority_name_counts <- as.numeric(unique_names_1)
print(local_authority_name_counts)
## Summary statistics for local_authority_name
max(local_authority_name_counts)
min(local_authority_name_counts)
median(local_authority_name_counts)
IQR(local_authority_name_counts)
## T-test
# Four sample independent t-test is used to compare the significance level in combined_data data set. Student’s t-test assumes both groups of data are normally distributed and the variances of the two distributions are the same.
x1 <-combined_data$income_deprivation_rate_2015
x2 <-combined_data$employment_deprivation_rate_2015
x3 <-combined_data$housing_affordability_score_2016
y <-combined_data$avg_healthcare_distance_2022
# Plot a 2x2 histogram grid
par(mfrow=c(2, 2))
hist(x1)
hist(x2)
hist(x3)
hist(y)
#Based on histogram plots, we have to perform a Welch's t-test for x1, x2, both of which are skewed, and for x3 as well since DV is highly skewed despite x3 being normally distributed.
t_test1<- t.test(x1,y)
t_test2<- t.test(x2,y)
t_test3<- t.test(x3,y)
t_test1
t_test2
t_test3
## Covariance values of our primary and secondary IV against primary DV
cov(x1,y)
cov(x2,y)
cov(x3,y)
## Correlation values of our primary and secondary IV against primary DV
# As x1, x2 are not normally distributed, we will use 'spearman' method. We will use 'spearman' method for x3 despite it being normally distributed as DV is highly skewed.
cor(x1,y, method='spearman')
cor(x2,y, method='spearman')
cor(x3,y, method='spearman')
DistPlot(combined_data, "income_deprivation_rate_2015", "Income Deprivation Rate Across LSOAs in England (2015)", subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")
DistPlot(combined_data, "avg_healthcare_distance_2022", "Average Minutes to Healthcare Services Across LSOAs in England (2022)", subtitle = "Mean Access Time to Healthcare Services per LSOA", caption = "Data source: Consumer Data Reasearch Centre (2022)")
DistPlot(combined_data, "housing_affordability_score_2016", "Housing Affordability Indicator Across LSOAs in England (2016)", subtitle = "Higher Scores Indicate Increased Deprivation in Entering Ownership or Private Rental Market", caption = "Data source: Consumer Data Reasearch Centre (2022)")
DistPlot(combined_data, "employment_deprivation_rate_2015", "Employment Deprivation Rate in England (2015)", subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")
# Both the main explanatory variable and outcome variable have a very strong right skew. Thus, a log transformation in both of these is examined.
## Adding log transformed variables to dataframe:
combined_data$log_avg_healthcare_distance_2022 <- log(combined_data$avg_healthcare_distance_2022)
combined_data$log_income_deprivation_rate_2015 <- log(combined_data$income_deprivation_rate_2015)
combined_data$log_employment_deprivation_rate_2015 <- log(combined_data$employment_deprivation_rate_2015)
## Checking distributions:
DistPlot(combined_data, "log_income_deprivation_rate_2015", 'log(Income Deprivation Rate) Across LSOAs in England (2015)', subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")
DistPlot(combined_data, "log_avg_healthcare_distance_2022", "log(Average Minutes to Healthcare Services) Across LSOAs in England (2022)", subtitle = "Logarithm of Mean Access Time to Healthcare Services per LSOA", caption = "Data source: Consumer Data Reasearch Centre (2022)")
DistPlot(combined_data, "log_employment_deprivation_rate_2015", 'log(Employment Deprivation Rate) Across LSOAs in England (2015)', subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")
## Filtering out rows with infinite values and creating a complete case sample:
complete_case_sample <- combined_data %>%
select(log_income_deprivation_rate_2015, log_avg_healthcare_distance_2022,
housing_affordability_score_2016, rural_urban_class_2011) %>%
filter(!is.infinite(log_avg_healthcare_distance_2022),
!is.infinite(log_income_deprivation_rate_2015)) %>%
na.omit()
## Saving as CSV
write.csv(complete_case_sample, file = "Group_7_complete_case.csv", row.names = FALSE)
# Checking linearity of relationships between x variables
ggpairs(complete_case_sample[,c("log_income_deprivation_rate_2015", "housing_affordability_score_2016", "log_avg_healthcare_distance_2022")])
PointPlot(combined_data, "log_employment_deprivation_rate_2015", "log_income_deprivation_rate_2015", title = "Income Deprivation Rate (2015) as a Function of Employment Deprivation Rate (2015)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Income Deprivation Rate) (2015)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021", add_regression_line = TRUE)
result <- cor.test(combined_data$log_employment_deprivation_rate_2015, combined_data$income_deprivation_rate_2015, method = "pearson")
# creating a df with correlation test results
model_summary <- data.frame(
Estimate = round(result$estimate, 4),
t.value = round(result$statistic, 2),
p = ifelse(result$p.value < 0.05, "< 0.05", round(result$p.value, 10))
)
# pretty table using stargazer
stargazer(model_summary, type = "html",
title = "Results of Pearson's Correlation Test",
summary = FALSE,
omit.stat = c("all"),
single.row = TRUE, rownames = FALSE, out = 'x1x2.html')
rstudioapi::viewer("x1x2.html")
# As employment deprivation rate and income deprivation rate are highly correlated, employment deprivation rate will be excluded from analyses (to avoid issues of multicollinearity).
# Examining linearity between xi and y
PointPlot(complete_case_sample, "log_income_deprivation_rate_2015", "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Income Deprivation Rate (2015)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "log(Income Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)", add_regression_line = TRUE)
PointPlot(complete_case_sample, "housing_affordability_score_2016", "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Housing Affordability Score (2016)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "Housing Affordability Score (2016)", y_label = "log(Mean Access Time to Healthcare Services) (2022)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)", add_regression_line = TRUE)
PointPlot(combined_data, "log_employment_deprivation_rate_2015", "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Employment Deprivation Rate (2015)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)", add_regression_line = TRUE)
### PLotting of table for slide 11
combined_data_sum <- combined_data[c("local_authority_name","housing_affordability_score_2016", "income_deprivation_rate_2015", "employment_deprivation_rate_2015","avg_healthcare_distance_2022", "rural_urban_class_2011")]
html_table <- stargazer(
combined_data_sum,
type = "html",
title = "Table 1: Summary Statistics",
digits = 1,
column.labels = c(
"Housing Affordability Score (2016)",
"Income Deprivation Rate (2015)",
"Employment Deprivation Rate (2015)",
"Average Healthcare Distance (2022)"
),
out = "table.html"
)
# See HTML table in R Studio
rstudioapi::viewer("table.html")
### Calculate missingness percentages for each variable for slide 11
missingness <- sapply(combined_data_sum, function(x) mean(is.na(x)) * 100)
# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 2: Missingness Summary/%", summary = FALSE, flip=TRUE)
html_file <- "missingness_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
### count of unique names for rural/urban for slide 12
unique_class2011 <- table(combined_data_sum$rural_urban_class_2011)
unique_class2011.1 <- as.data.frame(unique_class2011)
# Rename the columns
colnames(unique_class2011.1) <- c("Rural/Urban Class (2011)", "Count")
# Create HTML table using stargazer
html_table1 <- stargazer(unique_class2011.1, type = "html", title = "Unique Names and its Respective Counts", summary = FALSE)
cat(html_table1)
writeLines(html_table1, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
# Calculate summary statistics
summary_ruralurban <- c(
Max = max(rural_urban_class_2011_counts),
Min = min(rural_urban_class_2011_counts),
Med = median(rural_urban_class_2011_counts),
IQR = IQR(rural_urban_class_2011_counts)
)
# Create data frame for summary statistics
summary_df <- data.frame(statistic = names(summary_ruralurban), value = summary_ruralurban)
# Create HTML table using stargazer
html_table2 <- stargazer(summary_df, type = "html", title = "Summary Statistics", summary = FALSE, rownames = FALSE)
cat(html_table2)
writeLines(html_table2, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
## count of unique names for local authority names for slide 12
unique_localauthority <- table(combined_data_sum$local_authority_name)
# Convert the result to a data frame
unique_localauthority1 <- as.data.frame(unique_localauthority)
colnames(unique_localauthority1) <- c("Local Authority Names", "Count")
# Create HTML table using stargazer
html_table3 <- stargazer(unique_localauthority1, type = "html", title = "Unique Names and its Respective Counts", summary = FALSE)
cat(html_table3)
writeLines(html_table3, html_file)
rstudioapi::viewer(html_file)
# Calculate summary statistics
summary_localauthority <- c(
Max = max(local_authority_name_counts),
Min = min(local_authority_name_counts),
Med = median(local_authority_name_counts),
IQR = IQR(local_authority_name_counts)
)
summary_df <- data.frame(statistic = names(summary_localauthority ), value = summary_localauthority )
# Create HTML table using stargazer
html_table4 <- stargazer(summary_df, type = "html", title = "Summary Statistics", summary = FALSE, rownames = FALSE)
cat(html_table4)
writeLines(html_table4, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
### Repeat for complete_case.csv for slide 16
## Read the complete_case file
complete_case_sum <- read.csv("Group_7_complete_case.csv")
html_table <- stargazer(
complete_case_sum,
type = "html",
title = "Table 3: Summary Statistics",
digits = 1,
column.labels = c(
"Log Income Deprivation Rate (2015)",
"Log Average Healthcare Distance (2022)",
"Housing Affordability Score (2016)",
"Rural/Urban Class (2011)"
),
out = "table.html"
)
# See HTML table in R Studio
rstudioapi::viewer("table.html")
### Calculate missingness percentages for each variable for slide 16
missingness <- sapply(complete_case_sum, function(x) mean(is.na(x)) * 100)
# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 4: Missingness Summary/%", summary = FALSE, flip=TRUE)
html_file <- "missingness_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
### covariance of x1,x2,x3 before log transformation for slide 17
x1 <-combined_data_sum$income_deprivation_rate_2015
x2 <-combined_data_sum$employment_deprivation_rate_2015
x3 <-combined_data_sum$housing_affordability_score_2016
y <-combined_data_sum$avg_healthcare_distance_2022
cov_x1y <- cov(x1, y)
cov_x2y <- cov(x2, y)
cov_x3y <- cov(x3, y)
# Create a data frame for the covariances
covariances_df <- data.frame(
variables = c("x1 and y", "x2 and y", "x3 and y"),
covariance = c(cov_x1y, cov_x2y, cov_x3y)
)
# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances before log transformation", summary = FALSE, rownames = FALSE)
html_file <- "covariances_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
###covariance of x4,x5 after log transformation for slide 17
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <-complete_case_sum$log_income_deprivation_rate_2015
x5 <-complete_case_sum$housing_affordability_score_2016
y1 <-complete_case_sum$log_avg_healthcare_distance_2022
cov_x4y1 <- cov(x4, y1)
cov_x5y1 <- cov(x5, y1)
# Create a data frame for the covariances
covariances_df <- data.frame(
variables = c("x4 and y1", "x5 and y1"),
covariance = c(cov_x4y1, cov_x5y1)
)
# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances after log transformation", summary = FALSE, rownames = FALSE)
html_file <- "covariances_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
## Correlation test for slide 18
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <-complete_case_sum$log_income_deprivation_rate_2015
x5 <-complete_case_sum$housing_affordability_score_2016
y1 <-complete_case_sum$log_avg_healthcare_distance_2022
corx4y1 <- cor.test(x4, y1, method="pearson")
corx5y1 <- cor.test(x5, y1, method="pearson")
# Create a data frame to store correlation results
# format.pval function is used to format the p-value to make it into a string ie. a range
corr_df <- data.frame(
variables = c("x4 and y1", "x5 and y1"),
Pearson_Correlation = c(corx4y1$estimate, corx5y1$estimate),
t_value = c(corx4y1$statistic, corx5y1$statistic),
Estimate = c(corx4y1$estimate, corx5y1$estimate),
p_value = ifelse(corx4y1$p.value == 0, "< 0.001", format.pval(corx4y1$p.value))
)
# Create HTML table using stargazer
html_table <- stargazer(corr_df, type = "html", title = "Pearson Correlations After Log Transformation", summary = FALSE, rownames = FALSE)
html_file <- "cor_table.html"
writeLines(html_table, html_file)
# See HTML table in R Studio
rstudioapi::viewer(html_file)
# Model 1:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate.
mod1 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
summary(mod1)
# Model 1 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate. As both the dependent and independent variables are log transformed, the coefficients may be interpreted in terms of percentage change. That is, a 1% increase in the income deprivation rate is associated with a 0.35% decrease in the average travel time to a healthcare service. This coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). Thus, according to this model, we may reject the null hypothesis that there is no relationship between income deprivation rate and average healthcare travel distance. Further, the p-value associated with the F-statistic is also significant considering the same alpha, sugegsting at least one variable in the model has some explanatory power regarding average healthcare distance. This statistically is not that useful regarding this model however, as only one explanatory variable is considered. The coefficient of determination for this model indicates that we are able to explain 13.1% of the variation in average travel time to healthcare services with this model.
# Model 2:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate and the housing affordability indicator
mod2 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)
# Exponentiating untransformed variable coefficients to allow easier interpretation
mod2$coefficients["housing_affordability_score_2016"] <- exp(mod2$coefficients["housing_affordability_score_2016"])
# Viewing model results
summary(mod2)
# Model 2 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate and the housing affordability indicator. Here, we can see that a 1% increase in the income deprivation rate is associated with a 0.19% decrease in the average travel time to a healthcare service. Thus, we can see that when we control for a further underlying economic variable, the explanatory power of income deprivation alone decreases. Again, this coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). For interpretation of the housing affordability indicator, the exponential of the coefficient is considered (0.88). Thus, we can see that a 1 unit increase in the housing affordability indicator is associated with a 12% decrease in the average travel time to a healthcare service, again significant considering an alpha of 0.05. Remembering that increases in the housing affordability indicator signify increased inability to afford to enter owner-occupation or the private rental market, this again appears to indicate that more economically deprived areas have greater geographical access to healthcare services. Further, the p-value associated with the F-statistic is also significant at the 5% level, again suggesting at least one variable in the model has some explanatory power regarding average healthcare distance. The adjusted coefficient of determination for this model indicates that we are able to explain 19.3% of the variation in average travel time to healthcare services with this model.
# Model 3:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate, the housing affordability indicator, and the re-leveled Rural-Urban Classification (RUC11) with "Urban city and town" as the reference category
mod3 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + forcats::fct_relevel(rural_urban_class_2011, "Urban city and town"), data = complete_case_sample)
# Exponentiated coefficient data
mod3_exp_coeff <- data.frame(exp(coef(mod3)))
# Exponentiating untransformed variable coefficients WITHIN the model to allow easier interpretation
mod3$coefficients[3:10] <- exp(mod3$coefficients[3:10])
# Viewing model results
summary(mod3)
# Creating data for confidence interval plot
ConfidencePlotData <- data.frame (
"names" = as.factor (c("Rural town and fringe", "Rural town and fringe in a sparse setting", "Rural village and dispersed", "Rural village and dispersed in a sparse setting", "Urban city and town in a sparse setting", "Urban major conurbation", "Urban minor conurbation")), # Creating predictor names (row names)
"coefs" = mod3_exp_coeff[4:10, 1],
"lower" = confint (mod3) [4:10, 1], # Creating a lower confidence interval [ , 1] list for predictors 2-3 in the model [2:5 , ]
"upper" = confint (mod3) [4:10, 2] # Creating an upper confidence interval [ , 2] list for predictors 2-3 in the model [2:5 , ]
)
ConfidencePlotData$names <- factor(ConfidencePlotData$names, levels = unique(ConfidencePlotData$names))
# Creating confidence interval plot for RUC coefficients
ConfidencePlot(ConfidencePlotData, "Healthcare Access by Rural-Urban Classification in England", "Reference Category: Urban City and Town", x_label = "Exponentiated Coefficient", middle_point = 1, caption = "*95% CI Indicated by Green Error Bars. Red dashed line indicates reference group. Controlling for income deprivation (2015) and housing affordability indicators (2016).")
# Model 3 expands on Model 2 by including the Rural-Urban Classification (RUC11) as a factor variable, allowing us to account for geographic differences in average travel time to healthcare services. The model estimates the logarithm of average travel time to a healthcare service within a Lower Layer Super Output Area (LSOA) as a function of the logarithm of the income deprivation rate, the housing affordability indicator, and the RUC11 categories. We observe that a 1% increase in the income deprivation rate is associated with a 0.10% decrease in average travel time to healthcare services, when holding the other variables constant. This relationship is statistically significant at the 5% level, with a p-value less than 0.05, indicating that income deprivation continues to be an important factor in explaining variations in healthcare access, albeit with a smaller coefficient compared to Model 2. Similarly, a 1 unit increase in the housing affordability indicator, which reflects increased difficulty in entering owner-occupation or the private rental market, is associated with a 7% decrease in average travel time to healthcare services. This relationship is also statistically significant at the 5% level. The new variables introduced, representing the RUC11 categories, show significant variations in healthcare access across different geographic areas. For instance, compared to Urban city and town areas, Rural village and dispersed areas experience a 344% increase in average travel time to healthcare services, indicating that rural areas face substantial barriers in healthcare access. These relationships are statistically significant at the 5% level, with the exception of Urban city and town in a sparse setting category, which does not show a significant difference in healthcare access compared to Urban city and town areas. The adjusted coefficient of determination for this model is 0.537, suggesting that the model explains approximately 53.7% of the variation in average travel time to healthcare services. The inclusion of the RUC11 categories significantly enhances the explanatory power of the model compared to Model 2. The p-value associated with the F-statistic is also significant at the 5% level, indicating that at least one variable in the model has some explanatory power regarding average healthcare distance. In conclusion, Model 3 provides a more nuanced understanding of the factors affecting healthcare access, by accounting for both economic and geographic variables. The significant differences in healthcare access across the RUC11 categories highlight the importance of considering geographic disparities when analyzing healthcare access.
results <- stargazer(mod1, mod2, mod3, type="html",
dep.var.labels=c("log(Mean Healthcare Service Distance [MHSD])"),
covariate.labels=c("log(Income Deprivation Rate 2015)","Housing Affordability Score 2016 [HAS]","Rural town and fringe [RUC 2011]","Rural town and fringe in a sparse setting [RUC 2011]","Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]", "Urban city and town in a sparse setting [RUC 2011]", "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"),
notes = "Coefficients for HAS and RUC 2011 are exponentiated, reflecting a multiplicative effect on MHSD.",
ci = TRUE)
# Checking assumptions: Model 1
# Adding residual values to dataframe
complete_case_sample$resids1 <- residuals(mod1)
# Linearity (using partial regression plots)
PartialRegressionPlot (mod1, 'log_income_deprivation_rate_2015', x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR)")
# Constant variance of residuals
HomoskedasticityPlot(mod1, add_loess_line = TRUE, title = "Model 1 Scale-Location")
# Durbin-Watson test
dw1 <- dwtest(mod1)
# Create a data frame with the Durbin-Watson test results
dw_df <- data.frame(
Statistic = dw1$statistic,
P = format.pval(dw1$p.value, digits = 16)
)
# Use stargazer to create an HTML table of the Durbin-Watson results
stargazer(dw_df, type = "html", title = "Durbin-Watson Test Results",
summary = FALSE, rownames = FALSE, out = 'dw_test_results.html')
# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer('dw_test_results.html')
# Normality of residuals check
QQPlotModel(mod1, subtitle = "QQ-Plot for Model 1")
# Zero conditional mean of residual values
mean(mod1$residuals)
resid_mod1 <- lm(resids1 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
summary(resid_mod1)
# Zero conditional mean regression results
stargazer(resid_mod1, type="html",
dep.var.labels=c("Residuals of Model 1"),
covariate.labels=c("log(Income Deprivation Rate 2015)"), out = "residmod1.html")
rstudioapi::viewer("residmod1.html")
# Checking assumptions: Model 2
# Adding residual values to dataframe
complete_case_sample$resids2 <- residuals(mod2)
# Linearity (using partial regression plots)
PartialRegressionPlot (mod2, 'log_income_deprivation_rate_2015', x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot (mod2, 'housing_affordability_score_2016', x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")
# Multicollinearity (using VIF)
vif_values1 <- vif(mod2)
# storing VIF in a dataframe
vif_df1 <- data.frame(Variable = names(vif_values1), VIF = vif_values1)
vif_df1$Variable[1] <- "log(Income Deprivation Rate 2015)"
vif_df1$Variable[2] <- "Housing Affordability Score 2016"
# creating a pretty table
stargazer(vif_df1, type = "html", title = "VIF Values", summary = FALSE,
rownames = FALSE, out = 'mod2vif.html')
# using rstudioapi to view the table
rstudioapi::viewer("mod2vif.html")
# Constant variance of residuals
HomoskedasticityPlot(mod2, add_loess_line = TRUE, title = "Model 2 Scale-Location")
# Durbin-Watson test
dw2 <- dwtest(mod2)
# Create a data frame with the Durbin-Watson test results
dw_df2 <- data.frame(
Statistic = dw2$statistic,
P = ifelse(dw2$p.value > 0.05, '> 0.05',
ifelse(dw2$p.value < 0.01, '< 0.01', '< 0.05'))
)
# Use stargazer to create an HTML table of the Durbin-Watson results
stargazer(dw_df2, type = "html", title = "Durbin-Watson Test Results",
summary = FALSE, rownames = FALSE, out = 'dw_test_results2.html')
# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer('dw_test_results2.html')
# Normality of residuals check
QQPlotModel(mod2, subtitle = "QQ-Plot for Model 2")
# Zero conditional mean of residual values
mean(mod2$residuals)
resid_mod2 <- lm(resids2 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)
summary(resid_mod2)
# Zero conditional mean regression results
stargazer(resid_mod2, type="html",
dep.var.labels=c("Residuals of Model 2"),
covariate.labels=c("log(Income Deprivation Rate 2015)","Housing Affordability Score 2016 [HAS]"), out = "residmod2.html")
rstudioapi::viewer("residmod2.html")
# Checking assumptions: Model 3
# Adding residual values to dataframe
complete_case_sample$resids3 <- residuals(mod3)
# Linearity (using partial regression plots)
PartialRegressionPlot (mod3, 'log_income_deprivation_rate_2015', x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot (mod3, 'housing_affordability_score_2016', x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")
# Multicollinearity (using VIF)
vif_values2 <- data.frame(vif(mod3))
vif_values2$Variable <- c('log(Income Deprivation Rate 2015)', 'Housing Affordability Score 2016', 'Rural Urban Classification 2011')
vif_df2 = vif_values2[, c("Variable", "GVIF")]
# creating a pretty table
stargazer(vif_df2, type = "html", title = "GVIF Values", summary = FALSE,
rownames = FALSE, out = 'mod3vif.html')
# using rstudioapi to view the table
rstudioapi::viewer("mod3vif.html")
# Constant variance of residuals plot
HomoskedasticityPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location")
# Plot indicating groups
HomoskedasticityDetailPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location", dataframe = complete_case_sample, categorical_var_name = "rural_urban_class_2011")
# Testing for non-constant variance across groups:
leveneTest(complete_case_sample$resids3, complete_case_sample$rural_urban_class_2011, center=mean)
# df for levene's test results
levene_results <- data.frame(
`Degrees of Freedom (Group)` = 7,
`Degrees of Freedom (Residual)` = 32706,
`F Value` = 189.04,
`P Value` = '< 0.01'
)
names(levene_results) <- c("Degrees of Freedom (Group)",
"Degrees of Freedom (Residual)",
"F Value",
"P Value")
# Viewing the table with descriptive column names
stargazer(levene_results,
type = "html",
title = "Levene's Test for Homogeneity of Variance",
out = "levene_test.html",
summary = FALSE,
rownames = FALSE)
rstudioapi::viewer('levene_test.html')
# Durbin-Watson test
dw3 <- dwtest(mod3)
# Create a data frame with the Durbin-Watson test results
dw_df3 <- data.frame(
Statistic = dw3$statistic,
P = ifelse(dw3$p.value > 0.05, '> 0.05',
ifelse(dw3$p.value < 0.01, '< 0.01', '< 0.05'))
)
# Use stargazer to create an HTML table of the Durbin-Watson results
stargazer(dw_df3, type = "html", title = "Durbin-Watson Test Results",
summary = FALSE, rownames = FALSE, out = 'dw_test_results3.html')
# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer('dw_test_results3.html')
# When considering the independence of residuals, we might have reason to think that this assumption is not satisfied within the model, as it is likely that some degree of spatial autocorrelation could be influencing our results - that is, that LSOAs closer to each-other could theoretically have more similar values of X. This idea is supported by the results of the durbin-watson test statistics shown - notably, getting closer to 2 (representing less reduced autocorrelation) when including a variable that partially accounts for spatiality (the RUC2011 indicator). As such, we do see a considerable amount of positive autocorrelation in the data (according to a Durbin-Watson statistic of 0.83 being considerably below 2). This means that residuals that follow each other in the data tend to be slightly similar in magnitude. Considering the null hypothesis of no autocorrelation and considering the p-value being significant considering an alpha of 0.05, we fail to reject the null hypothesis that there is no autocorrelation presented by the variables within our model. Thus, we might not be confident that the independence of residuals assumption is satisfied.
# Normality of residuals check
QQPlotModel(mod3, subtitle = "QQ-Plot for Model 3")
# Zero conditional mean of residual values
mean(mod3$residuals)
resid_mod3 <- lm(resids3 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + rural_urban_class_2011, data = complete_case_sample)
summary(resid_mod3)
# Zero conditional mean regression results
stargazer(resid_mod3, type="html",
dep.var.labels=c("Residuals of Model 3"),
covariate.labels=c("log(Income Deprivation Rate 2015)","Housing Affordability Score 2016 [HAS]","Rural town and fringe [RUC 2011]","Rural town and fringe in a sparse setting [RUC 2011]","Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]", "Urban city and town in a sparse setting [RUC 2011]", "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"), out = "residmod3.html")
rstudioapi::viewer("residmod3.html")