Abstract

Healthcare deprivation has long been influenced by factors such as geographical access and living conditions. Despite prior research that investigated such factors, there is still a dearth of studies that support how economic deprivation influences geographical access to such services. Our analyses reveal that while economic deprivation somewhat decreases travel time, this relationship is significantly mediated by housing costs and urban-rural classification. The findings, which also reflect on model assumptions and are supported by external literature, highlight the complexities of healthcare access and underscore the need for inclusive approaches in future research to capture the multifaceted nature of economic influences.

Libraries used

# Libraries used
library(tidyverse)
library(haven)
library(dplyr)
library(readxl)
library(GGally)
library(forcats)
library(stargazer)
library(lme4)
library(car)
library(rstudioapi)
library(knitr)
library(kableExtra)

This chunk loads the necessary libraries for data manipulation, analysis, and plotting.

# Turning off scientific notation
options(scipen = 9999)

This line sets the option to avoid scientific notation in outputs.

Functions created for the purposes of this research

# Bespoke functions used ----
PointPlot <- function(dataset, independent_var, dependent_var, colour_var = NULL, title = NULL, subtitle = NULL, caption = NULL, x_label = "X variable", y_label = "Y variable", point_size = 1, line_intercept = NULL, line_slope = NULL, line_colour = "black", line_linewidth = 1, line_linetype = "solid", add_regression_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Adjust the aesthetic mapping based on the presence of the third variable
  if(!is.null(colour_var)) {
    plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var, colour = colour_var))
  } else {
    plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var))
  }
  
  # Base point plot
  plot <- plot + geom_point(alpha = 1, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
                                      family = 'Roboto'),
          axis.title = element_text(family = 'Roboto'),
          axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          legend.position = ifelse(is.null(colour_var), 'none', 'right'),
          legend.text = element_text(family = 'Roboto')) + 
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = x_label,
         y = y_label)
  
  # Add manual line if intercept and slope are provided
  if (!is.null(line_intercept) & !is.null(line_slope)) {
    plot <- plot + geom_abline(intercept = line_intercept, 
                               slope = line_slope, 
                               colour = line_colour, 
                               linewidth = line_linewidth, 
                               linetype = line_linetype)
  }
  
  # Add regression line with confidence interval if requested
  if (add_regression_line) {
    plot <- plot + geom_smooth(method = "lm", alpha = 0.2)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  return(plot)
}

DistPlot <- function(dataset, variable, formatted_name, subtitle = NULL, caption = NULL, y_label = NULL) {
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  plot <- ggplot(dataset, aes_string(x = variable)) +
    geom_histogram(aes(y = ..density..),
                   colour = 1, fill = "white") +
    geom_density(fill = '#0072C6', colour = '#0072C6', alpha = 0.25) +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                      family = 'Roboto'),
          axis.title.x = element_blank(),  # This removes the x-axis title
          axis.title.y = element_text(size = 15.5,
                                      family = 'Roboto',
                                      colour = 'black'),
          axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          legend.position = 'none') +
    labs(title = paste0("Distribution of ", formatted_name),
         subtitle = subtitle,
         caption = caption,
         y = y_label)
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  plot
}

ConfidencePlot <- function(data, title, subtitle, middle_point = 0, x_label = "Estimated Coefficient", caption = NULL) {
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  if (!middle_point %in% c(0, 1)) {
    stop("middle_point must be 0 or 1")
  }
  
  conf_plot <- ggplot(data, aes(x = coefs, y = names)) +
    geom_errorbar(aes(xmin = lower, xmax = upper),
                  col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "forestgreen"),
                  width = 0.5, size = 1, alpha = 0.5) +
    geom_point(col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "black"), size = 1, alpha = 1) +
    geom_vline(xintercept = middle_point, col = 'red', size = 0.5, linetype="dashed") +
    theme_classic() +
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = x_label,
         y = "") +
    theme(axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          axis.title = element_text(family = 'Roboto', size = 15.5),
          plot.title = element_text(hjust = 0, colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto', margin = margin(b = 10)),
          plot.subtitle = element_text(hjust = 0, colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto', margin = margin(b = 5)),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
                                      family = 'Roboto'))
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", conf_plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = conf_plot, width = 1200/100, height = 740/100, dpi = 100)  # Adjust height as necessary
  
  return(conf_plot)
}

PartialRegressionPlot <- function(model, variable, x_var_name = "X1", y_var_name = "Dependent Variable", title = NULL) {
  # Close any existing plotting device
  graphics.off()
  
  # Construct axis labels using the specified names
  xlab <- paste("Residuals of", x_var_name, "Controlling for Other Variables")
  ylab <- paste("Residuals of", y_var_name, "(Excluding", x_var_name, ")")
  
  # Create the plot without a main title
  avPlots(model, variable, xlab = xlab, ylab = ylab)
  
  # Add a title using the title function
  title(main = title)
}

HomoskedasticityPlot <- function(model, title = "Homoskedasticity Check", subtitle = NULL, 
                                 x_label = "Fitted Values", y_label = "Standardized Residuals", 
                                 point_size = 1.2, add_loess_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  library(lmtest) # For Breusch-Pagan test
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Calculate standardized residuals
  std_residuals <- rstandard(model)
  
  # Fitted values
  fitted_values <- fitted(model)
  
  # Create a data frame for plotting
  plot_data <- data.frame(FittedValues = fitted_values, StdResiduals = std_residuals)
  
  # Perform Breusch-Pagan test
  bp_test <- bptest(model)
  
  # Format the p-value for the caption
  if (bp_test$p.value < 0.05) {
    p_value_text <- "p < 0.05"
  } else if (bp_test$p.value > 0.05) {
    p_value_text <- "p > 0.05"
  } else {
    p_value_text <- "p = 0.05"
  }
  
  # Create a caption including the formatted p-value text
  caption <- paste("Breusch-Pagan Test:", p_value_text)
  
  # Base point plot with adjusted sizes for readability
  plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals)) +
    geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 50, face = 'bold', family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 30, face = 'italic', family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 50, face = 'italic', family = 'Roboto'),
          axis.title = element_text(size = 30, family = 'Roboto'),
          axis.text = element_text(size = 30, family = 'Roboto', colour = 'black'),
          legend.position = 'none') + 
    labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
  
  # Add loess smoothing line if requested
  if (add_loess_line) {
    plot <- plot + geom_smooth(method = "loess", alpha = 0.2)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot with dimensions that balance size and readability
  ggsave(filename = file_name, plot = plot, width = 5.91, height = 4.50, dpi = 300)
  
  return(plot)
}

HomoskedasticityDetailPlot <- function(model, dataframe, categorical_var_name, 
                                       title = "Homoskedasticity Check", subtitle = NULL, 
                                       x_label = "Fitted Values", y_label = "Standardized Residuals", 
                                       point_size = 1.2, add_loess_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  library(lmtest) # For Breusch-Pagan test
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Calculate standardized residuals
  std_residuals <- rstandard(model)
  
  # Fitted values
  fitted_values <- fitted(model)
  
  # Ensure the categorical variable exists in the dataframe
  if (!categorical_var_name %in% names(dataframe)) {
    stop("Specified categorical variable not found in the provided dataframe.")
  }
  
  # Create a data frame for plotting
  plot_data <- data.frame(FittedValues = fitted_values, 
                          StdResiduals = std_residuals, 
                          Category = dataframe[[categorical_var_name]])
  
  # Perform Breusch-Pagan test
  bp_test <- bptest(model)
  
  # Format the p-value for the caption
  p_value_text <- ifelse(bp_test$p.value < 0.05, "p < 0.05",
                         ifelse(bp_test$p.value > 0.05, "p > 0.05", "p = 0.05"))
  
  # Create a caption including the formatted p-value text
  caption <- paste("Breusch-Pagan Test:", p_value_text)
  
  # Base point plot with adjusted sizes for readability
  plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals, color = Category)) +
    geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold', family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic', family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic', family = 'Roboto'),
          axis.title = element_text(size = 15.5, family = 'Roboto'),
          axis.text = element_text(size = 15.5, family = 'Roboto', colour = 'black'),
          legend.position = 'right', legend.text = element_text(size = 12.5, family = 'Roboto')) + 
    labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
  
  # Add loess smoothing line if requested
  if (add_loess_line) {
    plot <- plot + geom_smooth(method = "loess", alpha = 0.2, se = FALSE)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot with dimensions that balance size and readability
  ggsave(filename = file_name, plot = plot, width = 10, height = 6.17, dpi = 100)
  
  return(plot)
}

QQPlotModel <- function(model, title = 'Normality of Residuals Check', subtitle = 'QQ-Plot', caption = NULL) {
  
  # Loading required libraries
  library(ggplot2)
  
  # Extract residuals from the model
  model_residuals <- residuals(model)
  
  # Create the QQ-plot
  plot <- ggplot() +
    stat_qq(aes(sample = model_residuals)) +
    stat_qq_line(aes(sample = model_residuals), colour = "blue") +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic'),
          axis.title = element_text(),
          axis.text = element_text(size = 15.5, colour = 'black')) + 
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = "Theoretical Quantiles",
         y = "Sample Quantiles")
  
  # Create a filename from the plot title or use a default one
  file_name <- if (!is.null(title)) {
    paste0(gsub(" ", "_", title), ".png")
  } else {
    "QQPlot.png"
  }
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  return(plot)
}

Cleaning and joining the data

AHAH_df <- read.csv("Group_7_AHAH_V3_0.csv")  # Access to healthy assets and hazards V3 data
IOD_income_df <- read.csv("Group_7_IoD2019_Income.csv")  # Index of Multiple Deprivation Income Domian Indicator Data (measured in 2015)
IOD_health_df <- read.csv("Group_7_IoD2019_Health.csv")  # Index of Multiple Deprivation Health Domian Indicator Data
IOD_barriers_df <- read.csv("Group_7_IoD2019_Barriers.csv")  # Index of Multiple Deprivation Barriers to Housing and Services Domain Indicator Data
IOD_education_df <- read.csv("Group_7_IoD2019_Education.csv")  # Index of Multiple Deprivation Education, Skills and Training Deprivation Domain Indicator Data
IOD_employment_df <- read.csv("Group_7_IoD2019_Employment.csv")  # Index of Multiple Deprivation Employment Deprivation Domain Indicator Data (measured in 2015)
ONS_MIDYEAR_2015 <- read.csv("Group_7_ONS_MIDYEAR_2015.csv")  # Mid-year population estimates for 2015 in England and Wales (for creating rates from the numerator values given in the income and employment measures)
RUC2011 <- read.csv("Group_7_RUC2011.csv")

The above code reads in the raw data, available to download from the sources provided.

## Renaming columns for consistent joining
AHAH_df <- AHAH_df %>%
    rename(LSOA.code..2011. = lsoa11)
ONS_MIDYEAR_2015 <- ONS_MIDYEAR_2015 %>%
    rename(LSOA.code..2011. = Area.Codes)
RUC2011 <- RUC2011 %>%
    rename(LSOA.code..2011. = LSOA11CD)

## Creating complete combined dataset
complete_combined_data <- IOD_barriers_df %>%
    left_join(IOD_education_df, by = "LSOA.code..2011.") %>%
    left_join(IOD_employment_df, by = "LSOA.code..2011.") %>%
    left_join(IOD_health_df, by = "LSOA.code..2011.") %>%
    left_join(IOD_income_df, by = "LSOA.code..2011.") %>%
    left_join(AHAH_df, by = "LSOA.code..2011.") %>%
    left_join(ONS_MIDYEAR_2015, by = "LSOA.code..2011.") %>%
    left_join(RUC2011, by = "LSOA.code..2011.")

## Retaining only columns of interest
combined_data <- complete_combined_data %>%
    select(LSOA.code..2011., Local.Authority.District.name..2019.,
        Income.Domain.numerator, Years.of.potential.life.lost.indicator,
        Housing.affordability.indicator, Staying.on.in.education.post.16.indicator,
        Employment.Domain.numerator, ah3gp, ah3hosp, ah3dent,
        ah3phar, All.Ages, RUC11)

## Removing commas and converting columns 3:12 to numeric
combined_data <- combined_data %>%
    mutate(across(.cols = 3:12, .fns = ~as.numeric(str_remove_all(.x,
        ","))))

## Creating income and employment deprivation rate
## variables
combined_data$income_deprivation_rate <- (combined_data$Income.Domain.numerator/combined_data$All.Ages) *
    100
combined_data$employment_deprivation_rate <- (combined_data$Employment.Domain.numerator/combined_data$All.Ages) *
    100

## Creating average distance (in minutes) to a healthcare
## service (including GP practice, hospital, dentist and
## pharmacy)
combined_data$avg_healthcare_distance <- rowMeans(combined_data[,
    c("ah3gp", "ah3hosp", "ah3dent", "ah3phar")], na.rm = TRUE)

## Renaming column names for understanding
colnames(combined_data) <- c("lsoa_code", "local_authority_name",
    "no._of_income_deprived_individuals_2015", "years_of_lives_lost_2013_2017",
    "housing_affordability_score_2016", "proportion_of_above16_individuals_inschool_2010_2012",
    "no._of_deprived_employment_2015_2016", "avg_dist_gppractice_in_mins_2022",
    "avg_dist_hospital_in_mins_2022", "avg_dist_dentist_in_mins_2022",
    "avg_dist_pharmacy_in_mins_2022", "population_estimates_for_allages_2015",
    "rural_urban_class_2011", "income_deprivation_rate_2015",
    "employment_deprivation_rate_2015", "avg_healthcare_distance_2022")

The above code reads in multiple CSV files containing various indicators of deprivation and population data. It then proceeds to clean and merge these datasets into a single comprehensive dataset by renaming inconsistent column names to facilitate joining. A series of left joins creates our combined dataset which is further refined by selecting only the columns of interest. Subsequent operations include cleaning numeric data by removing commas, calculating rates of income and employment deprivation, and computing the average distance to healthcare services. The final steps involve renaming columns to more descriptive titles and preparing the dataset for further analysis, ensuring there are no duplicate entries.

## Saving as CSV
write.csv(combined_data, file = "Group_7_combined_data.csv",
    row.names = FALSE)
write.csv(complete_combined_data, file = "Group_7_complete_combined_data.csv",
    row.names = FALSE)

The above code saves our shortened dataframes as csv files.

## Loading combined data
combined_data <- read.csv("Group_7_combined_data.csv")

## Checking for duplicates
There_are_duplicates <- any(duplicated(combined_data))
if (There_are_duplicates) {
    cat("There are duplicates in the combined_data.\n")
} else {
    cat("There are no duplicates in combined_data.\n")
}

## There are no duplicates in combined_data.

# Calculate missingness percentages for each numerical
# variable (IV and DV)
missingness <- sapply(combined_data, function(x) mean(is.na(x)) *
    100)
missingness_summary <- data.frame(variable = names(missingness),
    Missingness_percentage = missingness)
missingness_summary

##                                                                                                  variable
## lsoa_code                                                                                       lsoa_code
## local_authority_name                                                                 local_authority_name
## no._of_income_deprived_individuals_2015                           no._of_income_deprived_individuals_2015
## years_of_lives_lost_2013_2017                                               years_of_lives_lost_2013_2017
## housing_affordability_score_2016                                         housing_affordability_score_2016
## proportion_of_above16_individuals_inschool_2010_2012 proportion_of_above16_individuals_inschool_2010_2012
## no._of_deprived_employment_2015_2016                                 no._of_deprived_employment_2015_2016
## avg_dist_gppractice_in_mins_2022                                         avg_dist_gppractice_in_mins_2022
## avg_dist_hospital_in_mins_2022                                             avg_dist_hospital_in_mins_2022
## avg_dist_dentist_in_mins_2022                                               avg_dist_dentist_in_mins_2022
## avg_dist_pharmacy_in_mins_2022                                             avg_dist_pharmacy_in_mins_2022
## population_estimates_for_allages_2015                               population_estimates_for_allages_2015
## rural_urban_class_2011                                                             rural_urban_class_2011
## income_deprivation_rate_2015                                                 income_deprivation_rate_2015
## employment_deprivation_rate_2015                                         employment_deprivation_rate_2015
## avg_healthcare_distance_2022                                                 avg_healthcare_distance_2022
##                                                      Missingness_percentage
## lsoa_code                                                        0.00000000
## local_authority_name                                             0.00000000
## no._of_income_deprived_individuals_2015                          0.09438558
## years_of_lives_lost_2013_2017                                    0.00000000
## housing_affordability_score_2016                                 0.00000000
## proportion_of_above16_individuals_inschool_2010_2012             0.00000000
## no._of_deprived_employment_2015_2016                             0.36231884
## avg_dist_gppractice_in_mins_2022                                 0.00000000
## avg_dist_hospital_in_mins_2022                                   0.00000000
## avg_dist_dentist_in_mins_2022                                    0.00000000
## avg_dist_pharmacy_in_mins_2022                                   0.00000000
## population_estimates_for_allages_2015                            0.00000000
## rural_urban_class_2011                                           0.00000000
## income_deprivation_rate_2015                                     0.09438558
## employment_deprivation_rate_2015                                 0.36231884
## avg_healthcare_distance_2022                                     0.00000000

# Removing Null Values from data set
combined_data <- na.omit(combined_data)

# First 6 rows of data
head(combined_data)

##   lsoa_code local_authority_name no._of_income_deprived_individuals_2015
## 2 E01000002       City of London                                      40
## 3 E01000003       City of London                                     115
## 4 E01000005       City of London                                     235
## 5 E01000006 Barking and Dagenham                                     235
## 6 E01000007 Barking and Dagenham                                     435
## 7 E01000008 Barking and Dagenham                                     420
##   years_of_lives_lost_2013_2017 housing_affordability_score_2016
## 2                        69.045                            1.586
## 3                        82.254                            3.063
## 4                        67.035                            3.383
## 5                        59.895                            3.515
## 6                        66.861                            3.183
## 7                        73.391                            2.932
##   proportion_of_above16_individuals_inschool_2010_2012
## 2                                                0.279
## 3                                                0.201
## 4                                                0.272
## 5                                                0.100
## 6                                                0.113
## 7                                                0.291
##   no._of_deprived_employment_2015_2016 avg_dist_gppractice_in_mins_2022
## 2                                   15                         3.146895
## 3                                   70                         1.151000
## 4                                   95                         1.462800
## 5                                   75                         1.585000
## 6                                  150                         0.177500
## 7                                  145                         2.641000
##   avg_dist_hospital_in_mins_2022 avg_dist_dentist_in_mins_2022
## 2                       1.448895                        0.8875
## 3                       0.649500                        0.0000
## 4                       1.224050                        0.5650
## 5                       1.585000                        2.1500
## 6                       0.517500                        0.6475
## 7                       2.861000                        3.1110
##   avg_dist_pharmacy_in_mins_2022 population_estimates_for_allages_2015
## 2                      0.8791789                                  1156
## 3                      0.3475000                                  1350
## 4                      0.9426000                                  1121
## 5                      1.8675000                                  2040
## 6                      1.0775000                                  2101
## 7                      3.2585000                                  1566
##    rural_urban_class_2011 income_deprivation_rate_2015
## 2 Urban major conurbation                     3.460208
## 3 Urban major conurbation                     8.518519
## 4 Urban major conurbation                    20.963426
## 5 Urban major conurbation                    11.519608
## 6 Urban major conurbation                    20.704426
## 7 Urban major conurbation                    26.819923
##   employment_deprivation_rate_2015 avg_healthcare_distance_2022
## 2                         1.297578                     1.590617
## 3                         5.185185                     0.537000
## 4                         8.474576                     1.048612
## 5                         3.676471                     1.796875
## 6                         7.139457                     0.605000
## 7                         9.259259                     2.967875

# Last 6 rows of data
tail(combined_data)

##       lsoa_code local_authority_name no._of_income_deprived_individuals_2015
## 32839 E01033763            Liverpool                                     630
## 32840 E01033764            Liverpool                                    1275
## 32841 E01033765            Liverpool                                     525
## 32842 E01033766            Liverpool                                     105
## 32843 E01033767            Liverpool                                     405
## 32844 E01033768            Liverpool                                     350
##       years_of_lives_lost_2013_2017 housing_affordability_score_2016
## 32839                       101.024                            1.262
## 32840                        86.855                            1.415
## 32841                        75.078                            1.396
## 32842                        75.244                           -1.862
## 32843                        83.476                            1.752
## 32844                        91.496                            1.505
##       proportion_of_above16_individuals_inschool_2010_2012
## 32839                                                0.329
## 32840                                                0.241
## 32841                                                0.207
## 32842                                                0.142
## 32843                                                0.175
## 32844                                                0.175
##       no._of_deprived_employment_2015_2016 avg_dist_gppractice_in_mins_2022
## 32839                                  305                          1.19380
## 32840                                  555                          1.54425
## 32841                                  255                          1.77000
## 32842                                   55                          1.07000
## 32843                                  190                          3.26270
## 32844                                  245                          0.85200
##       avg_dist_hospital_in_mins_2022 avg_dist_dentist_in_mins_2022
## 32839                        0.82660                        2.3109
## 32840                        2.07095                        3.1923
## 32841                        1.43750                        1.9050
## 32842                        1.07000                        2.7176
## 32843                        1.16050                        2.6112
## 32844                        0.85200                        1.0795
##       avg_dist_pharmacy_in_mins_2022 population_estimates_for_allages_2015
## 32839                         0.9740                                  1679
## 32840                         1.6249                                  2712
## 32841                         1.9050                                  1445
## 32842                         1.0700                                  1054
## 32843                         1.6750                                  1019
## 32844                         1.1720                                  1281
##        rural_urban_class_2011 income_deprivation_rate_2015
## 32839 Urban major conurbation                    37.522335
## 32840 Urban major conurbation                    47.013274
## 32841 Urban major conurbation                    36.332180
## 32842 Urban major conurbation                     9.962049
## 32843 Urban major conurbation                    39.744848
## 32844 Urban major conurbation                    27.322404
##       employment_deprivation_rate_2015 avg_healthcare_distance_2022
## 32839                        18.165575                     1.326325
## 32840                        20.464602                     2.108100
## 32841                        17.647059                     1.754375
## 32842                         5.218216                     1.481900
## 32843                        18.645731                     2.177350
## 32844                        19.125683                     0.988875

After loading the dataset, the above code checks for any duplicate rows, and outputs a message indicating whether duplicates are found. The script then calculates the percentage of missing values (NA values) for each variable in the dataset, summarizing the results in a data frame that lists variables alongside their respective percentages of missingness. Subsequently, it removes all rows with any missing values from the dataset. Finally, the script displays the first and last six rows of the cleaned dataset, which serves as a quick check of the data after the aforementioned operations, giving a glimpse of the data structure post-cleaning. # Initial summary statistics

summary(combined_data)

lsoa_code local_authority_name Length:32715 Length:32715
Class :character Class :character
Mode :character Mode :character

no._of_income_deprived_individuals_2015 years_of_lives_lost_2013_2017 Min. : 10.0 Min. : 18.43
1st Qu.: 90.0 1st Qu.: 47.36
Median : 165.0 Median : 54.78
Mean : 214.3 Mean : 57.29
3rd Qu.: 300.0 3rd Qu.: 64.53
Max. :1435.0 Max. :184.73
housing_affordability_score_2016 Min. :-6.658000
1st Qu.:-1.277000
Median :-0.013000
Mean : 0.000486
3rd Qu.: 1.297000
Max. : 8.019000
proportion_of_above16_individuals_inschool_2010_2012 Min. :0.0380
1st Qu.:0.1360
Median :0.1780
Mean :0.1855
3rd Qu.:0.2280
Max. :0.5500
no._of_deprived_employment_2015_2016 avg_dist_gppractice_in_mins_2022 Min. : 10.0 Min. : 0.000
1st Qu.: 45.0 1st Qu.: 1.803
Median : 75.0 Median : 3.022
Mean : 96.5 Mean : 4.660
3rd Qu.:130.0 3rd Qu.: 5.251
Max. :670.0 Max. :50.528
avg_dist_hospital_in_mins_2022 avg_dist_dentist_in_mins_2022 Min. : 0.000 Min. : 0.000
1st Qu.: 1.561 1st Qu.: 1.725
Median : 2.614 Median : 2.936
Mean : 4.301 Mean : 4.857
3rd Qu.: 4.598 3rd Qu.: 5.187
Max. :53.912 Max. :94.747
avg_dist_pharmacy_in_mins_2022 population_estimates_for_allages_2015 Min. : 0.000 Min. : 614
1st Qu.: 1.502 1st Qu.:1448
Median : 2.383 Median :1599
Mean : 3.953 Mean :1669
3rd Qu.: 3.908 3rd Qu.:1802
Max. :64.166 Max. :9551
rural_urban_class_2011 income_deprivation_rate_2015 Length:32715 Min. : 0.1912
Class :character 1st Qu.: 5.5485
Mode :character Median : 9.9291
Mean :12.8100
3rd Qu.:17.7987
Max. :60.9177
employment_deprivation_rate_2015 avg_healthcare_distance_2022 Min. : 0.1912 Min. : 0.000
1st Qu.: 2.7865 1st Qu.: 1.877
Median : 4.6030 Median : 2.898
Mean : 5.8027 Mean : 4.443
3rd Qu.: 7.8489 3rd Qu.: 4.661
Max. :34.7452 Max. :60.310

# Descriptive table for numerical variables
stargazer(combined_data, type = "html", digits = 1, title = "Table 1: Summary Statistics",
    out = "table.html")

**Table 1: Summary Statistics**

Statistic	N	Mean	St. Dev.	Min	Max

no._of_income_deprived_individuals_2015	32,715	214.3	164.4	10	1,435
years_of_lives_lost_2013_2017	32,715	57.3	14.0	18.4	184.7
housing_affordability_score_2016	32,715	0.000	1.9	-6.7	8.0
proportion_of_above16_individuals_inschool_2010_2012	32,715	0.2	0.1	0.04	0.6
no._of_deprived_employment_2015_2016	32,715	96.5	68.7	10	670
avg_dist_gppractice_in_mins_2022	32,715	4.7	4.9	0.0	50.5
avg_dist_hospital_in_mins_2022	32,715	4.3	5.0	0.0	53.9
avg_dist_dentist_in_mins_2022	32,715	4.9	5.8	0.0	94.7
avg_dist_pharmacy_in_mins_2022	32,715	4.0	5.1	0.0	64.2
population_estimates_for_allages_2015	32,715	1,668.6	364.9	614	9,551
income_deprivation_rate_2015	32,715	12.8	9.4	0.2	60.9
employment_deprivation_rate_2015	32,715	5.8	4.0	0.2	34.7
avg_healthcare_distance_2022	32,715	4.4	4.8	0.0	60.3

# Use rstudioapi to view the table
rstudioapi::viewer("table.html")

NULL

## SD of numerical variable
sd(combined_data$lsoa_code)

[1] NA

sd(combined_data$no._of_income_deprived_individuals_2015)

[1] 164.3886

sd(combined_data$years_of_lives_lost_2013_2017)

[1] 14.01493

sd(combined_data$housing_affordability_score_2016)

[1] 1.904849

sd(combined_data$proportion_of_above16_individuals_inschool_2010_2012)

[1] 0.06606487

sd(combined_data$no._of_deprived_employment_2015_2016)

[1] 68.72312

sd(combined_data$avg_dist_gppractice_in_mins_2022)

[1] 4.932105

sd(combined_data$avg_dist_hospital_in_mins_2022)

[1] 4.991137

sd(combined_data$avg_dist_dentist_in_mins_2022)

[1] 5.822524

sd(combined_data$avg_dist_pharmacy_in_mins_2022)

[1] 5.062794

sd(combined_data$population_estimates_for_allages_2015)

[1] 364.8729

sd(combined_data$income_deprivation_rate_2015)

[1] 9.412047

sd(combined_data$employment_deprivation_rate_2015)

[1] 4.007918

sd(combined_data$avg_healthcare_distance_2022)

[1] 4.752529 The above code provides a summary of the combined_data dataset, first by generating a descriptive statistics table, which includes mean, median, minimum, maximum, and quartile values for each numerical variable in the dataset. This table is created using the stargazer package, formatted as HTML, titled “Table 1: Summary Statistics”, and saved to a file named “table.html”. It then uses rstudioapi::viewer to display this HTML table within RStudio’s viewer pane for easy inspection. Following that, the code calculates the standard deviation (SD) for each numerical variable within combined_data individually, which quantifies the amount of variation or dispersion of a set of values.

# The number of unique names in the rural_urban_class
# column
unique_names <- table(combined_data$rural_urban_class_2011)
print(unique_names)

## 
##                           Rural town and fringe 
##                                            2930 
##       Rural town and fringe in a sparse setting 
##                                             119 
##                     Rural village and dispersed 
##                                            2356 
## Rural village and dispersed in a sparse setting 
##                                             181 
##                             Urban city and town 
##                                           14399 
##         Urban city and town in a sparse setting 
##                                              59 
##                         Urban major conurbation 
##                                           11465 
##                         Urban minor conurbation 
##                                            1206

rural_urban_class_2011_counts <- as.numeric(unique_names)
print(rural_urban_class_2011_counts)

## [1]  2930   119  2356   181 14399    59 11465  1206

## Summary statistics for Rural_urban_class_2011
max(rural_urban_class_2011_counts)

## [1] 14399

min(rural_urban_class_2011_counts)

## [1] 59

median(rural_urban_class_2011_counts)

## [1] 1781

IQR(rural_urban_class_2011_counts)

## [1] 4898.25

# The number of unique names in the local_authority_name
# column
unique_names_1 <- table(combined_data$local_authority_name)
print(unique_names_1)

## 
##                                Adur                           Allerdale 
##                                  42                                  60 
##                        Amber Valley                                Arun 
##                                  78                                  94 
##                            Ashfield                             Ashford 
##                                  74                                  78 
##                      Aylesbury Vale                             Babergh 
##                                 114                                  54 
##                Barking and Dagenham                              Barnet 
##                                 110                                 209 
##                            Barnsley                   Barrow-in-Furness 
##                                 147                                  49 
##                            Basildon               Basingstoke and Deane 
##                                 109                                 109 
##                           Bassetlaw        Bath and North East Somerset 
##                                  70                                 114 
##                             Bedford                              Bexley 
##                                 103                                 146 
##                          Birmingham                               Blaby 
##                                 638                                  60 
##               Blackburn with Darwen                           Blackpool 
##                                  91                                  94 
##                            Bolsover                              Bolton 
##                                  48                                 177 
##                              Boston Bournemouth, Christchurch and Poole 
##                                  36                                 233 
##                    Bracknell Forest                            Bradford 
##                                  75                                 308 
##                           Braintree                           Breckland 
##                                  87                                  78 
##                               Brent                           Brentwood 
##                                 173                                  45 
##                   Brighton and Hove                    Bristol, City of 
##                                 165                                 262 
##                           Broadland                             Bromley 
##                                  84                                 195 
##                          Bromsgrove                          Broxbourne 
##                                  58                                  56 
##                            Broxtowe                             Burnley 
##                                  71                                  60 
##                                Bury                          Calderdale 
##                                 120                                 128 
##                           Cambridge                              Camden 
##                                  67                                 132 
##                       Cannock Chase                          Canterbury 
##                                  60                                  90 
##                            Carlisle                        Castle Point 
##                                  68                                  57 
##                Central Bedfordshire                           Charnwood 
##                                 157                                  99 
##                          Chelmsford                          Cheltenham 
##                                 107                                  75 
##                            Cherwell                       Cheshire East 
##                                  93                                 232 
##           Cheshire West and Chester                        Chesterfield 
##                                 212                                  69 
##                          Chichester                            Chiltern 
##                                  71                                  54 
##                             Chorley                      City of London 
##                                  66                                   4 
##                          Colchester                            Copeland 
##                                 105                                  49 
##                               Corby                            Cornwall 
##                                  41                                 326 
##                            Cotswold                       County Durham 
##                                  51                                 324 
##                            Coventry                              Craven 
##                                 195                                  32 
##                             Crawley                             Croydon 
##                                  66                                 220 
##                             Dacorum                          Darlington 
##                                  91                                  65 
##                            Dartford                            Daventry 
##                                  58                                  44 
##                               Derby                    Derbyshire Dales 
##                                 151                                  43 
##                           Doncaster                              Dorset 
##                                 194                                 218 
##                               Dover                              Dudley 
##                                  67                                 201 
##                              Ealing                 East Cambridgeshire 
##                                 196                                  50 
##                          East Devon                      East Hampshire 
##                                  81                                  71 
##                  East Hertfordshire                        East Lindsey 
##                                  84                                  81 
##               East Northamptonshire            East Riding of Yorkshire 
##                                  49                                 210 
##                  East Staffordshire                        East Suffolk 
##                                  72                                 146 
##                          Eastbourne                           Eastleigh 
##                                  61                                  77 
##                                Eden                           Elmbridge 
##                                  36                                  76 
##                             Enfield                       Epping Forest 
##                                 183                                  78 
##                     Epsom and Ewell                             Erewash 
##                                  44                                  73 
##                              Exeter                             Fareham 
##                                  74                                  73 
##                             Fenland                Folkestone and Hythe 
##                                  55                                  67 
##                      Forest of Dean                               Fylde 
##                                  50                                  51 
##                           Gateshead                             Gedling 
##                                 126                                  77 
##                          Gloucester                             Gosport 
##                                  78                                  53 
##                           Gravesham                      Great Yarmouth 
##                                  64                                  61 
##                           Greenwich                           Guildford 
##                                 150                                  82 
##                             Hackney                              Halton 
##                                 144                                  79 
##                           Hambleton              Hammersmith and Fulham 
##                                  52                                 113 
##                          Harborough                            Haringey 
##                                  47                                 145 
##                              Harlow                           Harrogate 
##                                  54                                 100 
##                              Harrow                                Hart 
##                                 137                                  56 
##                          Hartlepool                            Hastings 
##                                  58                                  53 
##                              Havant                            Havering 
##                                  78                                 150 
##            Herefordshire, County of                           Hertsmere 
##                                 116                                  61 
##                           High Peak                          Hillingdon 
##                                  59                                 161 
##               Hinckley and Bosworth                             Horsham 
##                                  66                                  81 
##                            Hounslow                     Huntingdonshire 
##                                 142                                 105 
##                            Hyndburn                             Ipswich 
##                                  52                                  85 
##                       Isle of Wight                     Isles of Scilly 
##                                  89                                   1 
##                           Islington              Kensington and Chelsea 
##                                 123                                  90 
##                           Kettering        King's Lynn and West Norfolk 
##                                  57                                  89 
##         Kingston upon Hull, City of                Kingston upon Thames 
##                                 166                                  98 
##                            Kirklees                            Knowsley 
##                                 259                                  98 
##                             Lambeth                           Lancaster 
##                                 178                                  89 
##                               Leeds                           Leicester 
##                                 480                                 192 
##                               Lewes                            Lewisham 
##                                  62                                 169 
##                           Lichfield                             Lincoln 
##                                  58                                  57 
##                           Liverpool                               Luton 
##                                 297                                 121 
##                           Maidstone                              Maldon 
##                                  94                                  40 
##                       Malvern Hills                          Manchester 
##                                  45                                 278 
##                           Mansfield                              Medway 
##                                  67                                 163 
##                              Melton                              Mendip 
##                                  30                                  66 
##                              Merton                           Mid Devon 
##                                 122                                  43 
##                         Mid Suffolk                          Mid Sussex 
##                                  56                                  86 
##                       Middlesbrough                       Milton Keynes 
##                                  86                                 152 
##                         Mole Valley                          New Forest 
##                                  52                                 113 
##                 Newark and Sherwood                Newcastle-under-Lyme 
##                                  70                                  80 
##                 Newcastle upon Tyne                              Newham 
##                                 174                                 162 
##                         North Devon               North East Derbyshire 
##                                  58                                  63 
##             North East Lincolnshire                 North Hertfordshire 
##                                 106                                  82 
##                      North Kesteven                  North Lincolnshire 
##                                  64                                 101 
##                       North Norfolk                      North Somerset 
##                                  62                                 135 
##                      North Tyneside                  North Warwickshire 
##                                 131                                  38 
##           North West Leicestershire                         Northampton 
##                                  58                                 133 
##                      Northumberland                             Norwich 
##                                 197                                  83 
##                          Nottingham               Nuneaton and Bedworth 
##                                 182                                  81 
##                   Oadby and Wigston                              Oldham 
##                                  36                                 141 
##                              Oxford                              Pendle 
##                                  79                                  57 
##                        Peterborough                            Plymouth 
##                                 112                                 161 
##                          Portsmouth                             Preston 
##                                 124                                  86 
##                             Reading                           Redbridge 
##                                  97                                 161 
##                Redcar and Cleveland                            Redditch 
##                                  88                                  55 
##                Reigate and Banstead                       Ribble Valley 
##                                  83                                  39 
##                Richmond upon Thames                       Richmondshire 
##                                 115                                  34 
##                            Rochdale                            Rochford 
##                                 134                                  53 
##                          Rossendale                              Rother 
##                                  43                                  58 
##                           Rotherham                               Rugby 
##                                 167                                  61 
##                           Runnymede                          Rushcliffe 
##                                  51                                  68 
##                            Rushmoor                             Rutland 
##                                  58                                  23 
##                             Ryedale                             Salford 
##                                  30                                 149 
##                            Sandwell                         Scarborough 
##                                 186                                  71 
##                           Sedgemoor                              Sefton 
##                                  70                                 189 
##                               Selby                           Sevenoaks 
##                                  50                                  73 
##                           Sheffield                          Shropshire 
##                                 343                                 193 
##                              Slough                            Solihull 
##                                  80                                 132 
##           Somerset West and Taunton                         South Bucks 
##                                  88                                  37 
##                South Cambridgeshire                    South Derbyshire 
##                                  96                                  58 
##               South Gloucestershire                          South Hams 
##                                 165                                  49 
##                       South Holland                      South Kesteven 
##                                  49                                  81 
##                      South Lakeland                       South Norfolk 
##                                  59                                  81 
##              South Northamptonshire                   South Oxfordshire 
##                                  51                                  87 
##                        South Ribble                      South Somerset 
##                                  70                                 103 
##                 South Staffordshire                      South Tyneside 
##                                  68                                 102 
##                         Southampton                     Southend-on-Sea 
##                                 148                                 107 
##                           Southwark                          Spelthorne 
##                                 162                                  60 
##                           St Albans                          St. Helens 
##                                  83                                 119 
##                            Stafford             Staffordshire Moorlands 
##                                  80                                  59 
##                           Stevenage                           Stockport 
##                                  52                                 190 
##                    Stockton-on-Tees                      Stoke-on-Trent 
##                                 120                                 159 
##                   Stratford-on-Avon                              Stroud 
##                                  73                                  69 
##                          Sunderland                        Surrey Heath 
##                                 185                                  53 
##                              Sutton                               Swale 
##                                 121                                  85 
##                             Swindon                            Tameside 
##                                 132                                 141 
##                            Tamworth                           Tandridge 
##                                  51                                  49 
##                         Teignbridge                  Telford and Wrekin 
##                                  84                                 108 
##                            Tendring                         Test Valley 
##                                  89                                  71 
##                          Tewkesbury                              Thanet 
##                                  50                                  84 
##                        Three Rivers                            Thurrock 
##                                  49                                  98 
##               Tonbridge and Malling                              Torbay 
##                                  71                                  89 
##                            Torridge                       Tower Hamlets 
##                                  37                                 144 
##                            Trafford                     Tunbridge Wells 
##                                 138                                  66 
##                          Uttlesford                 Vale of White Horse 
##                                  45                                  76 
##                           Wakefield                             Walsall 
##                                 209                                 167 
##                      Waltham Forest                          Wandsworth 
##                                 144                                 178 
##                          Warrington                             Warwick 
##                                 127                                  85 
##                             Watford                            Waverley 
##                                  53                                  81 
##                             Wealden                      Wellingborough 
##                                  95                                  47 
##                     Welwyn Hatfield                      West Berkshire 
##                                  65                                  96 
##                          West Devon                     West Lancashire 
##                                  31                                  73 
##                        West Lindsey                    West Oxfordshire 
##                                  52                                  66 
##                        West Suffolk                         Westminster 
##                                 100                                 120 
##                               Wigan                           Wiltshire 
##                                 200                                 285 
##                          Winchester              Windsor and Maidenhead 
##                                  69                                  88 
##                              Wirral                              Woking 
##                                 206                                  59 
##                           Wokingham                       Wolverhampton 
##                                  99                                 158 
##                           Worcester                            Worthing 
##                                  63                                  65 
##                            Wychavon                             Wycombe 
##                                  78                                 106 
##                                Wyre                         Wyre Forest 
##                                  69                                  65 
##                                York 
##                                 120

local_authority_name_counts <- as.numeric(unique_names_1)
print(local_authority_name_counts)

##   [1]  42  60  78  94  74  78 114  54 110 209 147  49 109 109  70 114 103 146
##  [19] 638  60  91  94  48 177  36 233  75 308  87  78 173  45 165 262  84 195
##  [37]  58  56  71  60 120 128  67 132  60  90  68  57 157  99 107  75  93 232
##  [55] 212  69  71  54  66   4 105  49  41 326  51 324 195  32  66 220  91  65
##  [73]  58  44 151  43 194 218  67 201 196  50  81  71  84  81  49 210  72 146
##  [91]  61  77  36  76 183  78  44  73  74  73  55  67  50  51 126  77  78  53
## [109]  64  61 150  82 144  79  52 113  47 145  54 100 137  56  58  53  78 150
## [127] 116  61  59 161  66  81 142 105  52  85  89   1 123  90  57  89 166  98
## [145] 259  98 178  89 480 192  62 169  58  57 297 121  94  40  45 278  67 163
## [163]  30  66 122  43  56  86  86 152  52 113  70  80 174 162  58  63 106  82
## [181]  64 101  62 135 131  38  58 133 197  83 182  81  36 141  79  57 112 161
## [199] 124  86  97 161  88  55  83  39 115  34 134  53  43  58 167  61  51  68
## [217]  58  23  30 149 186  71  70 189  50  73 343 193  80 132  88  37  96  58
## [235] 165  49  49  81  59  81  51  87  70 103  68 102 148 107 162  60  83 119
## [253]  80  59  52 190 120 159  73  69 185  53 121  85 132 141  51  49  84 108
## [271]  89  71  50  84  49  98  71  89  37 144 138  66  45  76 209 167 144 178
## [289] 127  85  53  81  95  47  65  96  31  73  52  66 100 120 200 285  69  88
## [307] 206  59  99 158  63  65  78 106  69  65 120

## Summary statistics for local_authority_name
max(local_authority_name_counts)

## [1] 638

min(local_authority_name_counts)

## [1] 1

median(local_authority_name_counts)

## [1] 81

IQR(local_authority_name_counts)

## [1] 72

## T-test Four sample independent t-test is used to compare
## the significance level in combined_data data set.
## Student’s t-test assumes both groups of data are
## normally distributed and the variances of the two
## distributions are the same.
x1 <- combined_data$income_deprivation_rate_2015
x2 <- combined_data$employment_deprivation_rate_2015
x3 <- combined_data$housing_affordability_score_2016
y <- combined_data$avg_healthcare_distance_2022

# Plot a 2x2 histogram grid
par(mfrow = c(2, 2))
hist(x1)
hist(x2)
hist(x3)
hist(y)

# Based on histogram plots, we have to perform a Welch's
# t-test for x1, x2, both of which are skewed, and for x3
# as well since DV is highly skewed despite x3 being
# normally distributed.
t_test1 <- t.test(x1, y)
t_test2 <- t.test(x2, y)
t_test3 <- t.test(x3, y)
t_test1

## 
##  Welch Two Sample t-test
## 
## data:  x1 and y
## t = 143.53, df = 48378, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  8.252976 8.481491
## sample estimates:
## mean of x mean of y 
## 12.809981  4.442748

t_test2

## 
##  Welch Two Sample t-test
## 
## data:  x2 and y
## t = 39.567, df = 63616, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.292606 1.427343
## sample estimates:
## mean of x mean of y 
##  5.802723  4.442748

t_test3

## 
##  Welch Two Sample t-test
## 
## data:  x3 and y
## t = -156.93, df = 42960, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.497746 -4.386779
## sample estimates:
##    mean of x    mean of y 
## 0.0004856182 4.4427479057

## Covariance values of our primary and secondary IV
## against primary DV
cov(x1, y)

## [1] -12.79764

cov(x2, y)

## [1] -5.549447

cov(x3, y)

## [1] -2.716934

## Correlation values of our primary and secondary IV
## against primary DV As x1, x2 are not normally
## distributed, we will use 'spearman' method. We will use
## 'spearman' method for x3 despite it being normally
## distributed as DV is highly skewed.
cor(x1, y, method = "spearman")

## [1] -0.3929755

cor(x2, y, method = "spearman")

## [1] -0.3911658

cor(x3, y, method = "spearman")

## [1] -0.4408718

The above code performs frequency analysis and summary statistics for unique categories within two categorical columns of the combined_data dataset, constructs histograms for four numerical variables to assess their distribution, conducts Welch’s t-tests to compare means between independent and dependent variables given non-normal distributions, calculates covariance to measure joint variability, and computes Spearman correlation coefficients to evaluate the strength and direction of association, accounting for non-parametric data.

Variable distributions

DistPlot(combined_data, "income_deprivation_rate_2015", "Income Deprivation Rate Across LSOAs in England (2015)",
    subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

DistPlot(combined_data, "avg_healthcare_distance_2022", "Average Minutes to Healthcare Services Across LSOAs in England (2022)",
    subtitle = "Mean Access Time to Healthcare Services per LSOA",
    caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "housing_affordability_score_2016", "Housing Affordability Indicator Across LSOAs in England (2016)",
    subtitle = "Higher Scores Indicate Increased Deprivation in Entering Ownership or Private Rental Market",
    caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "employment_deprivation_rate_2015", "Employment Deprivation Rate in England (2015)",
    subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

# Both the main explanatory variable and outcome variable
# have a very strong right skew. Thus, a log transformation
# in both of these is examined.

The above code generates distribution plots for four different variables within the combined_data dataset, each with its own title, subtitle, and source caption. It visualizes the distribution of income deprivation rate for 2015, the average distance to healthcare services for 2022, the housing affordability score for 2016, and the employment deprivation rate for 2015 across Lower Super Output Areas (LSOAs) in England. The code also notes a significant right skew in the main explanatory and outcome variables, suggesting that a logarithmic transformation may be examined for these variables to address the skewness.

Log transformations

## Adding log transformed variables to dataframe:
combined_data$log_avg_healthcare_distance_2022 <- log(combined_data$avg_healthcare_distance_2022)
combined_data$log_income_deprivation_rate_2015 <- log(combined_data$income_deprivation_rate_2015)
combined_data$log_employment_deprivation_rate_2015 <- log(combined_data$employment_deprivation_rate_2015)

## Checking distributions:
DistPlot(combined_data, "log_income_deprivation_rate_2015", "log(Income Deprivation Rate) Across LSOAs in England (2015)",
    subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

DistPlot(combined_data, "log_avg_healthcare_distance_2022", "log(Average Minutes to Healthcare Services) Across LSOAs in England (2022)",
    subtitle = "Logarithm of Mean Access Time to Healthcare Services per LSOA",
    caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "log_employment_deprivation_rate_2015",
    "log(Employment Deprivation Rate) Across LSOAs in England (2015)",
    subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

## Filtering out rows with infinite values and creating a
## complete case sample:
complete_case_sample <- combined_data %>%
    select(log_income_deprivation_rate_2015, log_avg_healthcare_distance_2022,
        housing_affordability_score_2016, rural_urban_class_2011) %>%
    filter(!is.infinite(log_avg_healthcare_distance_2022), !is.infinite(log_income_deprivation_rate_2015)) %>%
    na.omit()

## Saving as CSV
write.csv(complete_case_sample, file = "Group_7_complete_case.csv",
    row.names = FALSE)

# Checking linearity of relationships between x variables
ggpairs(complete_case_sample[, c("log_income_deprivation_rate_2015",
    "housing_affordability_score_2016", "log_avg_healthcare_distance_2022")])

PointPlot(combined_data, "log_employment_deprivation_rate_2015",
    "log_income_deprivation_rate_2015", title = "Income Deprivation Rate (2015) as a Function of Employment Deprivation Rate (2015)",
    subtitle = "For Lower layer Super Output Areas Across England",
    x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Income Deprivation Rate) (2015)",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021",
    add_regression_line = TRUE)

result <- cor.test(combined_data$log_employment_deprivation_rate_2015,
    combined_data$income_deprivation_rate_2015, method = "pearson")

# creating a df with correlation test results
model_summary <- data.frame(Estimate = round(result$estimate,
    4), t.value = round(result$statistic, 2), p = ifelse(result$p.value <
    0.05, "< 0.05", round(result$p.value, 10)))

# pretty table using stargazer
stargazer(model_summary, type = "html", title = "Results of Pearson's Correlation Test",
    summary = FALSE, omit.stat = c("all"), single.row = TRUE,
    rownames = FALSE, out = "x1x2.html")

**Results of Pearson’s Correlation Test**

Estimate	t.value	p

0.897	367	< 0.05

rstudioapi::viewer("x1x2.html")

NULL

# As employment deprivation rate and income deprivation
# rate are highly correlated, employment deprivation rate
# will be excluded from analyses (to avoid issues of
# multicollinearity).

# Examining linearity between xi and y
PointPlot(complete_case_sample, "log_income_deprivation_rate_2015",
    "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Income Deprivation Rate (2015)",
    subtitle = "For Lower layer Super Output Areas Across England",
    x_label = "log(Income Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)",
    add_regression_line = TRUE)

PointPlot(complete_case_sample, "housing_affordability_score_2016",
    "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Housing Affordability Score (2016)",
    subtitle = "For Lower layer Super Output Areas Across England",
    x_label = "Housing Affordability Score (2016)", y_label = "log(Mean Access Time to Healthcare Services) (2022)",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)",
    add_regression_line = TRUE)

PointPlot(combined_data, "log_employment_deprivation_rate_2015",
    "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Employment Deprivation Rate (2015)",
    subtitle = "For Lower layer Super Output Areas Across England",
    x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)",
    add_regression_line = TRUE)

The above code performs data transformation, cleaning, visualization, analysis, and reporting tasks on a dataset. It applies logarithmic transformations to skewed variables related to healthcare distance and deprivation rates to normalize their distributions, which are then visualized using custom DistPlot functions. A cleaning step filters out infinite and missing values, resulting in a complete_case_sample dataset that is saved as a CSV file. The ggpairs and PointPlot functions are used to check for linearity and visualize relationships between variables. A Pearson correlation test is conducted between the log-transformed employment deprivation rate and income deprivation rate, with results formatted into an HTML table using stargazer. Based on the high correlation observed, the employment deprivation rate is considered for exclusion from further analysis to prevent multicollinearity. The PointPlot visualizations with regression lines further explore the relationships between the remaining variables, assessing the linearity between independent variables and the dependent variable.

Summary statistics with log transformed variables

### PLotting of table for slide 11
combined_data_sum <- combined_data[c("local_authority_name",
    "housing_affordability_score_2016", "income_deprivation_rate_2015",
    "employment_deprivation_rate_2015", "avg_healthcare_distance_2022",
    "rural_urban_class_2011")]

html_table <- stargazer(combined_data_sum, type = "html", title = "Table 1: Summary Statistics",
    digits = 1, column.labels = c("Housing Affordability Score (2016)",
        "Income Deprivation Rate (2015)", "Employment Deprivation Rate (2015)",
        "Average Healthcare Distance (2022)"), out = "table.html")

**Table 1: Summary Statistics**

Statistic	N	Mean	St. Dev.	Min	Max

housing_affordability_score_2016	32,715	0.000	1.9	-6.7	8.0
income_deprivation_rate_2015	32,715	12.8	9.4	0.2	60.9
employment_deprivation_rate_2015	32,715	5.8	4.0	0.2	34.7
avg_healthcare_distance_2022	32,715	4.4	4.8	0.0	60.3

# See HTML table in R Studio
rstudioapi::viewer("table.html")

NULL

### Calculate missingness percentages for each variable for
### slide 11
missingness <- sapply(combined_data_sum, function(x) mean(is.na(x)) *
    100)

# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 2: Missingness Summary/%",
    summary = FALSE, flip = TRUE)

**Table 2: Missingness Summary/%**

local_authority_name	0
housing_affordability_score_2016	0
income_deprivation_rate_2015	0
employment_deprivation_rate_2015	0
avg_healthcare_distance_2022	0
rural_urban_class_2011	0

html_file <- "missingness_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

### count of unique names for rural/urban for slide 12
unique_class2011 <- table(combined_data_sum$rural_urban_class_2011)
unique_class2011.1 <- as.data.frame(unique_class2011)

# Rename the columns
colnames(unique_class2011.1) <- c("Rural/Urban Class (2011)",
    "Count")

# Create HTML table using stargazer
html_table1 <- stargazer(unique_class2011.1, type = "html", title = "Unique Names and its Respective Counts",
    summary = FALSE)

**Unique Names and its Respective Counts**

	Rural/Urban Class (2011)	Count

1	Rural town and fringe	2,930
2	Rural town and fringe in a sparse setting	119
3	Rural village and dispersed	2,356
4	Rural village and dispersed in a sparse setting	181
5	Urban city and town	14,399
6	Urban city and town in a sparse setting	59
7	Urban major conurbation	11,465
8	Urban minor conurbation	1,206

cat(html_table1)

**Unique Names and its Respective Counts**

	Rural/Urban Class (2011)	Count

1	Rural town and fringe	2,930
2	Rural town and fringe in a sparse setting	119
3	Rural village and dispersed	2,356
4	Rural village and dispersed in a sparse setting	181
5	Urban city and town	14,399
6	Urban city and town in a sparse setting	59
7	Urban major conurbation	11,465
8	Urban minor conurbation	1,206

writeLines(html_table1, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

# Calculate summary statistics
summary_ruralurban <- c(Max = max(rural_urban_class_2011_counts),
    Min = min(rural_urban_class_2011_counts), Med = median(rural_urban_class_2011_counts),
    IQR = IQR(rural_urban_class_2011_counts))

# Create data frame for summary statistics
summary_df <- data.frame(statistic = names(summary_ruralurban),
    value = summary_ruralurban)

# Create HTML table using stargazer
html_table2 <- stargazer(summary_df, type = "html", title = "Summary Statistics",
    summary = FALSE, rownames = FALSE)

**Summary Statistics**

statistic	value

Max	14,399
Min	59
Med	1,781
IQR	4,898.250

cat(html_table2)

**Summary Statistics**

statistic	value

Max	14,399
Min	59
Med	1,781
IQR	4,898.250

writeLines(html_table2, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

## count of unique names for local authority names for
## slide 12
unique_localauthority <- table(combined_data_sum$local_authority_name)

# Convert the result to a data frame
unique_localauthority1 <- as.data.frame(unique_localauthority)

colnames(unique_localauthority1) <- c("Local Authority Names",
    "Count")

# Create HTML table using stargazer
html_table3 <- stargazer(unique_localauthority1, type = "html",
    title = "Unique Names and its Respective Counts", summary = FALSE)

**Unique Names and its Respective Counts**

	Local Authority Names	Count

1	Adur	42
2	Allerdale	60
3	Amber Valley	78
4	Arun	94
5	Ashfield	74
6	Ashford	78
7	Aylesbury Vale	114
8	Babergh	54
9	Barking and Dagenham	110
10	Barnet	209
11	Barnsley	147
12	Barrow-in-Furness	49
13	Basildon	109
14	Basingstoke and Deane	109
15	Bassetlaw	70
16	Bath and North East Somerset	114
17	Bedford	103
18	Bexley	146
19	Birmingham	638
20	Blaby	60
21	Blackburn with Darwen	91
22	Blackpool	94
23	Bolsover	48
24	Bolton	177
25	Boston	36
26	Bournemouth, Christchurch and Poole	233
27	Bracknell Forest	75
28	Bradford	308
29	Braintree	87
30	Breckland	78
31	Brent	173
32	Brentwood	45
33	Brighton and Hove	165
34	Bristol, City of	262
35	Broadland	84
36	Bromley	195
37	Bromsgrove	58
38	Broxbourne	56
39	Broxtowe	71
40	Burnley	60
41	Bury	120
42	Calderdale	128
43	Cambridge	67
44	Camden	132
45	Cannock Chase	60
46	Canterbury	90
47	Carlisle	68
48	Castle Point	57
49	Central Bedfordshire	157
50	Charnwood	99
51	Chelmsford	107
52	Cheltenham	75
53	Cherwell	93
54	Cheshire East	232
55	Cheshire West and Chester	212
56	Chesterfield	69
57	Chichester	71
58	Chiltern	54
59	Chorley	66
60	City of London	4
61	Colchester	105
62	Copeland	49
63	Corby	41
64	Cornwall	326
65	Cotswold	51
66	County Durham	324
67	Coventry	195
68	Craven	32
69	Crawley	66
70	Croydon	220
71	Dacorum	91
72	Darlington	65
73	Dartford	58
74	Daventry	44
75	Derby	151
76	Derbyshire Dales	43
77	Doncaster	194
78	Dorset	218
79	Dover	67
80	Dudley	201
81	Ealing	196
82	East Cambridgeshire	50
83	East Devon	81
84	East Hampshire	71
85	East Hertfordshire	84
86	East Lindsey	81
87	East Northamptonshire	49
88	East Riding of Yorkshire	210
89	East Staffordshire	72
90	East Suffolk	146
91	Eastbourne	61
92	Eastleigh	77
93	Eden	36
94	Elmbridge	76
95	Enfield	183
96	Epping Forest	78
97	Epsom and Ewell	44
98	Erewash	73
99	Exeter	74
100	Fareham	73
101	Fenland	55
102	Folkestone and Hythe	67
103	Forest of Dean	50
104	Fylde	51
105	Gateshead	126
106	Gedling	77
107	Gloucester	78
108	Gosport	53
109	Gravesham	64
110	Great Yarmouth	61
111	Greenwich	150
112	Guildford	82
113	Hackney	144
114	Halton	79
115	Hambleton	52
116	Hammersmith and Fulham	113
117	Harborough	47
118	Haringey	145
119	Harlow	54
120	Harrogate	100
121	Harrow	137
122	Hart	56
123	Hartlepool	58
124	Hastings	53
125	Havant	78
126	Havering	150
127	Herefordshire, County of	116
128	Hertsmere	61
129	High Peak	59
130	Hillingdon	161
131	Hinckley and Bosworth	66
132	Horsham	81
133	Hounslow	142
134	Huntingdonshire	105
135	Hyndburn	52
136	Ipswich	85
137	Isle of Wight	89
138	Isles of Scilly	1
139	Islington	123
140	Kensington and Chelsea	90
141	Kettering	57
142	King’s Lynn and West Norfolk	89
143	Kingston upon Hull, City of	166
144	Kingston upon Thames	98
145	Kirklees	259
146	Knowsley	98
147	Lambeth	178
148	Lancaster	89
149	Leeds	480
150	Leicester	192
151	Lewes	62
152	Lewisham	169
153	Lichfield	58
154	Lincoln	57
155	Liverpool	297
156	Luton	121
157	Maidstone	94
158	Maldon	40
159	Malvern Hills	45
160	Manchester	278
161	Mansfield	67
162	Medway	163
163	Melton	30
164	Mendip	66
165	Merton	122
166	Mid Devon	43
167	Mid Suffolk	56
168	Mid Sussex	86
169	Middlesbrough	86
170	Milton Keynes	152
171	Mole Valley	52
172	New Forest	113
173	Newark and Sherwood	70
174	Newcastle-under-Lyme	80
175	Newcastle upon Tyne	174
176	Newham	162
177	North Devon	58
178	North East Derbyshire	63
179	North East Lincolnshire	106
180	North Hertfordshire	82
181	North Kesteven	64
182	North Lincolnshire	101
183	North Norfolk	62
184	North Somerset	135
185	North Tyneside	131
186	North Warwickshire	38
187	North West Leicestershire	58
188	Northampton	133
189	Northumberland	197
190	Norwich	83
191	Nottingham	182
192	Nuneaton and Bedworth	81
193	Oadby and Wigston	36
194	Oldham	141
195	Oxford	79
196	Pendle	57
197	Peterborough	112
198	Plymouth	161
199	Portsmouth	124
200	Preston	86
201	Reading	97
202	Redbridge	161
203	Redcar and Cleveland	88
204	Redditch	55
205	Reigate and Banstead	83
206	Ribble Valley	39
207	Richmond upon Thames	115
208	Richmondshire	34
209	Rochdale	134
210	Rochford	53
211	Rossendale	43
212	Rother	58
213	Rotherham	167
214	Rugby	61
215	Runnymede	51
216	Rushcliffe	68
217	Rushmoor	58
218	Rutland	23
219	Ryedale	30
220	Salford	149
221	Sandwell	186
222	Scarborough	71
223	Sedgemoor	70
224	Sefton	189
225	Selby	50
226	Sevenoaks	73
227	Sheffield	343
228	Shropshire	193
229	Slough	80
230	Solihull	132
231	Somerset West and Taunton	88
232	South Bucks	37
233	South Cambridgeshire	96
234	South Derbyshire	58
235	South Gloucestershire	165
236	South Hams	49
237	South Holland	49
238	South Kesteven	81
239	South Lakeland	59
240	South Norfolk	81
241	South Northamptonshire	51
242	South Oxfordshire	87
243	South Ribble	70
244	South Somerset	103
245	South Staffordshire	68
246	South Tyneside	102
247	Southampton	148
248	Southend-on-Sea	107
249	Southwark	162
250	Spelthorne	60
251	St Albans	83
252	St. Helens	119
253	Stafford	80
254	Staffordshire Moorlands	59
255	Stevenage	52
256	Stockport	190
257	Stockton-on-Tees	120
258	Stoke-on-Trent	159
259	Stratford-on-Avon	73
260	Stroud	69
261	Sunderland	185
262	Surrey Heath	53
263	Sutton	121
264	Swale	85
265	Swindon	132
266	Tameside	141
267	Tamworth	51
268	Tandridge	49
269	Teignbridge	84
270	Telford and Wrekin	108
271	Tendring	89
272	Test Valley	71
273	Tewkesbury	50
274	Thanet	84
275	Three Rivers	49
276	Thurrock	98
277	Tonbridge and Malling	71
278	Torbay	89
279	Torridge	37
280	Tower Hamlets	144
281	Trafford	138
282	Tunbridge Wells	66
283	Uttlesford	45
284	Vale of White Horse	76
285	Wakefield	209
286	Walsall	167
287	Waltham Forest	144
288	Wandsworth	178
289	Warrington	127
290	Warwick	85
291	Watford	53
292	Waverley	81
293	Wealden	95
294	Wellingborough	47
295	Welwyn Hatfield	65
296	West Berkshire	96
297	West Devon	31
298	West Lancashire	73
299	West Lindsey	52
300	West Oxfordshire	66
301	West Suffolk	100
302	Westminster	120
303	Wigan	200
304	Wiltshire	285
305	Winchester	69
306	Windsor and Maidenhead	88
307	Wirral	206
308	Woking	59
309	Wokingham	99
310	Wolverhampton	158
311	Worcester	63
312	Worthing	65
313	Wychavon	78
314	Wycombe	106
315	Wyre	69
316	Wyre Forest	65
317	York	120

cat(html_table3)

**Unique Names and its Respective Counts**

	Local Authority Names	Count

1	Adur	42
2	Allerdale	60
3	Amber Valley	78
4	Arun	94
5	Ashfield	74
6	Ashford	78
7	Aylesbury Vale	114
8	Babergh	54
9	Barking and Dagenham	110
10	Barnet	209
11	Barnsley	147
12	Barrow-in-Furness	49
13	Basildon	109
14	Basingstoke and Deane	109
15	Bassetlaw	70
16	Bath and North East Somerset	114
17	Bedford	103
18	Bexley	146
19	Birmingham	638
20	Blaby	60
21	Blackburn with Darwen	91
22	Blackpool	94
23	Bolsover	48
24	Bolton	177
25	Boston	36
26	Bournemouth, Christchurch and Poole	233
27	Bracknell Forest	75
28	Bradford	308
29	Braintree	87
30	Breckland	78
31	Brent	173
32	Brentwood	45
33	Brighton and Hove	165
34	Bristol, City of	262
35	Broadland	84
36	Bromley	195
37	Bromsgrove	58
38	Broxbourne	56
39	Broxtowe	71
40	Burnley	60
41	Bury	120
42	Calderdale	128
43	Cambridge	67
44	Camden	132
45	Cannock Chase	60
46	Canterbury	90
47	Carlisle	68
48	Castle Point	57
49	Central Bedfordshire	157
50	Charnwood	99
51	Chelmsford	107
52	Cheltenham	75
53	Cherwell	93
54	Cheshire East	232
55	Cheshire West and Chester	212
56	Chesterfield	69
57	Chichester	71
58	Chiltern	54
59	Chorley	66
60	City of London	4
61	Colchester	105
62	Copeland	49
63	Corby	41
64	Cornwall	326
65	Cotswold	51
66	County Durham	324
67	Coventry	195
68	Craven	32
69	Crawley	66
70	Croydon	220
71	Dacorum	91
72	Darlington	65
73	Dartford	58
74	Daventry	44
75	Derby	151
76	Derbyshire Dales	43
77	Doncaster	194
78	Dorset	218
79	Dover	67
80	Dudley	201
81	Ealing	196
82	East Cambridgeshire	50
83	East Devon	81
84	East Hampshire	71
85	East Hertfordshire	84
86	East Lindsey	81
87	East Northamptonshire	49
88	East Riding of Yorkshire	210
89	East Staffordshire	72
90	East Suffolk	146
91	Eastbourne	61
92	Eastleigh	77
93	Eden	36
94	Elmbridge	76
95	Enfield	183
96	Epping Forest	78
97	Epsom and Ewell	44
98	Erewash	73
99	Exeter	74
100	Fareham	73
101	Fenland	55
102	Folkestone and Hythe	67
103	Forest of Dean	50
104	Fylde	51
105	Gateshead	126
106	Gedling	77
107	Gloucester	78
108	Gosport	53
109	Gravesham	64
110	Great Yarmouth	61
111	Greenwich	150
112	Guildford	82
113	Hackney	144
114	Halton	79
115	Hambleton	52
116	Hammersmith and Fulham	113
117	Harborough	47
118	Haringey	145
119	Harlow	54
120	Harrogate	100
121	Harrow	137
122	Hart	56
123	Hartlepool	58
124	Hastings	53
125	Havant	78
126	Havering	150
127	Herefordshire, County of	116
128	Hertsmere	61
129	High Peak	59
130	Hillingdon	161
131	Hinckley and Bosworth	66
132	Horsham	81
133	Hounslow	142
134	Huntingdonshire	105
135	Hyndburn	52
136	Ipswich	85
137	Isle of Wight	89
138	Isles of Scilly	1
139	Islington	123
140	Kensington and Chelsea	90
141	Kettering	57
142	King’s Lynn and West Norfolk	89
143	Kingston upon Hull, City of	166
144	Kingston upon Thames	98
145	Kirklees	259
146	Knowsley	98
147	Lambeth	178
148	Lancaster	89
149	Leeds	480
150	Leicester	192
151	Lewes	62
152	Lewisham	169
153	Lichfield	58
154	Lincoln	57
155	Liverpool	297
156	Luton	121
157	Maidstone	94
158	Maldon	40
159	Malvern Hills	45
160	Manchester	278
161	Mansfield	67
162	Medway	163
163	Melton	30
164	Mendip	66
165	Merton	122
166	Mid Devon	43
167	Mid Suffolk	56
168	Mid Sussex	86
169	Middlesbrough	86
170	Milton Keynes	152
171	Mole Valley	52
172	New Forest	113
173	Newark and Sherwood	70
174	Newcastle-under-Lyme	80
175	Newcastle upon Tyne	174
176	Newham	162
177	North Devon	58
178	North East Derbyshire	63
179	North East Lincolnshire	106
180	North Hertfordshire	82
181	North Kesteven	64
182	North Lincolnshire	101
183	North Norfolk	62
184	North Somerset	135
185	North Tyneside	131
186	North Warwickshire	38
187	North West Leicestershire	58
188	Northampton	133
189	Northumberland	197
190	Norwich	83
191	Nottingham	182
192	Nuneaton and Bedworth	81
193	Oadby and Wigston	36
194	Oldham	141
195	Oxford	79
196	Pendle	57
197	Peterborough	112
198	Plymouth	161
199	Portsmouth	124
200	Preston	86
201	Reading	97
202	Redbridge	161
203	Redcar and Cleveland	88
204	Redditch	55
205	Reigate and Banstead	83
206	Ribble Valley	39
207	Richmond upon Thames	115
208	Richmondshire	34
209	Rochdale	134
210	Rochford	53
211	Rossendale	43
212	Rother	58
213	Rotherham	167
214	Rugby	61
215	Runnymede	51
216	Rushcliffe	68
217	Rushmoor	58
218	Rutland	23
219	Ryedale	30
220	Salford	149
221	Sandwell	186
222	Scarborough	71
223	Sedgemoor	70
224	Sefton	189
225	Selby	50
226	Sevenoaks	73
227	Sheffield	343
228	Shropshire	193
229	Slough	80
230	Solihull	132
231	Somerset West and Taunton	88
232	South Bucks	37
233	South Cambridgeshire	96
234	South Derbyshire	58
235	South Gloucestershire	165
236	South Hams	49
237	South Holland	49
238	South Kesteven	81
239	South Lakeland	59
240	South Norfolk	81
241	South Northamptonshire	51
242	South Oxfordshire	87
243	South Ribble	70
244	South Somerset	103
245	South Staffordshire	68
246	South Tyneside	102
247	Southampton	148
248	Southend-on-Sea	107
249	Southwark	162
250	Spelthorne	60
251	St Albans	83
252	St. Helens	119
253	Stafford	80
254	Staffordshire Moorlands	59
255	Stevenage	52
256	Stockport	190
257	Stockton-on-Tees	120
258	Stoke-on-Trent	159
259	Stratford-on-Avon	73
260	Stroud	69
261	Sunderland	185
262	Surrey Heath	53
263	Sutton	121
264	Swale	85
265	Swindon	132
266	Tameside	141
267	Tamworth	51
268	Tandridge	49
269	Teignbridge	84
270	Telford and Wrekin	108
271	Tendring	89
272	Test Valley	71
273	Tewkesbury	50
274	Thanet	84
275	Three Rivers	49
276	Thurrock	98
277	Tonbridge and Malling	71
278	Torbay	89
279	Torridge	37
280	Tower Hamlets	144
281	Trafford	138
282	Tunbridge Wells	66
283	Uttlesford	45
284	Vale of White Horse	76
285	Wakefield	209
286	Walsall	167
287	Waltham Forest	144
288	Wandsworth	178
289	Warrington	127
290	Warwick	85
291	Watford	53
292	Waverley	81
293	Wealden	95
294	Wellingborough	47
295	Welwyn Hatfield	65
296	West Berkshire	96
297	West Devon	31
298	West Lancashire	73
299	West Lindsey	52
300	West Oxfordshire	66
301	West Suffolk	100
302	Westminster	120
303	Wigan	200
304	Wiltshire	285
305	Winchester	69
306	Windsor and Maidenhead	88
307	Wirral	206
308	Woking	59
309	Wokingham	99
310	Wolverhampton	158
311	Worcester	63
312	Worthing	65
313	Wychavon	78
314	Wycombe	106
315	Wyre	69
316	Wyre Forest	65
317	York	120

writeLines(html_table3, html_file)
rstudioapi::viewer(html_file)

NULL

# Calculate summary statistics
summary_localauthority <- c(Max = max(local_authority_name_counts),
    Min = min(local_authority_name_counts), Med = median(local_authority_name_counts),
    IQR = IQR(local_authority_name_counts))

summary_df <- data.frame(statistic = names(summary_localauthority),
    value = summary_localauthority)

# Create HTML table using stargazer
html_table4 <- stargazer(summary_df, type = "html", title = "Summary Statistics",
    summary = FALSE, rownames = FALSE)

**Summary Statistics**

statistic	value

Max	638
Min	1
Med	81
IQR	72

cat(html_table4)

**Summary Statistics**

statistic	value

Max	638
Min	1
Med	81
IQR	72

writeLines(html_table4, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

### Repeat for complete_case.csv for slide 16 Read the
### complete_case file
complete_case_sum <- read.csv("Group_7_complete_case.csv")

html_table <- stargazer(complete_case_sum, type = "html", title = "Table 3: Summary Statistics",
    digits = 1, column.labels = c("Log Income Deprivation Rate (2015)",
        "Log Average Healthcare Distance (2022)", "Housing Affordability Score (2016)",
        "Rural/Urban Class (2011)"), out = "table.html")

**Table 3: Summary Statistics**

Statistic	N	Mean	St. Dev.	Min	Max

log_income_deprivation_rate_2015	32,714	2.3	0.8	-1.7	4.1
log_avg_healthcare_distance_2022	32,714	1.2	0.8	-1.5	4.1
housing_affordability_score_2016	32,714	0.001	1.9	-6.7	8.0

# See HTML table in R Studio
rstudioapi::viewer("table.html")

NULL

### Calculate missingness percentages for each variable for
### slide 16
missingness <- sapply(complete_case_sum, function(x) mean(is.na(x)) *
    100)

# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 4: Missingness Summary/%",
    summary = FALSE, flip = TRUE)

**Table 4: Missingness Summary/%**

log_income_deprivation_rate_2015	0
log_avg_healthcare_distance_2022	0
housing_affordability_score_2016	0
rural_urban_class_2011	0

html_file <- "missingness_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

### covariance of x1,x2,x3 before log transformation for
### slide 17
x1 <- combined_data_sum$income_deprivation_rate_2015
x2 <- combined_data_sum$employment_deprivation_rate_2015
x3 <- combined_data_sum$housing_affordability_score_2016
y <- combined_data_sum$avg_healthcare_distance_2022
cov_x1y <- cov(x1, y)
cov_x2y <- cov(x2, y)
cov_x3y <- cov(x3, y)

# Create a data frame for the covariances
covariances_df <- data.frame(variables = c("x1 and y", "x2 and y",
    "x3 and y"), covariance = c(cov_x1y, cov_x2y, cov_x3y))

# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances before log transformation",
    summary = FALSE, rownames = FALSE)

**Covariances before log transformation**

variables	covariance

x1 and y	-12.798
x2 and y	-5.549
x3 and y	-2.717

html_file <- "covariances_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

### covariance of x4,x5 after log transformation for slide
### 17
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <- complete_case_sum$log_income_deprivation_rate_2015
x5 <- complete_case_sum$housing_affordability_score_2016
y1 <- complete_case_sum$log_avg_healthcare_distance_2022
cov_x4y1 <- cov(x4, y1)
cov_x5y1 <- cov(x5, y1)

# Create a data frame for the covariances
covariances_df <- data.frame(variables = c("x4 and y1", "x5 and y1"),
    covariance = c(cov_x4y1, cov_x5y1))

# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances after log transformation",
    summary = FALSE, rownames = FALSE)

**Covariances after log transformation**

variables	covariance

x4 and y1	-0.219
x5 and y1	-0.597

html_file <- "covariances_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

## Correlation test for slide 18
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <- complete_case_sum$log_income_deprivation_rate_2015
x5 <- complete_case_sum$housing_affordability_score_2016
y1 <- complete_case_sum$log_avg_healthcare_distance_2022
corx4y1 <- cor.test(x4, y1, method = "pearson")
corx5y1 <- cor.test(x5, y1, method = "pearson")

# Create a data frame to store correlation results
# format.pval function is used to format the p-value to
# make it into a string ie. a range
corr_df <- data.frame(variables = c("x4 and y1", "x5 and y1"),
    Pearson_Correlation = c(corx4y1$estimate, corx5y1$estimate),
    t_value = c(corx4y1$statistic, corx5y1$statistic), Estimate = c(corx4y1$estimate,
        corx5y1$estimate), p_value = ifelse(corx4y1$p.value ==
        0, "< 0.001", format.pval(corx4y1$p.value)))

# Create HTML table using stargazer
html_table <- stargazer(corr_df, type = "html", title = "Pearson Correlations After Log Transformation",
    summary = FALSE, rownames = FALSE)

**Pearson Correlations After Log Transformation**

variables	Pearson_Correlation	t_value	Estimate	p_value

x4 and y1	-0.367	-71.408	-0.367	< 0.001
x5 and y1	-0.408	-80.848	-0.408	< 0.001

html_file <- "cor_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL The above code is a sequence of commands for generating HTML tables that display summary statistics, missingness percentages, unique category counts, and measures of covariance and correlation. It begins with extracting and tabulating a subset of the dataset for slide 11, continues with analyzing missing data and category counts for slides 11 and 12, then moves on to slide 16 where it reads a cleaned dataset to produce similar summaries. For slide 17, it calculates covariances pre- and post-log transformation, and for slide 18, it conducts Pearson correlation tests, with results formatted and exported as HTML tables viewable within RStudio.

Model creation

# Model 1:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate.
mod1 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
summary(mod1)

## 
## Call:
## lm(formula = log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015, 
##     data = complete_case_sample)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.70482 -0.47303 -0.06624  0.38161  2.80484 
## 
## Coefficients:
##                                   Estimate Std. Error t value
## (Intercept)                       1.977622   0.012243  161.53
## log_income_deprivation_rate_2015 -0.363913   0.005096  -71.41
##                                             Pr(>|t|)    
## (Intercept)                      <0.0000000000000002 ***
## log_income_deprivation_rate_2015 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7149 on 32712 degrees of freedom
## Multiple R-squared:  0.1349, Adjusted R-squared:  0.1348 
## F-statistic:  5099 on 1 and 32712 DF,  p-value: < 0.00000000000000022

# Model 1 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate. As both the dependent and independent variables are log transformed, the coefficients may be interpreted in terms of percentage change. That is, a 1% increase in the income deprivation rate is associated with a 0.35% decrease in the average travel time to a healthcare service. This coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). Thus, according to this model, we may reject the null hypothesis that there is no relationship between income deprivation rate and average healthcare travel distance. Further, the p-value associated with the F-statistic is also significant considering the same alpha, sugegsting at least one variable in the model has some explanatory power regarding average healthcare distance. This statistically is not that useful regarding this model however, as only one explanatory variable is considered. The coefficient of determination for this model indicates that we are able to explain 13.1% of the variation in average travel time to healthcare services with this model.

# Model 2:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate and the housing affordability indicator
mod2 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)

# Exponentiating untransformed variable coefficients to allow easier interpretation
mod2$coefficients["housing_affordability_score_2016"] <- exp(mod2$coefficients["housing_affordability_score_2016"])

# Viewing model results
summary(mod2)

## 
## Call:
## lm(formula = log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + 
##     housing_affordability_score_2016, data = complete_case_sample)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5103 -0.4599 -0.0744  0.3675  2.7243 
## 
## Coefficients:
##                                   Estimate Std. Error t value
## (Intercept)                       1.604985   0.014034  114.36
## log_income_deprivation_rate_2015 -0.200002   0.005940  -33.67
## housing_affordability_score_2016  0.887820   0.002419  367.07
##                                             Pr(>|t|)    
## (Intercept)                      <0.0000000000000002 ***
## log_income_deprivation_rate_2015 <0.0000000000000002 ***
## housing_affordability_score_2016 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6899 on 32711 degrees of freedom
## Multiple R-squared:  0.1945, Adjusted R-squared:  0.1944 
## F-statistic:  3948 on 2 and 32711 DF,  p-value: < 0.00000000000000022

# Model 2 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate and the housing affordability indicator. Here, we can see that a 1% increase in the income deprivation rate is associated with a 0.19% decrease in the average travel time to a healthcare service. Thus, we can see that when we control for a further underlying economic variable, the explanatory power of income deprivation alone decreases. Again, this coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). For interpretation of the housing affordability indicator, the exponential of the coefficient is considered (0.88). Thus, we can see that a 1 unit increase in the housing affordability indicator is associated with a 12% decrease in the average travel time to a healthcare service, again significant considering an alpha of 0.05. Remembering that increases in the housing affordability indicator signify increased inability to afford to enter owner-occupation or the private rental market, this again appears to indicate that more economically deprived areas have greater geographical access to healthcare services. Further, the p-value associated with the F-statistic is also significant at the 5% level, again suggesting at least one variable in the model has some explanatory power regarding average healthcare distance. The adjusted coefficient of determination for this model indicates that we are able to explain 19.3% of the variation in average travel time to healthcare services with this model.

# Model 3:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate, the housing affordability indicator, and the re-leveled Rural-Urban Classification (RUC11) with "Urban city and town" as the reference category
mod3 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + forcats::fct_relevel(rural_urban_class_2011, "Urban city and town"), data = complete_case_sample)

# Exponentiated coefficient data
mod3_exp_coeff <- data.frame(exp(coef(mod3)))

# Exponentiating untransformed variable coefficients WITHIN the model to allow easier interpretation
mod3$coefficients[3:10] <- exp(mod3$coefficients[3:10])

# Viewing model results
summary(mod3)

## 
## Call:
## lm(formula = log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + 
##     housing_affordability_score_2016 + forcats::fct_relevel(rural_urban_class_2011, 
##     "Urban city and town"), data = complete_case_sample)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35847 -0.33319  0.02095  0.34803  2.03045 
## 
## Coefficients:
##                                                                                                                     Estimate
## (Intercept)                                                                                                         1.309527
## log_income_deprivation_rate_2015                                                                                   -0.103974
## housing_affordability_score_2016                                                                                    0.931051
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                            1.763662
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting        1.208809
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                      4.439979
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting  6.326992
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting          0.958377
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                          0.781509
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                          0.861055
##                                                                                                                    Std. Error
## (Intercept)                                                                                                          0.011329
## log_income_deprivation_rate_2015                                                                                     0.004576
## housing_affordability_score_2016                                                                                     0.001921
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                             0.010651
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting         0.048121
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                       0.011797
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting   0.039126
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting           0.068209
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                           0.006801
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                           0.015798
##                                                                                                                    t value
## (Intercept)                                                                                                         115.59
## log_income_deprivation_rate_2015                                                                                    -22.72
## housing_affordability_score_2016                                                                                    484.75
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                            165.58
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting         25.12
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                      376.37
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting  161.71
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting           14.05
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                          114.91
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                           54.50
##                                                                                                                               Pr(>|t|)
## (Intercept)                                                                                                        <0.0000000000000002
## log_income_deprivation_rate_2015                                                                                   <0.0000000000000002
## housing_affordability_score_2016                                                                                   <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                           <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting       <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                     <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting         <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                         <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                         <0.0000000000000002
##                                                                                                                       
## (Intercept)                                                                                                        ***
## log_income_deprivation_rate_2015                                                                                   ***
## housing_affordability_score_2016                                                                                   ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                           ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting       ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                     ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting         ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                         ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                         ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5228 on 32704 degrees of freedom
## Multiple R-squared:  0.5376, Adjusted R-squared:  0.5374 
## F-statistic:  4224 on 9 and 32704 DF,  p-value: < 0.00000000000000022

# Creating data for confidence interval plot
ConfidencePlotData <- data.frame (
  "names" = as.factor (c("Rural town and fringe", "Rural town and fringe in a sparse setting", "Rural village and dispersed", "Rural village and dispersed in a sparse setting", "Urban city and town in a sparse setting", "Urban major conurbation", "Urban minor conurbation")), # Creating predictor names (row names)
  "coefs" = mod3_exp_coeff[4:10, 1],
  "lower" = confint (mod3) [4:10, 1], # Creating a lower confidence interval [ , 1] list for predictors 2-3 in the model [2:5 , ]
  "upper" = confint (mod3) [4:10, 2] # Creating an upper confidence interval [ , 2] list for predictors 2-3 in the model [2:5 , ]
)
ConfidencePlotData$names <- factor(ConfidencePlotData$names, levels = unique(ConfidencePlotData$names))

# Creating confidence interval plot for RUC coefficients
ConfidencePlot(ConfidencePlotData, "Healthcare Access by Rural-Urban Classification in England", "Reference Category: Urban City and Town", x_label = "Exponentiated Coefficient", middle_point = 1, caption = "*95% CI Indicated by Green Error Bars. Red dashed line indicates reference group. Controlling for income deprivation (2015) and housing affordability indicators (2016).")

# Model 3 expands on Model 2 by including the Rural-Urban Classification (RUC11) as a factor variable, allowing us to account for geographic differences in average travel time to healthcare services. The model estimates the logarithm of average travel time to a healthcare service within a Lower Layer Super Output Area (LSOA) as a function of the logarithm of the income deprivation rate, the housing affordability indicator, and the RUC11 categories. We observe that a 1% increase in the income deprivation rate is associated with a 0.10% decrease in average travel time to healthcare services, when holding the other variables constant. This relationship is statistically significant at the 5% level, with a p-value less than 0.05, indicating that income deprivation continues to be an important factor in explaining variations in healthcare access, albeit with a smaller coefficient compared to Model 2. Similarly, a 1 unit increase in the housing affordability indicator, which reflects increased difficulty in entering owner-occupation or the private rental market, is associated with a 7% decrease in average travel time to healthcare services. This relationship is also statistically significant at the 5% level. The new variables introduced, representing the RUC11 categories, show significant variations in healthcare access across different geographic areas. For instance, compared to Urban city and town areas, Rural village and dispersed areas experience a 344% increase in average travel time to healthcare services, indicating that rural areas face substantial barriers in healthcare access. These relationships are statistically significant at the 5% level, with the exception of Urban city and town in a sparse setting category, which does not show a significant difference in healthcare access compared to Urban city and town areas. The adjusted coefficient of determination for this model is 0.537, suggesting that the model explains approximately 53.7% of the variation in average travel time to healthcare services. The inclusion of the RUC11 categories significantly enhances the explanatory power of the model compared to Model 2. The p-value associated with the F-statistic is also significant at the 5% level, indicating that at least one variable in the model has some explanatory power regarding average healthcare distance. In conclusion, Model 3 provides a more nuanced understanding of the factors affecting healthcare access, by accounting for both economic and geographic variables. The significant differences in healthcare access across the RUC11 categories highlight the importance of considering geographic disparities when analyzing healthcare access.

In these three regression models, the dependent variable—logarithm of average healthcare distance—is modeled against various predictors to study their effects on healthcare accessibility within Lower Layer Super Output Areas (LSOAs). Model 1 examines the impact of income deprivation alone, indicating a statistically significant inverse relationship where an increase in deprivation is associated with a decrease in travel time. Model 2 introduces housing affordability, showing a similar inverse relationship with both predictors, and a slightly improved explanatory power. Finally, Model 3 expands the analysis by adding Rural-Urban Classification, revealing significant geographic disparities in healthcare access; rural areas notably have longer travel times to healthcare services. Each model’s coefficient of determination increases progressively, demonstrating an improved ability to explain the variation in healthcare access by including additional socio-economic and geographic factors.

Model results

results <- stargazer(mod1, mod2, mod3, type = "html", dep.var.labels = c("log(Mean Healthcare Service Distance [MHSD])"),
    covariate.labels = c("log(Income Deprivation Rate 2015)",
        "Housing Affordability Score 2016 [HAS]", "Rural town and fringe [RUC 2011]",
        "Rural town and fringe in a sparse setting [RUC 2011]",
        "Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]",
        "Urban city and town in a sparse setting [RUC 2011]",
        "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"),
    notes = "Coefficients for HAS and RUC 2011 are exponentiated, reflecting a multiplicative effect on MHSD.",
    ci = TRUE)


	Dependent variable:

	log(Mean Healthcare Service Distance [MHSD])
	(1)	(2)	(3)

log(Income Deprivation Rate 2015)	-0.364^***	-0.200^***	-0.104^***
	(-0.374, -0.354)	(-0.212, -0.188)	(-0.113, -0.095)

Housing Affordability Score 2016 [HAS]		0.888^***	0.931^***
		(0.883, 0.893)	(0.927, 0.935)

Rural town and fringe [RUC 2011]			1.764^***
			(1.743, 1.785)

Rural town and fringe in a sparse setting [RUC 2011]			1.209^***
			(1.114, 1.303)

Rural village and dispersed [RUC 2011]			4.440^***
			(4.417, 4.463)

Rural village and dispersed in a sparse setting [RUC 2011]			6.327^***
			(6.250, 6.404)

Urban city and town in a sparse setting [RUC 2011]			0.958^***
			(0.825, 1.092)

Urban major conurbation [RUC 2011]			0.782^***
			(0.768, 0.795)

Urban minor conurbation [RUC 2011]			0.861^***
			(0.830, 0.892)

Constant	1.978^***	1.605^***	1.310^***
	(1.954, 2.002)	(1.577, 1.632)	(1.287, 1.332)


Observations	32,714	32,714	32,714
R²	0.135	0.194	0.538
Adjusted R²	0.135	0.194	0.537
Residual Std. Error	0.715 (df = 32712)	0.690 (df = 32711)	0.523 (df = 32704)
F Statistic	5,099.128^*** (df = 1; 32712)	3,948.166^*** (df = 2; 32711)	4,224.042^*** (df = 9; 32704)

Note:	p<0.1; p<0.05; p<0.01
	Coefficients for HAS and RUC 2011 are exponentiated, reflecting a multiplicative effect on MHSD.

Assumption checks

# Checking assumptions: Model 1 Adding residual values to
# dataframe
complete_case_sample$resids1 <- residuals(mod1)

# Linearity (using partial regression plots)
PartialRegressionPlot(mod1, "log_income_deprivation_rate_2015",
    x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR)")

# Constant variance of residuals
HomoskedasticityPlot(mod1, add_loess_line = TRUE, title = "Model 1 Scale-Location")

# Durbin-Watson test
dw1 <- dwtest(mod1)

# Create a data frame with the Durbin-Watson test results
dw_df <- data.frame(Statistic = dw1$statistic, P = format.pval(dw1$p.value,
    digits = 16))

# Use stargazer to create an HTML table of the
# Durbin-Watson results
stargazer(dw_df, type = "html", title = "Durbin-Watson Test Results",
    summary = FALSE, rownames = FALSE, out = "dw_test_results.html")

**Durbin-Watson Test Results**

Statistic	P

0.831	< 0.00000000000000022204460492503

# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer("dw_test_results.html")

NULL

# Normality of residuals check
QQPlotModel(mod1, subtitle = "QQ-Plot for Model 1")

# Zero conditional mean of residual values
mean(mod1$residuals)

[1] -0.00000000000000006509453

resid_mod1 <- lm(resids1 ~ log_income_deprivation_rate_2015,
    data = complete_case_sample)
summary(resid_mod1)

Call: lm(formula = resids1 ~ log_income_deprivation_rate_2015, data = complete_case_sample)

Residuals: Min 1Q Median 3Q Max -2.70482 -0.47303 -0.06624 0.38161 2.80484

Coefficients: Estimate Std. Error (Intercept) 0.0000000000000019171 0.0122433939839822377 log_income_deprivation_rate_2015 -0.0000000000000004258 0.0050962369734882749 t value Pr(>|t|) (Intercept) 0 1 log_income_deprivation_rate_2015 0 1

Residual standard error: 0.7149 on 32712 degrees of freedom Multiple R-squared: 1.914e-30, Adjusted R-squared: -3.057e-05 F-statistic: 6.261e-26 on 1 and 32712 DF, p-value: 1

# Zero conditional mean regression results
stargazer(resid_mod1, type = "html", dep.var.labels = c("Residuals of Model 1"),
    covariate.labels = c("log(Income Deprivation Rate 2015)"),
    out = "residmod1.html")


	Dependent variable:

	Residuals of Model 1

log(Income Deprivation Rate 2015)	-0.000
	(0.005)

Constant	0.000
	(0.012)


Observations	32,714
R²	0.000
Adjusted R²	-0.00003
Residual Std. Error	0.715 (df = 32712)
F Statistic	0.000 (df = 1; 32712)

Note:	p<0.1; p<0.05; p<0.01

rstudioapi::viewer("residmod1.html")

NULL

# Checking assumptions: Model 2 Adding residual values to
# dataframe
complete_case_sample$resids2 <- residuals(mod2)

# Linearity (using partial regression plots)
PartialRegressionPlot(mod2, "log_income_deprivation_rate_2015",
    x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot(mod2, "housing_affordability_score_2016",
    x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")

# Multicollinearity (using VIF)
vif_values1 <- vif(mod2)

# storing VIF in a dataframe
vif_df1 <- data.frame(Variable = names(vif_values1), VIF = vif_values1)
vif_df1$Variable[1] <- "log(Income Deprivation Rate 2015)"
vif_df1$Variable[2] <- "Housing Affordability Score 2016"

# creating a pretty table
stargazer(vif_df1, type = "html", title = "VIF Values", summary = FALSE,
    rownames = FALSE, out = "mod2vif.html")

**VIF Values**

Variable	VIF

log(Income Deprivation Rate 2015)	1.459
Housing Affordability Score 2016	1.459

# using rstudioapi to view the table
rstudioapi::viewer("mod2vif.html")

NULL

# Constant variance of residuals
HomoskedasticityPlot(mod2, add_loess_line = TRUE, title = "Model 2 Scale-Location")

# Durbin-Watson test
dw2 <- dwtest(mod2)

# Create a data frame with the Durbin-Watson test results
dw_df2 <- data.frame(Statistic = dw2$statistic, P = ifelse(dw2$p.value >
    0.05, "> 0.05", ifelse(dw2$p.value < 0.01, "< 0.01", "< 0.05")))

# Use stargazer to create an HTML table of the
# Durbin-Watson results
stargazer(dw_df2, type = "html", title = "Durbin-Watson Test Results",
    summary = FALSE, rownames = FALSE, out = "dw_test_results2.html")

**Durbin-Watson Test Results**

Statistic	P

0.885	< 0.01

# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer("dw_test_results2.html")

NULL

# Normality of residuals check
QQPlotModel(mod2, subtitle = "QQ-Plot for Model 2")

# Zero conditional mean of residual values
mean(mod2$residuals)

[1] -0.000000000000001023337

resid_mod2 <- lm(resids2 ~ log_income_deprivation_rate_2015 +
    housing_affordability_score_2016, data = complete_case_sample)
summary(resid_mod2)

Call: lm(formula = resids2 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)

Residuals: Min 1Q Median 3Q Max -2.5103 -0.4599 -0.0744 0.3675 2.7243

Coefficients: Estimate (Intercept) -0.00000000000000123474 log_income_deprivation_rate_2015 0.00000000000000012189 housing_affordability_score_2016 0.00000000000000007123 Std. Error t value Pr(>|t|) (Intercept) 0.01403408525346747093 0 1 log_income_deprivation_rate_2015 0.00594008978139718563 0 1 housing_affordability_score_2016 0.00241867829270426540 0 1

Residual standard error: 0.6899 on 32711 degrees of freedom Multiple R-squared: 2.489e-30, Adjusted R-squared: -6.114e-05 F-statistic: 4.071e-26 on 2 and 32711 DF, p-value: 1

# Zero conditional mean regression results
stargazer(resid_mod2, type = "html", dep.var.labels = c("Residuals of Model 2"),
    covariate.labels = c("log(Income Deprivation Rate 2015)",
        "Housing Affordability Score 2016 [HAS]"), out = "residmod2.html")


	Dependent variable:

	Residuals of Model 2

log(Income Deprivation Rate 2015)	0.000
	(0.006)

Housing Affordability Score 2016 [HAS]	0.000
	(0.002)

Constant	-0.000
	(0.014)


Observations	32,714
R²	0.000
Adjusted R²	-0.0001
Residual Std. Error	0.690 (df = 32711)
F Statistic	0.000 (df = 2; 32711)

Note:	p<0.1; p<0.05; p<0.01

rstudioapi::viewer("residmod2.html")

NULL

# Checking assumptions: Model 3 Adding residual values to
# dataframe
complete_case_sample$resids3 <- residuals(mod3)

# Linearity (using partial regression plots)
PartialRegressionPlot(mod3, "log_income_deprivation_rate_2015",
    x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot(mod3, "housing_affordability_score_2016",
    x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")

# Multicollinearity (using VIF)
vif_values2 <- data.frame(vif(mod3))
vif_values2$Variable <- c("log(Income Deprivation Rate 2015)",
    "Housing Affordability Score 2016", "Rural Urban Classification 2011")
vif_df2 = vif_values2[, c("Variable", "GVIF")]

# creating a pretty table
stargazer(vif_df2, type = "html", title = "GVIF Values", summary = FALSE,
    rownames = FALSE, out = "mod3vif.html")

**GVIF Values**

Variable	GVIF

log(Income Deprivation Rate 2015)	1.508
Housing Affordability Score 2016	1.602
Rural Urban Classification 2011	1.197

# using rstudioapi to view the table
rstudioapi::viewer("mod3vif.html")

NULL

# Constant variance of residuals plot
HomoskedasticityPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location")

# Plot indicating groups
HomoskedasticityDetailPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location",
    dataframe = complete_case_sample, categorical_var_name = "rural_urban_class_2011")

# Testing for non-constant variance across groups:
leveneTest(complete_case_sample$resids3, complete_case_sample$rural_urban_class_2011,
    center = mean)

Levene’s Test for Homogeneity of Variance (center = mean) Df F value Pr(>F)
group 7 189.04 < 0.00000000000000022 *** 32706
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

# df for levene's test results
levene_results <- data.frame(`Degrees of Freedom (Group)` = 7,
    `Degrees of Freedom (Residual)` = 32706, `F Value` = 189.04,
    `P Value` = "< 0.01")

names(levene_results) <- c("Degrees of Freedom (Group)", "Degrees of Freedom (Residual)",
    "F Value", "P Value")

# Viewing the table with descriptive column names
stargazer(levene_results, type = "html", title = "Levene's Test for Homogeneity of Variance",
    out = "levene_test.html", summary = FALSE, rownames = FALSE)

**Levene’s Test for Homogeneity of Variance**

Degrees of Freedom (Group)	Degrees of Freedom (Residual)	F Value	P Value

7	32,706	189.040	< 0.01

rstudioapi::viewer("levene_test.html")

NULL

# Durbin-Watson test
dw3 <- dwtest(mod3)

# Create a data frame with the Durbin-Watson test results
dw_df3 <- data.frame(Statistic = dw3$statistic, P = ifelse(dw3$p.value >
    0.05, "> 0.05", ifelse(dw3$p.value < 0.01, "< 0.01", "< 0.05")))

# Use stargazer to create an HTML table of the
# Durbin-Watson results
stargazer(dw_df3, type = "html", title = "Durbin-Watson Test Results",
    summary = FALSE, rownames = FALSE, out = "dw_test_results3.html")

**Durbin-Watson Test Results**

Statistic	P

1.184	< 0.01

# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer("dw_test_results3.html")

NULL

# When considering the independence of residuals, we might
# have reason to think that this assumption is not
# satisfied within the model, as it is likely that some
# degree of spatial autocorrelation could be influencing
# our results - that is, that LSOAs closer to each-other
# could theoretically have more similar values of X. This
# idea is supported by the results of the durbin-watson
# test statistics shown - notably, getting closer to 2
# (representing less reduced autocorrelation) when
# including a variable that partially accounts for
# spatiality (the RUC2011 indicator). As such, we do see a
# considerable amount of positive autocorrelation in the
# data (according to a Durbin-Watson statistic of 0.83
# being considerably below 2). This means that residuals
# that follow each other in the data tend to be slightly
# similar in magnitude. Considering the null hypothesis of
# no autocorrelation and considering the p-value being
# significant considering an alpha of 0.05, we fail to
# reject the null hypothesis that there is no
# autocorrelation presented by the variables within our
# model. Thus, we might not be confident that the
# independence of residuals assumption is satisfied.

# Normality of residuals check
QQPlotModel(mod3, subtitle = "QQ-Plot for Model 3")

# Zero conditional mean of residual values
mean(mod3$residuals)

[1] 0.00000000000000001521903

resid_mod3 <- lm(resids3 ~ log_income_deprivation_rate_2015 +
    housing_affordability_score_2016 + rural_urban_class_2011,
    data = complete_case_sample)
summary(resid_mod3)

Call: lm(formula = resids3 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + rural_urban_class_2011, data = complete_case_sample)

Residuals: Min 1Q Median 3Q Max -2.35847 -0.33319 0.02095 0.34803 2.03045

Coefficients: Estimate (Intercept) -0.00000000000000309640 log_income_deprivation_rate_2015 -0.00000000000000014066 housing_affordability_score_2016 0.00000000000000002189 rural_urban_class_2011Rural town and fringe in a sparse setting 0.00000000000000313182 rural_urban_class_2011Rural village and dispersed 0.00000000000000311998 rural_urban_class_2011Rural village and dispersed in a sparse setting 0.00000000000000369754 rural_urban_class_2011Urban city and town 0.00000000000000390593 rural_urban_class_2011Urban city and town in a sparse setting 0.00000000000000223619 rural_urban_class_2011Urban major conurbation 0.00000000000000350995 rural_urban_class_2011Urban minor conurbation 0.00000000000000338580 Std. Error (Intercept) 0.01398036896695923660 log_income_deprivation_rate_2015 0.00457597976149103861 housing_affordability_score_2016 0.00192069232350872252 rural_urban_class_2011Rural town and fringe in a sparse setting 0.04890182920944522588 rural_urban_class_2011Rural village and dispersed 0.01450616708669712727 rural_urban_class_2011Rural village and dispersed in a sparse setting 0.04004073360934087833 rural_urban_class_2011Urban city and town 0.01065145323950207612 rural_urban_class_2011Urban city and town in a sparse setting 0.06876884655176458694 rural_urban_class_2011Urban major conurbation 0.01120479904872859027 rural_urban_class_2011Urban minor conurbation 0.01802400600454943935 t value (Intercept) 0 log_income_deprivation_rate_2015 0 housing_affordability_score_2016 0 rural_urban_class_2011Rural town and fringe in a sparse setting 0 rural_urban_class_2011Rural village and dispersed 0 rural_urban_class_2011Rural village and dispersed in a sparse setting 0 rural_urban_class_2011Urban city and town 0 rural_urban_class_2011Urban city and town in a sparse setting 0 rural_urban_class_2011Urban major conurbation 0 rural_urban_class_2011Urban minor conurbation 0 Pr(>|t|) (Intercept) 1 log_income_deprivation_rate_2015 1 housing_affordability_score_2016 1 rural_urban_class_2011Rural town and fringe in a sparse setting 1 rural_urban_class_2011Rural village and dispersed 1 rural_urban_class_2011Rural village and dispersed in a sparse setting 1 rural_urban_class_2011Urban city and town 1 rural_urban_class_2011Urban city and town in a sparse setting 1 rural_urban_class_2011Urban major conurbation 1 rural_urban_class_2011Urban minor conurbation 1

Residual standard error: 0.5228 on 32704 degrees of freedom Multiple R-squared: 3.712e-30, Adjusted R-squared: -0.0002752 F-statistic: 1.349e-26 on 9 and 32704 DF, p-value: 1

# Zero conditional mean regression results
stargazer(resid_mod3, type = "html", dep.var.labels = c("Residuals of Model 3"),
    covariate.labels = c("log(Income Deprivation Rate 2015)",
        "Housing Affordability Score 2016 [HAS]", "Rural town and fringe [RUC 2011]",
        "Rural town and fringe in a sparse setting [RUC 2011]",
        "Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]",
        "Urban city and town in a sparse setting [RUC 2011]",
        "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"),
    out = "residmod3.html")


	Dependent variable:

	Residuals of Model 3

log(Income Deprivation Rate 2015)	-0.000
	(0.005)

Housing Affordability Score 2016 [HAS]	0.000
	(0.002)

Rural town and fringe [RUC 2011]	0.000
	(0.049)

Rural town and fringe in a sparse setting [RUC 2011]	0.000
	(0.015)

Rural village and dispersed [RUC 2011]	0.000
	(0.040)

Rural village and dispersed in a sparse setting [RUC 2011]	0.000
	(0.011)

Urban city and town in a sparse setting [RUC 2011]	0.000
	(0.069)

Urban major conurbation [RUC 2011]	0.000
	(0.011)

Urban minor conurbation [RUC 2011]	0.000
	(0.018)

Constant	-0.000
	(0.014)


Observations	32,714
R²	0.000
Adjusted R²	-0.0003
Residual Std. Error	0.523 (df = 32704)
F Statistic	0.000 (df = 9; 32704)

Note:	p<0.1; p<0.05; p<0.01

rstudioapi::viewer("residmod3.html")

NULL

Appendix (complete code)

library(knitr)
knitr::opts_chunk$set(echo = T,  warning = FALSE, message = FALSE)
opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=TRUE)
# Libraries used
library(tidyverse)
library(haven)
library(dplyr)
library(readxl)
library(GGally)
library(forcats)
library(stargazer)
library(lme4)
library(car)
library(rstudioapi)
library(knitr)
library(kableExtra)
# Turning off scientific notation
options(scipen = 9999)
# Bespoke functions used ----
PointPlot <- function(dataset, independent_var, dependent_var, colour_var = NULL, title = NULL, subtitle = NULL, caption = NULL, x_label = "X variable", y_label = "Y variable", point_size = 1, line_intercept = NULL, line_slope = NULL, line_colour = "black", line_linewidth = 1, line_linetype = "solid", add_regression_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Adjust the aesthetic mapping based on the presence of the third variable
  if(!is.null(colour_var)) {
    plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var, colour = colour_var))
  } else {
    plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var))
  }
  
  # Base point plot
  plot <- plot + geom_point(alpha = 1, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
                                      family = 'Roboto'),
          axis.title = element_text(family = 'Roboto'),
          axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          legend.position = ifelse(is.null(colour_var), 'none', 'right'),
          legend.text = element_text(family = 'Roboto')) + 
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = x_label,
         y = y_label)
  
  # Add manual line if intercept and slope are provided
  if (!is.null(line_intercept) & !is.null(line_slope)) {
    plot <- plot + geom_abline(intercept = line_intercept, 
                               slope = line_slope, 
                               colour = line_colour, 
                               linewidth = line_linewidth, 
                               linetype = line_linetype)
  }
  
  # Add regression line with confidence interval if requested
  if (add_regression_line) {
    plot <- plot + geom_smooth(method = "lm", alpha = 0.2)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  return(plot)
}

DistPlot <- function(dataset, variable, formatted_name, subtitle = NULL, caption = NULL, y_label = NULL) {
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  plot <- ggplot(dataset, aes_string(x = variable)) +
    geom_histogram(aes(y = ..density..),
                   colour = 1, fill = "white") +
    geom_density(fill = '#0072C6', colour = '#0072C6', alpha = 0.25) +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                      family = 'Roboto'),
          axis.title.x = element_blank(),  # This removes the x-axis title
          axis.title.y = element_text(size = 15.5,
                                      family = 'Roboto',
                                      colour = 'black'),
          axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          legend.position = 'none') +
    labs(title = paste0("Distribution of ", formatted_name),
         subtitle = subtitle,
         caption = caption,
         y = y_label)
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  plot
}

ConfidencePlot <- function(data, title, subtitle, middle_point = 0, x_label = "Estimated Coefficient", caption = NULL) {
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  if (!middle_point %in% c(0, 1)) {
    stop("middle_point must be 0 or 1")
  }
  
  conf_plot <- ggplot(data, aes(x = coefs, y = names)) +
    geom_errorbar(aes(xmin = lower, xmax = upper),
                  col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "forestgreen"),
                  width = 0.5, size = 1, alpha = 0.5) +
    geom_point(col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "black"), size = 1, alpha = 1) +
    geom_vline(xintercept = middle_point, col = 'red', size = 0.5, linetype="dashed") +
    theme_classic() +
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = x_label,
         y = "") +
    theme(axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          axis.title = element_text(family = 'Roboto', size = 15.5),
          plot.title = element_text(hjust = 0, colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto', margin = margin(b = 10)),
          plot.subtitle = element_text(hjust = 0, colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto', margin = margin(b = 5)),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
                                      family = 'Roboto'))
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", conf_plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = conf_plot, width = 1200/100, height = 740/100, dpi = 100)  # Adjust height as necessary
  
  return(conf_plot)
}

PartialRegressionPlot <- function(model, variable, x_var_name = "X1", y_var_name = "Dependent Variable", title = NULL) {
  # Close any existing plotting device
  graphics.off()
  
  # Construct axis labels using the specified names
  xlab <- paste("Residuals of", x_var_name, "Controlling for Other Variables")
  ylab <- paste("Residuals of", y_var_name, "(Excluding", x_var_name, ")")
  
  # Create the plot without a main title
  avPlots(model, variable, xlab = xlab, ylab = ylab)
  
  # Add a title using the title function
  title(main = title)
}

HomoskedasticityPlot <- function(model, title = "Homoskedasticity Check", subtitle = NULL, 
                                 x_label = "Fitted Values", y_label = "Standardized Residuals", 
                                 point_size = 1.2, add_loess_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  library(lmtest) # For Breusch-Pagan test
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Calculate standardized residuals
  std_residuals <- rstandard(model)
  
  # Fitted values
  fitted_values <- fitted(model)
  
  # Create a data frame for plotting
  plot_data <- data.frame(FittedValues = fitted_values, StdResiduals = std_residuals)
  
  # Perform Breusch-Pagan test
  bp_test <- bptest(model)
  
  # Format the p-value for the caption
  if (bp_test$p.value < 0.05) {
    p_value_text <- "p < 0.05"
  } else if (bp_test$p.value > 0.05) {
    p_value_text <- "p > 0.05"
  } else {
    p_value_text <- "p = 0.05"
  }
  
  # Create a caption including the formatted p-value text
  caption <- paste("Breusch-Pagan Test:", p_value_text)
  
  # Base point plot with adjusted sizes for readability
  plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals)) +
    geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 50, face = 'bold', family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 30, face = 'italic', family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 50, face = 'italic', family = 'Roboto'),
          axis.title = element_text(size = 30, family = 'Roboto'),
          axis.text = element_text(size = 30, family = 'Roboto', colour = 'black'),
          legend.position = 'none') + 
    labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
  
  # Add loess smoothing line if requested
  if (add_loess_line) {
    plot <- plot + geom_smooth(method = "loess", alpha = 0.2)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot with dimensions that balance size and readability
  ggsave(filename = file_name, plot = plot, width = 5.91, height = 4.50, dpi = 300)
  
  return(plot)
}

HomoskedasticityDetailPlot <- function(model, dataframe, categorical_var_name, 
                                       title = "Homoskedasticity Check", subtitle = NULL, 
                                       x_label = "Fitted Values", y_label = "Standardized Residuals", 
                                       point_size = 1.2, add_loess_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  library(lmtest) # For Breusch-Pagan test
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Calculate standardized residuals
  std_residuals <- rstandard(model)
  
  # Fitted values
  fitted_values <- fitted(model)
  
  # Ensure the categorical variable exists in the dataframe
  if (!categorical_var_name %in% names(dataframe)) {
    stop("Specified categorical variable not found in the provided dataframe.")
  }
  
  # Create a data frame for plotting
  plot_data <- data.frame(FittedValues = fitted_values, 
                          StdResiduals = std_residuals, 
                          Category = dataframe[[categorical_var_name]])
  
  # Perform Breusch-Pagan test
  bp_test <- bptest(model)
  
  # Format the p-value for the caption
  p_value_text <- ifelse(bp_test$p.value < 0.05, "p < 0.05",
                         ifelse(bp_test$p.value > 0.05, "p > 0.05", "p = 0.05"))
  
  # Create a caption including the formatted p-value text
  caption <- paste("Breusch-Pagan Test:", p_value_text)
  
  # Base point plot with adjusted sizes for readability
  plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals, color = Category)) +
    geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold', family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic', family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic', family = 'Roboto'),
          axis.title = element_text(size = 15.5, family = 'Roboto'),
          axis.text = element_text(size = 15.5, family = 'Roboto', colour = 'black'),
          legend.position = 'right', legend.text = element_text(size = 12.5, family = 'Roboto')) + 
    labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
  
  # Add loess smoothing line if requested
  if (add_loess_line) {
    plot <- plot + geom_smooth(method = "loess", alpha = 0.2, se = FALSE)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot with dimensions that balance size and readability
  ggsave(filename = file_name, plot = plot, width = 10, height = 6.17, dpi = 100)
  
  return(plot)
}

QQPlotModel <- function(model, title = 'Normality of Residuals Check', subtitle = 'QQ-Plot', caption = NULL) {
  
  # Loading required libraries
  library(ggplot2)
  
  # Extract residuals from the model
  model_residuals <- residuals(model)
  
  # Create the QQ-plot
  plot <- ggplot() +
    stat_qq(aes(sample = model_residuals)) +
    stat_qq_line(aes(sample = model_residuals), colour = "blue") +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic'),
          axis.title = element_text(),
          axis.text = element_text(size = 15.5, colour = 'black')) + 
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = "Theoretical Quantiles",
         y = "Sample Quantiles")
  
  # Create a filename from the plot title or use a default one
  file_name <- if (!is.null(title)) {
    paste0(gsub(" ", "_", title), ".png")
  } else {
    "QQPlot.png"
  }
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  return(plot)
}
AHAH_df <- read.csv('Group_7_AHAH_V3_0.csv') # Access to healthy assets and hazards V3 data
IOD_income_df <- read.csv('Group_7_IoD2019_Income.csv') # Index of Multiple Deprivation Income Domian Indicator Data (measured in 2015)
IOD_health_df <- read.csv('Group_7_IoD2019_Health.csv') # Index of Multiple Deprivation Health Domian Indicator Data
IOD_barriers_df <- read.csv('Group_7_IoD2019_Barriers.csv') # Index of Multiple Deprivation Barriers to Housing and Services Domain Indicator Data
IOD_education_df <- read.csv('Group_7_IoD2019_Education.csv') # Index of Multiple Deprivation Education, Skills and Training Deprivation Domain Indicator Data
IOD_employment_df <- read.csv('Group_7_IoD2019_Employment.csv') # Index of Multiple Deprivation Employment Deprivation Domain Indicator Data (measured in 2015)
ONS_MIDYEAR_2015 <- read.csv('Group_7_ONS_MIDYEAR_2015.csv') # Mid-year population estimates for 2015 in England and Wales (for creating rates from the numerator values given in the income and employment measures)
RUC2011 <- read.csv('Group_7_RUC2011.csv')
## Renaming columns for consistent joining
AHAH_df <- AHAH_df %>% rename("LSOA.code..2011." = lsoa11)
ONS_MIDYEAR_2015 <- ONS_MIDYEAR_2015 %>% rename("LSOA.code..2011." = Area.Codes)
RUC2011 <- RUC2011 %>% rename("LSOA.code..2011." = LSOA11CD)

## Creating complete combined dataset
complete_combined_data <- IOD_barriers_df %>%
  left_join(IOD_education_df, by = "LSOA.code..2011.") %>%
  left_join(IOD_employment_df, by = "LSOA.code..2011.") %>%
  left_join(IOD_health_df, by = "LSOA.code..2011.") %>%
  left_join(IOD_income_df, by = "LSOA.code..2011.") %>%
  left_join(AHAH_df, by = "LSOA.code..2011.") %>%
  left_join(ONS_MIDYEAR_2015, by = "LSOA.code..2011.") %>%
  left_join(RUC2011, by = "LSOA.code..2011.")

## Retaining only columns of interest
combined_data <- complete_combined_data %>% 
  select(
    LSOA.code..2011.,
    Local.Authority.District.name..2019.,
    Income.Domain.numerator,
    Years.of.potential.life.lost.indicator,
    Housing.affordability.indicator,
    Staying.on.in.education.post.16.indicator,
    Employment.Domain.numerator,
    ah3gp, 
    ah3hosp, 
    ah3dent, 
    ah3phar,
    All.Ages,
    RUC11
  )

## Removing commas and converting columns 3:12 to numeric
combined_data <- combined_data %>% 
  mutate(across(.cols = 3:12, 
                .fns = ~ as.numeric(str_remove_all(.x, ","))))

## Creating income and employment deprivation rate variables
combined_data$income_deprivation_rate <- (combined_data$Income.Domain.numerator/combined_data$All.Ages) * 100
combined_data$employment_deprivation_rate <- (combined_data$Employment.Domain.numerator /combined_data$All.Ages) * 100

## Creating average distance (in minutes) to a healthcare service (including GP practice, hospital, dentist and pharmacy)
combined_data$avg_healthcare_distance <- rowMeans(combined_data[, c("ah3gp", "ah3hosp", "ah3dent", "ah3phar")], na.rm = TRUE)

## Renaming column names for understanding
colnames(combined_data) <- c('lsoa_code', 'local_authority_name', 'no._of_income_deprived_individuals_2015', 'years_of_lives_lost_2013_2017', 'housing_affordability_score_2016', 'proportion_of_above16_individuals_inschool_2010_2012', 'no._of_deprived_employment_2015_2016', 'avg_dist_gppractice_in_mins_2022', 'avg_dist_hospital_in_mins_2022', 'avg_dist_dentist_in_mins_2022', 'avg_dist_pharmacy_in_mins_2022', 'population_estimates_for_allages_2015', 'rural_urban_class_2011','income_deprivation_rate_2015','employment_deprivation_rate_2015','avg_healthcare_distance_2022')
## Saving as CSV
write.csv(combined_data, file = "Group_7_combined_data.csv", row.names = FALSE)
write.csv(complete_combined_data, file = "Group_7_complete_combined_data.csv", row.names = FALSE)
## Loading combined data 
combined_data <- read.csv('Group_7_combined_data.csv')

## Checking for duplicates
There_are_duplicates <- any(duplicated(combined_data))
if (There_are_duplicates) {
  cat("There are duplicates in the combined_data.\n")
} else {
  cat("There are no duplicates in combined_data.\n")
}

# Calculate missingness percentages for each numerical variable (IV and DV)
missingness <- sapply(combined_data, function(x) mean(is.na(x)) * 100)
missingness_summary <- data.frame(variable = names(missingness), Missingness_percentage = missingness)
missingness_summary

# Removing Null Values from data set
combined_data <- na.omit(combined_data)

# First 6 rows of data
head(combined_data)

# Last 6 rows of data
tail(combined_data)
summary(combined_data)

# Descriptive table for numerical variables 
stargazer(combined_data, type = "html", digits = 1, title = "Table 1: Summary Statistics", out = "table.html")

# Use rstudioapi to view the table
rstudioapi::viewer("table.html")

## SD of numerical variable
sd(combined_data$lsoa_code)
sd(combined_data$no._of_income_deprived_individuals_2015)
sd(combined_data$years_of_lives_lost_2013_2017)
sd(combined_data$housing_affordability_score_2016)
sd(combined_data$proportion_of_above16_individuals_inschool_2010_2012)
sd(combined_data$no._of_deprived_employment_2015_2016)
sd(combined_data$avg_dist_gppractice_in_mins_2022)
sd(combined_data$avg_dist_hospital_in_mins_2022)
sd(combined_data$avg_dist_dentist_in_mins_2022)
sd(combined_data$avg_dist_pharmacy_in_mins_2022)
sd(combined_data$population_estimates_for_allages_2015)
sd(combined_data$income_deprivation_rate_2015)
sd(combined_data$employment_deprivation_rate_2015)
sd(combined_data$avg_healthcare_distance_2022)
# The number of unique names in the rural_urban_class column 
unique_names <- table(combined_data$rural_urban_class_2011)
print(unique_names)
rural_urban_class_2011_counts <- as.numeric(unique_names)
print(rural_urban_class_2011_counts)

## Summary statistics for Rural_urban_class_2011
max(rural_urban_class_2011_counts)
min(rural_urban_class_2011_counts)
median(rural_urban_class_2011_counts)
IQR(rural_urban_class_2011_counts)

# The number of unique names in the local_authority_name column 
unique_names_1 <- table(combined_data$local_authority_name)
print(unique_names_1)
local_authority_name_counts <- as.numeric(unique_names_1)
print(local_authority_name_counts)

## Summary statistics for local_authority_name
max(local_authority_name_counts)
min(local_authority_name_counts)
median(local_authority_name_counts)
IQR(local_authority_name_counts)

## T-test
# Four sample independent t-test is used to compare the significance level in combined_data data set. Student’s t-test assumes both groups of data are normally distributed and the variances of the two distributions are the same.
x1 <-combined_data$income_deprivation_rate_2015
x2 <-combined_data$employment_deprivation_rate_2015
x3 <-combined_data$housing_affordability_score_2016
y <-combined_data$avg_healthcare_distance_2022

# Plot a 2x2 histogram grid
par(mfrow=c(2, 2))
hist(x1)
hist(x2)
hist(x3)
hist(y)

#Based on histogram plots, we have to perform a Welch's t-test for x1, x2, both of which are skewed, and for x3 as well since DV is highly skewed despite x3 being normally distributed.
t_test1<- t.test(x1,y)    
t_test2<- t.test(x2,y)
t_test3<- t.test(x3,y)
t_test1
t_test2
t_test3

## Covariance values of our primary and secondary IV against primary DV
cov(x1,y)
cov(x2,y)
cov(x3,y)

## Correlation values of our primary and secondary IV against primary DV
# As x1, x2 are not normally distributed, we will use 'spearman' method. We will use 'spearman' method for x3 despite it being normally distributed as DV is highly skewed.
cor(x1,y, method='spearman')
cor(x2,y, method='spearman')
cor(x3,y, method='spearman')
DistPlot(combined_data, "income_deprivation_rate_2015", "Income Deprivation Rate Across LSOAs in England (2015)", subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

DistPlot(combined_data, "avg_healthcare_distance_2022", "Average Minutes to Healthcare Services Across LSOAs in England (2022)", subtitle = "Mean Access Time to Healthcare Services per LSOA", caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "housing_affordability_score_2016", "Housing Affordability Indicator Across LSOAs in England (2016)", subtitle = "Higher Scores Indicate Increased Deprivation in Entering Ownership or Private Rental Market", caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "employment_deprivation_rate_2015", "Employment Deprivation Rate in England (2015)", subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

# Both the main explanatory variable and outcome variable have a very strong right skew. Thus, a log transformation in both of these is examined.
## Adding log transformed variables to dataframe:
combined_data$log_avg_healthcare_distance_2022 <- log(combined_data$avg_healthcare_distance_2022)
combined_data$log_income_deprivation_rate_2015 <- log(combined_data$income_deprivation_rate_2015)
combined_data$log_employment_deprivation_rate_2015 <- log(combined_data$employment_deprivation_rate_2015)

## Checking distributions:
DistPlot(combined_data, "log_income_deprivation_rate_2015", 'log(Income Deprivation Rate) Across LSOAs in England (2015)', subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

DistPlot(combined_data, "log_avg_healthcare_distance_2022", "log(Average Minutes to Healthcare Services) Across LSOAs in England (2022)", subtitle = "Logarithm of Mean Access Time to Healthcare Services per LSOA", caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "log_employment_deprivation_rate_2015", 'log(Employment Deprivation Rate) Across LSOAs in England (2015)', subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

## Filtering out rows with infinite values and creating a complete case sample:
complete_case_sample <- combined_data %>%
  select(log_income_deprivation_rate_2015, log_avg_healthcare_distance_2022, 
         housing_affordability_score_2016, rural_urban_class_2011) %>%
  filter(!is.infinite(log_avg_healthcare_distance_2022), 
         !is.infinite(log_income_deprivation_rate_2015)) %>%
  na.omit()

## Saving as CSV
write.csv(complete_case_sample, file = "Group_7_complete_case.csv", row.names = FALSE)

# Checking linearity of relationships between x variables
ggpairs(complete_case_sample[,c("log_income_deprivation_rate_2015", "housing_affordability_score_2016", "log_avg_healthcare_distance_2022")])

PointPlot(combined_data, "log_employment_deprivation_rate_2015", "log_income_deprivation_rate_2015", title = "Income Deprivation Rate (2015) as a Function of Employment Deprivation Rate (2015)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Income Deprivation Rate) (2015)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021", add_regression_line = TRUE)

result <- cor.test(combined_data$log_employment_deprivation_rate_2015, combined_data$income_deprivation_rate_2015, method = "pearson")

# creating a df with correlation test results
model_summary <- data.frame(
  Estimate = round(result$estimate, 4),
  t.value = round(result$statistic, 2),
  p = ifelse(result$p.value < 0.05, "< 0.05", round(result$p.value, 10))
)

# pretty table using stargazer
stargazer(model_summary, type = "html", 
          title = "Results of Pearson's Correlation Test",
          summary = FALSE,
          omit.stat = c("all"), 
          single.row = TRUE, rownames = FALSE, out = 'x1x2.html')
rstudioapi::viewer("x1x2.html")

# As employment deprivation rate and income deprivation rate are highly correlated, employment deprivation rate will be excluded from analyses (to avoid issues of multicollinearity).

# Examining linearity between xi and y
PointPlot(complete_case_sample, "log_income_deprivation_rate_2015", "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Income Deprivation Rate (2015)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "log(Income Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)", add_regression_line = TRUE)

PointPlot(complete_case_sample, "housing_affordability_score_2016", "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Housing Affordability Score (2016)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "Housing Affordability Score (2016)", y_label = "log(Mean Access Time to Healthcare Services) (2022)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)", add_regression_line = TRUE)

PointPlot(combined_data, "log_employment_deprivation_rate_2015", "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Employment Deprivation Rate (2015)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)", add_regression_line = TRUE)
### PLotting of table for slide 11
combined_data_sum <- combined_data[c("local_authority_name","housing_affordability_score_2016", "income_deprivation_rate_2015", "employment_deprivation_rate_2015","avg_healthcare_distance_2022", "rural_urban_class_2011")]

html_table <- stargazer(
  combined_data_sum,
  type = "html",
  title = "Table 1: Summary Statistics",
  digits = 1,
  column.labels = c(
    "Housing Affordability Score (2016)",
    "Income Deprivation Rate (2015)",
    "Employment Deprivation Rate (2015)",
    "Average Healthcare Distance (2022)"
  ),
  out = "table.html"
)

# See HTML table in R Studio
rstudioapi::viewer("table.html")


### Calculate missingness percentages for each variable for slide 11
missingness <- sapply(combined_data_sum, function(x) mean(is.na(x)) * 100)

# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 2: Missingness Summary/%", summary = FALSE, flip=TRUE)
html_file <- "missingness_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


### count of unique names for rural/urban for slide 12
unique_class2011 <- table(combined_data_sum$rural_urban_class_2011)
unique_class2011.1 <- as.data.frame(unique_class2011)

# Rename the columns
colnames(unique_class2011.1) <- c("Rural/Urban Class (2011)", "Count")

# Create HTML table using stargazer
html_table1 <- stargazer(unique_class2011.1, type = "html", title = "Unique Names and its Respective Counts", summary = FALSE)
cat(html_table1)
writeLines(html_table1, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

# Calculate summary statistics
summary_ruralurban <- c(
  Max = max(rural_urban_class_2011_counts),
  Min = min(rural_urban_class_2011_counts),
  Med = median(rural_urban_class_2011_counts),
  IQR = IQR(rural_urban_class_2011_counts)
)

# Create data frame for summary statistics
summary_df <- data.frame(statistic = names(summary_ruralurban), value = summary_ruralurban)

# Create HTML table using stargazer
html_table2 <- stargazer(summary_df, type = "html", title = "Summary Statistics", summary = FALSE, rownames = FALSE)
cat(html_table2)
writeLines(html_table2, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


## count of unique names for local authority names for slide 12
unique_localauthority <- table(combined_data_sum$local_authority_name)

# Convert the result to a data frame
unique_localauthority1 <- as.data.frame(unique_localauthority)

colnames(unique_localauthority1) <- c("Local Authority Names", "Count")

# Create HTML table using stargazer
html_table3 <- stargazer(unique_localauthority1, type = "html", title = "Unique Names and its Respective Counts", summary = FALSE)
cat(html_table3)
writeLines(html_table3, html_file)
rstudioapi::viewer(html_file)

# Calculate summary statistics
summary_localauthority <- c(
  Max = max(local_authority_name_counts),
  Min = min(local_authority_name_counts),
  Med = median(local_authority_name_counts),
  IQR = IQR(local_authority_name_counts)
)

summary_df <- data.frame(statistic = names(summary_localauthority ), value = summary_localauthority )

# Create HTML table using stargazer
html_table4 <- stargazer(summary_df, type = "html", title = "Summary Statistics", summary = FALSE, rownames = FALSE)
cat(html_table4)
writeLines(html_table4, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


### Repeat for complete_case.csv for slide 16
## Read the complete_case file
complete_case_sum <- read.csv("Group_7_complete_case.csv")

html_table <- stargazer(
  complete_case_sum,
  type = "html",
  title = "Table 3: Summary Statistics",
  digits = 1,
  column.labels = c(
    "Log Income Deprivation Rate (2015)",
    "Log Average Healthcare Distance (2022)",
    "Housing Affordability Score (2016)",
    "Rural/Urban Class (2011)"
  ),
  out = "table.html"
)

# See HTML table in R Studio
rstudioapi::viewer("table.html")

### Calculate missingness percentages for each variable for slide 16
missingness <- sapply(complete_case_sum, function(x) mean(is.na(x)) * 100)

# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 4: Missingness Summary/%", summary = FALSE, flip=TRUE)
html_file <- "missingness_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


### covariance of x1,x2,x3 before log transformation for slide 17
x1 <-combined_data_sum$income_deprivation_rate_2015
x2 <-combined_data_sum$employment_deprivation_rate_2015
x3 <-combined_data_sum$housing_affordability_score_2016
y <-combined_data_sum$avg_healthcare_distance_2022
cov_x1y <- cov(x1, y)
cov_x2y <- cov(x2, y)
cov_x3y <- cov(x3, y)

# Create a data frame for the covariances
covariances_df <- data.frame(
  variables = c("x1 and y", "x2 and y", "x3 and y"),
  covariance = c(cov_x1y, cov_x2y, cov_x3y)
)

# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances before log transformation", summary = FALSE, rownames = FALSE)
html_file <- "covariances_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


###covariance of x4,x5 after log transformation for slide 17
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <-complete_case_sum$log_income_deprivation_rate_2015
x5 <-complete_case_sum$housing_affordability_score_2016
y1 <-complete_case_sum$log_avg_healthcare_distance_2022
cov_x4y1 <- cov(x4, y1)
cov_x5y1 <- cov(x5, y1)

# Create a data frame for the covariances
covariances_df <- data.frame(
  variables = c("x4 and y1", "x5 and y1"),
  covariance = c(cov_x4y1, cov_x5y1)
)

# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances after log transformation", summary = FALSE, rownames = FALSE)
html_file <- "covariances_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


## Correlation test for slide 18
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <-complete_case_sum$log_income_deprivation_rate_2015
x5 <-complete_case_sum$housing_affordability_score_2016
y1 <-complete_case_sum$log_avg_healthcare_distance_2022
corx4y1 <- cor.test(x4, y1, method="pearson")
corx5y1 <- cor.test(x5, y1, method="pearson")

# Create a data frame to store correlation results
# format.pval function is used to format the p-value to make it into a string ie. a range
corr_df <- data.frame(
  variables = c("x4 and y1", "x5 and y1"),
  Pearson_Correlation = c(corx4y1$estimate, corx5y1$estimate),
  t_value = c(corx4y1$statistic, corx5y1$statistic),
  Estimate = c(corx4y1$estimate, corx5y1$estimate),
  p_value = ifelse(corx4y1$p.value == 0, "< 0.001", format.pval(corx4y1$p.value))
)

# Create HTML table using stargazer
html_table <- stargazer(corr_df, type = "html", title = "Pearson Correlations After Log Transformation", summary = FALSE, rownames = FALSE)
html_file <- "cor_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)
# Model 1:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate.
mod1 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
summary(mod1)

# Model 1 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate. As both the dependent and independent variables are log transformed, the coefficients may be interpreted in terms of percentage change. That is, a 1% increase in the income deprivation rate is associated with a 0.35% decrease in the average travel time to a healthcare service. This coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). Thus, according to this model, we may reject the null hypothesis that there is no relationship between income deprivation rate and average healthcare travel distance. Further, the p-value associated with the F-statistic is also significant considering the same alpha, sugegsting at least one variable in the model has some explanatory power regarding average healthcare distance. This statistically is not that useful regarding this model however, as only one explanatory variable is considered. The coefficient of determination for this model indicates that we are able to explain 13.1% of the variation in average travel time to healthcare services with this model.

# Model 2:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate and the housing affordability indicator
mod2 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)

# Exponentiating untransformed variable coefficients to allow easier interpretation
mod2$coefficients["housing_affordability_score_2016"] <- exp(mod2$coefficients["housing_affordability_score_2016"])

# Viewing model results
summary(mod2)

# Model 2 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate and the housing affordability indicator. Here, we can see that a 1% increase in the income deprivation rate is associated with a 0.19% decrease in the average travel time to a healthcare service. Thus, we can see that when we control for a further underlying economic variable, the explanatory power of income deprivation alone decreases. Again, this coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). For interpretation of the housing affordability indicator, the exponential of the coefficient is considered (0.88). Thus, we can see that a 1 unit increase in the housing affordability indicator is associated with a 12% decrease in the average travel time to a healthcare service, again significant considering an alpha of 0.05. Remembering that increases in the housing affordability indicator signify increased inability to afford to enter owner-occupation or the private rental market, this again appears to indicate that more economically deprived areas have greater geographical access to healthcare services. Further, the p-value associated with the F-statistic is also significant at the 5% level, again suggesting at least one variable in the model has some explanatory power regarding average healthcare distance. The adjusted coefficient of determination for this model indicates that we are able to explain 19.3% of the variation in average travel time to healthcare services with this model.

# Model 3:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate, the housing affordability indicator, and the re-leveled Rural-Urban Classification (RUC11) with "Urban city and town" as the reference category
mod3 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + forcats::fct_relevel(rural_urban_class_2011, "Urban city and town"), data = complete_case_sample)

# Exponentiated coefficient data
mod3_exp_coeff <- data.frame(exp(coef(mod3)))

# Exponentiating untransformed variable coefficients WITHIN the model to allow easier interpretation
mod3$coefficients[3:10] <- exp(mod3$coefficients[3:10])

# Viewing model results
summary(mod3)

# Creating data for confidence interval plot
ConfidencePlotData <- data.frame (
  "names" = as.factor (c("Rural town and fringe", "Rural town and fringe in a sparse setting", "Rural village and dispersed", "Rural village and dispersed in a sparse setting", "Urban city and town in a sparse setting", "Urban major conurbation", "Urban minor conurbation")), # Creating predictor names (row names)
  "coefs" = mod3_exp_coeff[4:10, 1],
  "lower" = confint (mod3) [4:10, 1], # Creating a lower confidence interval [ , 1] list for predictors 2-3 in the model [2:5 , ]
  "upper" = confint (mod3) [4:10, 2] # Creating an upper confidence interval [ , 2] list for predictors 2-3 in the model [2:5 , ]
)
ConfidencePlotData$names <- factor(ConfidencePlotData$names, levels = unique(ConfidencePlotData$names))

# Creating confidence interval plot for RUC coefficients
ConfidencePlot(ConfidencePlotData, "Healthcare Access by Rural-Urban Classification in England", "Reference Category: Urban City and Town", x_label = "Exponentiated Coefficient", middle_point = 1, caption = "*95% CI Indicated by Green Error Bars. Red dashed line indicates reference group. Controlling for income deprivation (2015) and housing affordability indicators (2016).")

# Model 3 expands on Model 2 by including the Rural-Urban Classification (RUC11) as a factor variable, allowing us to account for geographic differences in average travel time to healthcare services. The model estimates the logarithm of average travel time to a healthcare service within a Lower Layer Super Output Area (LSOA) as a function of the logarithm of the income deprivation rate, the housing affordability indicator, and the RUC11 categories. We observe that a 1% increase in the income deprivation rate is associated with a 0.10% decrease in average travel time to healthcare services, when holding the other variables constant. This relationship is statistically significant at the 5% level, with a p-value less than 0.05, indicating that income deprivation continues to be an important factor in explaining variations in healthcare access, albeit with a smaller coefficient compared to Model 2. Similarly, a 1 unit increase in the housing affordability indicator, which reflects increased difficulty in entering owner-occupation or the private rental market, is associated with a 7% decrease in average travel time to healthcare services. This relationship is also statistically significant at the 5% level. The new variables introduced, representing the RUC11 categories, show significant variations in healthcare access across different geographic areas. For instance, compared to Urban city and town areas, Rural village and dispersed areas experience a 344% increase in average travel time to healthcare services, indicating that rural areas face substantial barriers in healthcare access. These relationships are statistically significant at the 5% level, with the exception of Urban city and town in a sparse setting category, which does not show a significant difference in healthcare access compared to Urban city and town areas. The adjusted coefficient of determination for this model is 0.537, suggesting that the model explains approximately 53.7% of the variation in average travel time to healthcare services. The inclusion of the RUC11 categories significantly enhances the explanatory power of the model compared to Model 2. The p-value associated with the F-statistic is also significant at the 5% level, indicating that at least one variable in the model has some explanatory power regarding average healthcare distance. In conclusion, Model 3 provides a more nuanced understanding of the factors affecting healthcare access, by accounting for both economic and geographic variables. The significant differences in healthcare access across the RUC11 categories highlight the importance of considering geographic disparities when analyzing healthcare access.
results <- stargazer(mod1, mod2, mod3, type="html",
          dep.var.labels=c("log(Mean Healthcare Service Distance [MHSD])"),
          covariate.labels=c("log(Income Deprivation Rate 2015)","Housing Affordability Score 2016 [HAS]","Rural town and fringe [RUC 2011]","Rural town and fringe in a sparse setting [RUC 2011]","Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]", "Urban city and town in a sparse setting [RUC 2011]", "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"),
          notes = "Coefficients for HAS and RUC 2011 are exponentiated, reflecting a multiplicative effect on MHSD.",
          ci = TRUE)
# Checking assumptions: Model 1 
# Adding residual values to dataframe
complete_case_sample$resids1 <- residuals(mod1)

# Linearity (using partial regression plots)
PartialRegressionPlot (mod1, 'log_income_deprivation_rate_2015', x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR)")

# Constant variance of residuals 
HomoskedasticityPlot(mod1, add_loess_line = TRUE, title = "Model 1 Scale-Location")

# Durbin-Watson test
dw1 <- dwtest(mod1)

# Create a data frame with the Durbin-Watson test results
dw_df <- data.frame(
  Statistic = dw1$statistic,
  P = format.pval(dw1$p.value, digits = 16)
)

# Use stargazer to create an HTML table of the Durbin-Watson results
stargazer(dw_df, type = "html", title = "Durbin-Watson Test Results", 
          summary = FALSE, rownames = FALSE, out = 'dw_test_results.html')

# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer('dw_test_results.html')

# Normality of residuals check
QQPlotModel(mod1, subtitle = "QQ-Plot for Model 1")

# Zero conditional mean of residual values
mean(mod1$residuals)
resid_mod1 <- lm(resids1 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
summary(resid_mod1)

# Zero conditional mean regression results
stargazer(resid_mod1, type="html",
          dep.var.labels=c("Residuals of Model 1"),
          covariate.labels=c("log(Income Deprivation Rate 2015)"), out = "residmod1.html")
rstudioapi::viewer("residmod1.html")

# Checking assumptions: Model 2
# Adding residual values to dataframe
complete_case_sample$resids2 <- residuals(mod2)

# Linearity (using partial regression plots)
PartialRegressionPlot (mod2, 'log_income_deprivation_rate_2015', x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot (mod2, 'housing_affordability_score_2016', x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")

# Multicollinearity (using VIF)
vif_values1 <- vif(mod2)

# storing VIF in a dataframe 
vif_df1 <- data.frame(Variable = names(vif_values1), VIF = vif_values1)
vif_df1$Variable[1] <- "log(Income Deprivation Rate 2015)"
vif_df1$Variable[2] <- "Housing Affordability Score 2016"

# creating a pretty table
stargazer(vif_df1, type = "html", title = "VIF Values", summary = FALSE, 
          rownames = FALSE, out = 'mod2vif.html')

# using rstudioapi to view the table
rstudioapi::viewer("mod2vif.html")

# Constant variance of residuals 
HomoskedasticityPlot(mod2, add_loess_line = TRUE, title = "Model 2 Scale-Location")

# Durbin-Watson test
dw2 <- dwtest(mod2)

# Create a data frame with the Durbin-Watson test results
dw_df2 <- data.frame(
  Statistic = dw2$statistic,
  P = ifelse(dw2$p.value > 0.05, '> 0.05',
             ifelse(dw2$p.value < 0.01, '< 0.01', '< 0.05'))
)

# Use stargazer to create an HTML table of the Durbin-Watson results
stargazer(dw_df2, type = "html", title = "Durbin-Watson Test Results", 
          summary = FALSE, rownames = FALSE, out = 'dw_test_results2.html')

# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer('dw_test_results2.html')

# Normality of residuals check
QQPlotModel(mod2, subtitle = "QQ-Plot for Model 2")

# Zero conditional mean of residual values
mean(mod2$residuals)
resid_mod2 <- lm(resids2 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)
summary(resid_mod2)

# Zero conditional mean regression results
stargazer(resid_mod2, type="html",
          dep.var.labels=c("Residuals of Model 2"),
          covariate.labels=c("log(Income Deprivation Rate 2015)","Housing Affordability Score 2016 [HAS]"), out = "residmod2.html")
rstudioapi::viewer("residmod2.html")

# Checking assumptions: Model 3 
# Adding residual values to dataframe
complete_case_sample$resids3 <- residuals(mod3)

# Linearity (using partial regression plots)
PartialRegressionPlot (mod3, 'log_income_deprivation_rate_2015', x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot (mod3, 'housing_affordability_score_2016', x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")

# Multicollinearity (using VIF)
vif_values2 <- data.frame(vif(mod3))
vif_values2$Variable <- c('log(Income Deprivation Rate 2015)', 'Housing Affordability Score 2016', 'Rural Urban Classification 2011')
vif_df2 = vif_values2[, c("Variable", "GVIF")] 

# creating a pretty table
stargazer(vif_df2, type = "html", title = "GVIF Values", summary = FALSE, 
          rownames = FALSE, out = 'mod3vif.html')

# using rstudioapi to view the table
rstudioapi::viewer("mod3vif.html")

# Constant variance of residuals plot
HomoskedasticityPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location")

# Plot indicating groups
HomoskedasticityDetailPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location", dataframe = complete_case_sample, categorical_var_name = "rural_urban_class_2011")

# Testing for non-constant variance across groups:
leveneTest(complete_case_sample$resids3, complete_case_sample$rural_urban_class_2011, center=mean)

# df for levene's test results
levene_results <- data.frame(
  `Degrees of Freedom (Group)` = 7,
  `Degrees of Freedom (Residual)` = 32706,
  `F Value` = 189.04,
  `P Value` = '< 0.01'
)

names(levene_results) <- c("Degrees of Freedom (Group)", 
                           "Degrees of Freedom (Residual)", 
                           "F Value", 
                           "P Value")

# Viewing the table with descriptive column names
stargazer(levene_results, 
          type = "html", 
          title = "Levene's Test for Homogeneity of Variance", 
          out = "levene_test.html", 
          summary = FALSE, 
          rownames = FALSE)
rstudioapi::viewer('levene_test.html')



# Durbin-Watson test
dw3 <- dwtest(mod3)

# Create a data frame with the Durbin-Watson test results
dw_df3 <- data.frame(
  Statistic = dw3$statistic,
  P = ifelse(dw3$p.value > 0.05, '> 0.05',
             ifelse(dw3$p.value < 0.01, '< 0.01', '< 0.05'))
)

# Use stargazer to create an HTML table of the Durbin-Watson results
stargazer(dw_df3, type = "html", title = "Durbin-Watson Test Results", 
          summary = FALSE, rownames = FALSE, out = 'dw_test_results3.html')

# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer('dw_test_results3.html')

# When considering the independence of residuals, we might have reason to think that this assumption is not satisfied within the model, as it is likely that some degree of spatial autocorrelation could be influencing our results - that is, that LSOAs closer to each-other could theoretically have more similar values of X. This idea is supported by the results of the durbin-watson test statistics shown - notably, getting closer to 2 (representing less reduced autocorrelation) when including a variable that partially accounts for spatiality (the RUC2011 indicator). As such, we do see a considerable amount of positive autocorrelation in the data (according to a Durbin-Watson statistic of 0.83 being considerably below 2). This means that residuals that follow each other in the data tend to be slightly similar in magnitude. Considering the null hypothesis of no autocorrelation and considering the p-value being significant considering an alpha of 0.05, we fail to reject the null hypothesis that there is no autocorrelation presented by the variables within our model. Thus, we might not be confident that the independence of residuals assumption is satisfied.

# Normality of residuals check
QQPlotModel(mod3, subtitle = "QQ-Plot for Model 3")

# Zero conditional mean of residual values
mean(mod3$residuals)
resid_mod3 <- lm(resids3 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + rural_urban_class_2011, data = complete_case_sample)
summary(resid_mod3)

# Zero conditional mean regression results
stargazer(resid_mod3, type="html",
          dep.var.labels=c("Residuals of Model 3"),
          covariate.labels=c("log(Income Deprivation Rate 2015)","Housing Affordability Score 2016 [HAS]","Rural town and fringe [RUC 2011]","Rural town and fringe in a sparse setting [RUC 2011]","Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]", "Urban city and town in a sparse setting [RUC 2011]", "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"), out = "residmod3.html")
rstudioapi::viewer("residmod3.html")

Assessing the Impact of Economic Deprivation on Healthcare Access Across LSOAs in England

Abstract

Libraries used

Functions created for the purposes of this research

Cleaning and joining the data

Variable distributions

Log transformations

Summary statistics with log transformed variables

Model creation

Model results

Assumption checks

Appendix (complete code)