Abstract

Healthcare deprivation has long been influenced by factors such as geographical access and living conditions. Despite prior research that investigated such factors, there is still a dearth of studies that support how economic deprivation influences geographical access to such services. Our analyses reveal that while economic deprivation somewhat decreases travel time, this relationship is significantly mediated by housing costs and urban-rural classification. The findings, which also reflect on model assumptions and are supported by external literature, highlight the complexities of healthcare access and underscore the need for inclusive approaches in future research to capture the multifaceted nature of economic influences.

Libraries used

# Libraries used
library(tidyverse)
library(haven)
library(dplyr)
library(readxl)
library(GGally)
library(forcats)
library(stargazer)
library(lme4)
library(car)
library(rstudioapi)
library(knitr)
library(kableExtra)

This chunk loads the necessary libraries for data manipulation, analysis, and plotting.

# Turning off scientific notation
options(scipen = 9999)

This line sets the option to avoid scientific notation in outputs.

Functions created for the purposes of this research

# Bespoke functions used ----
PointPlot <- function(dataset, independent_var, dependent_var, colour_var = NULL, title = NULL, subtitle = NULL, caption = NULL, x_label = "X variable", y_label = "Y variable", point_size = 1, line_intercept = NULL, line_slope = NULL, line_colour = "black", line_linewidth = 1, line_linetype = "solid", add_regression_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Adjust the aesthetic mapping based on the presence of the third variable
  if(!is.null(colour_var)) {
    plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var, colour = colour_var))
  } else {
    plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var))
  }
  
  # Base point plot
  plot <- plot + geom_point(alpha = 1, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
                                      family = 'Roboto'),
          axis.title = element_text(family = 'Roboto'),
          axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          legend.position = ifelse(is.null(colour_var), 'none', 'right'),
          legend.text = element_text(family = 'Roboto')) + 
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = x_label,
         y = y_label)
  
  # Add manual line if intercept and slope are provided
  if (!is.null(line_intercept) & !is.null(line_slope)) {
    plot <- plot + geom_abline(intercept = line_intercept, 
                               slope = line_slope, 
                               colour = line_colour, 
                               linewidth = line_linewidth, 
                               linetype = line_linetype)
  }
  
  # Add regression line with confidence interval if requested
  if (add_regression_line) {
    plot <- plot + geom_smooth(method = "lm", alpha = 0.2)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  return(plot)
}

DistPlot <- function(dataset, variable, formatted_name, subtitle = NULL, caption = NULL, y_label = NULL) {
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  plot <- ggplot(dataset, aes_string(x = variable)) +
    geom_histogram(aes(y = ..density..),
                   colour = 1, fill = "white") +
    geom_density(fill = '#0072C6', colour = '#0072C6', alpha = 0.25) +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                      family = 'Roboto'),
          axis.title.x = element_blank(),  # This removes the x-axis title
          axis.title.y = element_text(size = 15.5,
                                      family = 'Roboto',
                                      colour = 'black'),
          axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          legend.position = 'none') +
    labs(title = paste0("Distribution of ", formatted_name),
         subtitle = subtitle,
         caption = caption,
         y = y_label)
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  plot
}

ConfidencePlot <- function(data, title, subtitle, middle_point = 0, x_label = "Estimated Coefficient", caption = NULL) {
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  if (!middle_point %in% c(0, 1)) {
    stop("middle_point must be 0 or 1")
  }
  
  conf_plot <- ggplot(data, aes(x = coefs, y = names)) +
    geom_errorbar(aes(xmin = lower, xmax = upper),
                  col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "forestgreen"),
                  width = 0.5, size = 1, alpha = 0.5) +
    geom_point(col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "black"), size = 1, alpha = 1) +
    geom_vline(xintercept = middle_point, col = 'red', size = 0.5, linetype="dashed") +
    theme_classic() +
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = x_label,
         y = "") +
    theme(axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          axis.title = element_text(family = 'Roboto', size = 15.5),
          plot.title = element_text(hjust = 0, colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto', margin = margin(b = 10)),
          plot.subtitle = element_text(hjust = 0, colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto', margin = margin(b = 5)),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
                                      family = 'Roboto'))
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", conf_plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = conf_plot, width = 1200/100, height = 740/100, dpi = 100)  # Adjust height as necessary
  
  return(conf_plot)
}

PartialRegressionPlot <- function(model, variable, x_var_name = "X1", y_var_name = "Dependent Variable", title = NULL) {
  # Close any existing plotting device
  graphics.off()
  
  # Construct axis labels using the specified names
  xlab <- paste("Residuals of", x_var_name, "Controlling for Other Variables")
  ylab <- paste("Residuals of", y_var_name, "(Excluding", x_var_name, ")")
  
  # Create the plot without a main title
  avPlots(model, variable, xlab = xlab, ylab = ylab)
  
  # Add a title using the title function
  title(main = title)
}

HomoskedasticityPlot <- function(model, title = "Homoskedasticity Check", subtitle = NULL, 
                                 x_label = "Fitted Values", y_label = "Standardized Residuals", 
                                 point_size = 1.2, add_loess_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  library(lmtest) # For Breusch-Pagan test
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Calculate standardized residuals
  std_residuals <- rstandard(model)
  
  # Fitted values
  fitted_values <- fitted(model)
  
  # Create a data frame for plotting
  plot_data <- data.frame(FittedValues = fitted_values, StdResiduals = std_residuals)
  
  # Perform Breusch-Pagan test
  bp_test <- bptest(model)
  
  # Format the p-value for the caption
  if (bp_test$p.value < 0.05) {
    p_value_text <- "p < 0.05"
  } else if (bp_test$p.value > 0.05) {
    p_value_text <- "p > 0.05"
  } else {
    p_value_text <- "p = 0.05"
  }
  
  # Create a caption including the formatted p-value text
  caption <- paste("Breusch-Pagan Test:", p_value_text)
  
  # Base point plot with adjusted sizes for readability
  plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals)) +
    geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 50, face = 'bold', family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 30, face = 'italic', family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 50, face = 'italic', family = 'Roboto'),
          axis.title = element_text(size = 30, family = 'Roboto'),
          axis.text = element_text(size = 30, family = 'Roboto', colour = 'black'),
          legend.position = 'none') + 
    labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
  
  # Add loess smoothing line if requested
  if (add_loess_line) {
    plot <- plot + geom_smooth(method = "loess", alpha = 0.2)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot with dimensions that balance size and readability
  ggsave(filename = file_name, plot = plot, width = 5.91, height = 4.50, dpi = 300)
  
  return(plot)
}

HomoskedasticityDetailPlot <- function(model, dataframe, categorical_var_name, 
                                       title = "Homoskedasticity Check", subtitle = NULL, 
                                       x_label = "Fitted Values", y_label = "Standardized Residuals", 
                                       point_size = 1.2, add_loess_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  library(lmtest) # For Breusch-Pagan test
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Calculate standardized residuals
  std_residuals <- rstandard(model)
  
  # Fitted values
  fitted_values <- fitted(model)
  
  # Ensure the categorical variable exists in the dataframe
  if (!categorical_var_name %in% names(dataframe)) {
    stop("Specified categorical variable not found in the provided dataframe.")
  }
  
  # Create a data frame for plotting
  plot_data <- data.frame(FittedValues = fitted_values, 
                          StdResiduals = std_residuals, 
                          Category = dataframe[[categorical_var_name]])
  
  # Perform Breusch-Pagan test
  bp_test <- bptest(model)
  
  # Format the p-value for the caption
  p_value_text <- ifelse(bp_test$p.value < 0.05, "p < 0.05",
                         ifelse(bp_test$p.value > 0.05, "p > 0.05", "p = 0.05"))
  
  # Create a caption including the formatted p-value text
  caption <- paste("Breusch-Pagan Test:", p_value_text)
  
  # Base point plot with adjusted sizes for readability
  plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals, color = Category)) +
    geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold', family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic', family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic', family = 'Roboto'),
          axis.title = element_text(size = 15.5, family = 'Roboto'),
          axis.text = element_text(size = 15.5, family = 'Roboto', colour = 'black'),
          legend.position = 'right', legend.text = element_text(size = 12.5, family = 'Roboto')) + 
    labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
  
  # Add loess smoothing line if requested
  if (add_loess_line) {
    plot <- plot + geom_smooth(method = "loess", alpha = 0.2, se = FALSE)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot with dimensions that balance size and readability
  ggsave(filename = file_name, plot = plot, width = 10, height = 6.17, dpi = 100)
  
  return(plot)
}

QQPlotModel <- function(model, title = 'Normality of Residuals Check', subtitle = 'QQ-Plot', caption = NULL) {
  
  # Loading required libraries
  library(ggplot2)
  
  # Extract residuals from the model
  model_residuals <- residuals(model)
  
  # Create the QQ-plot
  plot <- ggplot() +
    stat_qq(aes(sample = model_residuals)) +
    stat_qq_line(aes(sample = model_residuals), colour = "blue") +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic'),
          axis.title = element_text(),
          axis.text = element_text(size = 15.5, colour = 'black')) + 
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = "Theoretical Quantiles",
         y = "Sample Quantiles")
  
  # Create a filename from the plot title or use a default one
  file_name <- if (!is.null(title)) {
    paste0(gsub(" ", "_", title), ".png")
  } else {
    "QQPlot.png"
  }
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  return(plot)
}

Cleaning and joining the data

AHAH_df <- read.csv("Group_7_AHAH_V3_0.csv")  # Access to healthy assets and hazards V3 data
IOD_income_df <- read.csv("Group_7_IoD2019_Income.csv")  # Index of Multiple Deprivation Income Domian Indicator Data (measured in 2015)
IOD_health_df <- read.csv("Group_7_IoD2019_Health.csv")  # Index of Multiple Deprivation Health Domian Indicator Data
IOD_barriers_df <- read.csv("Group_7_IoD2019_Barriers.csv")  # Index of Multiple Deprivation Barriers to Housing and Services Domain Indicator Data
IOD_education_df <- read.csv("Group_7_IoD2019_Education.csv")  # Index of Multiple Deprivation Education, Skills and Training Deprivation Domain Indicator Data
IOD_employment_df <- read.csv("Group_7_IoD2019_Employment.csv")  # Index of Multiple Deprivation Employment Deprivation Domain Indicator Data (measured in 2015)
ONS_MIDYEAR_2015 <- read.csv("Group_7_ONS_MIDYEAR_2015.csv")  # Mid-year population estimates for 2015 in England and Wales (for creating rates from the numerator values given in the income and employment measures)
RUC2011 <- read.csv("Group_7_RUC2011.csv")

The above code reads in the raw data, available to download from the sources provided.

## Renaming columns for consistent joining
AHAH_df <- AHAH_df %>%
    rename(LSOA.code..2011. = lsoa11)
ONS_MIDYEAR_2015 <- ONS_MIDYEAR_2015 %>%
    rename(LSOA.code..2011. = Area.Codes)
RUC2011 <- RUC2011 %>%
    rename(LSOA.code..2011. = LSOA11CD)

## Creating complete combined dataset
complete_combined_data <- IOD_barriers_df %>%
    left_join(IOD_education_df, by = "LSOA.code..2011.") %>%
    left_join(IOD_employment_df, by = "LSOA.code..2011.") %>%
    left_join(IOD_health_df, by = "LSOA.code..2011.") %>%
    left_join(IOD_income_df, by = "LSOA.code..2011.") %>%
    left_join(AHAH_df, by = "LSOA.code..2011.") %>%
    left_join(ONS_MIDYEAR_2015, by = "LSOA.code..2011.") %>%
    left_join(RUC2011, by = "LSOA.code..2011.")

## Retaining only columns of interest
combined_data <- complete_combined_data %>%
    select(LSOA.code..2011., Local.Authority.District.name..2019.,
        Income.Domain.numerator, Years.of.potential.life.lost.indicator,
        Housing.affordability.indicator, Staying.on.in.education.post.16.indicator,
        Employment.Domain.numerator, ah3gp, ah3hosp, ah3dent,
        ah3phar, All.Ages, RUC11)

## Removing commas and converting columns 3:12 to numeric
combined_data <- combined_data %>%
    mutate(across(.cols = 3:12, .fns = ~as.numeric(str_remove_all(.x,
        ","))))

## Creating income and employment deprivation rate
## variables
combined_data$income_deprivation_rate <- (combined_data$Income.Domain.numerator/combined_data$All.Ages) *
    100
combined_data$employment_deprivation_rate <- (combined_data$Employment.Domain.numerator/combined_data$All.Ages) *
    100

## Creating average distance (in minutes) to a healthcare
## service (including GP practice, hospital, dentist and
## pharmacy)
combined_data$avg_healthcare_distance <- rowMeans(combined_data[,
    c("ah3gp", "ah3hosp", "ah3dent", "ah3phar")], na.rm = TRUE)

## Renaming column names for understanding
colnames(combined_data) <- c("lsoa_code", "local_authority_name",
    "no._of_income_deprived_individuals_2015", "years_of_lives_lost_2013_2017",
    "housing_affordability_score_2016", "proportion_of_above16_individuals_inschool_2010_2012",
    "no._of_deprived_employment_2015_2016", "avg_dist_gppractice_in_mins_2022",
    "avg_dist_hospital_in_mins_2022", "avg_dist_dentist_in_mins_2022",
    "avg_dist_pharmacy_in_mins_2022", "population_estimates_for_allages_2015",
    "rural_urban_class_2011", "income_deprivation_rate_2015",
    "employment_deprivation_rate_2015", "avg_healthcare_distance_2022")

The above code reads in multiple CSV files containing various indicators of deprivation and population data. It then proceeds to clean and merge these datasets into a single comprehensive dataset by renaming inconsistent column names to facilitate joining. A series of left joins creates our combined dataset which is further refined by selecting only the columns of interest. Subsequent operations include cleaning numeric data by removing commas, calculating rates of income and employment deprivation, and computing the average distance to healthcare services. The final steps involve renaming columns to more descriptive titles and preparing the dataset for further analysis, ensuring there are no duplicate entries.

## Saving as CSV
write.csv(combined_data, file = "Group_7_combined_data.csv",
    row.names = FALSE)
write.csv(complete_combined_data, file = "Group_7_complete_combined_data.csv",
    row.names = FALSE)

The above code saves our shortened dataframes as csv files.

## Loading combined data
combined_data <- read.csv("Group_7_combined_data.csv")

## Checking for duplicates
There_are_duplicates <- any(duplicated(combined_data))
if (There_are_duplicates) {
    cat("There are duplicates in the combined_data.\n")
} else {
    cat("There are no duplicates in combined_data.\n")
}
## There are no duplicates in combined_data.
# Calculate missingness percentages for each numerical
# variable (IV and DV)
missingness <- sapply(combined_data, function(x) mean(is.na(x)) *
    100)
missingness_summary <- data.frame(variable = names(missingness),
    Missingness_percentage = missingness)
missingness_summary
##                                                                                                  variable
## lsoa_code                                                                                       lsoa_code
## local_authority_name                                                                 local_authority_name
## no._of_income_deprived_individuals_2015                           no._of_income_deprived_individuals_2015
## years_of_lives_lost_2013_2017                                               years_of_lives_lost_2013_2017
## housing_affordability_score_2016                                         housing_affordability_score_2016
## proportion_of_above16_individuals_inschool_2010_2012 proportion_of_above16_individuals_inschool_2010_2012
## no._of_deprived_employment_2015_2016                                 no._of_deprived_employment_2015_2016
## avg_dist_gppractice_in_mins_2022                                         avg_dist_gppractice_in_mins_2022
## avg_dist_hospital_in_mins_2022                                             avg_dist_hospital_in_mins_2022
## avg_dist_dentist_in_mins_2022                                               avg_dist_dentist_in_mins_2022
## avg_dist_pharmacy_in_mins_2022                                             avg_dist_pharmacy_in_mins_2022
## population_estimates_for_allages_2015                               population_estimates_for_allages_2015
## rural_urban_class_2011                                                             rural_urban_class_2011
## income_deprivation_rate_2015                                                 income_deprivation_rate_2015
## employment_deprivation_rate_2015                                         employment_deprivation_rate_2015
## avg_healthcare_distance_2022                                                 avg_healthcare_distance_2022
##                                                      Missingness_percentage
## lsoa_code                                                        0.00000000
## local_authority_name                                             0.00000000
## no._of_income_deprived_individuals_2015                          0.09438558
## years_of_lives_lost_2013_2017                                    0.00000000
## housing_affordability_score_2016                                 0.00000000
## proportion_of_above16_individuals_inschool_2010_2012             0.00000000
## no._of_deprived_employment_2015_2016                             0.36231884
## avg_dist_gppractice_in_mins_2022                                 0.00000000
## avg_dist_hospital_in_mins_2022                                   0.00000000
## avg_dist_dentist_in_mins_2022                                    0.00000000
## avg_dist_pharmacy_in_mins_2022                                   0.00000000
## population_estimates_for_allages_2015                            0.00000000
## rural_urban_class_2011                                           0.00000000
## income_deprivation_rate_2015                                     0.09438558
## employment_deprivation_rate_2015                                 0.36231884
## avg_healthcare_distance_2022                                     0.00000000
# Removing Null Values from data set
combined_data <- na.omit(combined_data)

# First 6 rows of data
head(combined_data)
##   lsoa_code local_authority_name no._of_income_deprived_individuals_2015
## 2 E01000002       City of London                                      40
## 3 E01000003       City of London                                     115
## 4 E01000005       City of London                                     235
## 5 E01000006 Barking and Dagenham                                     235
## 6 E01000007 Barking and Dagenham                                     435
## 7 E01000008 Barking and Dagenham                                     420
##   years_of_lives_lost_2013_2017 housing_affordability_score_2016
## 2                        69.045                            1.586
## 3                        82.254                            3.063
## 4                        67.035                            3.383
## 5                        59.895                            3.515
## 6                        66.861                            3.183
## 7                        73.391                            2.932
##   proportion_of_above16_individuals_inschool_2010_2012
## 2                                                0.279
## 3                                                0.201
## 4                                                0.272
## 5                                                0.100
## 6                                                0.113
## 7                                                0.291
##   no._of_deprived_employment_2015_2016 avg_dist_gppractice_in_mins_2022
## 2                                   15                         3.146895
## 3                                   70                         1.151000
## 4                                   95                         1.462800
## 5                                   75                         1.585000
## 6                                  150                         0.177500
## 7                                  145                         2.641000
##   avg_dist_hospital_in_mins_2022 avg_dist_dentist_in_mins_2022
## 2                       1.448895                        0.8875
## 3                       0.649500                        0.0000
## 4                       1.224050                        0.5650
## 5                       1.585000                        2.1500
## 6                       0.517500                        0.6475
## 7                       2.861000                        3.1110
##   avg_dist_pharmacy_in_mins_2022 population_estimates_for_allages_2015
## 2                      0.8791789                                  1156
## 3                      0.3475000                                  1350
## 4                      0.9426000                                  1121
## 5                      1.8675000                                  2040
## 6                      1.0775000                                  2101
## 7                      3.2585000                                  1566
##    rural_urban_class_2011 income_deprivation_rate_2015
## 2 Urban major conurbation                     3.460208
## 3 Urban major conurbation                     8.518519
## 4 Urban major conurbation                    20.963426
## 5 Urban major conurbation                    11.519608
## 6 Urban major conurbation                    20.704426
## 7 Urban major conurbation                    26.819923
##   employment_deprivation_rate_2015 avg_healthcare_distance_2022
## 2                         1.297578                     1.590617
## 3                         5.185185                     0.537000
## 4                         8.474576                     1.048612
## 5                         3.676471                     1.796875
## 6                         7.139457                     0.605000
## 7                         9.259259                     2.967875
# Last 6 rows of data
tail(combined_data)
##       lsoa_code local_authority_name no._of_income_deprived_individuals_2015
## 32839 E01033763            Liverpool                                     630
## 32840 E01033764            Liverpool                                    1275
## 32841 E01033765            Liverpool                                     525
## 32842 E01033766            Liverpool                                     105
## 32843 E01033767            Liverpool                                     405
## 32844 E01033768            Liverpool                                     350
##       years_of_lives_lost_2013_2017 housing_affordability_score_2016
## 32839                       101.024                            1.262
## 32840                        86.855                            1.415
## 32841                        75.078                            1.396
## 32842                        75.244                           -1.862
## 32843                        83.476                            1.752
## 32844                        91.496                            1.505
##       proportion_of_above16_individuals_inschool_2010_2012
## 32839                                                0.329
## 32840                                                0.241
## 32841                                                0.207
## 32842                                                0.142
## 32843                                                0.175
## 32844                                                0.175
##       no._of_deprived_employment_2015_2016 avg_dist_gppractice_in_mins_2022
## 32839                                  305                          1.19380
## 32840                                  555                          1.54425
## 32841                                  255                          1.77000
## 32842                                   55                          1.07000
## 32843                                  190                          3.26270
## 32844                                  245                          0.85200
##       avg_dist_hospital_in_mins_2022 avg_dist_dentist_in_mins_2022
## 32839                        0.82660                        2.3109
## 32840                        2.07095                        3.1923
## 32841                        1.43750                        1.9050
## 32842                        1.07000                        2.7176
## 32843                        1.16050                        2.6112
## 32844                        0.85200                        1.0795
##       avg_dist_pharmacy_in_mins_2022 population_estimates_for_allages_2015
## 32839                         0.9740                                  1679
## 32840                         1.6249                                  2712
## 32841                         1.9050                                  1445
## 32842                         1.0700                                  1054
## 32843                         1.6750                                  1019
## 32844                         1.1720                                  1281
##        rural_urban_class_2011 income_deprivation_rate_2015
## 32839 Urban major conurbation                    37.522335
## 32840 Urban major conurbation                    47.013274
## 32841 Urban major conurbation                    36.332180
## 32842 Urban major conurbation                     9.962049
## 32843 Urban major conurbation                    39.744848
## 32844 Urban major conurbation                    27.322404
##       employment_deprivation_rate_2015 avg_healthcare_distance_2022
## 32839                        18.165575                     1.326325
## 32840                        20.464602                     2.108100
## 32841                        17.647059                     1.754375
## 32842                         5.218216                     1.481900
## 32843                        18.645731                     2.177350
## 32844                        19.125683                     0.988875

After loading the dataset, the above code checks for any duplicate rows, and outputs a message indicating whether duplicates are found. The script then calculates the percentage of missing values (NA values) for each variable in the dataset, summarizing the results in a data frame that lists variables alongside their respective percentages of missingness. Subsequently, it removes all rows with any missing values from the dataset. Finally, the script displays the first and last six rows of the cleaned dataset, which serves as a quick check of the data after the aforementioned operations, giving a glimpse of the data structure post-cleaning. # Initial summary statistics

summary(combined_data)

lsoa_code local_authority_name Length:32715 Length:32715
Class :character Class :character
Mode :character Mode :character

no._of_income_deprived_individuals_2015 years_of_lives_lost_2013_2017 Min. : 10.0 Min. : 18.43
1st Qu.: 90.0 1st Qu.: 47.36
Median : 165.0 Median : 54.78
Mean : 214.3 Mean : 57.29
3rd Qu.: 300.0 3rd Qu.: 64.53
Max. :1435.0 Max. :184.73
housing_affordability_score_2016 Min. :-6.658000
1st Qu.:-1.277000
Median :-0.013000
Mean : 0.000486
3rd Qu.: 1.297000
Max. : 8.019000
proportion_of_above16_individuals_inschool_2010_2012 Min. :0.0380
1st Qu.:0.1360
Median :0.1780
Mean :0.1855
3rd Qu.:0.2280
Max. :0.5500
no._of_deprived_employment_2015_2016 avg_dist_gppractice_in_mins_2022 Min. : 10.0 Min. : 0.000
1st Qu.: 45.0 1st Qu.: 1.803
Median : 75.0 Median : 3.022
Mean : 96.5 Mean : 4.660
3rd Qu.:130.0 3rd Qu.: 5.251
Max. :670.0 Max. :50.528
avg_dist_hospital_in_mins_2022 avg_dist_dentist_in_mins_2022 Min. : 0.000 Min. : 0.000
1st Qu.: 1.561 1st Qu.: 1.725
Median : 2.614 Median : 2.936
Mean : 4.301 Mean : 4.857
3rd Qu.: 4.598 3rd Qu.: 5.187
Max. :53.912 Max. :94.747
avg_dist_pharmacy_in_mins_2022 population_estimates_for_allages_2015 Min. : 0.000 Min. : 614
1st Qu.: 1.502 1st Qu.:1448
Median : 2.383 Median :1599
Mean : 3.953 Mean :1669
3rd Qu.: 3.908 3rd Qu.:1802
Max. :64.166 Max. :9551
rural_urban_class_2011 income_deprivation_rate_2015 Length:32715 Min. : 0.1912
Class :character 1st Qu.: 5.5485
Mode :character Median : 9.9291
Mean :12.8100
3rd Qu.:17.7987
Max. :60.9177
employment_deprivation_rate_2015 avg_healthcare_distance_2022 Min. : 0.1912 Min. : 0.000
1st Qu.: 2.7865 1st Qu.: 1.877
Median : 4.6030 Median : 2.898
Mean : 5.8027 Mean : 4.443
3rd Qu.: 7.8489 3rd Qu.: 4.661
Max. :34.7452 Max. :60.310

# Descriptive table for numerical variables
stargazer(combined_data, type = "html", digits = 1, title = "Table 1: Summary Statistics",
    out = "table.html")
Table 1: Summary Statistics
Statistic N Mean St. Dev. Min Max
no._of_income_deprived_individuals_2015 32,715 214.3 164.4 10 1,435
years_of_lives_lost_2013_2017 32,715 57.3 14.0 18.4 184.7
housing_affordability_score_2016 32,715 0.000 1.9 -6.7 8.0
proportion_of_above16_individuals_inschool_2010_2012 32,715 0.2 0.1 0.04 0.6
no._of_deprived_employment_2015_2016 32,715 96.5 68.7 10 670
avg_dist_gppractice_in_mins_2022 32,715 4.7 4.9 0.0 50.5
avg_dist_hospital_in_mins_2022 32,715 4.3 5.0 0.0 53.9
avg_dist_dentist_in_mins_2022 32,715 4.9 5.8 0.0 94.7
avg_dist_pharmacy_in_mins_2022 32,715 4.0 5.1 0.0 64.2
population_estimates_for_allages_2015 32,715 1,668.6 364.9 614 9,551
income_deprivation_rate_2015 32,715 12.8 9.4 0.2 60.9
employment_deprivation_rate_2015 32,715 5.8 4.0 0.2 34.7
avg_healthcare_distance_2022 32,715 4.4 4.8 0.0 60.3
# Use rstudioapi to view the table
rstudioapi::viewer("table.html")

NULL

## SD of numerical variable
sd(combined_data$lsoa_code)

[1] NA

sd(combined_data$no._of_income_deprived_individuals_2015)

[1] 164.3886

sd(combined_data$years_of_lives_lost_2013_2017)

[1] 14.01493

sd(combined_data$housing_affordability_score_2016)

[1] 1.904849

sd(combined_data$proportion_of_above16_individuals_inschool_2010_2012)

[1] 0.06606487

sd(combined_data$no._of_deprived_employment_2015_2016)

[1] 68.72312

sd(combined_data$avg_dist_gppractice_in_mins_2022)

[1] 4.932105

sd(combined_data$avg_dist_hospital_in_mins_2022)

[1] 4.991137

sd(combined_data$avg_dist_dentist_in_mins_2022)

[1] 5.822524

sd(combined_data$avg_dist_pharmacy_in_mins_2022)

[1] 5.062794

sd(combined_data$population_estimates_for_allages_2015)

[1] 364.8729

sd(combined_data$income_deprivation_rate_2015)

[1] 9.412047

sd(combined_data$employment_deprivation_rate_2015)

[1] 4.007918

sd(combined_data$avg_healthcare_distance_2022)

[1] 4.752529 The above code provides a summary of the combined_data dataset, first by generating a descriptive statistics table, which includes mean, median, minimum, maximum, and quartile values for each numerical variable in the dataset. This table is created using the stargazer package, formatted as HTML, titled “Table 1: Summary Statistics”, and saved to a file named “table.html”. It then uses rstudioapi::viewer to display this HTML table within RStudio’s viewer pane for easy inspection. Following that, the code calculates the standard deviation (SD) for each numerical variable within combined_data individually, which quantifies the amount of variation or dispersion of a set of values.

# The number of unique names in the rural_urban_class
# column
unique_names <- table(combined_data$rural_urban_class_2011)
print(unique_names)
## 
##                           Rural town and fringe 
##                                            2930 
##       Rural town and fringe in a sparse setting 
##                                             119 
##                     Rural village and dispersed 
##                                            2356 
## Rural village and dispersed in a sparse setting 
##                                             181 
##                             Urban city and town 
##                                           14399 
##         Urban city and town in a sparse setting 
##                                              59 
##                         Urban major conurbation 
##                                           11465 
##                         Urban minor conurbation 
##                                            1206
rural_urban_class_2011_counts <- as.numeric(unique_names)
print(rural_urban_class_2011_counts)
## [1]  2930   119  2356   181 14399    59 11465  1206
## Summary statistics for Rural_urban_class_2011
max(rural_urban_class_2011_counts)
## [1] 14399
min(rural_urban_class_2011_counts)
## [1] 59
median(rural_urban_class_2011_counts)
## [1] 1781
IQR(rural_urban_class_2011_counts)
## [1] 4898.25
# The number of unique names in the local_authority_name
# column
unique_names_1 <- table(combined_data$local_authority_name)
print(unique_names_1)
## 
##                                Adur                           Allerdale 
##                                  42                                  60 
##                        Amber Valley                                Arun 
##                                  78                                  94 
##                            Ashfield                             Ashford 
##                                  74                                  78 
##                      Aylesbury Vale                             Babergh 
##                                 114                                  54 
##                Barking and Dagenham                              Barnet 
##                                 110                                 209 
##                            Barnsley                   Barrow-in-Furness 
##                                 147                                  49 
##                            Basildon               Basingstoke and Deane 
##                                 109                                 109 
##                           Bassetlaw        Bath and North East Somerset 
##                                  70                                 114 
##                             Bedford                              Bexley 
##                                 103                                 146 
##                          Birmingham                               Blaby 
##                                 638                                  60 
##               Blackburn with Darwen                           Blackpool 
##                                  91                                  94 
##                            Bolsover                              Bolton 
##                                  48                                 177 
##                              Boston Bournemouth, Christchurch and Poole 
##                                  36                                 233 
##                    Bracknell Forest                            Bradford 
##                                  75                                 308 
##                           Braintree                           Breckland 
##                                  87                                  78 
##                               Brent                           Brentwood 
##                                 173                                  45 
##                   Brighton and Hove                    Bristol, City of 
##                                 165                                 262 
##                           Broadland                             Bromley 
##                                  84                                 195 
##                          Bromsgrove                          Broxbourne 
##                                  58                                  56 
##                            Broxtowe                             Burnley 
##                                  71                                  60 
##                                Bury                          Calderdale 
##                                 120                                 128 
##                           Cambridge                              Camden 
##                                  67                                 132 
##                       Cannock Chase                          Canterbury 
##                                  60                                  90 
##                            Carlisle                        Castle Point 
##                                  68                                  57 
##                Central Bedfordshire                           Charnwood 
##                                 157                                  99 
##                          Chelmsford                          Cheltenham 
##                                 107                                  75 
##                            Cherwell                       Cheshire East 
##                                  93                                 232 
##           Cheshire West and Chester                        Chesterfield 
##                                 212                                  69 
##                          Chichester                            Chiltern 
##                                  71                                  54 
##                             Chorley                      City of London 
##                                  66                                   4 
##                          Colchester                            Copeland 
##                                 105                                  49 
##                               Corby                            Cornwall 
##                                  41                                 326 
##                            Cotswold                       County Durham 
##                                  51                                 324 
##                            Coventry                              Craven 
##                                 195                                  32 
##                             Crawley                             Croydon 
##                                  66                                 220 
##                             Dacorum                          Darlington 
##                                  91                                  65 
##                            Dartford                            Daventry 
##                                  58                                  44 
##                               Derby                    Derbyshire Dales 
##                                 151                                  43 
##                           Doncaster                              Dorset 
##                                 194                                 218 
##                               Dover                              Dudley 
##                                  67                                 201 
##                              Ealing                 East Cambridgeshire 
##                                 196                                  50 
##                          East Devon                      East Hampshire 
##                                  81                                  71 
##                  East Hertfordshire                        East Lindsey 
##                                  84                                  81 
##               East Northamptonshire            East Riding of Yorkshire 
##                                  49                                 210 
##                  East Staffordshire                        East Suffolk 
##                                  72                                 146 
##                          Eastbourne                           Eastleigh 
##                                  61                                  77 
##                                Eden                           Elmbridge 
##                                  36                                  76 
##                             Enfield                       Epping Forest 
##                                 183                                  78 
##                     Epsom and Ewell                             Erewash 
##                                  44                                  73 
##                              Exeter                             Fareham 
##                                  74                                  73 
##                             Fenland                Folkestone and Hythe 
##                                  55                                  67 
##                      Forest of Dean                               Fylde 
##                                  50                                  51 
##                           Gateshead                             Gedling 
##                                 126                                  77 
##                          Gloucester                             Gosport 
##                                  78                                  53 
##                           Gravesham                      Great Yarmouth 
##                                  64                                  61 
##                           Greenwich                           Guildford 
##                                 150                                  82 
##                             Hackney                              Halton 
##                                 144                                  79 
##                           Hambleton              Hammersmith and Fulham 
##                                  52                                 113 
##                          Harborough                            Haringey 
##                                  47                                 145 
##                              Harlow                           Harrogate 
##                                  54                                 100 
##                              Harrow                                Hart 
##                                 137                                  56 
##                          Hartlepool                            Hastings 
##                                  58                                  53 
##                              Havant                            Havering 
##                                  78                                 150 
##            Herefordshire, County of                           Hertsmere 
##                                 116                                  61 
##                           High Peak                          Hillingdon 
##                                  59                                 161 
##               Hinckley and Bosworth                             Horsham 
##                                  66                                  81 
##                            Hounslow                     Huntingdonshire 
##                                 142                                 105 
##                            Hyndburn                             Ipswich 
##                                  52                                  85 
##                       Isle of Wight                     Isles of Scilly 
##                                  89                                   1 
##                           Islington              Kensington and Chelsea 
##                                 123                                  90 
##                           Kettering        King's Lynn and West Norfolk 
##                                  57                                  89 
##         Kingston upon Hull, City of                Kingston upon Thames 
##                                 166                                  98 
##                            Kirklees                            Knowsley 
##                                 259                                  98 
##                             Lambeth                           Lancaster 
##                                 178                                  89 
##                               Leeds                           Leicester 
##                                 480                                 192 
##                               Lewes                            Lewisham 
##                                  62                                 169 
##                           Lichfield                             Lincoln 
##                                  58                                  57 
##                           Liverpool                               Luton 
##                                 297                                 121 
##                           Maidstone                              Maldon 
##                                  94                                  40 
##                       Malvern Hills                          Manchester 
##                                  45                                 278 
##                           Mansfield                              Medway 
##                                  67                                 163 
##                              Melton                              Mendip 
##                                  30                                  66 
##                              Merton                           Mid Devon 
##                                 122                                  43 
##                         Mid Suffolk                          Mid Sussex 
##                                  56                                  86 
##                       Middlesbrough                       Milton Keynes 
##                                  86                                 152 
##                         Mole Valley                          New Forest 
##                                  52                                 113 
##                 Newark and Sherwood                Newcastle-under-Lyme 
##                                  70                                  80 
##                 Newcastle upon Tyne                              Newham 
##                                 174                                 162 
##                         North Devon               North East Derbyshire 
##                                  58                                  63 
##             North East Lincolnshire                 North Hertfordshire 
##                                 106                                  82 
##                      North Kesteven                  North Lincolnshire 
##                                  64                                 101 
##                       North Norfolk                      North Somerset 
##                                  62                                 135 
##                      North Tyneside                  North Warwickshire 
##                                 131                                  38 
##           North West Leicestershire                         Northampton 
##                                  58                                 133 
##                      Northumberland                             Norwich 
##                                 197                                  83 
##                          Nottingham               Nuneaton and Bedworth 
##                                 182                                  81 
##                   Oadby and Wigston                              Oldham 
##                                  36                                 141 
##                              Oxford                              Pendle 
##                                  79                                  57 
##                        Peterborough                            Plymouth 
##                                 112                                 161 
##                          Portsmouth                             Preston 
##                                 124                                  86 
##                             Reading                           Redbridge 
##                                  97                                 161 
##                Redcar and Cleveland                            Redditch 
##                                  88                                  55 
##                Reigate and Banstead                       Ribble Valley 
##                                  83                                  39 
##                Richmond upon Thames                       Richmondshire 
##                                 115                                  34 
##                            Rochdale                            Rochford 
##                                 134                                  53 
##                          Rossendale                              Rother 
##                                  43                                  58 
##                           Rotherham                               Rugby 
##                                 167                                  61 
##                           Runnymede                          Rushcliffe 
##                                  51                                  68 
##                            Rushmoor                             Rutland 
##                                  58                                  23 
##                             Ryedale                             Salford 
##                                  30                                 149 
##                            Sandwell                         Scarborough 
##                                 186                                  71 
##                           Sedgemoor                              Sefton 
##                                  70                                 189 
##                               Selby                           Sevenoaks 
##                                  50                                  73 
##                           Sheffield                          Shropshire 
##                                 343                                 193 
##                              Slough                            Solihull 
##                                  80                                 132 
##           Somerset West and Taunton                         South Bucks 
##                                  88                                  37 
##                South Cambridgeshire                    South Derbyshire 
##                                  96                                  58 
##               South Gloucestershire                          South Hams 
##                                 165                                  49 
##                       South Holland                      South Kesteven 
##                                  49                                  81 
##                      South Lakeland                       South Norfolk 
##                                  59                                  81 
##              South Northamptonshire                   South Oxfordshire 
##                                  51                                  87 
##                        South Ribble                      South Somerset 
##                                  70                                 103 
##                 South Staffordshire                      South Tyneside 
##                                  68                                 102 
##                         Southampton                     Southend-on-Sea 
##                                 148                                 107 
##                           Southwark                          Spelthorne 
##                                 162                                  60 
##                           St Albans                          St. Helens 
##                                  83                                 119 
##                            Stafford             Staffordshire Moorlands 
##                                  80                                  59 
##                           Stevenage                           Stockport 
##                                  52                                 190 
##                    Stockton-on-Tees                      Stoke-on-Trent 
##                                 120                                 159 
##                   Stratford-on-Avon                              Stroud 
##                                  73                                  69 
##                          Sunderland                        Surrey Heath 
##                                 185                                  53 
##                              Sutton                               Swale 
##                                 121                                  85 
##                             Swindon                            Tameside 
##                                 132                                 141 
##                            Tamworth                           Tandridge 
##                                  51                                  49 
##                         Teignbridge                  Telford and Wrekin 
##                                  84                                 108 
##                            Tendring                         Test Valley 
##                                  89                                  71 
##                          Tewkesbury                              Thanet 
##                                  50                                  84 
##                        Three Rivers                            Thurrock 
##                                  49                                  98 
##               Tonbridge and Malling                              Torbay 
##                                  71                                  89 
##                            Torridge                       Tower Hamlets 
##                                  37                                 144 
##                            Trafford                     Tunbridge Wells 
##                                 138                                  66 
##                          Uttlesford                 Vale of White Horse 
##                                  45                                  76 
##                           Wakefield                             Walsall 
##                                 209                                 167 
##                      Waltham Forest                          Wandsworth 
##                                 144                                 178 
##                          Warrington                             Warwick 
##                                 127                                  85 
##                             Watford                            Waverley 
##                                  53                                  81 
##                             Wealden                      Wellingborough 
##                                  95                                  47 
##                     Welwyn Hatfield                      West Berkshire 
##                                  65                                  96 
##                          West Devon                     West Lancashire 
##                                  31                                  73 
##                        West Lindsey                    West Oxfordshire 
##                                  52                                  66 
##                        West Suffolk                         Westminster 
##                                 100                                 120 
##                               Wigan                           Wiltshire 
##                                 200                                 285 
##                          Winchester              Windsor and Maidenhead 
##                                  69                                  88 
##                              Wirral                              Woking 
##                                 206                                  59 
##                           Wokingham                       Wolverhampton 
##                                  99                                 158 
##                           Worcester                            Worthing 
##                                  63                                  65 
##                            Wychavon                             Wycombe 
##                                  78                                 106 
##                                Wyre                         Wyre Forest 
##                                  69                                  65 
##                                York 
##                                 120
local_authority_name_counts <- as.numeric(unique_names_1)
print(local_authority_name_counts)
##   [1]  42  60  78  94  74  78 114  54 110 209 147  49 109 109  70 114 103 146
##  [19] 638  60  91  94  48 177  36 233  75 308  87  78 173  45 165 262  84 195
##  [37]  58  56  71  60 120 128  67 132  60  90  68  57 157  99 107  75  93 232
##  [55] 212  69  71  54  66   4 105  49  41 326  51 324 195  32  66 220  91  65
##  [73]  58  44 151  43 194 218  67 201 196  50  81  71  84  81  49 210  72 146
##  [91]  61  77  36  76 183  78  44  73  74  73  55  67  50  51 126  77  78  53
## [109]  64  61 150  82 144  79  52 113  47 145  54 100 137  56  58  53  78 150
## [127] 116  61  59 161  66  81 142 105  52  85  89   1 123  90  57  89 166  98
## [145] 259  98 178  89 480 192  62 169  58  57 297 121  94  40  45 278  67 163
## [163]  30  66 122  43  56  86  86 152  52 113  70  80 174 162  58  63 106  82
## [181]  64 101  62 135 131  38  58 133 197  83 182  81  36 141  79  57 112 161
## [199] 124  86  97 161  88  55  83  39 115  34 134  53  43  58 167  61  51  68
## [217]  58  23  30 149 186  71  70 189  50  73 343 193  80 132  88  37  96  58
## [235] 165  49  49  81  59  81  51  87  70 103  68 102 148 107 162  60  83 119
## [253]  80  59  52 190 120 159  73  69 185  53 121  85 132 141  51  49  84 108
## [271]  89  71  50  84  49  98  71  89  37 144 138  66  45  76 209 167 144 178
## [289] 127  85  53  81  95  47  65  96  31  73  52  66 100 120 200 285  69  88
## [307] 206  59  99 158  63  65  78 106  69  65 120
## Summary statistics for local_authority_name
max(local_authority_name_counts)
## [1] 638
min(local_authority_name_counts)
## [1] 1
median(local_authority_name_counts)
## [1] 81
IQR(local_authority_name_counts)
## [1] 72
## T-test Four sample independent t-test is used to compare
## the significance level in combined_data data set.
## Student’s t-test assumes both groups of data are
## normally distributed and the variances of the two
## distributions are the same.
x1 <- combined_data$income_deprivation_rate_2015
x2 <- combined_data$employment_deprivation_rate_2015
x3 <- combined_data$housing_affordability_score_2016
y <- combined_data$avg_healthcare_distance_2022

# Plot a 2x2 histogram grid
par(mfrow = c(2, 2))
hist(x1)
hist(x2)
hist(x3)
hist(y)

# Based on histogram plots, we have to perform a Welch's
# t-test for x1, x2, both of which are skewed, and for x3
# as well since DV is highly skewed despite x3 being
# normally distributed.
t_test1 <- t.test(x1, y)
t_test2 <- t.test(x2, y)
t_test3 <- t.test(x3, y)
t_test1
## 
##  Welch Two Sample t-test
## 
## data:  x1 and y
## t = 143.53, df = 48378, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  8.252976 8.481491
## sample estimates:
## mean of x mean of y 
## 12.809981  4.442748
t_test2
## 
##  Welch Two Sample t-test
## 
## data:  x2 and y
## t = 39.567, df = 63616, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.292606 1.427343
## sample estimates:
## mean of x mean of y 
##  5.802723  4.442748
t_test3
## 
##  Welch Two Sample t-test
## 
## data:  x3 and y
## t = -156.93, df = 42960, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.497746 -4.386779
## sample estimates:
##    mean of x    mean of y 
## 0.0004856182 4.4427479057
## Covariance values of our primary and secondary IV
## against primary DV
cov(x1, y)
## [1] -12.79764
cov(x2, y)
## [1] -5.549447
cov(x3, y)
## [1] -2.716934
## Correlation values of our primary and secondary IV
## against primary DV As x1, x2 are not normally
## distributed, we will use 'spearman' method. We will use
## 'spearman' method for x3 despite it being normally
## distributed as DV is highly skewed.
cor(x1, y, method = "spearman")
## [1] -0.3929755
cor(x2, y, method = "spearman")
## [1] -0.3911658
cor(x3, y, method = "spearman")
## [1] -0.4408718

The above code performs frequency analysis and summary statistics for unique categories within two categorical columns of the combined_data dataset, constructs histograms for four numerical variables to assess their distribution, conducts Welch’s t-tests to compare means between independent and dependent variables given non-normal distributions, calculates covariance to measure joint variability, and computes Spearman correlation coefficients to evaluate the strength and direction of association, accounting for non-parametric data.

Variable distributions

DistPlot(combined_data, "income_deprivation_rate_2015", "Income Deprivation Rate Across LSOAs in England (2015)",
    subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

DistPlot(combined_data, "avg_healthcare_distance_2022", "Average Minutes to Healthcare Services Across LSOAs in England (2022)",
    subtitle = "Mean Access Time to Healthcare Services per LSOA",
    caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "housing_affordability_score_2016", "Housing Affordability Indicator Across LSOAs in England (2016)",
    subtitle = "Higher Scores Indicate Increased Deprivation in Entering Ownership or Private Rental Market",
    caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "employment_deprivation_rate_2015", "Employment Deprivation Rate in England (2015)",
    subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

# Both the main explanatory variable and outcome variable
# have a very strong right skew. Thus, a log transformation
# in both of these is examined.

The above code generates distribution plots for four different variables within the combined_data dataset, each with its own title, subtitle, and source caption. It visualizes the distribution of income deprivation rate for 2015, the average distance to healthcare services for 2022, the housing affordability score for 2016, and the employment deprivation rate for 2015 across Lower Super Output Areas (LSOAs) in England. The code also notes a significant right skew in the main explanatory and outcome variables, suggesting that a logarithmic transformation may be examined for these variables to address the skewness.

Log transformations

## Adding log transformed variables to dataframe:
combined_data$log_avg_healthcare_distance_2022 <- log(combined_data$avg_healthcare_distance_2022)
combined_data$log_income_deprivation_rate_2015 <- log(combined_data$income_deprivation_rate_2015)
combined_data$log_employment_deprivation_rate_2015 <- log(combined_data$employment_deprivation_rate_2015)

## Checking distributions:
DistPlot(combined_data, "log_income_deprivation_rate_2015", "log(Income Deprivation Rate) Across LSOAs in England (2015)",
    subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

DistPlot(combined_data, "log_avg_healthcare_distance_2022", "log(Average Minutes to Healthcare Services) Across LSOAs in England (2022)",
    subtitle = "Logarithm of Mean Access Time to Healthcare Services per LSOA",
    caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "log_employment_deprivation_rate_2015",
    "log(Employment Deprivation Rate) Across LSOAs in England (2015)",
    subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

## Filtering out rows with infinite values and creating a
## complete case sample:
complete_case_sample <- combined_data %>%
    select(log_income_deprivation_rate_2015, log_avg_healthcare_distance_2022,
        housing_affordability_score_2016, rural_urban_class_2011) %>%
    filter(!is.infinite(log_avg_healthcare_distance_2022), !is.infinite(log_income_deprivation_rate_2015)) %>%
    na.omit()

## Saving as CSV
write.csv(complete_case_sample, file = "Group_7_complete_case.csv",
    row.names = FALSE)

# Checking linearity of relationships between x variables
ggpairs(complete_case_sample[, c("log_income_deprivation_rate_2015",
    "housing_affordability_score_2016", "log_avg_healthcare_distance_2022")])

PointPlot(combined_data, "log_employment_deprivation_rate_2015",
    "log_income_deprivation_rate_2015", title = "Income Deprivation Rate (2015) as a Function of Employment Deprivation Rate (2015)",
    subtitle = "For Lower layer Super Output Areas Across England",
    x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Income Deprivation Rate) (2015)",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021",
    add_regression_line = TRUE)

result <- cor.test(combined_data$log_employment_deprivation_rate_2015,
    combined_data$income_deprivation_rate_2015, method = "pearson")

# creating a df with correlation test results
model_summary <- data.frame(Estimate = round(result$estimate,
    4), t.value = round(result$statistic, 2), p = ifelse(result$p.value <
    0.05, "< 0.05", round(result$p.value, 10)))

# pretty table using stargazer
stargazer(model_summary, type = "html", title = "Results of Pearson's Correlation Test",
    summary = FALSE, omit.stat = c("all"), single.row = TRUE,
    rownames = FALSE, out = "x1x2.html")
Results of Pearson’s Correlation Test
Estimate t.value p
0.897 367 < 0.05
rstudioapi::viewer("x1x2.html")

NULL

# As employment deprivation rate and income deprivation
# rate are highly correlated, employment deprivation rate
# will be excluded from analyses (to avoid issues of
# multicollinearity).

# Examining linearity between xi and y
PointPlot(complete_case_sample, "log_income_deprivation_rate_2015",
    "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Income Deprivation Rate (2015)",
    subtitle = "For Lower layer Super Output Areas Across England",
    x_label = "log(Income Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)",
    add_regression_line = TRUE)

PointPlot(complete_case_sample, "housing_affordability_score_2016",
    "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Housing Affordability Score (2016)",
    subtitle = "For Lower layer Super Output Areas Across England",
    x_label = "Housing Affordability Score (2016)", y_label = "log(Mean Access Time to Healthcare Services) (2022)",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)",
    add_regression_line = TRUE)

PointPlot(combined_data, "log_employment_deprivation_rate_2015",
    "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Employment Deprivation Rate (2015)",
    subtitle = "For Lower layer Super Output Areas Across England",
    x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)",
    caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)",
    add_regression_line = TRUE)

The above code performs data transformation, cleaning, visualization, analysis, and reporting tasks on a dataset. It applies logarithmic transformations to skewed variables related to healthcare distance and deprivation rates to normalize their distributions, which are then visualized using custom DistPlot functions. A cleaning step filters out infinite and missing values, resulting in a complete_case_sample dataset that is saved as a CSV file. The ggpairs and PointPlot functions are used to check for linearity and visualize relationships between variables. A Pearson correlation test is conducted between the log-transformed employment deprivation rate and income deprivation rate, with results formatted into an HTML table using stargazer. Based on the high correlation observed, the employment deprivation rate is considered for exclusion from further analysis to prevent multicollinearity. The PointPlot visualizations with regression lines further explore the relationships between the remaining variables, assessing the linearity between independent variables and the dependent variable.

Summary statistics with log transformed variables

### PLotting of table for slide 11
combined_data_sum <- combined_data[c("local_authority_name",
    "housing_affordability_score_2016", "income_deprivation_rate_2015",
    "employment_deprivation_rate_2015", "avg_healthcare_distance_2022",
    "rural_urban_class_2011")]

html_table <- stargazer(combined_data_sum, type = "html", title = "Table 1: Summary Statistics",
    digits = 1, column.labels = c("Housing Affordability Score (2016)",
        "Income Deprivation Rate (2015)", "Employment Deprivation Rate (2015)",
        "Average Healthcare Distance (2022)"), out = "table.html")
Table 1: Summary Statistics
Statistic N Mean St. Dev. Min Max
housing_affordability_score_2016 32,715 0.000 1.9 -6.7 8.0
income_deprivation_rate_2015 32,715 12.8 9.4 0.2 60.9
employment_deprivation_rate_2015 32,715 5.8 4.0 0.2 34.7
avg_healthcare_distance_2022 32,715 4.4 4.8 0.0 60.3
# See HTML table in R Studio
rstudioapi::viewer("table.html")

NULL

### Calculate missingness percentages for each variable for
### slide 11
missingness <- sapply(combined_data_sum, function(x) mean(is.na(x)) *
    100)

# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 2: Missingness Summary/%",
    summary = FALSE, flip = TRUE)
Table 2: Missingness Summary/%
local_authority_name 0
housing_affordability_score_2016 0
income_deprivation_rate_2015 0
employment_deprivation_rate_2015 0
avg_healthcare_distance_2022 0
rural_urban_class_2011 0
html_file <- "missingness_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

### count of unique names for rural/urban for slide 12
unique_class2011 <- table(combined_data_sum$rural_urban_class_2011)
unique_class2011.1 <- as.data.frame(unique_class2011)

# Rename the columns
colnames(unique_class2011.1) <- c("Rural/Urban Class (2011)",
    "Count")

# Create HTML table using stargazer
html_table1 <- stargazer(unique_class2011.1, type = "html", title = "Unique Names and its Respective Counts",
    summary = FALSE)
Unique Names and its Respective Counts
Rural/Urban Class (2011) Count
1 Rural town and fringe 2,930
2 Rural town and fringe in a sparse setting 119
3 Rural village and dispersed 2,356
4 Rural village and dispersed in a sparse setting 181
5 Urban city and town 14,399
6 Urban city and town in a sparse setting 59
7 Urban major conurbation 11,465
8 Urban minor conurbation 1,206
cat(html_table1)
Unique Names and its Respective Counts
Rural/Urban Class (2011) Count
1 Rural town and fringe 2,930
2 Rural town and fringe in a sparse setting 119
3 Rural village and dispersed 2,356
4 Rural village and dispersed in a sparse setting 181
5 Urban city and town 14,399
6 Urban city and town in a sparse setting 59
7 Urban major conurbation 11,465
8 Urban minor conurbation 1,206
writeLines(html_table1, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

# Calculate summary statistics
summary_ruralurban <- c(Max = max(rural_urban_class_2011_counts),
    Min = min(rural_urban_class_2011_counts), Med = median(rural_urban_class_2011_counts),
    IQR = IQR(rural_urban_class_2011_counts))

# Create data frame for summary statistics
summary_df <- data.frame(statistic = names(summary_ruralurban),
    value = summary_ruralurban)

# Create HTML table using stargazer
html_table2 <- stargazer(summary_df, type = "html", title = "Summary Statistics",
    summary = FALSE, rownames = FALSE)
Summary Statistics
statistic value
Max 14,399
Min 59
Med 1,781
IQR 4,898.250
cat(html_table2)
Summary Statistics
statistic value
Max 14,399
Min 59
Med 1,781
IQR 4,898.250
writeLines(html_table2, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

## count of unique names for local authority names for
## slide 12
unique_localauthority <- table(combined_data_sum$local_authority_name)

# Convert the result to a data frame
unique_localauthority1 <- as.data.frame(unique_localauthority)

colnames(unique_localauthority1) <- c("Local Authority Names",
    "Count")

# Create HTML table using stargazer
html_table3 <- stargazer(unique_localauthority1, type = "html",
    title = "Unique Names and its Respective Counts", summary = FALSE)
Unique Names and its Respective Counts
Local Authority Names Count
1 Adur 42
2 Allerdale 60
3 Amber Valley 78
4 Arun 94
5 Ashfield 74
6 Ashford 78
7 Aylesbury Vale 114
8 Babergh 54
9 Barking and Dagenham 110
10 Barnet 209
11 Barnsley 147
12 Barrow-in-Furness 49
13 Basildon 109
14 Basingstoke and Deane 109
15 Bassetlaw 70
16 Bath and North East Somerset 114
17 Bedford 103
18 Bexley 146
19 Birmingham 638
20 Blaby 60
21 Blackburn with Darwen 91
22 Blackpool 94
23 Bolsover 48
24 Bolton 177
25 Boston 36
26 Bournemouth, Christchurch and Poole 233
27 Bracknell Forest 75
28 Bradford 308
29 Braintree 87
30 Breckland 78
31 Brent 173
32 Brentwood 45
33 Brighton and Hove 165
34 Bristol, City of 262
35 Broadland 84
36 Bromley 195
37 Bromsgrove 58
38 Broxbourne 56
39 Broxtowe 71
40 Burnley 60
41 Bury 120
42 Calderdale 128
43 Cambridge 67
44 Camden 132
45 Cannock Chase 60
46 Canterbury 90
47 Carlisle 68
48 Castle Point 57
49 Central Bedfordshire 157
50 Charnwood 99
51 Chelmsford 107
52 Cheltenham 75
53 Cherwell 93
54 Cheshire East 232
55 Cheshire West and Chester 212
56 Chesterfield 69
57 Chichester 71
58 Chiltern 54
59 Chorley 66
60 City of London 4
61 Colchester 105
62 Copeland 49
63 Corby 41
64 Cornwall 326
65 Cotswold 51
66 County Durham 324
67 Coventry 195
68 Craven 32
69 Crawley 66
70 Croydon 220
71 Dacorum 91
72 Darlington 65
73 Dartford 58
74 Daventry 44
75 Derby 151
76 Derbyshire Dales 43
77 Doncaster 194
78 Dorset 218
79 Dover 67
80 Dudley 201
81 Ealing 196
82 East Cambridgeshire 50
83 East Devon 81
84 East Hampshire 71
85 East Hertfordshire 84
86 East Lindsey 81
87 East Northamptonshire 49
88 East Riding of Yorkshire 210
89 East Staffordshire 72
90 East Suffolk 146
91 Eastbourne 61
92 Eastleigh 77
93 Eden 36
94 Elmbridge 76
95 Enfield 183
96 Epping Forest 78
97 Epsom and Ewell 44
98 Erewash 73
99 Exeter 74
100 Fareham 73
101 Fenland 55
102 Folkestone and Hythe 67
103 Forest of Dean 50
104 Fylde 51
105 Gateshead 126
106 Gedling 77
107 Gloucester 78
108 Gosport 53
109 Gravesham 64
110 Great Yarmouth 61
111 Greenwich 150
112 Guildford 82
113 Hackney 144
114 Halton 79
115 Hambleton 52
116 Hammersmith and Fulham 113
117 Harborough 47
118 Haringey 145
119 Harlow 54
120 Harrogate 100
121 Harrow 137
122 Hart 56
123 Hartlepool 58
124 Hastings 53
125 Havant 78
126 Havering 150
127 Herefordshire, County of 116
128 Hertsmere 61
129 High Peak 59
130 Hillingdon 161
131 Hinckley and Bosworth 66
132 Horsham 81
133 Hounslow 142
134 Huntingdonshire 105
135 Hyndburn 52
136 Ipswich 85
137 Isle of Wight 89
138 Isles of Scilly 1
139 Islington 123
140 Kensington and Chelsea 90
141 Kettering 57
142 King’s Lynn and West Norfolk 89
143 Kingston upon Hull, City of 166
144 Kingston upon Thames 98
145 Kirklees 259
146 Knowsley 98
147 Lambeth 178
148 Lancaster 89
149 Leeds 480
150 Leicester 192
151 Lewes 62
152 Lewisham 169
153 Lichfield 58
154 Lincoln 57
155 Liverpool 297
156 Luton 121
157 Maidstone 94
158 Maldon 40
159 Malvern Hills 45
160 Manchester 278
161 Mansfield 67
162 Medway 163
163 Melton 30
164 Mendip 66
165 Merton 122
166 Mid Devon 43
167 Mid Suffolk 56
168 Mid Sussex 86
169 Middlesbrough 86
170 Milton Keynes 152
171 Mole Valley 52
172 New Forest 113
173 Newark and Sherwood 70
174 Newcastle-under-Lyme 80
175 Newcastle upon Tyne 174
176 Newham 162
177 North Devon 58
178 North East Derbyshire 63
179 North East Lincolnshire 106
180 North Hertfordshire 82
181 North Kesteven 64
182 North Lincolnshire 101
183 North Norfolk 62
184 North Somerset 135
185 North Tyneside 131
186 North Warwickshire 38
187 North West Leicestershire 58
188 Northampton 133
189 Northumberland 197
190 Norwich 83
191 Nottingham 182
192 Nuneaton and Bedworth 81
193 Oadby and Wigston 36
194 Oldham 141
195 Oxford 79
196 Pendle 57
197 Peterborough 112
198 Plymouth 161
199 Portsmouth 124
200 Preston 86
201 Reading 97
202 Redbridge 161
203 Redcar and Cleveland 88
204 Redditch 55
205 Reigate and Banstead 83
206 Ribble Valley 39
207 Richmond upon Thames 115
208 Richmondshire 34
209 Rochdale 134
210 Rochford 53
211 Rossendale 43
212 Rother 58
213 Rotherham 167
214 Rugby 61
215 Runnymede 51
216 Rushcliffe 68
217 Rushmoor 58
218 Rutland 23
219 Ryedale 30
220 Salford 149
221 Sandwell 186
222 Scarborough 71
223 Sedgemoor 70
224 Sefton 189
225 Selby 50
226 Sevenoaks 73
227 Sheffield 343
228 Shropshire 193
229 Slough 80
230 Solihull 132
231 Somerset West and Taunton 88
232 South Bucks 37
233 South Cambridgeshire 96
234 South Derbyshire 58
235 South Gloucestershire 165
236 South Hams 49
237 South Holland 49
238 South Kesteven 81
239 South Lakeland 59
240 South Norfolk 81
241 South Northamptonshire 51
242 South Oxfordshire 87
243 South Ribble 70
244 South Somerset 103
245 South Staffordshire 68
246 South Tyneside 102
247 Southampton 148
248 Southend-on-Sea 107
249 Southwark 162
250 Spelthorne 60
251 St Albans 83
252 St. Helens 119
253 Stafford 80
254 Staffordshire Moorlands 59
255 Stevenage 52
256 Stockport 190
257 Stockton-on-Tees 120
258 Stoke-on-Trent 159
259 Stratford-on-Avon 73
260 Stroud 69
261 Sunderland 185
262 Surrey Heath 53
263 Sutton 121
264 Swale 85
265 Swindon 132
266 Tameside 141
267 Tamworth 51
268 Tandridge 49
269 Teignbridge 84
270 Telford and Wrekin 108
271 Tendring 89
272 Test Valley 71
273 Tewkesbury 50
274 Thanet 84
275 Three Rivers 49
276 Thurrock 98
277 Tonbridge and Malling 71
278 Torbay 89
279 Torridge 37
280 Tower Hamlets 144
281 Trafford 138
282 Tunbridge Wells 66
283 Uttlesford 45
284 Vale of White Horse 76
285 Wakefield 209
286 Walsall 167
287 Waltham Forest 144
288 Wandsworth 178
289 Warrington 127
290 Warwick 85
291 Watford 53
292 Waverley 81
293 Wealden 95
294 Wellingborough 47
295 Welwyn Hatfield 65
296 West Berkshire 96
297 West Devon 31
298 West Lancashire 73
299 West Lindsey 52
300 West Oxfordshire 66
301 West Suffolk 100
302 Westminster 120
303 Wigan 200
304 Wiltshire 285
305 Winchester 69
306 Windsor and Maidenhead 88
307 Wirral 206
308 Woking 59
309 Wokingham 99
310 Wolverhampton 158
311 Worcester 63
312 Worthing 65
313 Wychavon 78
314 Wycombe 106
315 Wyre 69
316 Wyre Forest 65
317 York 120
cat(html_table3)
Unique Names and its Respective Counts
Local Authority Names Count
1 Adur 42
2 Allerdale 60
3 Amber Valley 78
4 Arun 94
5 Ashfield 74
6 Ashford 78
7 Aylesbury Vale 114
8 Babergh 54
9 Barking and Dagenham 110
10 Barnet 209
11 Barnsley 147
12 Barrow-in-Furness 49
13 Basildon 109
14 Basingstoke and Deane 109
15 Bassetlaw 70
16 Bath and North East Somerset 114
17 Bedford 103
18 Bexley 146
19 Birmingham 638
20 Blaby 60
21 Blackburn with Darwen 91
22 Blackpool 94
23 Bolsover 48
24 Bolton 177
25 Boston 36
26 Bournemouth, Christchurch and Poole 233
27 Bracknell Forest 75
28 Bradford 308
29 Braintree 87
30 Breckland 78
31 Brent 173
32 Brentwood 45
33 Brighton and Hove 165
34 Bristol, City of 262
35 Broadland 84
36 Bromley 195
37 Bromsgrove 58
38 Broxbourne 56
39 Broxtowe 71
40 Burnley 60
41 Bury 120
42 Calderdale 128
43 Cambridge 67
44 Camden 132
45 Cannock Chase 60
46 Canterbury 90
47 Carlisle 68
48 Castle Point 57
49 Central Bedfordshire 157
50 Charnwood 99
51 Chelmsford 107
52 Cheltenham 75
53 Cherwell 93
54 Cheshire East 232
55 Cheshire West and Chester 212
56 Chesterfield 69
57 Chichester 71
58 Chiltern 54
59 Chorley 66
60 City of London 4
61 Colchester 105
62 Copeland 49
63 Corby 41
64 Cornwall 326
65 Cotswold 51
66 County Durham 324
67 Coventry 195
68 Craven 32
69 Crawley 66
70 Croydon 220
71 Dacorum 91
72 Darlington 65
73 Dartford 58
74 Daventry 44
75 Derby 151
76 Derbyshire Dales 43
77 Doncaster 194
78 Dorset 218
79 Dover 67
80 Dudley 201
81 Ealing 196
82 East Cambridgeshire 50
83 East Devon 81
84 East Hampshire 71
85 East Hertfordshire 84
86 East Lindsey 81
87 East Northamptonshire 49
88 East Riding of Yorkshire 210
89 East Staffordshire 72
90 East Suffolk 146
91 Eastbourne 61
92 Eastleigh 77
93 Eden 36
94 Elmbridge 76
95 Enfield 183
96 Epping Forest 78
97 Epsom and Ewell 44
98 Erewash 73
99 Exeter 74
100 Fareham 73
101 Fenland 55
102 Folkestone and Hythe 67
103 Forest of Dean 50
104 Fylde 51
105 Gateshead 126
106 Gedling 77
107 Gloucester 78
108 Gosport 53
109 Gravesham 64
110 Great Yarmouth 61
111 Greenwich 150
112 Guildford 82
113 Hackney 144
114 Halton 79
115 Hambleton 52
116 Hammersmith and Fulham 113
117 Harborough 47
118 Haringey 145
119 Harlow 54
120 Harrogate 100
121 Harrow 137
122 Hart 56
123 Hartlepool 58
124 Hastings 53
125 Havant 78
126 Havering 150
127 Herefordshire, County of 116
128 Hertsmere 61
129 High Peak 59
130 Hillingdon 161
131 Hinckley and Bosworth 66
132 Horsham 81
133 Hounslow 142
134 Huntingdonshire 105
135 Hyndburn 52
136 Ipswich 85
137 Isle of Wight 89
138 Isles of Scilly 1
139 Islington 123
140 Kensington and Chelsea 90
141 Kettering 57
142 King’s Lynn and West Norfolk 89
143 Kingston upon Hull, City of 166
144 Kingston upon Thames 98
145 Kirklees 259
146 Knowsley 98
147 Lambeth 178
148 Lancaster 89
149 Leeds 480
150 Leicester 192
151 Lewes 62
152 Lewisham 169
153 Lichfield 58
154 Lincoln 57
155 Liverpool 297
156 Luton 121
157 Maidstone 94
158 Maldon 40
159 Malvern Hills 45
160 Manchester 278
161 Mansfield 67
162 Medway 163
163 Melton 30
164 Mendip 66
165 Merton 122
166 Mid Devon 43
167 Mid Suffolk 56
168 Mid Sussex 86
169 Middlesbrough 86
170 Milton Keynes 152
171 Mole Valley 52
172 New Forest 113
173 Newark and Sherwood 70
174 Newcastle-under-Lyme 80
175 Newcastle upon Tyne 174
176 Newham 162
177 North Devon 58
178 North East Derbyshire 63
179 North East Lincolnshire 106
180 North Hertfordshire 82
181 North Kesteven 64
182 North Lincolnshire 101
183 North Norfolk 62
184 North Somerset 135
185 North Tyneside 131
186 North Warwickshire 38
187 North West Leicestershire 58
188 Northampton 133
189 Northumberland 197
190 Norwich 83
191 Nottingham 182
192 Nuneaton and Bedworth 81
193 Oadby and Wigston 36
194 Oldham 141
195 Oxford 79
196 Pendle 57
197 Peterborough 112
198 Plymouth 161
199 Portsmouth 124
200 Preston 86
201 Reading 97
202 Redbridge 161
203 Redcar and Cleveland 88
204 Redditch 55
205 Reigate and Banstead 83
206 Ribble Valley 39
207 Richmond upon Thames 115
208 Richmondshire 34
209 Rochdale 134
210 Rochford 53
211 Rossendale 43
212 Rother 58
213 Rotherham 167
214 Rugby 61
215 Runnymede 51
216 Rushcliffe 68
217 Rushmoor 58
218 Rutland 23
219 Ryedale 30
220 Salford 149
221 Sandwell 186
222 Scarborough 71
223 Sedgemoor 70
224 Sefton 189
225 Selby 50
226 Sevenoaks 73
227 Sheffield 343
228 Shropshire 193
229 Slough 80
230 Solihull 132
231 Somerset West and Taunton 88
232 South Bucks 37
233 South Cambridgeshire 96
234 South Derbyshire 58
235 South Gloucestershire 165
236 South Hams 49
237 South Holland 49
238 South Kesteven 81
239 South Lakeland 59
240 South Norfolk 81
241 South Northamptonshire 51
242 South Oxfordshire 87
243 South Ribble 70
244 South Somerset 103
245 South Staffordshire 68
246 South Tyneside 102
247 Southampton 148
248 Southend-on-Sea 107
249 Southwark 162
250 Spelthorne 60
251 St Albans 83
252 St. Helens 119
253 Stafford 80
254 Staffordshire Moorlands 59
255 Stevenage 52
256 Stockport 190
257 Stockton-on-Tees 120
258 Stoke-on-Trent 159
259 Stratford-on-Avon 73
260 Stroud 69
261 Sunderland 185
262 Surrey Heath 53
263 Sutton 121
264 Swale 85
265 Swindon 132
266 Tameside 141
267 Tamworth 51
268 Tandridge 49
269 Teignbridge 84
270 Telford and Wrekin 108
271 Tendring 89
272 Test Valley 71
273 Tewkesbury 50
274 Thanet 84
275 Three Rivers 49
276 Thurrock 98
277 Tonbridge and Malling 71
278 Torbay 89
279 Torridge 37
280 Tower Hamlets 144
281 Trafford 138
282 Tunbridge Wells 66
283 Uttlesford 45
284 Vale of White Horse 76
285 Wakefield 209
286 Walsall 167
287 Waltham Forest 144
288 Wandsworth 178
289 Warrington 127
290 Warwick 85
291 Watford 53
292 Waverley 81
293 Wealden 95
294 Wellingborough 47
295 Welwyn Hatfield 65
296 West Berkshire 96
297 West Devon 31
298 West Lancashire 73
299 West Lindsey 52
300 West Oxfordshire 66
301 West Suffolk 100
302 Westminster 120
303 Wigan 200
304 Wiltshire 285
305 Winchester 69
306 Windsor and Maidenhead 88
307 Wirral 206
308 Woking 59
309 Wokingham 99
310 Wolverhampton 158
311 Worcester 63
312 Worthing 65
313 Wychavon 78
314 Wycombe 106
315 Wyre 69
316 Wyre Forest 65
317 York 120
writeLines(html_table3, html_file)
rstudioapi::viewer(html_file)

NULL

# Calculate summary statistics
summary_localauthority <- c(Max = max(local_authority_name_counts),
    Min = min(local_authority_name_counts), Med = median(local_authority_name_counts),
    IQR = IQR(local_authority_name_counts))

summary_df <- data.frame(statistic = names(summary_localauthority),
    value = summary_localauthority)

# Create HTML table using stargazer
html_table4 <- stargazer(summary_df, type = "html", title = "Summary Statistics",
    summary = FALSE, rownames = FALSE)
Summary Statistics
statistic value
Max 638
Min 1
Med 81
IQR 72
cat(html_table4)
Summary Statistics
statistic value
Max 638
Min 1
Med 81
IQR 72
writeLines(html_table4, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

### Repeat for complete_case.csv for slide 16 Read the
### complete_case file
complete_case_sum <- read.csv("Group_7_complete_case.csv")

html_table <- stargazer(complete_case_sum, type = "html", title = "Table 3: Summary Statistics",
    digits = 1, column.labels = c("Log Income Deprivation Rate (2015)",
        "Log Average Healthcare Distance (2022)", "Housing Affordability Score (2016)",
        "Rural/Urban Class (2011)"), out = "table.html")
Table 3: Summary Statistics
Statistic N Mean St. Dev. Min Max
log_income_deprivation_rate_2015 32,714 2.3 0.8 -1.7 4.1
log_avg_healthcare_distance_2022 32,714 1.2 0.8 -1.5 4.1
housing_affordability_score_2016 32,714 0.001 1.9 -6.7 8.0
# See HTML table in R Studio
rstudioapi::viewer("table.html")

NULL

### Calculate missingness percentages for each variable for
### slide 16
missingness <- sapply(complete_case_sum, function(x) mean(is.na(x)) *
    100)

# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 4: Missingness Summary/%",
    summary = FALSE, flip = TRUE)
Table 4: Missingness Summary/%
log_income_deprivation_rate_2015 0
log_avg_healthcare_distance_2022 0
housing_affordability_score_2016 0
rural_urban_class_2011 0
html_file <- "missingness_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

### covariance of x1,x2,x3 before log transformation for
### slide 17
x1 <- combined_data_sum$income_deprivation_rate_2015
x2 <- combined_data_sum$employment_deprivation_rate_2015
x3 <- combined_data_sum$housing_affordability_score_2016
y <- combined_data_sum$avg_healthcare_distance_2022
cov_x1y <- cov(x1, y)
cov_x2y <- cov(x2, y)
cov_x3y <- cov(x3, y)

# Create a data frame for the covariances
covariances_df <- data.frame(variables = c("x1 and y", "x2 and y",
    "x3 and y"), covariance = c(cov_x1y, cov_x2y, cov_x3y))

# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances before log transformation",
    summary = FALSE, rownames = FALSE)
Covariances before log transformation
variables covariance
x1 and y -12.798
x2 and y -5.549
x3 and y -2.717
html_file <- "covariances_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

### covariance of x4,x5 after log transformation for slide
### 17
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <- complete_case_sum$log_income_deprivation_rate_2015
x5 <- complete_case_sum$housing_affordability_score_2016
y1 <- complete_case_sum$log_avg_healthcare_distance_2022
cov_x4y1 <- cov(x4, y1)
cov_x5y1 <- cov(x5, y1)

# Create a data frame for the covariances
covariances_df <- data.frame(variables = c("x4 and y1", "x5 and y1"),
    covariance = c(cov_x4y1, cov_x5y1))

# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances after log transformation",
    summary = FALSE, rownames = FALSE)
Covariances after log transformation
variables covariance
x4 and y1 -0.219
x5 and y1 -0.597
html_file <- "covariances_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL

## Correlation test for slide 18
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <- complete_case_sum$log_income_deprivation_rate_2015
x5 <- complete_case_sum$housing_affordability_score_2016
y1 <- complete_case_sum$log_avg_healthcare_distance_2022
corx4y1 <- cor.test(x4, y1, method = "pearson")
corx5y1 <- cor.test(x5, y1, method = "pearson")

# Create a data frame to store correlation results
# format.pval function is used to format the p-value to
# make it into a string ie. a range
corr_df <- data.frame(variables = c("x4 and y1", "x5 and y1"),
    Pearson_Correlation = c(corx4y1$estimate, corx5y1$estimate),
    t_value = c(corx4y1$statistic, corx5y1$statistic), Estimate = c(corx4y1$estimate,
        corx5y1$estimate), p_value = ifelse(corx4y1$p.value ==
        0, "< 0.001", format.pval(corx4y1$p.value)))

# Create HTML table using stargazer
html_table <- stargazer(corr_df, type = "html", title = "Pearson Correlations After Log Transformation",
    summary = FALSE, rownames = FALSE)
Pearson Correlations After Log Transformation
variables Pearson_Correlation t_value Estimate p_value
x4 and y1 -0.367 -71.408 -0.367 < 0.001
x5 and y1 -0.408 -80.848 -0.408 < 0.001
html_file <- "cor_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

NULL The above code is a sequence of commands for generating HTML tables that display summary statistics, missingness percentages, unique category counts, and measures of covariance and correlation. It begins with extracting and tabulating a subset of the dataset for slide 11, continues with analyzing missing data and category counts for slides 11 and 12, then moves on to slide 16 where it reads a cleaned dataset to produce similar summaries. For slide 17, it calculates covariances pre- and post-log transformation, and for slide 18, it conducts Pearson correlation tests, with results formatted and exported as HTML tables viewable within RStudio.

Model creation

# Model 1:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate.
mod1 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
summary(mod1)
## 
## Call:
## lm(formula = log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015, 
##     data = complete_case_sample)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.70482 -0.47303 -0.06624  0.38161  2.80484 
## 
## Coefficients:
##                                   Estimate Std. Error t value
## (Intercept)                       1.977622   0.012243  161.53
## log_income_deprivation_rate_2015 -0.363913   0.005096  -71.41
##                                             Pr(>|t|)    
## (Intercept)                      <0.0000000000000002 ***
## log_income_deprivation_rate_2015 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7149 on 32712 degrees of freedom
## Multiple R-squared:  0.1349, Adjusted R-squared:  0.1348 
## F-statistic:  5099 on 1 and 32712 DF,  p-value: < 0.00000000000000022
# Model 1 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate. As both the dependent and independent variables are log transformed, the coefficients may be interpreted in terms of percentage change. That is, a 1% increase in the income deprivation rate is associated with a 0.35% decrease in the average travel time to a healthcare service. This coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). Thus, according to this model, we may reject the null hypothesis that there is no relationship between income deprivation rate and average healthcare travel distance. Further, the p-value associated with the F-statistic is also significant considering the same alpha, sugegsting at least one variable in the model has some explanatory power regarding average healthcare distance. This statistically is not that useful regarding this model however, as only one explanatory variable is considered. The coefficient of determination for this model indicates that we are able to explain 13.1% of the variation in average travel time to healthcare services with this model.

# Model 2:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate and the housing affordability indicator
mod2 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)

# Exponentiating untransformed variable coefficients to allow easier interpretation
mod2$coefficients["housing_affordability_score_2016"] <- exp(mod2$coefficients["housing_affordability_score_2016"])

# Viewing model results
summary(mod2)
## 
## Call:
## lm(formula = log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + 
##     housing_affordability_score_2016, data = complete_case_sample)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5103 -0.4599 -0.0744  0.3675  2.7243 
## 
## Coefficients:
##                                   Estimate Std. Error t value
## (Intercept)                       1.604985   0.014034  114.36
## log_income_deprivation_rate_2015 -0.200002   0.005940  -33.67
## housing_affordability_score_2016  0.887820   0.002419  367.07
##                                             Pr(>|t|)    
## (Intercept)                      <0.0000000000000002 ***
## log_income_deprivation_rate_2015 <0.0000000000000002 ***
## housing_affordability_score_2016 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6899 on 32711 degrees of freedom
## Multiple R-squared:  0.1945, Adjusted R-squared:  0.1944 
## F-statistic:  3948 on 2 and 32711 DF,  p-value: < 0.00000000000000022
# Model 2 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate and the housing affordability indicator. Here, we can see that a 1% increase in the income deprivation rate is associated with a 0.19% decrease in the average travel time to a healthcare service. Thus, we can see that when we control for a further underlying economic variable, the explanatory power of income deprivation alone decreases. Again, this coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). For interpretation of the housing affordability indicator, the exponential of the coefficient is considered (0.88). Thus, we can see that a 1 unit increase in the housing affordability indicator is associated with a 12% decrease in the average travel time to a healthcare service, again significant considering an alpha of 0.05. Remembering that increases in the housing affordability indicator signify increased inability to afford to enter owner-occupation or the private rental market, this again appears to indicate that more economically deprived areas have greater geographical access to healthcare services. Further, the p-value associated with the F-statistic is also significant at the 5% level, again suggesting at least one variable in the model has some explanatory power regarding average healthcare distance. The adjusted coefficient of determination for this model indicates that we are able to explain 19.3% of the variation in average travel time to healthcare services with this model.

# Model 3:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate, the housing affordability indicator, and the re-leveled Rural-Urban Classification (RUC11) with "Urban city and town" as the reference category
mod3 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + forcats::fct_relevel(rural_urban_class_2011, "Urban city and town"), data = complete_case_sample)

# Exponentiated coefficient data
mod3_exp_coeff <- data.frame(exp(coef(mod3)))

# Exponentiating untransformed variable coefficients WITHIN the model to allow easier interpretation
mod3$coefficients[3:10] <- exp(mod3$coefficients[3:10])

# Viewing model results
summary(mod3)
## 
## Call:
## lm(formula = log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + 
##     housing_affordability_score_2016 + forcats::fct_relevel(rural_urban_class_2011, 
##     "Urban city and town"), data = complete_case_sample)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35847 -0.33319  0.02095  0.34803  2.03045 
## 
## Coefficients:
##                                                                                                                     Estimate
## (Intercept)                                                                                                         1.309527
## log_income_deprivation_rate_2015                                                                                   -0.103974
## housing_affordability_score_2016                                                                                    0.931051
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                            1.763662
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting        1.208809
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                      4.439979
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting  6.326992
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting          0.958377
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                          0.781509
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                          0.861055
##                                                                                                                    Std. Error
## (Intercept)                                                                                                          0.011329
## log_income_deprivation_rate_2015                                                                                     0.004576
## housing_affordability_score_2016                                                                                     0.001921
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                             0.010651
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting         0.048121
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                       0.011797
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting   0.039126
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting           0.068209
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                           0.006801
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                           0.015798
##                                                                                                                    t value
## (Intercept)                                                                                                         115.59
## log_income_deprivation_rate_2015                                                                                    -22.72
## housing_affordability_score_2016                                                                                    484.75
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                            165.58
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting         25.12
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                      376.37
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting  161.71
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting           14.05
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                          114.91
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                           54.50
##                                                                                                                               Pr(>|t|)
## (Intercept)                                                                                                        <0.0000000000000002
## log_income_deprivation_rate_2015                                                                                   <0.0000000000000002
## housing_affordability_score_2016                                                                                   <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                           <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting       <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                     <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting         <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                         <0.0000000000000002
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                         <0.0000000000000002
##                                                                                                                       
## (Intercept)                                                                                                        ***
## log_income_deprivation_rate_2015                                                                                   ***
## housing_affordability_score_2016                                                                                   ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe                           ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural town and fringe in a sparse setting       ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed                     ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Rural village and dispersed in a sparse setting ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban city and town in a sparse setting         ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban major conurbation                         ***
## forcats::fct_relevel(rural_urban_class_2011, "Urban city and town")Urban minor conurbation                         ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5228 on 32704 degrees of freedom
## Multiple R-squared:  0.5376, Adjusted R-squared:  0.5374 
## F-statistic:  4224 on 9 and 32704 DF,  p-value: < 0.00000000000000022
# Creating data for confidence interval plot
ConfidencePlotData <- data.frame (
  "names" = as.factor (c("Rural town and fringe", "Rural town and fringe in a sparse setting", "Rural village and dispersed", "Rural village and dispersed in a sparse setting", "Urban city and town in a sparse setting", "Urban major conurbation", "Urban minor conurbation")), # Creating predictor names (row names)
  "coefs" = mod3_exp_coeff[4:10, 1],
  "lower" = confint (mod3) [4:10, 1], # Creating a lower confidence interval [ , 1] list for predictors 2-3 in the model [2:5 , ]
  "upper" = confint (mod3) [4:10, 2] # Creating an upper confidence interval [ , 2] list for predictors 2-3 in the model [2:5 , ]
)
ConfidencePlotData$names <- factor(ConfidencePlotData$names, levels = unique(ConfidencePlotData$names))

# Creating confidence interval plot for RUC coefficients
ConfidencePlot(ConfidencePlotData, "Healthcare Access by Rural-Urban Classification in England", "Reference Category: Urban City and Town", x_label = "Exponentiated Coefficient", middle_point = 1, caption = "*95% CI Indicated by Green Error Bars. Red dashed line indicates reference group. Controlling for income deprivation (2015) and housing affordability indicators (2016).")

# Model 3 expands on Model 2 by including the Rural-Urban Classification (RUC11) as a factor variable, allowing us to account for geographic differences in average travel time to healthcare services. The model estimates the logarithm of average travel time to a healthcare service within a Lower Layer Super Output Area (LSOA) as a function of the logarithm of the income deprivation rate, the housing affordability indicator, and the RUC11 categories. We observe that a 1% increase in the income deprivation rate is associated with a 0.10% decrease in average travel time to healthcare services, when holding the other variables constant. This relationship is statistically significant at the 5% level, with a p-value less than 0.05, indicating that income deprivation continues to be an important factor in explaining variations in healthcare access, albeit with a smaller coefficient compared to Model 2. Similarly, a 1 unit increase in the housing affordability indicator, which reflects increased difficulty in entering owner-occupation or the private rental market, is associated with a 7% decrease in average travel time to healthcare services. This relationship is also statistically significant at the 5% level. The new variables introduced, representing the RUC11 categories, show significant variations in healthcare access across different geographic areas. For instance, compared to Urban city and town areas, Rural village and dispersed areas experience a 344% increase in average travel time to healthcare services, indicating that rural areas face substantial barriers in healthcare access. These relationships are statistically significant at the 5% level, with the exception of Urban city and town in a sparse setting category, which does not show a significant difference in healthcare access compared to Urban city and town areas. The adjusted coefficient of determination for this model is 0.537, suggesting that the model explains approximately 53.7% of the variation in average travel time to healthcare services. The inclusion of the RUC11 categories significantly enhances the explanatory power of the model compared to Model 2. The p-value associated with the F-statistic is also significant at the 5% level, indicating that at least one variable in the model has some explanatory power regarding average healthcare distance. In conclusion, Model 3 provides a more nuanced understanding of the factors affecting healthcare access, by accounting for both economic and geographic variables. The significant differences in healthcare access across the RUC11 categories highlight the importance of considering geographic disparities when analyzing healthcare access.

In these three regression models, the dependent variable—logarithm of average healthcare distance—is modeled against various predictors to study their effects on healthcare accessibility within Lower Layer Super Output Areas (LSOAs). Model 1 examines the impact of income deprivation alone, indicating a statistically significant inverse relationship where an increase in deprivation is associated with a decrease in travel time. Model 2 introduces housing affordability, showing a similar inverse relationship with both predictors, and a slightly improved explanatory power. Finally, Model 3 expands the analysis by adding Rural-Urban Classification, revealing significant geographic disparities in healthcare access; rural areas notably have longer travel times to healthcare services. Each model’s coefficient of determination increases progressively, demonstrating an improved ability to explain the variation in healthcare access by including additional socio-economic and geographic factors.

Model results

results <- stargazer(mod1, mod2, mod3, type = "html", dep.var.labels = c("log(Mean Healthcare Service Distance [MHSD])"),
    covariate.labels = c("log(Income Deprivation Rate 2015)",
        "Housing Affordability Score 2016 [HAS]", "Rural town and fringe [RUC 2011]",
        "Rural town and fringe in a sparse setting [RUC 2011]",
        "Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]",
        "Urban city and town in a sparse setting [RUC 2011]",
        "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"),
    notes = "Coefficients for HAS and RUC 2011 are exponentiated, reflecting a multiplicative effect on MHSD.",
    ci = TRUE)
Dependent variable:
log(Mean Healthcare Service Distance [MHSD])
(1) (2) (3)
log(Income Deprivation Rate 2015) -0.364*** -0.200*** -0.104***
(-0.374, -0.354) (-0.212, -0.188) (-0.113, -0.095)
Housing Affordability Score 2016 [HAS] 0.888*** 0.931***
(0.883, 0.893) (0.927, 0.935)
Rural town and fringe [RUC 2011] 1.764***
(1.743, 1.785)
Rural town and fringe in a sparse setting [RUC 2011] 1.209***
(1.114, 1.303)
Rural village and dispersed [RUC 2011] 4.440***
(4.417, 4.463)
Rural village and dispersed in a sparse setting [RUC 2011] 6.327***
(6.250, 6.404)
Urban city and town in a sparse setting [RUC 2011] 0.958***
(0.825, 1.092)
Urban major conurbation [RUC 2011] 0.782***
(0.768, 0.795)
Urban minor conurbation [RUC 2011] 0.861***
(0.830, 0.892)
Constant 1.978*** 1.605*** 1.310***
(1.954, 2.002) (1.577, 1.632) (1.287, 1.332)
Observations 32,714 32,714 32,714
R2 0.135 0.194 0.538
Adjusted R2 0.135 0.194 0.537
Residual Std. Error 0.715 (df = 32712) 0.690 (df = 32711) 0.523 (df = 32704)
F Statistic 5,099.128*** (df = 1; 32712) 3,948.166*** (df = 2; 32711) 4,224.042*** (df = 9; 32704)
Note: p<0.1; p<0.05; p<0.01
Coefficients for HAS and RUC 2011 are exponentiated, reflecting a multiplicative effect on MHSD.

Assumption checks

# Checking assumptions: Model 1 Adding residual values to
# dataframe
complete_case_sample$resids1 <- residuals(mod1)

# Linearity (using partial regression plots)
PartialRegressionPlot(mod1, "log_income_deprivation_rate_2015",
    x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR)")

# Constant variance of residuals
HomoskedasticityPlot(mod1, add_loess_line = TRUE, title = "Model 1 Scale-Location")

# Durbin-Watson test
dw1 <- dwtest(mod1)

# Create a data frame with the Durbin-Watson test results
dw_df <- data.frame(Statistic = dw1$statistic, P = format.pval(dw1$p.value,
    digits = 16))

# Use stargazer to create an HTML table of the
# Durbin-Watson results
stargazer(dw_df, type = "html", title = "Durbin-Watson Test Results",
    summary = FALSE, rownames = FALSE, out = "dw_test_results.html")
Durbin-Watson Test Results
Statistic P
0.831 < 0.00000000000000022204460492503
# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer("dw_test_results.html")

NULL

# Normality of residuals check
QQPlotModel(mod1, subtitle = "QQ-Plot for Model 1")

# Zero conditional mean of residual values
mean(mod1$residuals)

[1] -0.00000000000000006509453

resid_mod1 <- lm(resids1 ~ log_income_deprivation_rate_2015,
    data = complete_case_sample)
summary(resid_mod1)

Call: lm(formula = resids1 ~ log_income_deprivation_rate_2015, data = complete_case_sample)

Residuals: Min 1Q Median 3Q Max -2.70482 -0.47303 -0.06624 0.38161 2.80484

Coefficients: Estimate Std. Error (Intercept) 0.0000000000000019171 0.0122433939839822377 log_income_deprivation_rate_2015 -0.0000000000000004258 0.0050962369734882749 t value Pr(>|t|) (Intercept) 0 1 log_income_deprivation_rate_2015 0 1

Residual standard error: 0.7149 on 32712 degrees of freedom Multiple R-squared: 1.914e-30, Adjusted R-squared: -3.057e-05 F-statistic: 6.261e-26 on 1 and 32712 DF, p-value: 1

# Zero conditional mean regression results
stargazer(resid_mod1, type = "html", dep.var.labels = c("Residuals of Model 1"),
    covariate.labels = c("log(Income Deprivation Rate 2015)"),
    out = "residmod1.html")
Dependent variable:
Residuals of Model 1
log(Income Deprivation Rate 2015) -0.000
(0.005)
Constant 0.000
(0.012)
Observations 32,714
R2 0.000
Adjusted R2 -0.00003
Residual Std. Error 0.715 (df = 32712)
F Statistic 0.000 (df = 1; 32712)
Note: p<0.1; p<0.05; p<0.01
rstudioapi::viewer("residmod1.html")

NULL

# Checking assumptions: Model 2 Adding residual values to
# dataframe
complete_case_sample$resids2 <- residuals(mod2)

# Linearity (using partial regression plots)
PartialRegressionPlot(mod2, "log_income_deprivation_rate_2015",
    x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot(mod2, "housing_affordability_score_2016",
    x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")

# Multicollinearity (using VIF)
vif_values1 <- vif(mod2)

# storing VIF in a dataframe
vif_df1 <- data.frame(Variable = names(vif_values1), VIF = vif_values1)
vif_df1$Variable[1] <- "log(Income Deprivation Rate 2015)"
vif_df1$Variable[2] <- "Housing Affordability Score 2016"

# creating a pretty table
stargazer(vif_df1, type = "html", title = "VIF Values", summary = FALSE,
    rownames = FALSE, out = "mod2vif.html")
VIF Values
Variable VIF
log(Income Deprivation Rate 2015) 1.459
Housing Affordability Score 2016 1.459
# using rstudioapi to view the table
rstudioapi::viewer("mod2vif.html")

NULL

# Constant variance of residuals
HomoskedasticityPlot(mod2, add_loess_line = TRUE, title = "Model 2 Scale-Location")

# Durbin-Watson test
dw2 <- dwtest(mod2)

# Create a data frame with the Durbin-Watson test results
dw_df2 <- data.frame(Statistic = dw2$statistic, P = ifelse(dw2$p.value >
    0.05, "> 0.05", ifelse(dw2$p.value < 0.01, "< 0.01", "< 0.05")))

# Use stargazer to create an HTML table of the
# Durbin-Watson results
stargazer(dw_df2, type = "html", title = "Durbin-Watson Test Results",
    summary = FALSE, rownames = FALSE, out = "dw_test_results2.html")
Durbin-Watson Test Results
Statistic P
0.885 < 0.01
# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer("dw_test_results2.html")

NULL

# Normality of residuals check
QQPlotModel(mod2, subtitle = "QQ-Plot for Model 2")

# Zero conditional mean of residual values
mean(mod2$residuals)

[1] -0.000000000000001023337

resid_mod2 <- lm(resids2 ~ log_income_deprivation_rate_2015 +
    housing_affordability_score_2016, data = complete_case_sample)
summary(resid_mod2)

Call: lm(formula = resids2 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)

Residuals: Min 1Q Median 3Q Max -2.5103 -0.4599 -0.0744 0.3675 2.7243

Coefficients: Estimate (Intercept) -0.00000000000000123474 log_income_deprivation_rate_2015 0.00000000000000012189 housing_affordability_score_2016 0.00000000000000007123 Std. Error t value Pr(>|t|) (Intercept) 0.01403408525346747093 0 1 log_income_deprivation_rate_2015 0.00594008978139718563 0 1 housing_affordability_score_2016 0.00241867829270426540 0 1

Residual standard error: 0.6899 on 32711 degrees of freedom Multiple R-squared: 2.489e-30, Adjusted R-squared: -6.114e-05 F-statistic: 4.071e-26 on 2 and 32711 DF, p-value: 1

# Zero conditional mean regression results
stargazer(resid_mod2, type = "html", dep.var.labels = c("Residuals of Model 2"),
    covariate.labels = c("log(Income Deprivation Rate 2015)",
        "Housing Affordability Score 2016 [HAS]"), out = "residmod2.html")
Dependent variable:
Residuals of Model 2
log(Income Deprivation Rate 2015) 0.000
(0.006)
Housing Affordability Score 2016 [HAS] 0.000
(0.002)
Constant -0.000
(0.014)
Observations 32,714
R2 0.000
Adjusted R2 -0.0001
Residual Std. Error 0.690 (df = 32711)
F Statistic 0.000 (df = 2; 32711)
Note: p<0.1; p<0.05; p<0.01
rstudioapi::viewer("residmod2.html")

NULL

# Checking assumptions: Model 3 Adding residual values to
# dataframe
complete_case_sample$resids3 <- residuals(mod3)

# Linearity (using partial regression plots)
PartialRegressionPlot(mod3, "log_income_deprivation_rate_2015",
    x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot(mod3, "housing_affordability_score_2016",
    x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")

# Multicollinearity (using VIF)
vif_values2 <- data.frame(vif(mod3))
vif_values2$Variable <- c("log(Income Deprivation Rate 2015)",
    "Housing Affordability Score 2016", "Rural Urban Classification 2011")
vif_df2 = vif_values2[, c("Variable", "GVIF")]

# creating a pretty table
stargazer(vif_df2, type = "html", title = "GVIF Values", summary = FALSE,
    rownames = FALSE, out = "mod3vif.html")
GVIF Values
Variable GVIF
log(Income Deprivation Rate 2015) 1.508
Housing Affordability Score 2016 1.602
Rural Urban Classification 2011 1.197
# using rstudioapi to view the table
rstudioapi::viewer("mod3vif.html")

NULL

# Constant variance of residuals plot
HomoskedasticityPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location")

# Plot indicating groups
HomoskedasticityDetailPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location",
    dataframe = complete_case_sample, categorical_var_name = "rural_urban_class_2011")

# Testing for non-constant variance across groups:
leveneTest(complete_case_sample$resids3, complete_case_sample$rural_urban_class_2011,
    center = mean)

Levene’s Test for Homogeneity of Variance (center = mean) Df F value Pr(>F)
group 7 189.04 < 0.00000000000000022 *** 32706
— Signif. codes: 0 ‘’ 0.001 ’’ 0.01 ’’ 0.05 ‘.’ 0.1 ’ ’ 1

# df for levene's test results
levene_results <- data.frame(`Degrees of Freedom (Group)` = 7,
    `Degrees of Freedom (Residual)` = 32706, `F Value` = 189.04,
    `P Value` = "< 0.01")

names(levene_results) <- c("Degrees of Freedom (Group)", "Degrees of Freedom (Residual)",
    "F Value", "P Value")

# Viewing the table with descriptive column names
stargazer(levene_results, type = "html", title = "Levene's Test for Homogeneity of Variance",
    out = "levene_test.html", summary = FALSE, rownames = FALSE)
Levene’s Test for Homogeneity of Variance
Degrees of Freedom (Group) Degrees of Freedom (Residual) F Value P Value
7 32,706 189.040 < 0.01
rstudioapi::viewer("levene_test.html")

NULL

# Durbin-Watson test
dw3 <- dwtest(mod3)

# Create a data frame with the Durbin-Watson test results
dw_df3 <- data.frame(Statistic = dw3$statistic, P = ifelse(dw3$p.value >
    0.05, "> 0.05", ifelse(dw3$p.value < 0.01, "< 0.01", "< 0.05")))

# Use stargazer to create an HTML table of the
# Durbin-Watson results
stargazer(dw_df3, type = "html", title = "Durbin-Watson Test Results",
    summary = FALSE, rownames = FALSE, out = "dw_test_results3.html")
Durbin-Watson Test Results
Statistic P
1.184 < 0.01
# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer("dw_test_results3.html")

NULL

# When considering the independence of residuals, we might
# have reason to think that this assumption is not
# satisfied within the model, as it is likely that some
# degree of spatial autocorrelation could be influencing
# our results - that is, that LSOAs closer to each-other
# could theoretically have more similar values of X. This
# idea is supported by the results of the durbin-watson
# test statistics shown - notably, getting closer to 2
# (representing less reduced autocorrelation) when
# including a variable that partially accounts for
# spatiality (the RUC2011 indicator). As such, we do see a
# considerable amount of positive autocorrelation in the
# data (according to a Durbin-Watson statistic of 0.83
# being considerably below 2). This means that residuals
# that follow each other in the data tend to be slightly
# similar in magnitude. Considering the null hypothesis of
# no autocorrelation and considering the p-value being
# significant considering an alpha of 0.05, we fail to
# reject the null hypothesis that there is no
# autocorrelation presented by the variables within our
# model. Thus, we might not be confident that the
# independence of residuals assumption is satisfied.

# Normality of residuals check
QQPlotModel(mod3, subtitle = "QQ-Plot for Model 3")

# Zero conditional mean of residual values
mean(mod3$residuals)

[1] 0.00000000000000001521903

resid_mod3 <- lm(resids3 ~ log_income_deprivation_rate_2015 +
    housing_affordability_score_2016 + rural_urban_class_2011,
    data = complete_case_sample)
summary(resid_mod3)

Call: lm(formula = resids3 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + rural_urban_class_2011, data = complete_case_sample)

Residuals: Min 1Q Median 3Q Max -2.35847 -0.33319 0.02095 0.34803 2.03045

Coefficients: Estimate (Intercept) -0.00000000000000309640 log_income_deprivation_rate_2015 -0.00000000000000014066 housing_affordability_score_2016 0.00000000000000002189 rural_urban_class_2011Rural town and fringe in a sparse setting 0.00000000000000313182 rural_urban_class_2011Rural village and dispersed 0.00000000000000311998 rural_urban_class_2011Rural village and dispersed in a sparse setting 0.00000000000000369754 rural_urban_class_2011Urban city and town 0.00000000000000390593 rural_urban_class_2011Urban city and town in a sparse setting 0.00000000000000223619 rural_urban_class_2011Urban major conurbation 0.00000000000000350995 rural_urban_class_2011Urban minor conurbation 0.00000000000000338580 Std. Error (Intercept) 0.01398036896695923660 log_income_deprivation_rate_2015 0.00457597976149103861 housing_affordability_score_2016 0.00192069232350872252 rural_urban_class_2011Rural town and fringe in a sparse setting 0.04890182920944522588 rural_urban_class_2011Rural village and dispersed 0.01450616708669712727 rural_urban_class_2011Rural village and dispersed in a sparse setting 0.04004073360934087833 rural_urban_class_2011Urban city and town 0.01065145323950207612 rural_urban_class_2011Urban city and town in a sparse setting 0.06876884655176458694 rural_urban_class_2011Urban major conurbation 0.01120479904872859027 rural_urban_class_2011Urban minor conurbation 0.01802400600454943935 t value (Intercept) 0 log_income_deprivation_rate_2015 0 housing_affordability_score_2016 0 rural_urban_class_2011Rural town and fringe in a sparse setting 0 rural_urban_class_2011Rural village and dispersed 0 rural_urban_class_2011Rural village and dispersed in a sparse setting 0 rural_urban_class_2011Urban city and town 0 rural_urban_class_2011Urban city and town in a sparse setting 0 rural_urban_class_2011Urban major conurbation 0 rural_urban_class_2011Urban minor conurbation 0 Pr(>|t|) (Intercept) 1 log_income_deprivation_rate_2015 1 housing_affordability_score_2016 1 rural_urban_class_2011Rural town and fringe in a sparse setting 1 rural_urban_class_2011Rural village and dispersed 1 rural_urban_class_2011Rural village and dispersed in a sparse setting 1 rural_urban_class_2011Urban city and town 1 rural_urban_class_2011Urban city and town in a sparse setting 1 rural_urban_class_2011Urban major conurbation 1 rural_urban_class_2011Urban minor conurbation 1

Residual standard error: 0.5228 on 32704 degrees of freedom Multiple R-squared: 3.712e-30, Adjusted R-squared: -0.0002752 F-statistic: 1.349e-26 on 9 and 32704 DF, p-value: 1

# Zero conditional mean regression results
stargazer(resid_mod3, type = "html", dep.var.labels = c("Residuals of Model 3"),
    covariate.labels = c("log(Income Deprivation Rate 2015)",
        "Housing Affordability Score 2016 [HAS]", "Rural town and fringe [RUC 2011]",
        "Rural town and fringe in a sparse setting [RUC 2011]",
        "Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]",
        "Urban city and town in a sparse setting [RUC 2011]",
        "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"),
    out = "residmod3.html")
Dependent variable:
Residuals of Model 3
log(Income Deprivation Rate 2015) -0.000
(0.005)
Housing Affordability Score 2016 [HAS] 0.000
(0.002)
Rural town and fringe [RUC 2011] 0.000
(0.049)
Rural town and fringe in a sparse setting [RUC 2011] 0.000
(0.015)
Rural village and dispersed [RUC 2011] 0.000
(0.040)
Rural village and dispersed in a sparse setting [RUC 2011] 0.000
(0.011)
Urban city and town in a sparse setting [RUC 2011] 0.000
(0.069)
Urban major conurbation [RUC 2011] 0.000
(0.011)
Urban minor conurbation [RUC 2011] 0.000
(0.018)
Constant -0.000
(0.014)
Observations 32,714
R2 0.000
Adjusted R2 -0.0003
Residual Std. Error 0.523 (df = 32704)
F Statistic 0.000 (df = 9; 32704)
Note: p<0.1; p<0.05; p<0.01
rstudioapi::viewer("residmod3.html")

NULL

Appendix (complete code)

library(knitr)
knitr::opts_chunk$set(echo = T,  warning = FALSE, message = FALSE)
opts_chunk$set(tidy.opts=list(width.cutoff=60),tidy=TRUE)
# Libraries used
library(tidyverse)
library(haven)
library(dplyr)
library(readxl)
library(GGally)
library(forcats)
library(stargazer)
library(lme4)
library(car)
library(rstudioapi)
library(knitr)
library(kableExtra)
# Turning off scientific notation
options(scipen = 9999)
# Bespoke functions used ----
PointPlot <- function(dataset, independent_var, dependent_var, colour_var = NULL, title = NULL, subtitle = NULL, caption = NULL, x_label = "X variable", y_label = "Y variable", point_size = 1, line_intercept = NULL, line_slope = NULL, line_colour = "black", line_linewidth = 1, line_linetype = "solid", add_regression_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Adjust the aesthetic mapping based on the presence of the third variable
  if(!is.null(colour_var)) {
    plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var, colour = colour_var))
  } else {
    plot <- ggplot(dataset, aes_string(x = independent_var, y = dependent_var))
  }
  
  # Base point plot
  plot <- plot + geom_point(alpha = 1, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
                                      family = 'Roboto'),
          axis.title = element_text(family = 'Roboto'),
          axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          legend.position = ifelse(is.null(colour_var), 'none', 'right'),
          legend.text = element_text(family = 'Roboto')) + 
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = x_label,
         y = y_label)
  
  # Add manual line if intercept and slope are provided
  if (!is.null(line_intercept) & !is.null(line_slope)) {
    plot <- plot + geom_abline(intercept = line_intercept, 
                               slope = line_slope, 
                               colour = line_colour, 
                               linewidth = line_linewidth, 
                               linetype = line_linetype)
  }
  
  # Add regression line with confidence interval if requested
  if (add_regression_line) {
    plot <- plot + geom_smooth(method = "lm", alpha = 0.2)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  return(plot)
}

DistPlot <- function(dataset, variable, formatted_name, subtitle = NULL, caption = NULL, y_label = NULL) {
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  plot <- ggplot(dataset, aes_string(x = variable)) +
    geom_histogram(aes(y = ..density..),
                   colour = 1, fill = "white") +
    geom_density(fill = '#0072C6', colour = '#0072C6', alpha = 0.25) +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 12.5, face = 'italic',
                                      family = 'Roboto'),
          axis.title.x = element_blank(),  # This removes the x-axis title
          axis.title.y = element_text(size = 15.5,
                                      family = 'Roboto',
                                      colour = 'black'),
          axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          legend.position = 'none') +
    labs(title = paste0("Distribution of ", formatted_name),
         subtitle = subtitle,
         caption = caption,
         y = y_label)
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  plot
}

ConfidencePlot <- function(data, title, subtitle, middle_point = 0, x_label = "Estimated Coefficient", caption = NULL) {
  library(ggplot2)
  library(showtext)
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  if (!middle_point %in% c(0, 1)) {
    stop("middle_point must be 0 or 1")
  }
  
  conf_plot <- ggplot(data, aes(x = coefs, y = names)) +
    geom_errorbar(aes(xmin = lower, xmax = upper),
                  col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "forestgreen"),
                  width = 0.5, size = 1, alpha = 0.5) +
    geom_point(col = ifelse((data$lower < middle_point & data$upper > middle_point), "grey70", "black"), size = 1, alpha = 1) +
    geom_vline(xintercept = middle_point, col = 'red', size = 0.5, linetype="dashed") +
    theme_classic() +
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = x_label,
         y = "") +
    theme(axis.text = element_text(size = 15.5,
                                   family = 'Roboto',
                                   colour = 'black'),
          axis.title = element_text(family = 'Roboto', size = 15.5),
          plot.title = element_text(hjust = 0, colour = 'black', size = 17.5, face = 'bold',
                                    family = 'Roboto', margin = margin(b = 10)),
          plot.subtitle = element_text(hjust = 0, colour = 'grey30', size = 12.5, face = 'italic',
                                       family = 'Roboto', margin = margin(b = 5)),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic',
                                      family = 'Roboto'))
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", conf_plot$labels$title), ".png")
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = conf_plot, width = 1200/100, height = 740/100, dpi = 100)  # Adjust height as necessary
  
  return(conf_plot)
}

PartialRegressionPlot <- function(model, variable, x_var_name = "X1", y_var_name = "Dependent Variable", title = NULL) {
  # Close any existing plotting device
  graphics.off()
  
  # Construct axis labels using the specified names
  xlab <- paste("Residuals of", x_var_name, "Controlling for Other Variables")
  ylab <- paste("Residuals of", y_var_name, "(Excluding", x_var_name, ")")
  
  # Create the plot without a main title
  avPlots(model, variable, xlab = xlab, ylab = ylab)
  
  # Add a title using the title function
  title(main = title)
}

HomoskedasticityPlot <- function(model, title = "Homoskedasticity Check", subtitle = NULL, 
                                 x_label = "Fitted Values", y_label = "Standardized Residuals", 
                                 point_size = 1.2, add_loess_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  library(lmtest) # For Breusch-Pagan test
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Calculate standardized residuals
  std_residuals <- rstandard(model)
  
  # Fitted values
  fitted_values <- fitted(model)
  
  # Create a data frame for plotting
  plot_data <- data.frame(FittedValues = fitted_values, StdResiduals = std_residuals)
  
  # Perform Breusch-Pagan test
  bp_test <- bptest(model)
  
  # Format the p-value for the caption
  if (bp_test$p.value < 0.05) {
    p_value_text <- "p < 0.05"
  } else if (bp_test$p.value > 0.05) {
    p_value_text <- "p > 0.05"
  } else {
    p_value_text <- "p = 0.05"
  }
  
  # Create a caption including the formatted p-value text
  caption <- paste("Breusch-Pagan Test:", p_value_text)
  
  # Base point plot with adjusted sizes for readability
  plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals)) +
    geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 50, face = 'bold', family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 30, face = 'italic', family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 50, face = 'italic', family = 'Roboto'),
          axis.title = element_text(size = 30, family = 'Roboto'),
          axis.text = element_text(size = 30, family = 'Roboto', colour = 'black'),
          legend.position = 'none') + 
    labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
  
  # Add loess smoothing line if requested
  if (add_loess_line) {
    plot <- plot + geom_smooth(method = "loess", alpha = 0.2)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot with dimensions that balance size and readability
  ggsave(filename = file_name, plot = plot, width = 5.91, height = 4.50, dpi = 300)
  
  return(plot)
}

HomoskedasticityDetailPlot <- function(model, dataframe, categorical_var_name, 
                                       title = "Homoskedasticity Check", subtitle = NULL, 
                                       x_label = "Fitted Values", y_label = "Standardized Residuals", 
                                       point_size = 1.2, add_loess_line = FALSE) {
  
  # Loading required libraries
  library(ggplot2)
  library(showtext)
  library(lmtest) # For Breusch-Pagan test
  
  # Enable showtext to render text
  showtext_auto()
  
  font_add_google("Roboto")
  
  # Calculate standardized residuals
  std_residuals <- rstandard(model)
  
  # Fitted values
  fitted_values <- fitted(model)
  
  # Ensure the categorical variable exists in the dataframe
  if (!categorical_var_name %in% names(dataframe)) {
    stop("Specified categorical variable not found in the provided dataframe.")
  }
  
  # Create a data frame for plotting
  plot_data <- data.frame(FittedValues = fitted_values, 
                          StdResiduals = std_residuals, 
                          Category = dataframe[[categorical_var_name]])
  
  # Perform Breusch-Pagan test
  bp_test <- bptest(model)
  
  # Format the p-value for the caption
  p_value_text <- ifelse(bp_test$p.value < 0.05, "p < 0.05",
                         ifelse(bp_test$p.value > 0.05, "p > 0.05", "p = 0.05"))
  
  # Create a caption including the formatted p-value text
  caption <- paste("Breusch-Pagan Test:", p_value_text)
  
  # Base point plot with adjusted sizes for readability
  plot <- ggplot(plot_data, aes(x = FittedValues, y = StdResiduals, color = Category)) +
    geom_point(alpha = 0.5, shape = 19, size = point_size, position = 'jitter') +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold', family = 'Roboto'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic', family = 'Roboto'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic', family = 'Roboto'),
          axis.title = element_text(size = 15.5, family = 'Roboto'),
          axis.text = element_text(size = 15.5, family = 'Roboto', colour = 'black'),
          legend.position = 'right', legend.text = element_text(size = 12.5, family = 'Roboto')) + 
    labs(title = title, subtitle = subtitle, caption = caption, x = x_label, y = y_label)
  
  # Add loess smoothing line if requested
  if (add_loess_line) {
    plot <- plot + geom_smooth(method = "loess", alpha = 0.2, se = FALSE)
  }
  
  # Create a filename from the plot title
  file_name <- paste0(gsub(" ", "_", plot$labels$title), ".png")
  
  # Save the plot with dimensions that balance size and readability
  ggsave(filename = file_name, plot = plot, width = 10, height = 6.17, dpi = 100)
  
  return(plot)
}

QQPlotModel <- function(model, title = 'Normality of Residuals Check', subtitle = 'QQ-Plot', caption = NULL) {
  
  # Loading required libraries
  library(ggplot2)
  
  # Extract residuals from the model
  model_residuals <- residuals(model)
  
  # Create the QQ-plot
  plot <- ggplot() +
    stat_qq(aes(sample = model_residuals)) +
    stat_qq_line(aes(sample = model_residuals), colour = "blue") +
    theme_classic() +
    theme(plot.title = element_text(colour = 'black', size = 17.5, face = 'bold'),
          plot.subtitle = element_text(colour = 'grey30', size = 12.5, face = 'italic'),
          plot.caption = element_text(colour = 'grey30', size = 10, face = 'italic'),
          axis.title = element_text(),
          axis.text = element_text(size = 15.5, colour = 'black')) + 
    labs(title = title,
         subtitle = subtitle,
         caption = caption,
         x = "Theoretical Quantiles",
         y = "Sample Quantiles")
  
  # Create a filename from the plot title or use a default one
  file_name <- if (!is.null(title)) {
    paste0(gsub(" ", "_", title), ".png")
  } else {
    "QQPlot.png"
  }
  
  # Save the plot to the working directory
  ggsave(filename = file_name, plot = plot, width = 1000/100, height = 617/100, dpi = 100)
  
  return(plot)
}
AHAH_df <- read.csv('Group_7_AHAH_V3_0.csv') # Access to healthy assets and hazards V3 data
IOD_income_df <- read.csv('Group_7_IoD2019_Income.csv') # Index of Multiple Deprivation Income Domian Indicator Data (measured in 2015)
IOD_health_df <- read.csv('Group_7_IoD2019_Health.csv') # Index of Multiple Deprivation Health Domian Indicator Data
IOD_barriers_df <- read.csv('Group_7_IoD2019_Barriers.csv') # Index of Multiple Deprivation Barriers to Housing and Services Domain Indicator Data
IOD_education_df <- read.csv('Group_7_IoD2019_Education.csv') # Index of Multiple Deprivation Education, Skills and Training Deprivation Domain Indicator Data
IOD_employment_df <- read.csv('Group_7_IoD2019_Employment.csv') # Index of Multiple Deprivation Employment Deprivation Domain Indicator Data (measured in 2015)
ONS_MIDYEAR_2015 <- read.csv('Group_7_ONS_MIDYEAR_2015.csv') # Mid-year population estimates for 2015 in England and Wales (for creating rates from the numerator values given in the income and employment measures)
RUC2011 <- read.csv('Group_7_RUC2011.csv')
## Renaming columns for consistent joining
AHAH_df <- AHAH_df %>% rename("LSOA.code..2011." = lsoa11)
ONS_MIDYEAR_2015 <- ONS_MIDYEAR_2015 %>% rename("LSOA.code..2011." = Area.Codes)
RUC2011 <- RUC2011 %>% rename("LSOA.code..2011." = LSOA11CD)

## Creating complete combined dataset
complete_combined_data <- IOD_barriers_df %>%
  left_join(IOD_education_df, by = "LSOA.code..2011.") %>%
  left_join(IOD_employment_df, by = "LSOA.code..2011.") %>%
  left_join(IOD_health_df, by = "LSOA.code..2011.") %>%
  left_join(IOD_income_df, by = "LSOA.code..2011.") %>%
  left_join(AHAH_df, by = "LSOA.code..2011.") %>%
  left_join(ONS_MIDYEAR_2015, by = "LSOA.code..2011.") %>%
  left_join(RUC2011, by = "LSOA.code..2011.")

## Retaining only columns of interest
combined_data <- complete_combined_data %>% 
  select(
    LSOA.code..2011.,
    Local.Authority.District.name..2019.,
    Income.Domain.numerator,
    Years.of.potential.life.lost.indicator,
    Housing.affordability.indicator,
    Staying.on.in.education.post.16.indicator,
    Employment.Domain.numerator,
    ah3gp, 
    ah3hosp, 
    ah3dent, 
    ah3phar,
    All.Ages,
    RUC11
  )

## Removing commas and converting columns 3:12 to numeric
combined_data <- combined_data %>% 
  mutate(across(.cols = 3:12, 
                .fns = ~ as.numeric(str_remove_all(.x, ","))))

## Creating income and employment deprivation rate variables
combined_data$income_deprivation_rate <- (combined_data$Income.Domain.numerator/combined_data$All.Ages) * 100
combined_data$employment_deprivation_rate <- (combined_data$Employment.Domain.numerator /combined_data$All.Ages) * 100

## Creating average distance (in minutes) to a healthcare service (including GP practice, hospital, dentist and pharmacy)
combined_data$avg_healthcare_distance <- rowMeans(combined_data[, c("ah3gp", "ah3hosp", "ah3dent", "ah3phar")], na.rm = TRUE)

## Renaming column names for understanding
colnames(combined_data) <- c('lsoa_code', 'local_authority_name', 'no._of_income_deprived_individuals_2015', 'years_of_lives_lost_2013_2017', 'housing_affordability_score_2016', 'proportion_of_above16_individuals_inschool_2010_2012', 'no._of_deprived_employment_2015_2016', 'avg_dist_gppractice_in_mins_2022', 'avg_dist_hospital_in_mins_2022', 'avg_dist_dentist_in_mins_2022', 'avg_dist_pharmacy_in_mins_2022', 'population_estimates_for_allages_2015', 'rural_urban_class_2011','income_deprivation_rate_2015','employment_deprivation_rate_2015','avg_healthcare_distance_2022')
## Saving as CSV
write.csv(combined_data, file = "Group_7_combined_data.csv", row.names = FALSE)
write.csv(complete_combined_data, file = "Group_7_complete_combined_data.csv", row.names = FALSE)
## Loading combined data 
combined_data <- read.csv('Group_7_combined_data.csv')

## Checking for duplicates
There_are_duplicates <- any(duplicated(combined_data))
if (There_are_duplicates) {
  cat("There are duplicates in the combined_data.\n")
} else {
  cat("There are no duplicates in combined_data.\n")
}

# Calculate missingness percentages for each numerical variable (IV and DV)
missingness <- sapply(combined_data, function(x) mean(is.na(x)) * 100)
missingness_summary <- data.frame(variable = names(missingness), Missingness_percentage = missingness)
missingness_summary

# Removing Null Values from data set
combined_data <- na.omit(combined_data)

# First 6 rows of data
head(combined_data)

# Last 6 rows of data
tail(combined_data)
summary(combined_data)

# Descriptive table for numerical variables 
stargazer(combined_data, type = "html", digits = 1, title = "Table 1: Summary Statistics", out = "table.html")

# Use rstudioapi to view the table
rstudioapi::viewer("table.html")

## SD of numerical variable
sd(combined_data$lsoa_code)
sd(combined_data$no._of_income_deprived_individuals_2015)
sd(combined_data$years_of_lives_lost_2013_2017)
sd(combined_data$housing_affordability_score_2016)
sd(combined_data$proportion_of_above16_individuals_inschool_2010_2012)
sd(combined_data$no._of_deprived_employment_2015_2016)
sd(combined_data$avg_dist_gppractice_in_mins_2022)
sd(combined_data$avg_dist_hospital_in_mins_2022)
sd(combined_data$avg_dist_dentist_in_mins_2022)
sd(combined_data$avg_dist_pharmacy_in_mins_2022)
sd(combined_data$population_estimates_for_allages_2015)
sd(combined_data$income_deprivation_rate_2015)
sd(combined_data$employment_deprivation_rate_2015)
sd(combined_data$avg_healthcare_distance_2022)
# The number of unique names in the rural_urban_class column 
unique_names <- table(combined_data$rural_urban_class_2011)
print(unique_names)
rural_urban_class_2011_counts <- as.numeric(unique_names)
print(rural_urban_class_2011_counts)

## Summary statistics for Rural_urban_class_2011
max(rural_urban_class_2011_counts)
min(rural_urban_class_2011_counts)
median(rural_urban_class_2011_counts)
IQR(rural_urban_class_2011_counts)

# The number of unique names in the local_authority_name column 
unique_names_1 <- table(combined_data$local_authority_name)
print(unique_names_1)
local_authority_name_counts <- as.numeric(unique_names_1)
print(local_authority_name_counts)

## Summary statistics for local_authority_name
max(local_authority_name_counts)
min(local_authority_name_counts)
median(local_authority_name_counts)
IQR(local_authority_name_counts)

## T-test
# Four sample independent t-test is used to compare the significance level in combined_data data set. Student’s t-test assumes both groups of data are normally distributed and the variances of the two distributions are the same.
x1 <-combined_data$income_deprivation_rate_2015
x2 <-combined_data$employment_deprivation_rate_2015
x3 <-combined_data$housing_affordability_score_2016
y <-combined_data$avg_healthcare_distance_2022

# Plot a 2x2 histogram grid
par(mfrow=c(2, 2))
hist(x1)
hist(x2)
hist(x3)
hist(y)

#Based on histogram plots, we have to perform a Welch's t-test for x1, x2, both of which are skewed, and for x3 as well since DV is highly skewed despite x3 being normally distributed.
t_test1<- t.test(x1,y)    
t_test2<- t.test(x2,y)
t_test3<- t.test(x3,y)
t_test1
t_test2
t_test3

## Covariance values of our primary and secondary IV against primary DV
cov(x1,y)
cov(x2,y)
cov(x3,y)

## Correlation values of our primary and secondary IV against primary DV
# As x1, x2 are not normally distributed, we will use 'spearman' method. We will use 'spearman' method for x3 despite it being normally distributed as DV is highly skewed.
cor(x1,y, method='spearman')
cor(x2,y, method='spearman')
cor(x3,y, method='spearman')
DistPlot(combined_data, "income_deprivation_rate_2015", "Income Deprivation Rate Across LSOAs in England (2015)", subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

DistPlot(combined_data, "avg_healthcare_distance_2022", "Average Minutes to Healthcare Services Across LSOAs in England (2022)", subtitle = "Mean Access Time to Healthcare Services per LSOA", caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "housing_affordability_score_2016", "Housing Affordability Indicator Across LSOAs in England (2016)", subtitle = "Higher Scores Indicate Increased Deprivation in Entering Ownership or Private Rental Market", caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "employment_deprivation_rate_2015", "Employment Deprivation Rate in England (2015)", subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

# Both the main explanatory variable and outcome variable have a very strong right skew. Thus, a log transformation in both of these is examined.
## Adding log transformed variables to dataframe:
combined_data$log_avg_healthcare_distance_2022 <- log(combined_data$avg_healthcare_distance_2022)
combined_data$log_income_deprivation_rate_2015 <- log(combined_data$income_deprivation_rate_2015)
combined_data$log_employment_deprivation_rate_2015 <- log(combined_data$employment_deprivation_rate_2015)

## Checking distributions:
DistPlot(combined_data, "log_income_deprivation_rate_2015", 'log(Income Deprivation Rate) Across LSOAs in England (2015)', subtitle = "Income Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

DistPlot(combined_data, "log_avg_healthcare_distance_2022", "log(Average Minutes to Healthcare Services) Across LSOAs in England (2022)", subtitle = "Logarithm of Mean Access Time to Healthcare Services per LSOA", caption = "Data source: Consumer Data Reasearch Centre (2022)")

DistPlot(combined_data, "log_employment_deprivation_rate_2015", 'log(Employment Deprivation Rate) Across LSOAs in England (2015)', subtitle = "Employment Deprivation Rate per LSOA (2015) based on Mid-Year Population Estimates", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021")

## Filtering out rows with infinite values and creating a complete case sample:
complete_case_sample <- combined_data %>%
  select(log_income_deprivation_rate_2015, log_avg_healthcare_distance_2022, 
         housing_affordability_score_2016, rural_urban_class_2011) %>%
  filter(!is.infinite(log_avg_healthcare_distance_2022), 
         !is.infinite(log_income_deprivation_rate_2015)) %>%
  na.omit()

## Saving as CSV
write.csv(complete_case_sample, file = "Group_7_complete_case.csv", row.names = FALSE)

# Checking linearity of relationships between x variables
ggpairs(complete_case_sample[,c("log_income_deprivation_rate_2015", "housing_affordability_score_2016", "log_avg_healthcare_distance_2022")])

PointPlot(combined_data, "log_employment_deprivation_rate_2015", "log_income_deprivation_rate_2015", title = "Income Deprivation Rate (2015) as a Function of Employment Deprivation Rate (2015)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Income Deprivation Rate) (2015)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021", add_regression_line = TRUE)

result <- cor.test(combined_data$log_employment_deprivation_rate_2015, combined_data$income_deprivation_rate_2015, method = "pearson")

# creating a df with correlation test results
model_summary <- data.frame(
  Estimate = round(result$estimate, 4),
  t.value = round(result$statistic, 2),
  p = ifelse(result$p.value < 0.05, "< 0.05", round(result$p.value, 10))
)

# pretty table using stargazer
stargazer(model_summary, type = "html", 
          title = "Results of Pearson's Correlation Test",
          summary = FALSE,
          omit.stat = c("all"), 
          single.row = TRUE, rownames = FALSE, out = 'x1x2.html')
rstudioapi::viewer("x1x2.html")

# As employment deprivation rate and income deprivation rate are highly correlated, employment deprivation rate will be excluded from analyses (to avoid issues of multicollinearity).

# Examining linearity between xi and y
PointPlot(complete_case_sample, "log_income_deprivation_rate_2015", "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Income Deprivation Rate (2015)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "log(Income Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)", add_regression_line = TRUE)

PointPlot(complete_case_sample, "housing_affordability_score_2016", "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Housing Affordability Score (2016)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "Housing Affordability Score (2016)", y_label = "log(Mean Access Time to Healthcare Services) (2022)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)", add_regression_line = TRUE)

PointPlot(combined_data, "log_employment_deprivation_rate_2015", "log_avg_healthcare_distance_2022", title = "Average Healthcare Distance (2022) as a Function of Employment Deprivation Rate (2015)", subtitle = "For Lower layer Super Output Areas Across England", x_label = "log(Employment Deprivation Rate) (2015)", y_label = "log(Mean Access Time to Healthcare Services) (2022)", caption = "Data sources: Ministry of Housing, Communities & Local Government, 2019; Office for National Statistics, 2021; Consumer Data Reasearch Centre (2022)", add_regression_line = TRUE)
### PLotting of table for slide 11
combined_data_sum <- combined_data[c("local_authority_name","housing_affordability_score_2016", "income_deprivation_rate_2015", "employment_deprivation_rate_2015","avg_healthcare_distance_2022", "rural_urban_class_2011")]

html_table <- stargazer(
  combined_data_sum,
  type = "html",
  title = "Table 1: Summary Statistics",
  digits = 1,
  column.labels = c(
    "Housing Affordability Score (2016)",
    "Income Deprivation Rate (2015)",
    "Employment Deprivation Rate (2015)",
    "Average Healthcare Distance (2022)"
  ),
  out = "table.html"
)

# See HTML table in R Studio
rstudioapi::viewer("table.html")


### Calculate missingness percentages for each variable for slide 11
missingness <- sapply(combined_data_sum, function(x) mean(is.na(x)) * 100)

# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 2: Missingness Summary/%", summary = FALSE, flip=TRUE)
html_file <- "missingness_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


### count of unique names for rural/urban for slide 12
unique_class2011 <- table(combined_data_sum$rural_urban_class_2011)
unique_class2011.1 <- as.data.frame(unique_class2011)

# Rename the columns
colnames(unique_class2011.1) <- c("Rural/Urban Class (2011)", "Count")

# Create HTML table using stargazer
html_table1 <- stargazer(unique_class2011.1, type = "html", title = "Unique Names and its Respective Counts", summary = FALSE)
cat(html_table1)
writeLines(html_table1, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)

# Calculate summary statistics
summary_ruralurban <- c(
  Max = max(rural_urban_class_2011_counts),
  Min = min(rural_urban_class_2011_counts),
  Med = median(rural_urban_class_2011_counts),
  IQR = IQR(rural_urban_class_2011_counts)
)

# Create data frame for summary statistics
summary_df <- data.frame(statistic = names(summary_ruralurban), value = summary_ruralurban)

# Create HTML table using stargazer
html_table2 <- stargazer(summary_df, type = "html", title = "Summary Statistics", summary = FALSE, rownames = FALSE)
cat(html_table2)
writeLines(html_table2, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


## count of unique names for local authority names for slide 12
unique_localauthority <- table(combined_data_sum$local_authority_name)

# Convert the result to a data frame
unique_localauthority1 <- as.data.frame(unique_localauthority)

colnames(unique_localauthority1) <- c("Local Authority Names", "Count")

# Create HTML table using stargazer
html_table3 <- stargazer(unique_localauthority1, type = "html", title = "Unique Names and its Respective Counts", summary = FALSE)
cat(html_table3)
writeLines(html_table3, html_file)
rstudioapi::viewer(html_file)

# Calculate summary statistics
summary_localauthority <- c(
  Max = max(local_authority_name_counts),
  Min = min(local_authority_name_counts),
  Med = median(local_authority_name_counts),
  IQR = IQR(local_authority_name_counts)
)

summary_df <- data.frame(statistic = names(summary_localauthority ), value = summary_localauthority )

# Create HTML table using stargazer
html_table4 <- stargazer(summary_df, type = "html", title = "Summary Statistics", summary = FALSE, rownames = FALSE)
cat(html_table4)
writeLines(html_table4, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


### Repeat for complete_case.csv for slide 16
## Read the complete_case file
complete_case_sum <- read.csv("Group_7_complete_case.csv")

html_table <- stargazer(
  complete_case_sum,
  type = "html",
  title = "Table 3: Summary Statistics",
  digits = 1,
  column.labels = c(
    "Log Income Deprivation Rate (2015)",
    "Log Average Healthcare Distance (2022)",
    "Housing Affordability Score (2016)",
    "Rural/Urban Class (2011)"
  ),
  out = "table.html"
)

# See HTML table in R Studio
rstudioapi::viewer("table.html")

### Calculate missingness percentages for each variable for slide 16
missingness <- sapply(complete_case_sum, function(x) mean(is.na(x)) * 100)

# Create HTML table using stargazer
html_table <- stargazer(missingness, type = "html", title = "Table 4: Missingness Summary/%", summary = FALSE, flip=TRUE)
html_file <- "missingness_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


### covariance of x1,x2,x3 before log transformation for slide 17
x1 <-combined_data_sum$income_deprivation_rate_2015
x2 <-combined_data_sum$employment_deprivation_rate_2015
x3 <-combined_data_sum$housing_affordability_score_2016
y <-combined_data_sum$avg_healthcare_distance_2022
cov_x1y <- cov(x1, y)
cov_x2y <- cov(x2, y)
cov_x3y <- cov(x3, y)

# Create a data frame for the covariances
covariances_df <- data.frame(
  variables = c("x1 and y", "x2 and y", "x3 and y"),
  covariance = c(cov_x1y, cov_x2y, cov_x3y)
)

# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances before log transformation", summary = FALSE, rownames = FALSE)
html_file <- "covariances_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


###covariance of x4,x5 after log transformation for slide 17
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <-complete_case_sum$log_income_deprivation_rate_2015
x5 <-complete_case_sum$housing_affordability_score_2016
y1 <-complete_case_sum$log_avg_healthcare_distance_2022
cov_x4y1 <- cov(x4, y1)
cov_x5y1 <- cov(x5, y1)

# Create a data frame for the covariances
covariances_df <- data.frame(
  variables = c("x4 and y1", "x5 and y1"),
  covariance = c(cov_x4y1, cov_x5y1)
)

# Create HTML table using stargazer
html_table <- stargazer(covariances_df, type = "html", title = "Covariances after log transformation", summary = FALSE, rownames = FALSE)
html_file <- "covariances_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)


## Correlation test for slide 18
complete_case_sum <- read.csv("Group_7_complete_case.csv")
x4 <-complete_case_sum$log_income_deprivation_rate_2015
x5 <-complete_case_sum$housing_affordability_score_2016
y1 <-complete_case_sum$log_avg_healthcare_distance_2022
corx4y1 <- cor.test(x4, y1, method="pearson")
corx5y1 <- cor.test(x5, y1, method="pearson")

# Create a data frame to store correlation results
# format.pval function is used to format the p-value to make it into a string ie. a range
corr_df <- data.frame(
  variables = c("x4 and y1", "x5 and y1"),
  Pearson_Correlation = c(corx4y1$estimate, corx5y1$estimate),
  t_value = c(corx4y1$statistic, corx5y1$statistic),
  Estimate = c(corx4y1$estimate, corx5y1$estimate),
  p_value = ifelse(corx4y1$p.value == 0, "< 0.001", format.pval(corx4y1$p.value))
)

# Create HTML table using stargazer
html_table <- stargazer(corr_df, type = "html", title = "Pearson Correlations After Log Transformation", summary = FALSE, rownames = FALSE)
html_file <- "cor_table.html"
writeLines(html_table, html_file)

# See HTML table in R Studio
rstudioapi::viewer(html_file)
# Model 1:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate.
mod1 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
summary(mod1)

# Model 1 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate. As both the dependent and independent variables are log transformed, the coefficients may be interpreted in terms of percentage change. That is, a 1% increase in the income deprivation rate is associated with a 0.35% decrease in the average travel time to a healthcare service. This coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). Thus, according to this model, we may reject the null hypothesis that there is no relationship between income deprivation rate and average healthcare travel distance. Further, the p-value associated with the F-statistic is also significant considering the same alpha, sugegsting at least one variable in the model has some explanatory power regarding average healthcare distance. This statistically is not that useful regarding this model however, as only one explanatory variable is considered. The coefficient of determination for this model indicates that we are able to explain 13.1% of the variation in average travel time to healthcare services with this model.

# Model 2:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate and the housing affordability indicator
mod2 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)

# Exponentiating untransformed variable coefficients to allow easier interpretation
mod2$coefficients["housing_affordability_score_2016"] <- exp(mod2$coefficients["housing_affordability_score_2016"])

# Viewing model results
summary(mod2)

# Model 2 represents the logarithm of average travel time to a healthcare service within an LSOA as a function of the logarithm of the income deprivation rate and the housing affordability indicator. Here, we can see that a 1% increase in the income deprivation rate is associated with a 0.19% decrease in the average travel time to a healthcare service. Thus, we can see that when we control for a further underlying economic variable, the explanatory power of income deprivation alone decreases. Again, this coefficient is statistically significant considering a significance level of 5% (alpha = 0.05). For interpretation of the housing affordability indicator, the exponential of the coefficient is considered (0.88). Thus, we can see that a 1 unit increase in the housing affordability indicator is associated with a 12% decrease in the average travel time to a healthcare service, again significant considering an alpha of 0.05. Remembering that increases in the housing affordability indicator signify increased inability to afford to enter owner-occupation or the private rental market, this again appears to indicate that more economically deprived areas have greater geographical access to healthcare services. Further, the p-value associated with the F-statistic is also significant at the 5% level, again suggesting at least one variable in the model has some explanatory power regarding average healthcare distance. The adjusted coefficient of determination for this model indicates that we are able to explain 19.3% of the variation in average travel time to healthcare services with this model.

# Model 3:
# The logarithm of average healthcare distance modeled as a function of the logarithm of income deprivation rate, the housing affordability indicator, and the re-leveled Rural-Urban Classification (RUC11) with "Urban city and town" as the reference category
mod3 <- lm(log_avg_healthcare_distance_2022 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + forcats::fct_relevel(rural_urban_class_2011, "Urban city and town"), data = complete_case_sample)

# Exponentiated coefficient data
mod3_exp_coeff <- data.frame(exp(coef(mod3)))

# Exponentiating untransformed variable coefficients WITHIN the model to allow easier interpretation
mod3$coefficients[3:10] <- exp(mod3$coefficients[3:10])

# Viewing model results
summary(mod3)

# Creating data for confidence interval plot
ConfidencePlotData <- data.frame (
  "names" = as.factor (c("Rural town and fringe", "Rural town and fringe in a sparse setting", "Rural village and dispersed", "Rural village and dispersed in a sparse setting", "Urban city and town in a sparse setting", "Urban major conurbation", "Urban minor conurbation")), # Creating predictor names (row names)
  "coefs" = mod3_exp_coeff[4:10, 1],
  "lower" = confint (mod3) [4:10, 1], # Creating a lower confidence interval [ , 1] list for predictors 2-3 in the model [2:5 , ]
  "upper" = confint (mod3) [4:10, 2] # Creating an upper confidence interval [ , 2] list for predictors 2-3 in the model [2:5 , ]
)
ConfidencePlotData$names <- factor(ConfidencePlotData$names, levels = unique(ConfidencePlotData$names))

# Creating confidence interval plot for RUC coefficients
ConfidencePlot(ConfidencePlotData, "Healthcare Access by Rural-Urban Classification in England", "Reference Category: Urban City and Town", x_label = "Exponentiated Coefficient", middle_point = 1, caption = "*95% CI Indicated by Green Error Bars. Red dashed line indicates reference group. Controlling for income deprivation (2015) and housing affordability indicators (2016).")

# Model 3 expands on Model 2 by including the Rural-Urban Classification (RUC11) as a factor variable, allowing us to account for geographic differences in average travel time to healthcare services. The model estimates the logarithm of average travel time to a healthcare service within a Lower Layer Super Output Area (LSOA) as a function of the logarithm of the income deprivation rate, the housing affordability indicator, and the RUC11 categories. We observe that a 1% increase in the income deprivation rate is associated with a 0.10% decrease in average travel time to healthcare services, when holding the other variables constant. This relationship is statistically significant at the 5% level, with a p-value less than 0.05, indicating that income deprivation continues to be an important factor in explaining variations in healthcare access, albeit with a smaller coefficient compared to Model 2. Similarly, a 1 unit increase in the housing affordability indicator, which reflects increased difficulty in entering owner-occupation or the private rental market, is associated with a 7% decrease in average travel time to healthcare services. This relationship is also statistically significant at the 5% level. The new variables introduced, representing the RUC11 categories, show significant variations in healthcare access across different geographic areas. For instance, compared to Urban city and town areas, Rural village and dispersed areas experience a 344% increase in average travel time to healthcare services, indicating that rural areas face substantial barriers in healthcare access. These relationships are statistically significant at the 5% level, with the exception of Urban city and town in a sparse setting category, which does not show a significant difference in healthcare access compared to Urban city and town areas. The adjusted coefficient of determination for this model is 0.537, suggesting that the model explains approximately 53.7% of the variation in average travel time to healthcare services. The inclusion of the RUC11 categories significantly enhances the explanatory power of the model compared to Model 2. The p-value associated with the F-statistic is also significant at the 5% level, indicating that at least one variable in the model has some explanatory power regarding average healthcare distance. In conclusion, Model 3 provides a more nuanced understanding of the factors affecting healthcare access, by accounting for both economic and geographic variables. The significant differences in healthcare access across the RUC11 categories highlight the importance of considering geographic disparities when analyzing healthcare access.
results <- stargazer(mod1, mod2, mod3, type="html",
          dep.var.labels=c("log(Mean Healthcare Service Distance [MHSD])"),
          covariate.labels=c("log(Income Deprivation Rate 2015)","Housing Affordability Score 2016 [HAS]","Rural town and fringe [RUC 2011]","Rural town and fringe in a sparse setting [RUC 2011]","Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]", "Urban city and town in a sparse setting [RUC 2011]", "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"),
          notes = "Coefficients for HAS and RUC 2011 are exponentiated, reflecting a multiplicative effect on MHSD.",
          ci = TRUE)
# Checking assumptions: Model 1 
# Adding residual values to dataframe
complete_case_sample$resids1 <- residuals(mod1)

# Linearity (using partial regression plots)
PartialRegressionPlot (mod1, 'log_income_deprivation_rate_2015', x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR)")

# Constant variance of residuals 
HomoskedasticityPlot(mod1, add_loess_line = TRUE, title = "Model 1 Scale-Location")

# Durbin-Watson test
dw1 <- dwtest(mod1)

# Create a data frame with the Durbin-Watson test results
dw_df <- data.frame(
  Statistic = dw1$statistic,
  P = format.pval(dw1$p.value, digits = 16)
)

# Use stargazer to create an HTML table of the Durbin-Watson results
stargazer(dw_df, type = "html", title = "Durbin-Watson Test Results", 
          summary = FALSE, rownames = FALSE, out = 'dw_test_results.html')

# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer('dw_test_results.html')

# Normality of residuals check
QQPlotModel(mod1, subtitle = "QQ-Plot for Model 1")

# Zero conditional mean of residual values
mean(mod1$residuals)
resid_mod1 <- lm(resids1 ~ log_income_deprivation_rate_2015, data = complete_case_sample)
summary(resid_mod1)

# Zero conditional mean regression results
stargazer(resid_mod1, type="html",
          dep.var.labels=c("Residuals of Model 1"),
          covariate.labels=c("log(Income Deprivation Rate 2015)"), out = "residmod1.html")
rstudioapi::viewer("residmod1.html")

# Checking assumptions: Model 2
# Adding residual values to dataframe
complete_case_sample$resids2 <- residuals(mod2)

# Linearity (using partial regression plots)
PartialRegressionPlot (mod2, 'log_income_deprivation_rate_2015', x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot (mod2, 'housing_affordability_score_2016', x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")

# Multicollinearity (using VIF)
vif_values1 <- vif(mod2)

# storing VIF in a dataframe 
vif_df1 <- data.frame(Variable = names(vif_values1), VIF = vif_values1)
vif_df1$Variable[1] <- "log(Income Deprivation Rate 2015)"
vif_df1$Variable[2] <- "Housing Affordability Score 2016"

# creating a pretty table
stargazer(vif_df1, type = "html", title = "VIF Values", summary = FALSE, 
          rownames = FALSE, out = 'mod2vif.html')

# using rstudioapi to view the table
rstudioapi::viewer("mod2vif.html")

# Constant variance of residuals 
HomoskedasticityPlot(mod2, add_loess_line = TRUE, title = "Model 2 Scale-Location")

# Durbin-Watson test
dw2 <- dwtest(mod2)

# Create a data frame with the Durbin-Watson test results
dw_df2 <- data.frame(
  Statistic = dw2$statistic,
  P = ifelse(dw2$p.value > 0.05, '> 0.05',
             ifelse(dw2$p.value < 0.01, '< 0.01', '< 0.05'))
)

# Use stargazer to create an HTML table of the Durbin-Watson results
stargazer(dw_df2, type = "html", title = "Durbin-Watson Test Results", 
          summary = FALSE, rownames = FALSE, out = 'dw_test_results2.html')

# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer('dw_test_results2.html')

# Normality of residuals check
QQPlotModel(mod2, subtitle = "QQ-Plot for Model 2")

# Zero conditional mean of residual values
mean(mod2$residuals)
resid_mod2 <- lm(resids2 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016, data = complete_case_sample)
summary(resid_mod2)

# Zero conditional mean regression results
stargazer(resid_mod2, type="html",
          dep.var.labels=c("Residuals of Model 2"),
          covariate.labels=c("log(Income Deprivation Rate 2015)","Housing Affordability Score 2016 [HAS]"), out = "residmod2.html")
rstudioapi::viewer("residmod2.html")

# Checking assumptions: Model 3 
# Adding residual values to dataframe
complete_case_sample$resids3 <- residuals(mod3)

# Linearity (using partial regression plots)
PartialRegressionPlot (mod3, 'log_income_deprivation_rate_2015', x_var_name = "X1", y_var_name = "Y", title = "Partial Regression: log(MHSD) | log(IDR) + Controls")
PartialRegressionPlot (mod3, 'housing_affordability_score_2016', x_var_name = "X2", y_var_name = "Y", title = "Partial Regression: log(MHSD) | HAS + Controls")

# Multicollinearity (using VIF)
vif_values2 <- data.frame(vif(mod3))
vif_values2$Variable <- c('log(Income Deprivation Rate 2015)', 'Housing Affordability Score 2016', 'Rural Urban Classification 2011')
vif_df2 = vif_values2[, c("Variable", "GVIF")] 

# creating a pretty table
stargazer(vif_df2, type = "html", title = "GVIF Values", summary = FALSE, 
          rownames = FALSE, out = 'mod3vif.html')

# using rstudioapi to view the table
rstudioapi::viewer("mod3vif.html")

# Constant variance of residuals plot
HomoskedasticityPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location")

# Plot indicating groups
HomoskedasticityDetailPlot(mod3, add_loess_line = TRUE, title = "Model 3 Scale-Location", dataframe = complete_case_sample, categorical_var_name = "rural_urban_class_2011")

# Testing for non-constant variance across groups:
leveneTest(complete_case_sample$resids3, complete_case_sample$rural_urban_class_2011, center=mean)

# df for levene's test results
levene_results <- data.frame(
  `Degrees of Freedom (Group)` = 7,
  `Degrees of Freedom (Residual)` = 32706,
  `F Value` = 189.04,
  `P Value` = '< 0.01'
)

names(levene_results) <- c("Degrees of Freedom (Group)", 
                           "Degrees of Freedom (Residual)", 
                           "F Value", 
                           "P Value")

# Viewing the table with descriptive column names
stargazer(levene_results, 
          type = "html", 
          title = "Levene's Test for Homogeneity of Variance", 
          out = "levene_test.html", 
          summary = FALSE, 
          rownames = FALSE)
rstudioapi::viewer('levene_test.html')



# Durbin-Watson test
dw3 <- dwtest(mod3)

# Create a data frame with the Durbin-Watson test results
dw_df3 <- data.frame(
  Statistic = dw3$statistic,
  P = ifelse(dw3$p.value > 0.05, '> 0.05',
             ifelse(dw3$p.value < 0.01, '< 0.01', '< 0.05'))
)

# Use stargazer to create an HTML table of the Durbin-Watson results
stargazer(dw_df3, type = "html", title = "Durbin-Watson Test Results", 
          summary = FALSE, rownames = FALSE, out = 'dw_test_results3.html')

# Use rstudioapi to view the table in RStudio Viewer
rstudioapi::viewer('dw_test_results3.html')

# When considering the independence of residuals, we might have reason to think that this assumption is not satisfied within the model, as it is likely that some degree of spatial autocorrelation could be influencing our results - that is, that LSOAs closer to each-other could theoretically have more similar values of X. This idea is supported by the results of the durbin-watson test statistics shown - notably, getting closer to 2 (representing less reduced autocorrelation) when including a variable that partially accounts for spatiality (the RUC2011 indicator). As such, we do see a considerable amount of positive autocorrelation in the data (according to a Durbin-Watson statistic of 0.83 being considerably below 2). This means that residuals that follow each other in the data tend to be slightly similar in magnitude. Considering the null hypothesis of no autocorrelation and considering the p-value being significant considering an alpha of 0.05, we fail to reject the null hypothesis that there is no autocorrelation presented by the variables within our model. Thus, we might not be confident that the independence of residuals assumption is satisfied.

# Normality of residuals check
QQPlotModel(mod3, subtitle = "QQ-Plot for Model 3")

# Zero conditional mean of residual values
mean(mod3$residuals)
resid_mod3 <- lm(resids3 ~ log_income_deprivation_rate_2015 + housing_affordability_score_2016 + rural_urban_class_2011, data = complete_case_sample)
summary(resid_mod3)

# Zero conditional mean regression results
stargazer(resid_mod3, type="html",
          dep.var.labels=c("Residuals of Model 3"),
          covariate.labels=c("log(Income Deprivation Rate 2015)","Housing Affordability Score 2016 [HAS]","Rural town and fringe [RUC 2011]","Rural town and fringe in a sparse setting [RUC 2011]","Rural village and dispersed [RUC 2011]", "Rural village and dispersed in a sparse setting [RUC 2011]", "Urban city and town in a sparse setting [RUC 2011]", "Urban major conurbation [RUC 2011]", "Urban minor conurbation [RUC 2011]"), out = "residmod3.html")
rstudioapi::viewer("residmod3.html")