Set Up

Generating hypothetical / pseudo data

#### Generating hypothetical / pseudo data ####

# Set seed for reproducibility
set.seed(123)

# Number of local authorities
n <- 374

# Generate the base dataset with six capital variables
data <- tibble(
  Local_Authority = 1:n,
  Physical_Capital = runif(n, 1, 100),
  Intangible_Capital = runif(n, 1, 100),
  Financial_Capital = runif(n, 1, 100),
  Human_Capital = runif(n, 1, 100),
  Social_Capital = runif(n, 1, 100),
  Institutional_Capital = runif(n, 1, 100)
)

partial_correlation <- function(x, y, corr) {
  noise_x <- runif(length(x), -abs(corr) / 2, abs(corr) / 2)
  noise_y <- runif(length(y), -abs(corr) / 2, abs(corr) / 2)
  return((x + noise_x) * (y + noise_y) / 100)
}

data$Human_Capital <- partial_correlation(data$Physical_Capital, data$Human_Capital, 0.5)
data$Social_Capital <- partial_correlation(data$Physical_Capital, data$Social_Capital, 0.5)
data$Institutional_Capital <- partial_correlation(data$Physical_Capital, data$Institutional_Capital, 0.5)
data$Intangible_Capital <- partial_correlation(data$Physical_Capital, data$Intangible_Capital, 0.5)
data$Financial_Capital <- partial_correlation(data$Physical_Capital, data$Financial_Capital, 0.5)

# Normalize the variables to the 1-100 range
normalize <- function(x) {
  return((x - min(x)) / (max(x) - min(x)) * 99 + 1)
}

data$Physical_Capital <- normalize(data$Physical_Capital)
data$Intangible_Capital <- normalize(data$Intangible_Capital)
data$Financial_Capital <- normalize(data$Financial_Capital)
data$Human_Capital <- normalize(data$Human_Capital)
data$Social_Capital <- normalize(data$Social_Capital)
data$Institutional_Capital <- normalize(data$Institutional_Capital)

Adding Productivity variable

# Add the Productivity variable with a normal distribution
data$Productivity <- rnorm(n, mean = 50, sd = 15)

# Ensure each capital is weakly positively correlated with productivity
data$Productivity <- data$Productivity +
  0.1 * data$Physical_Capital +
  0.1 * data$Intangible_Capital +
  0.1 * data$Financial_Capital +
  0.1 * data$Human_Capital +
  0.1 * data$Social_Capital +
  0.1 * data$Institutional_Capital

# Normalize the Productivity variable to maintain the desired distribution
data$Productivity <- normalize(data$Productivity)

# Print the generated dataset
print(data)
## # A tibble: 374 × 8
##    Local_Authority Physical_Capital Intangible_Capital Financial_Capital
##              <int>            <dbl>              <dbl>             <dbl>
##  1               1            29.4               17.2              22.2 
##  2               2            79.1               79.5              78.5 
##  3               3            41.5               34.0              13.0 
##  4               4            88.5               20.7              24.1 
##  5               5            94.2               31.6              17.4 
##  6               6             5.45               6.22              1.36
##  7               7            53.3               33.2              31.8 
##  8               8            89.4               71.1              46.6 
##  9               9            55.6               22.8              29.7 
## 10              10            46.2               37.8              43.5 
## # ℹ 364 more rows
## # ℹ 4 more variables: Human_Capital <dbl>, Social_Capital <dbl>,
## #   Institutional_Capital <dbl>, Productivity <dbl>

Descriptive / Correlation Analysis

#### Descriptive analysis ####

#### Correlation Work ####

# Calculate univariate correlations between productivity and each of the capitals
correlations <- cor(data[, c("Productivity", "Physical_Capital", "Intangible_Capital", "Financial_Capital", "Human_Capital", "Social_Capital", "Institutional_Capital")])[1, -1]

# Create a tibble with the correlation results
correlation_data <- tibble(
  Capital = names(correlations),
  Correlation = correlations
)

# Visualize the correlations using a bar plot
ggplot(correlation_data, aes(x = Capital, y = Correlation, fill = Capital)) +
  geom_col(alpha = 0.9) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  labs(
    title = "Correlations between Productivity and Capitals",
    subtitle = "Univariate correlations",
    x = "Capital",
    y = "Correlation"
  ) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "black") +
  scale_fill_brewer(palette = "Spectral")

# Create a table displaying the correlation coefficients
correlation_table <- kable(correlation_data, digits = 3, col.names = c("Capital", "Correlation Coefficient"), format = "markdown")
correlation_table
Capital Correlation Coefficient
Physical_Capital 0.615
Intangible_Capital 0.420
Financial_Capital 0.488
Human_Capital 0.453
Social_Capital 0.449
Institutional_Capital 0.423
# Scatterplot of Productivity vs. Physical_Capital
scatter_physical <- ggplot(data, aes(x = Physical_Capital, y = Productivity)) +
  geom_point(color = "navy", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    x = "Physical Capital",
    y = "Productivity"
  ) +
  theme_minimal()

# Scatterplot of Productivity vs. Intangible_Capital
scatter_intangible <- ggplot(data, aes(x = Intangible_Capital, y = Productivity)) +
  geom_point(color = "navy", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    x = "Intangible Capital",
    y = "Productivity"
  ) +
  theme_minimal()

# Scatterplot of Productivity vs. Financial_Capital
scatter_financial <- ggplot(data, aes(x = Financial_Capital, y = Productivity)) +
  geom_point(color = "navy", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    x = "Financial Capital",
    y = "Productivity"
  ) +
  theme_minimal()

# Scatterplot of Productivity vs. Human_Capital
scatter_human <- ggplot(data, aes(x = Human_Capital, y = Productivity)) +
  geom_point(color = "navy", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    x = "Human Capital",
    y = "Productivity"
  ) +
  theme_minimal()

# Scatterplot of Productivity vs. Social_Capital
scatter_social <- ggplot(data, aes(x = Social_Capital, y = Productivity)) +
  geom_point(color = "navy", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    x = "Social Capital",
    y = "Productivity"
  ) +
  theme_minimal()

# Scatterplot of Productivity vs. Institutional_Capital
scatter_institutional <- ggplot(data, aes(x = Institutional_Capital, y = Productivity)) +
  geom_point(color = "navy", alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(
    x = "Institutional Capital",
    y = "Productivity"
  ) +
  theme_minimal()

# Combine the scatterplots into a grid
combined_plots <- ggarrange(
  scatter_physical, scatter_intangible, scatter_financial,
  scatter_human, scatter_social, scatter_institutional,
  nrow = 2, ncol = 3
)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
# Display the combined plot
print(combined_plots)

Mapping the Six Capitals

#### Mapping Capitals ####

## Reading in shapefile of county and unitary authority boundaries (ONS)
shapefile <- read_sf(
  "/Users/Tom/downloads/LAD Shapefiles/LAD_DEC_2022_UK_BGC.shp"
) %>%
  sf::st_transform("+proj=longlat +datum=WGS84")


# simplifying shapefile data to reduce size
shapefile_simplified <-
  rmapshaper::ms_simplify(shapefile)

# see reduction in file size
object.size(shapefile_simplified)
## 993144 bytes
object.size(shapefile)
## 10243008 bytes
# subsetting to keep only useful geometry and ID info
shapefile_simplified <- shapefile_simplified %>%
  select(OBJECTID, LAD22CD, LAD22NM, geometry)

# Add a new column and assign values from 1 to n
n <- nrow(shapefile_simplified)
shapefile_simplified$Local_Authority <- 1:n

# Check the updated shapefile
print(shapefile_simplified)
## Simple feature collection with 374 features and 4 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: -8.611241 ymin: 49.88571 xmax: 1.763207 ymax: 60.84568
## Geodetic CRS:  +proj=longlat +datum=WGS84
## # A tibble: 374 × 5
##    OBJECTID LAD22CD   LAD22NM                           geometry Local_Authority
##  *    <int> <chr>     <chr>                       <GEOMETRY [°]>           <int>
##  1        1 E06000001 Hartlepool       POLYGON ((-1.240951 54.7…               1
##  2        2 E06000002 Middlesbrough    POLYGON ((-1.201013 54.5…               2
##  3        3 E06000003 Redcar and Clev… POLYGON ((-0.7933648 54.…               3
##  4        4 E06000004 Stockton-on-Tees MULTIPOLYGON (((-1.22492…               4
##  5        5 E06000005 Darlington       POLYGON ((-1.438344 54.5…               5
##  6        6 E06000006 Halton           MULTIPOLYGON (((-2.59522…               6
##  7        7 E06000007 Warrington       POLYGON ((-2.449379 53.4…               7
##  8        8 E06000008 Blackburn with … POLYGON ((-2.465809 53.7…               8
##  9        9 E06000009 Blackpool        POLYGON ((-3.010658 53.8…               9
## 10       10 E06000010 Kingston upon H… MULTIPOLYGON (((-0.41920…              10
## # ℹ 364 more rows
# joining polygons to subnational data by a common id
joined <- left_join(shapefile_simplified,
  data,
  by = "Local_Authority"
)
head(joined)
## Simple feature collection with 6 features and 11 fields
## Geometry type: GEOMETRY
## Dimension:     XY
## Bounding box:  xmin: -2.826656 ymin: 53.30579 xmax: -0.7933648 ymax: 54.72716
## Geodetic CRS:  +proj=longlat +datum=WGS84
## # A tibble: 6 × 12
##   OBJECTID LAD22CD   LAD22NM                            geometry Local_Authority
##      <int> <chr>     <chr>                        <GEOMETRY [°]>           <int>
## 1        1 E06000001 Hartlepool        POLYGON ((-1.240951 54.7…               1
## 2        2 E06000002 Middlesbrough     POLYGON ((-1.201013 54.5…               2
## 3        3 E06000003 Redcar and Cleve… POLYGON ((-0.7933648 54.…               3
## 4        4 E06000004 Stockton-on-Tees  MULTIPOLYGON (((-1.22492…               4
## 5        5 E06000005 Darlington        POLYGON ((-1.438344 54.5…               5
## 6        6 E06000006 Halton            MULTIPOLYGON (((-2.59522…               6
## # ℹ 7 more variables: Physical_Capital <dbl>, Intangible_Capital <dbl>,
## #   Financial_Capital <dbl>, Human_Capital <dbl>, Social_Capital <dbl>,
## #   Institutional_Capital <dbl>, Productivity <dbl>
# Define the color palette using the Spectral palette from RColorBrewer
palette <- colorNumeric(palette = "Spectral", domain = joined$Physical_Capital)

# Create the Leaflet map
leaflet() %>%
  addTiles() %>%
  addLayersControl(
    baseGroups = "Base Map",
    overlayGroups = c(
      "Physical Capital",
      "Intangible Capital",
      "Financial Capital",
      "Human Capital",
      "Social Capital",
      "Institutional Capital"
    ),
    options = layersControlOptions(position = "topright")
  ) %>%
  addLegend(
    pal = palette,
    values = joined$Physical_Capital,
    title = "Physical Capital",
    position = "bottomright",
    opacity = 0.7
  ) %>%
  addPolygons(
    data = joined,
    fillColor = ~ palette(Physical_Capital),
    weight = 0.05,
    opacity = 0.7,
    color = "white",
    dashArray = "2",
    smoothFactor = 0.2,
    fillOpacity = 0.6,
    highlightOptions = highlightOptions(
      weight = 1,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE
    ),
    label = ~LAD22NM,
    group = "Physical Capital"
  ) %>%
  addPolygons(
    data = joined,
    fillColor = ~ palette(Intangible_Capital),
    weight = 0.05,
    opacity = 0.7,
    color = "white",
    dashArray = "2",
    smoothFactor = 0.2,
    fillOpacity = 0.6,
    highlightOptions = highlightOptions(
      weight = 1,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE
    ),
    label = ~LAD22NM,
    group = "Intangible Capital"
  ) %>%
  addPolygons(
    data = joined,
    fillColor = ~ palette(Financial_Capital),
    weight = 0.05,
    opacity = 0.7,
    color = "white",
    dashArray = "2",
    smoothFactor = 0.2,
    fillOpacity = 0.6,
    highlightOptions = highlightOptions(
      weight = 1,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE
    ),
    label = ~LAD22NM,
    group = "Financial Capital"
  ) %>%
  addPolygons(
    data = joined,
    fillColor = ~ palette(Human_Capital),
    weight = 0.05,
    opacity = 0.7,
    color = "white",
    dashArray = "2",
    smoothFactor = 0.2,
    fillOpacity = 0.6,
    highlightOptions = highlightOptions(
      weight = 1,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE
    ),
    label = ~LAD22NM,
    group = "Human Capital"
  ) %>%
  addPolygons(
    data = joined,
    fillColor = ~ palette(Social_Capital),
    weight = 0.05,
    opacity = 0.7,
    color = "white",
    dashArray = "2",
    smoothFactor = 0.2,
    fillOpacity = 0.6,
    highlightOptions = highlightOptions(
      weight = 1,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE
    ),
    label = ~LAD22NM,
    group = "Social Capital"
  ) %>%
  addPolygons(
    data = joined,
    fillColor = ~ palette(Institutional_Capital),
    opacity = 0.7,
    weight = 0.05,
    color = "white",
    dashArray = "2",
    smoothFactor = 0.2,
    fillOpacity = 0.6,
    highlightOptions = highlightOptions(
      weight = 1,
      color = "#666",
      dashArray = "",
      fillOpacity = 0.7,
      bringToFront = TRUE
    ),
    label = ~LAD22NM,
    group = "Institutional Capital"
  )

Cluster Analysis

#### Cluster Analysis ####

# K means clustering requires complete data, therefore removing NAs

cluster_data <- data %>%
  mutate_at(c(2:8), funs(c(as.numeric(.)))) # converting data into numeric format
## Warning: `funs()` was deprecated in dplyr 0.8.0.
## ℹ Please use a list of either functions or lambdas:
## 
## # Simple named list: list(mean = mean, median = median)
## 
## # Auto named with `tibble::lst()`: tibble::lst(mean, median)
## 
## # Using lambdas list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
cluster_data <- cluster_data[complete.cases(cluster_data), ]

# Standardize the data for clustering
standardize <- function(x) {
  return((x - mean(x)) / sd(x))
}

standardized_data <- cluster_data %>%
  select(-Local_Authority, -Productivity) %>%
  mutate_all(standardize)

# Determine the optimal number of clusters using the elbow method
wss <- purrr::map_dbl(1:10, function(k) {
  kmeans(standardized_data, centers = k, nstart = 25)$tot.withinss
})

# Visualize the elbow plot
ggplot(tibble(K = 1:10, WSS = wss), aes(x = K, y = WSS)) +
  geom_line() +
  geom_point() +
  labs(
    title = "Elbow Method",
    x = "Number of clusters (K)",
    y = "Total within-cluster sum of squares"
  ) +
  theme_minimal()

# Perform k-means clustering with the optimal number of clusters
optimal_k <- 4 # Based on the elbow plot, adjust as necessary
clusters <- kmeans(standardized_data, centers = optimal_k, nstart = 25)

# Add the cluster assignment to the original data
data$Cluster <- factor(clusters$cluster)

# Visualize the clusters
clusplot(standardized_data,
  clus = clusters$cluster,
  labels = 0,
  stand = TRUE,
  main = "Clusters of Local Authorities Based on Capitals",
  sub = "Standardized Variables",
  col.p = clusters$cluster
)

# K means with 2 clusters
k_4 <- kmeans(standardized_data[, c(1:6)], centers = 4, nstart = 25)
str(k_4)
## List of 9
##  $ cluster     : int [1:374] 1 4 2 2 3 1 2 4 2 2 ...
##  $ centers     : num [1:4, 1:6] -1.022 0.169 1.136 1.132 -0.715 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:4] "1" "2" "3" "4"
##   .. ..$ : chr [1:6] "Physical_Capital" "Intangible_Capital" "Financial_Capital" "Human_Capital" ...
##  $ totss       : num 2238
##  $ withinss    : num [1:4] 132 320 239 320
##  $ tot.withinss: num 1011
##  $ betweenss   : num 1227
##  $ size        : int [1:4] 143 120 50 61
##  $ iter        : int 4
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
# Visualising the two clusters in two dimension space
fviz_cluster(k_4, data = standardized_data[, c(1:6)])

Descriptive Statistics on Clusters

#### Descriptive Stats on Clusters ####

# Calculate the average levels of each capital for each cluster
cluster_averages <- data %>%
  select(-Local_Authority, -Productivity) %>%
  group_by(Cluster) %>%
  summarise_all(mean) %>%
  pivot_longer(-Cluster, names_to = "Capital", values_to = "Average_Value")

# Visualize the average levels of each capital for each cluster using a grouped bar plot
bar_plot <- ggplot(cluster_averages, aes(x = Capital, y = Average_Value, fill = Cluster)) +
  geom_col(position = "dodge") +
  labs(
    title = "Average Levels of Capitals by Cluster",
    x = "Capital",
    y = "Average Value"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) +
  scale_fill_manual(values = wes_palette("GrandBudapest2", n = 4, type = "continuous"))

bar_plot

ggplotly(bar_plot)
# Create a table displaying the average levels of each capital for each cluster
cluster_averages_table <- data %>%
  select(-Local_Authority, -Productivity) %>%
  group_by(Cluster) %>%
  summarise_all(mean)

# Print the table using kable function
cluster_averages_table
## # A tibble: 4 × 7
##   Cluster Physical_Capital Intangible_Capital Financial_Capital Human_Capital
##   <fct>              <dbl>              <dbl>             <dbl>         <dbl>
## 1 1                   55.0               31.2              26.9          25.2
## 2 2                   82.3               51.4              45.3          35.6
## 3 3                   82.8               32.0              48.8          55.6
## 4 4                   21.7               11.1              10.8          11.3
## # ℹ 2 more variables: Social_Capital <dbl>, Institutional_Capital <dbl>
knitr::kable(cluster_averages_table, digits = 2, format = "markdown", col.names = c("Cluster", colnames(data)[3:8]))
Cluster Intangible_Capital Financial_Capital Human_Capital Social_Capital Institutional_Capital Productivity
1 55.04 31.17 26.86 25.24 30.66 29.03
2 82.31 51.41 45.34 35.65 29.68 64.76
3 82.78 32.01 48.77 55.65 64.78 27.22
4 21.73 11.08 10.84 11.34 11.95 11.56

Radar plots of clusters

#### Radar plots of clusters ####

# Prepare the data for the radar plot
radar_data <- cluster_averages %>%
  spread(key = "Capital", value = "Average_Value") %>%
  mutate(Cluster = paste0("Cluster ", Cluster))

# Radar plot of cluster characteristics
ggradar_plot <- ggradar(
  radar_data,
  axis.label.size = 3,
  grid.label.size = 5,
  base.size = 1,
  values.radar = c("0", "0", "100"),
  grid.min = 0,
  grid.mid = 0,
  grid.max = 100,
  # Polygons
  group.line.width = 1,
  group.point.size = 3,
  group.colours = c(
    "lightblue",
    "turquoise",
    "blue",
    "navy"
  ),
  background.circle.colour = "white",
  gridline.mid.colour = "grey",
  legend.position = "bottom",
  legend.title = "Cluster",
  plot.title = "Cluster Radar Plot",
  fill = T,
  fill.alpha = 0.3
) +
  theme_minimal()

ggradar_plot

# Interactive plot
ggplotly(ggradar_plot)

Productivity decomposition analysis

#### Productivity Decomposition Analysis ####

#### Multivariate regression of productivity on capitals ####

# Fit a multiple linear regression model
model <- lm(Productivity ~ Physical_Capital + Intangible_Capital + Financial_Capital + Human_Capital + Social_Capital + Institutional_Capital, data = data)

# View the model summary
summary(model)
## 
## Call:
## lm(formula = Productivity ~ Physical_Capital + Intangible_Capital + 
##     Financial_Capital + Human_Capital + Social_Capital + Institutional_Capital, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.593  -7.798  -0.514   8.557  37.116 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           35.08919    1.40479  24.978  < 2e-16 ***
## Physical_Capital       0.11324    0.05442   2.081 0.038145 *  
## Intangible_Capital     0.08381    0.03964   2.114 0.035171 *  
## Financial_Capital      0.14691    0.04144   3.546 0.000442 ***
## Human_Capital          0.10558    0.04155   2.541 0.011463 *  
## Social_Capital         0.11072    0.03985   2.778 0.005747 ** 
## Institutional_Capital  0.08756    0.03635   2.409 0.016501 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13 on 367 degrees of freedom
## Multiple R-squared:  0.4257, Adjusted R-squared:  0.4163 
## F-statistic: 45.34 on 6 and 367 DF,  p-value: < 2.2e-16
# Extract the regression coefficients
coefficients <- coef(model)

# Display the coefficients
cat("Regression Coefficients:\n")
## Regression Coefficients:
print(coefficients)
##           (Intercept)      Physical_Capital    Intangible_Capital 
##           35.08918994            0.11324093            0.08381472 
##     Financial_Capital         Human_Capital        Social_Capital 
##            0.14691441            0.10557513            0.11072204 
## Institutional_Capital 
##            0.08755938
# Create diagnostic plots
par(mfrow = c(2, 2))
cat("Diagnostic Plots:\n")
## Diagnostic Plots:
plot(model)

stargazer(model, type = "text")
## 
## =================================================
##                           Dependent variable:    
##                       ---------------------------
##                              Productivity        
## -------------------------------------------------
## Physical_Capital                0.113**          
##                                 (0.054)          
##                                                  
## Intangible_Capital              0.084**          
##                                 (0.040)          
##                                                  
## Financial_Capital              0.147***          
##                                 (0.041)          
##                                                  
## Human_Capital                   0.106**          
##                                 (0.042)          
##                                                  
## Social_Capital                 0.111***          
##                                 (0.040)          
##                                                  
## Institutional_Capital           0.088**          
##                                 (0.036)          
##                                                  
## Constant                       35.089***         
##                                 (1.405)          
##                                                  
## -------------------------------------------------
## Observations                      374            
## R2                               0.426           
## Adjusted R2                      0.416           
## Residual Std. Error        13.000 (df = 367)     
## F Statistic             45.344*** (df = 6; 367)  
## =================================================
## Note:                 *p<0.1; **p<0.05; ***p<0.01
# Fit univariate regression models
model_physical <- lm(Productivity ~ Physical_Capital, data = data)
model_intangible <- lm(Productivity ~ Intangible_Capital, data = data)
model_financial <- lm(Productivity ~ Financial_Capital, data = data)
model_human <- lm(Productivity ~ Human_Capital, data = data)
model_social <- lm(Productivity ~ Social_Capital, data = data)
model_institutional <- lm(Productivity ~ Institutional_Capital, data = data)

# Fit the multivariate regression model
model_all <- lm(Productivity ~ Physical_Capital + Intangible_Capital + Financial_Capital + Human_Capital + Social_Capital + Institutional_Capital, data = data)
summary(model_all)
## 
## Call:
## lm(formula = Productivity ~ Physical_Capital + Intangible_Capital + 
##     Financial_Capital + Human_Capital + Social_Capital + Institutional_Capital, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.593  -7.798  -0.514   8.557  37.116 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           35.08919    1.40479  24.978  < 2e-16 ***
## Physical_Capital       0.11324    0.05442   2.081 0.038145 *  
## Intangible_Capital     0.08381    0.03964   2.114 0.035171 *  
## Financial_Capital      0.14691    0.04144   3.546 0.000442 ***
## Human_Capital          0.10558    0.04155   2.541 0.011463 *  
## Social_Capital         0.11072    0.03985   2.778 0.005747 ** 
## Institutional_Capital  0.08756    0.03635   2.409 0.016501 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13 on 367 degrees of freedom
## Multiple R-squared:  0.4257, Adjusted R-squared:  0.4163 
## F-statistic: 45.34 on 6 and 367 DF,  p-value: < 2.2e-16
# Create a narrower regression table using stargazer
#stargazer(model_physical, model_intangible, model_financial, model_human, model_social, model_institutional, model_all,
#  title = "Regression Models of Productivity and Capitals",
#  align = TRUE,
#  covariate.labels = c("Physical", "Intangible", "Financial", "Human", "Social", "Institutional", "Constant"),
#  column.sep.width = "5pt",
#  keep.stat = c("rsq", "n"),
#  type = "text",
#  digits = 3
#)
#### Plotting fitted values ####

# Save the fitted values from the regression model
fitted_values <- fitted(model_all)
head(fitted_values)
##        1        2        3        4        5        6 
## 49.91620 73.47643 49.62179 59.07952 73.10600 38.10492
# Create a data frame for plotting
data$fitted_productivity <- fitted_values

# List of local authorities to label (e.g., London, Manchester,...)
local_authorities <- c(
  "1", "2", "3", "5", "250", "300", "294", "294",
  "253", "213", "188", "75"
)

# Fit the linear regression model
model_lm <- lm(Productivity ~ fitted_productivity, data = data)
slope <- coef(model_lm)[2]
r2 <- summary(model_lm)$r.squared

# Create scatter plot of fitted vs. actual values
fitted_plot <- ggplot(data, aes(x = fitted_productivity, y = Productivity, label = Local_Authority, color = (Productivity - fitted_productivity))) +
  geom_point(alpha = 0.7) +
  labs(
    title = "Scatter Plot of Productivity vs. Fitted Values",
    x = "Fitted Values",
    y = "Productivity (GVA)"
  ) +
  theme_minimal() +
  geom_smooth(method = "lm", se = TRUE, color = "navy") +
  scale_color_gradientn(colors = brewer.pal(11, "Spectral"), name = "Unexplained Productivity") +
  theme(legend.position = "top") +
  geom_label_repel(
    data = subset(data, Local_Authority %in% local_authorities),
    nudge_y = 0.5,
    color = "darkgrey",
    alpha = 0.7
  ) +
  geom_text(
    x = max(data$fitted_productivity),
    y = max(data$Productivity),
    label = paste("Slope:", round(slope, 4), "| R-sqrd:", round(r2, 2)),
    hjust = 1,
    vjust = 30,
    color = "darkgrey",
    size = 3
  )

# Add a line of best fit to the scatter plot
fitted_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?

ggplotly(fitted_plot)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: The following aesthetics were dropped during statistical transformation: label
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
##   the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
##   variable into a factor?
## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomLabelRepel() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues
#### Contribution Plot ####

# Identify example local authorities
low_productivity_example <- data %>% filter(Productivity == min(Productivity)) # e.g. Manchester
high_productivity_example <- data %>% filter(Productivity == max(Productivity)) # e.g. London

# Function to calculate the fitted values for each combination of capitals
calculate_fitted_values <- function(row, coefficients) {
  intercept <- coefficients[1]
  fitted_value <- intercept
  fitted_values <- c(fitted_value)

  for (i in 2:length(coefficients)) {
    fitted_value <- fitted_value + row[i - 1] * coefficients[i]
    fitted_values <- c(fitted_values, fitted_value)
  }

  return(fitted_values)
}

# Calculate the fitted values for low and high productivity examples
low_fitted_values <- calculate_fitted_values(low_productivity_example[, 2:7], coefficients) # e.g. Manchester
high_fitted_values <- calculate_fitted_values(high_productivity_example[, 2:7], coefficients) # e.g. London

# Calculate the contribution of each capital to the difference in productivity
contributions <- coefficients[-1] * (high_productivity_example[2:7] - low_productivity_example[2:7])

contributions_percent <- ((coefficients[-1] * (high_productivity_example[2:7] - low_productivity_example[2:7])) / (high_productivity_example[2:7] - low_productivity_example[2:7])) * 100

# Create a data frame for the contribution chart
contribution_data <- data.frame(
  Capital = colnames(data)[2:7],
  Contribution = as.numeric(contributions)
)

# Create the waterfall chart
contribution_chart <- ggplot(contribution_data, aes(x = Capital, y = Contribution)) +
  geom_bar(stat = "identity", position = "dodge", color = "navy", fill = "navy", alpha = 0.6) +
  labs(
    title = "Waterfall Chart of Productivity Differences",
    x = "Capital",
    y = "Contribution"
  ) +
  coord_flip() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  ) +
  theme_minimal()

# Display the contribution chart
contribution_chart

# The waterfall chart shows the contribution of each capital to the
# difference in productivity between the low and high productivity examples.
# Each bar represents a capital, and the height of the bar indicates the
# contribution of that capital to the productivity difference. e.g. 10 = 10 productivity points


# Create a data frame for the contribution percent chart
contribution_percent_data <- data.frame(
  Capital = colnames(data)[2:7],
  Contribution = as.numeric(contributions_percent)
)

# Create the waterfall chart
contribution_percent_chart <- ggplot(contribution_percent_data, aes(x = Capital, y = Contribution)) +
  geom_bar(stat = "identity", position = "dodge", color = "navy", fill = "navy", alpha = 0.6) +
  labs(
    title = "Waterfall Chart of Productivity Differences",
    x = "Capital",
    y = "Contribution %"
  ) +
  coord_flip() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  ) +
  theme_minimal()

# Display the contribution percent chart
contribution_percent_chart

The waterfall chart shows the contribution of each capital to the difference in productivity between the low and high productivity examples. Each bar represents a capital, and the height of the bar indicates the contribution of that capital to the productivity difference.

#### 'Bridging the Gap'  Chart ####

# Calculate the contribution of each capital to the productivity difference
contributions <- coefficients[-1] * (high_productivity_example[2:7] - low_productivity_example[2:7])

# Calculate the fitted productivity for the low example
fitted_low <- sum(low_productivity_example[3:8] * coefficients[-1]) + coefficients[1]
fitted_low
## (Intercept) 
##    35.87074
# Calculate the fitted productivity for the high example
fitted_high <- sum(high_productivity_example[2:7] * coefficients[-1]) + coefficients[1]
fitted_high
## (Intercept) 
##    78.22226
# Create a data frame with the low productivity example
rf_chart_data <- data.frame(
  Category = "Low Productivity Example",
  Value = low_productivity_example$Productivity
)

# Add a row for the fitted productivity to the data frame
rf_chart_data <- rbind(rf_chart_data, data.frame(
  Category = "Fitted Low",
  Value = fitted_low
))
rf_chart_data
##                             Category    Value
## 1           Low Productivity Example  1.00000
## (Intercept)               Fitted Low 35.87074
# Add rows for the capital contributions to the data frame, cumulatively
cumulative_value <- fitted_low
for (i in 1:length(contributions)) {
  cumulative_value <- cumulative_value + contributions[i]
  row_label <- paste0("+", names(contributions)[i])
  new_row <- data.frame(
    Category = row_label,
    Value = cumulative_value
  )
  if (i == 1) {
    colnames(rf_chart_data) <- colnames(new_row)
  }
  rf_chart_data <- rbind(rf_chart_data, new_row)
}

# Add rows for the fitted high productivity and high productivity example
new_rows <- data.frame(
  Category = c("Fitted High Productivity Example", "High Productivity Example"),
  Value = c(fitted_high, high_productivity_example$Productivity)
)
colnames(new_rows) <- colnames(rf_chart_data)
rf_chart_data <- rbind(rf_chart_data, new_rows)

# Set the row names to NULL
rownames(rf_chart_data) <- NULL

# # Rename the "Value" column to "value"
# rf_chart_data <- rf_chart_data %>% rename(value = Physical_Capital)
# Rename an existing column
colnames(rf_chart_data)[colnames(rf_chart_data) == "Physical_Capital"] <- "value"

# Print the modified data frame
rf_chart_data
##                            Category     value
## 1          Low Productivity Example   1.00000
## 2                        Fitted Low  35.87074
## 3                 +Physical_Capital  45.74628
## 4               +Intangible_Capital  46.99035
## 5                +Financial_Capital  56.62055
## 6                    +Human_Capital  63.49349
## 7                   +Social_Capital  72.08545
## 8            +Institutional_Capital  78.20670
## 9  Fitted High Productivity Example  78.22226
## 10        High Productivity Example 100.00000
## Plotting chart

# Create a new column for the change in value
rf_chart_data <- mutate(rf_chart_data, Change = c(value[1], diff(value)))

# Define the order of the categories
category_order <- c(
  "Low Productivity Example", "Fitted Low",
  "+Physical_Capital", "+Intangible_Capital",
  "+Financial_Capital", "+Human_Capital",
  "+Social_Capital", "+Institutional_Capital",
  "Fitted High Productivity Example", "High Productivity Example"
)

# Arrange the levels of the "Category" factor
rf_chart_data$Category <- factor(rf_chart_data$Category, levels = category_order)

# Define the number of levels in the Category variable
num_levels <- length(levels(rf_chart_data$Category))

# Create the stacked bar chart with marginal contributions and a reversed spectral color palette
stacked_bar_chart <- ggplot(data = rf_chart_data, aes(x = Category, y = value)) +
  geom_bar(
    stat = "identity", color = "grey",
    position = position_stack(vjust = 0.5),
    fill = brewer.pal(num_levels, "Spectral")
  ) +
  labs(title = "Stacked Bar Chart of Productivity Contributions", y = "Productivity", x = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 10, angle = 0, hjust = 1)) +
  coord_flip()

# Display the stacked bar chart
stacked_bar_chart

ggplotly(stacked_bar_chart)
#### Waterfall plot ####

# Create the stacked bar chart with marginal contributions and a reversed spectral color palette

# Define a custom color palette similar to "Spectral"
color_palette <- brewer.pal(11, "Spectral")

# Create the stacked bar chart with marginal contributions
stacked_bar_chart <- waterfall(
  values = round(rf_chart_data$Change, 2), labels = rf_chart_data$Category,
  fill_by_sign = FALSE, fill_colours = color_palette[1:length(rf_chart_data$Change)],
  calc_total = TRUE, total_rect_color = "navy", total_rect_text_color = "white", draw_lines = F
)

# Customize the chart labels and appearance
stacked_bar_chart <- stacked_bar_chart +
  labs(title = "Waterfall Chart of Productivity Contributions", y = "Productivity", x = NULL) +
  scale_x_discrete(labels = rf_chart_data$Category) +
  theme_minimal() +
  theme(axis.text.x = element_text(size = 10, angle = 45, hjust = 1))
## Scale for x is already present.
## Adding another scale for x, which will replace the existing scale.
# Display the stacked bar chart
stacked_bar_chart

Interactions Analysis

#### Interactions analysis ####

# Exclude Local_Authority and Cluster variables and their interactions
exclude_vars <- c("Local_Authority", "Cluster")
interaction_vars <- combn(colnames(data)[2:7], 2, paste, collapse = "*")
exclude_formula <- paste(paste0("(", exclude_vars, ")"), collapse = "+")
interaction_formula <- paste0(interaction_vars, collapse = "+")

# Construct the formula excluding the specified variables and their interactions
formula_excluded <- as.formula(paste("Productivity ~ . -", exclude_formula, "-", interaction_formula))

# Fit the model with the updated formula
model_excluded <- lm(formula_excluded, data = data)

# Perform the ANOVA
anova_results <- anova(model_excluded)

# Remove the terms corresponding to Local_Authority and Cluster from the ANOVA table
exclude_terms <- paste0("(", exclude_vars, ")")
anova_results <- anova_results[!rownames(anova_results) %in% exclude_terms, ]

# Print the updated ANOVA table
print(anova_results)
## Analysis of Variance Table
## 
## Response: Productivity
##                                           Df Sum Sq Mean Sq  F value    Pr(>F)
## Financial_Capital                          1  25710 25710.5 153.6892 < 2.2e-16
## Human_Capital                              1   8783  8782.7  52.5003 2.766e-12
## Social_Capital                             1   4774  4773.8  28.5362 1.660e-07
## Institutional_Capital                      1   3852  3851.6  23.0236 2.373e-06
## Cluster                                    3   1195   398.4   2.3817  0.069293
## fitted_productivity                        1   1723  1722.7  10.2979  0.001455
## Physical_Capital                           1      1     0.6   0.0034  0.953198
## Physical_Capital:Financial_Capital         1    202   202.2   1.2084  0.272404
## Physical_Capital:Human_Capital             1    135   134.9   0.8066  0.369745
## Physical_Capital:Social_Capital            1     59    59.1   0.3531  0.552727
## Physical_Capital:Institutional_Capital     1     50    49.9   0.2985  0.585179
## Intangible_Capital:Financial_Capital       1      6     5.8   0.0346  0.852609
## Intangible_Capital:Human_Capital           1    806   806.0   4.8181  0.028819
## Intangible_Capital:Social_Capital          1      1     0.8   0.0047  0.945410
## Intangible_Capital:Institutional_Capital   1      0     0.2   0.0010  0.974212
## Financial_Capital:Human_Capital            1    308   307.6   1.8387  0.175973
## Financial_Capital:Social_Capital           1     39    39.0   0.2332  0.629469
## Financial_Capital:Institutional_Capital    1     36    36.5   0.2182  0.640741
## Human_Capital:Social_Capital               1   1435  1435.2   8.5792  0.003623
## Human_Capital:Institutional_Capital        1    242   241.8   1.4452  0.230119
## Social_Capital:Institutional_Capital       1     93    92.6   0.5533  0.457466
## Residuals                                350  58551   167.3                   
##                                             
## Financial_Capital                        ***
## Human_Capital                            ***
## Social_Capital                           ***
## Institutional_Capital                    ***
## Cluster                                  .  
## fitted_productivity                      ** 
## Physical_Capital                            
## Physical_Capital:Financial_Capital          
## Physical_Capital:Human_Capital              
## Physical_Capital:Social_Capital             
## Physical_Capital:Institutional_Capital      
## Intangible_Capital:Financial_Capital        
## Intangible_Capital:Human_Capital         *  
## Intangible_Capital:Social_Capital           
## Intangible_Capital:Institutional_Capital    
## Financial_Capital:Human_Capital             
## Financial_Capital:Social_Capital            
## Financial_Capital:Institutional_Capital     
## Human_Capital:Social_Capital             ** 
## Human_Capital:Institutional_Capital         
## Social_Capital:Institutional_Capital        
## Residuals                                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Create a formula with all the interaction terms
interaction_formula <- reformulate(interaction_vars, response = "Productivity")

# Fit the model with all the interaction terms
model_interactions <- lm(interaction_formula, data = data)

# Print the model summary
summary(model_interactions)
## 
## Call:
## lm(formula = interaction_formula, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.235  -7.512  -0.100   7.739  35.886 
## 
## Coefficients:
##                                            Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              33.1277141  2.2328722  14.836  < 2e-16
## Physical_Capital                         -0.0572301  0.1402649  -0.408  0.68351
## Intangible_Capital                        0.2190000  0.1432262   1.529  0.12715
## Financial_Capital                         0.3465964  0.1519348   2.281  0.02313
## Human_Capital                             0.3582178  0.1526474   2.347  0.01949
## Social_Capital                            0.1162259  0.1467823   0.792  0.42900
## Institutional_Capital                     0.0110995  0.1369947   0.081  0.93547
## Physical_Capital:Intangible_Capital       0.0007325  0.0024465   0.299  0.76482
## Physical_Capital:Financial_Capital       -0.0024251  0.0024728  -0.981  0.32739
## Physical_Capital:Human_Capital            0.0021660  0.0024722   0.876  0.38154
## Physical_Capital:Social_Capital           0.0026238  0.0022726   1.155  0.24906
## Physical_Capital:Institutional_Capital    0.0019153  0.0023371   0.819  0.41306
## Intangible_Capital:Financial_Capital     -0.0004424  0.0020681  -0.214  0.83072
## Intangible_Capital:Human_Capital         -0.0045512  0.0019455  -2.339  0.01988
## Intangible_Capital:Social_Capital        -0.0008208  0.0018399  -0.446  0.65578
## Intangible_Capital:Institutional_Capital -0.0002778  0.0017422  -0.159  0.87342
## Financial_Capital:Human_Capital           0.0019210  0.0019438   0.988  0.32370
## Financial_Capital:Social_Capital         -0.0010971  0.0018546  -0.592  0.55455
## Financial_Capital:Institutional_Capital  -0.0005900  0.0017378  -0.339  0.73444
## Human_Capital:Social_Capital             -0.0058812  0.0019708  -2.984  0.00304
## Human_Capital:Institutional_Capital      -0.0027179  0.0020287  -1.340  0.18119
## Social_Capital:Institutional_Capital      0.0013215  0.0016229   0.814  0.41603
##                                             
## (Intercept)                              ***
## Physical_Capital                            
## Intangible_Capital                          
## Financial_Capital                        *  
## Human_Capital                            *  
## Social_Capital                              
## Institutional_Capital                       
## Physical_Capital:Intangible_Capital         
## Physical_Capital:Financial_Capital          
## Physical_Capital:Human_Capital              
## Physical_Capital:Social_Capital             
## Physical_Capital:Institutional_Capital      
## Intangible_Capital:Financial_Capital        
## Intangible_Capital:Human_Capital         *  
## Intangible_Capital:Social_Capital           
## Intangible_Capital:Institutional_Capital    
## Financial_Capital:Human_Capital             
## Financial_Capital:Social_Capital            
## Financial_Capital:Institutional_Capital     
## Human_Capital:Social_Capital             ** 
## Human_Capital:Institutional_Capital         
## Social_Capital:Institutional_Capital        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.9 on 352 degrees of freedom
## Multiple R-squared:  0.4573, Adjusted R-squared:  0.425 
## F-statistic: 14.13 on 21 and 352 DF,  p-value: < 2.2e-16
stargazer(model_interactions, type = "text")
## 
## ====================================================================
##                                              Dependent variable:    
##                                          ---------------------------
##                                                 Productivity        
## --------------------------------------------------------------------
## Physical_Capital                                   -0.057           
##                                                    (0.140)          
##                                                                     
## Intangible_Capital                                  0.219           
##                                                    (0.143)          
##                                                                     
## Financial_Capital                                  0.347**          
##                                                    (0.152)          
##                                                                     
## Human_Capital                                      0.358**          
##                                                    (0.153)          
##                                                                     
## Social_Capital                                      0.116           
##                                                    (0.147)          
##                                                                     
## Institutional_Capital                               0.011           
##                                                    (0.137)          
##                                                                     
## Physical_Capital:Intangible_Capital                 0.001           
##                                                    (0.002)          
##                                                                     
## Physical_Capital:Financial_Capital                 -0.002           
##                                                    (0.002)          
##                                                                     
## Physical_Capital:Human_Capital                      0.002           
##                                                    (0.002)          
##                                                                     
## Physical_Capital:Social_Capital                     0.003           
##                                                    (0.002)          
##                                                                     
## Physical_Capital:Institutional_Capital              0.002           
##                                                    (0.002)          
##                                                                     
## Intangible_Capital:Financial_Capital               -0.0004          
##                                                    (0.002)          
##                                                                     
## Intangible_Capital:Human_Capital                  -0.005**          
##                                                    (0.002)          
##                                                                     
## Intangible_Capital:Social_Capital                  -0.001           
##                                                    (0.002)          
##                                                                     
## Intangible_Capital:Institutional_Capital           -0.0003          
##                                                    (0.002)          
##                                                                     
## Financial_Capital:Human_Capital                     0.002           
##                                                    (0.002)          
##                                                                     
## Financial_Capital:Social_Capital                   -0.001           
##                                                    (0.002)          
##                                                                     
## Financial_Capital:Institutional_Capital            -0.001           
##                                                    (0.002)          
##                                                                     
## Human_Capital:Social_Capital                      -0.006***         
##                                                    (0.002)          
##                                                                     
## Human_Capital:Institutional_Capital                -0.003           
##                                                    (0.002)          
##                                                                     
## Social_Capital:Institutional_Capital                0.001           
##                                                    (0.002)          
##                                                                     
## Constant                                          33.128***         
##                                                    (2.233)          
##                                                                     
## --------------------------------------------------------------------
## Observations                                         374            
## R2                                                  0.457           
## Adjusted R2                                         0.425           
## Residual Std. Error                           12.903 (df = 352)     
## F Statistic                               14.127*** (df = 21; 352)  
## ====================================================================
## Note:                                    *p<0.1; **p<0.05; ***p<0.01
## Plotting fitted values

# Save the fitted values from the regression model
fitted_values_interactions <- fitted(model_interactions)
head(fitted_values_interactions)
##        1        2        3        4        5        6 
## 51.23928 72.12787 49.77304 58.95625 60.98348 36.73503
# Create a data frame for plotting
data$fitted_int_productivity <- fitted_values_interactions

# Create scatter plot of fitted vs. actual values
scatter_plot <- ggplot(data, aes(x = fitted_int_productivity, y = Productivity)) +
  geom_point(color = "navy", alpha = 0.5) +
  labs(
    title = "Scatter Plot of Fitted Values vs. Productivity",
    x = "Fitted values",
    y = "Productivity"
  ) +
  theme_minimal()

# Add a line of best fit to the scatter plot
scatter_plot + geom_smooth(method = "lm", se = T, color = "darkred")
## `geom_smooth()` using formula = 'y ~ x'

By including all the interaction terms explicitly, we can directly examine the coefficients and their significance to understand the interaction effects. This approach allows for a more detailed exploration of the specific interactions between the variables. It’s worth noting that including all possible interaction terms can lead to a large number of predictors, and it may result in overfitting or multicollinearity issues if the sample size is limited. Therefore, careful consideration should be given to the complexity of the model and the interpretability of the results.

Feature importance analysis

Tree-Based Methods: These methods calculate feature importance based on the number of times a feature is used for splitting in the trees and the resulting improvement in the model’s performance.

#### Feature importance analysis ####

# Tree-Based Methods: For decision tree-based algorithms such as Random Forest or Gradient Boosting, feature importance is often readily available. These methods calculate feature importance based on the number of times a feature is used for splitting in the trees and the resulting improvement in the model's performance.

#### Decision Tree ####

# Separate the features and outcome variable
features <- data[, c("Physical_Capital", "Intangible_Capital", "Financial_Capital", "Human_Capital", "Social_Capital", "Institutional_Capital")]
outcome <- data$Productivity

# Create the control object with a maximum depth of 5
control <- rpart.control(maxdepth = 5)

# Fit the decision tree model with the control object
model_tree <- rpart(outcome ~ ., data = features, control = control)

# Generate a tree plot
prp(model_tree, extra = 1, branch = 0.8, box.col = "pink", shadow.col = "gray", branch.lty = 1, branch.lwd = 1, cex.main = 1)

#### Random Forest ####

# Separate the features and outcome variable
features <- data[, c("Physical_Capital", "Intangible_Capital", "Financial_Capital", "Human_Capital", "Social_Capital", "Institutional_Capital")]
outcome <- data$Productivity

# Fit a Random Forest model
model_rf <- randomForest(x = features, y = outcome, ntree = 500, importance = TRUE)

# Extract feature importance
importance_rf <- importance(model_rf)

# Print the feature importance
print(importance_rf)
##                         %IncMSE IncNodePurity
## Physical_Capital      20.549925      26218.40
## Intangible_Capital     8.018208      13090.92
## Financial_Capital     13.476384      17846.51
## Human_Capital         15.246239      17207.09
## Social_Capital        16.117092      15212.00
## Institutional_Capital 10.891295      12585.75
# Generate a plot of feature importance
plot(model_rf)

# Create a data frame for plotting
importance_df <- data.frame(Feature = row.names(importance_rf), Importance = importance_rf[, 1])

# Plot the feature importance
ggplot(importance_df, aes(x = Feature, y = Importance)) +
  geom_bar(stat = "identity", fill = "navy", alpha = 0.7) +
  labs(title = "Feature Importance - Random Forest", x = "Feature", y = "Importance (% Inc MSE)") +
  theme_minimal() +
  coord_flip()

# Variable importance
importance(model_rf)
##                         %IncMSE IncNodePurity
## Physical_Capital      20.549925      26218.40
## Intangible_Capital     8.018208      13090.92
## Financial_Capital     13.476384      17846.51
## Human_Capital         15.246239      17207.09
## Social_Capital        16.117092      15212.00
## Institutional_Capital 10.891295      12585.75
varImpPlot(model_rf)