Carga de datos

The aim of this study is to evaluate the relationship between various health indicators and the healthcare expenditure, which will be carried out using a correlation analysis.

library(dplyr)

objective5 = read.csv("gastsalenferm_bien.csv")
objective5$GastoRelativo <- objective5$Porcentaje.gastos_log / objective5$Gasto.total

objective5 <- objective5 %>%
  select(-c(Sexo, Porcentaje.gastos, Gasto.total))

summary(objective5)
##      Pais                Año        Prev.cardiac       Prev.diab      
##  Length:630         Min.   :2000   Min.   :-1.4954   Min.   :-1.5855  
##  Class :character   1st Qu.:2003   1st Qu.:-0.8155   1st Qu.:-0.7200  
##  Mode  :character   Median :2007   Median :-0.4197   Median :-0.2159  
##                     Mean   :2007   Mean   : 0.0000   Mean   : 0.0000  
##                     3rd Qu.:2011   3rd Qu.: 0.7628   3rd Qu.: 0.5375  
##                     Max.   :2014   Max.   : 2.2825   Max.   : 4.1153  
##    Prev.hiper          Region            Media.obes       Porcentaje.gastos_log
##  Min.   :-2.85139   Length:630         Min.   :-2.19386   Min.   :-2.5192      
##  1st Qu.:-0.57432   Class :character   1st Qu.:-0.68776   1st Qu.:-0.6656      
##  Median :-0.06262   Mode  :character   Median : 0.01942   Median : 0.2327      
##  Mean   : 0.00000                      Mean   : 0.00000   Mean   : 0.0000      
##  3rd Qu.: 0.55142                      3rd Qu.: 0.57652   3rd Qu.: 0.6668      
##  Max.   : 2.82849                      Max.   : 3.31600   Max.   : 1.9466      
##  GastoRelativo     
##  Min.   :-0.48917  
##  1st Qu.:-0.10264  
##  Median : 0.03060  
##  Mean   : 0.01522  
##  3rd Qu.: 0.09487  
##  Max.   : 1.44321
objective5
unique(objective5$Region)
## [1] "Americas"

Only countries from the Americas region are included in this dataset.

In order to choose the appropriate correlation method, it should be taken into account the data distribution of the variables.

datos = objective5[, c("Media.obes", "Prev.hiper", "Prev.cardiac", "Prev.diab", "GastoRelativo")]

par(mfrow = c(2, 3))
for (d in names(datos)) {
  hist(datos[[d]],
       main = d,
       xlab = "Escala estandarizada",
       col = "lightblue",
       breaks = 30)
}

par(mfrow = c(2, 3))

for (d in names(datos)) {
  boxplot(datos[[d]], main = d, col = "salmon")
}

Pearson is suitable for variables that are normally distributed and without outliers. Considering that variables like GastoRelativo exhibit a skewed distribution, Spearman seems more adequate. Additionally, variables such as media.obesidad and diabetes prevalence have a vast amount of outliers, which enforces the use of the Spearman correlation coefficient, as it is more robust tan Pearson.

Porcentaje Gastos: Expenditure on health as a percentage of GDP per capita(%). Gasto.total: General government expenditure on health as a percentage of total government expenditure (%).

corMatriz = cor(datos, method = "spearman")

corMatriz
##                Media.obes Prev.hiper Prev.cardiac   Prev.diab GastoRelativo
## Media.obes     1.00000000 -0.3850126  0.055856633  0.66421219   0.096934463
## Prev.hiper    -0.38501265  1.0000000  0.333959693 -0.11355417  -0.205383671
## Prev.cardiac   0.05585663  0.3339597  1.000000000  0.39835075   0.005579108
## Prev.diab      0.66421219 -0.1135542  0.398350753  1.00000000   0.068776594
## GastoRelativo  0.09693446 -0.2053837  0.005579108  0.06877659   1.000000000
library(ggplot2)

cortot = as.data.frame(as.table(corMatriz))

ggplot(cortot, aes(Var1, Var2, fill = Freq)) +
  geom_tile() +
  geom_text(aes(label = round(Freq, 2))) +
  scale_fill_gradient2(low = "blue3", high = "red3", mid = "white", midpoint = 0)

At first glance, the results do not appear to be related. However, the correlations will be shown separately for each country. With this, it will be possible to examine possible specific patterns that might not be evident in a general analysis.

corrgasto = objective5 %>%
  group_by(Pais) %>%
  summarize(cor_card = cor(GastoRelativo, Prev.cardiac, method = "spearman", use = "complete.obs"),
            cor_diab = cor(GastoRelativo, Prev.diab, method = "spearman", use = "complete.obs"),
            cor_hiper = cor(GastoRelativo, Prev.hiper, method = "spearman", use = "complete.obs"),
            cor_obes = cor(GastoRelativo, Media.obes, method = "spearman", use = "complete.obs"))
corrgasto = na.omit(corrgasto)
corrgasto
# Convertir a formato largo
corrTotal = corrgasto %>%
  pivot_longer(cols = starts_with("cor_"), 
               names_to = "Variable", 
               values_to = "Correlation")

# Cambiar nombres de variables
corrTotal$Variable = recode(corrTotal$Variable,
                             cor_card = "Cardiovascular",
                             cor_diab = "Diabetes",
                             cor_hiper = "Hypertension",
                             cor_obes = "Obesidad")

 
ggplot(corrTotal, aes(x = Variable, y = reorder(Pais, -Correlation), fill = Correlation)) +
  geom_tile(color = "white", linewidth = 0.5) + #de mayor a menor correlación
  geom_text(aes(label = round(Correlation, 2)), size = 3, color = "black") +
  scale_fill_gradient2(low = "blue3", mid = "white", high = "red3", midpoint = 0,
                       name = "Correlation") +
  labs(title = "Correlation between Healthcare and Illnesses by Country",
       x = "Illness", y = "Country") +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(size = 12))

library(NbClust)
cordatos = corrgasto[, -1]


# Clustering
distancias = dist(cordatos)
hc = hclust(distancias, method = "ward.D2")
nb = NbClust(cordatos, distance = "manhattan",
              min.nc = 2, max.nc = 6, method = "median")

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 6 proposed 2 as the best number of clusters 
## * 11 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 1 proposed 5 as the best number of clusters 
## * 4 proposed 6 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************
clusters = cutree(hc, k = 3)

corrgasto$cluster = factor(clusters)
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
## Warning: package 'rnaturalearthdata' was built under R version 4.4.3
## 
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
## 
##     countries110
america = ne_countries(scale = "medium", returnclass = "sf") %>%
  filter(region_un == "Americas")

america = left_join(america, corrgasto[, c("Pais", "cluster")], by = c("admin" = "Pais"))

ggplot(america) +
  geom_sf(aes(fill = cluster)) +
  scale_fill_manual(values = c("1" = "salmon", "2" = "lightgreen", "3" = "skyblue"),
                    na.value = "lightgray") +
  labs(title = "Countries by Clusters about the Correlations with Healthcare", fill = "Cluster") + theme_minimal()

ggplot(corrgasto, aes(x = 1, y = Pais, fill = cluster)) +
  geom_col() +
  labs(title = "Clusters by Country", x = "Presence", y = "Country", fill = "Cluster") +
  scale_fill_manual(values = c("1" = "salmon", "2" = "lightgreen", "3" = "skyblue")) +
  theme_minimal()

medias = corrgasto %>%
  group_by(cluster) %>%
  summarise(across(where(is.numeric), mean))

mediastot <- medias %>%
  pivot_longer(cols = starts_with("cor_"), names_to = "Indicador", values_to = "Promedio") %>%
  mutate(Indicador = recode(Indicador,
                            cor_card = "Cardiovascular",
                            cor_diab = "Diabetes",
                            cor_hiper = "Hypertension",
                            cor_obes = "Obesity"))

ggplot(mediastot, aes(x = factor(cluster), y = Promedio, fill = factor(cluster))) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = round(Promedio, 2)), vjust = -0.3, size = 3) +
  facet_wrap(~ Indicador) +     # subgráfico por enfermedad
  scale_fill_manual(values = c("1" = "salmon", "2" = "lightgreen", "3" = "skyblue")) +
  labs(title = "Average correlations by Clusters",
       x = "Cluster", y = "Average Correlation", fill = "Cluster") +
  theme_minimal()