The aim of this study is to evaluate the relationship between various health indicators and the healthcare expenditure, which will be carried out using a correlation analysis.
library(dplyr)
objective5 = read.csv("gastsalenferm_bien.csv")
objective5$GastoRelativo <- objective5$Porcentaje.gastos_log / objective5$Gasto.total
objective5 <- objective5 %>%
select(-c(Sexo, Porcentaje.gastos, Gasto.total))
summary(objective5)
## Pais Año Prev.cardiac Prev.diab
## Length:630 Min. :2000 Min. :-1.4954 Min. :-1.5855
## Class :character 1st Qu.:2003 1st Qu.:-0.8155 1st Qu.:-0.7200
## Mode :character Median :2007 Median :-0.4197 Median :-0.2159
## Mean :2007 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.:2011 3rd Qu.: 0.7628 3rd Qu.: 0.5375
## Max. :2014 Max. : 2.2825 Max. : 4.1153
## Prev.hiper Region Media.obes Porcentaje.gastos_log
## Min. :-2.85139 Length:630 Min. :-2.19386 Min. :-2.5192
## 1st Qu.:-0.57432 Class :character 1st Qu.:-0.68776 1st Qu.:-0.6656
## Median :-0.06262 Mode :character Median : 0.01942 Median : 0.2327
## Mean : 0.00000 Mean : 0.00000 Mean : 0.0000
## 3rd Qu.: 0.55142 3rd Qu.: 0.57652 3rd Qu.: 0.6668
## Max. : 2.82849 Max. : 3.31600 Max. : 1.9466
## GastoRelativo
## Min. :-0.48917
## 1st Qu.:-0.10264
## Median : 0.03060
## Mean : 0.01522
## 3rd Qu.: 0.09487
## Max. : 1.44321
objective5
unique(objective5$Region)
## [1] "Americas"
Only countries from the Americas region are included in this dataset.
In order to choose the appropriate correlation method, it should be taken into account the data distribution of the variables.
datos = objective5[, c("Media.obes", "Prev.hiper", "Prev.cardiac", "Prev.diab", "GastoRelativo")]
par(mfrow = c(2, 3))
for (d in names(datos)) {
hist(datos[[d]],
main = d,
xlab = "Escala estandarizada",
col = "lightblue",
breaks = 30)
}
par(mfrow = c(2, 3))
for (d in names(datos)) {
boxplot(datos[[d]], main = d, col = "salmon")
}
Pearson is suitable for variables that are normally distributed and
without outliers. Considering that variables like GastoRelativo exhibit
a skewed distribution, Spearman seems more adequate. Additionally,
variables such as media.obesidad and diabetes prevalence have a vast
amount of outliers, which enforces the use of the Spearman correlation
coefficient, as it is more robust tan Pearson.
Porcentaje Gastos: Expenditure on health as a percentage of GDP per capita(%). Gasto.total: General government expenditure on health as a percentage of total government expenditure (%).
corMatriz = cor(datos, method = "spearman")
corMatriz
## Media.obes Prev.hiper Prev.cardiac Prev.diab GastoRelativo
## Media.obes 1.00000000 -0.3850126 0.055856633 0.66421219 0.096934463
## Prev.hiper -0.38501265 1.0000000 0.333959693 -0.11355417 -0.205383671
## Prev.cardiac 0.05585663 0.3339597 1.000000000 0.39835075 0.005579108
## Prev.diab 0.66421219 -0.1135542 0.398350753 1.00000000 0.068776594
## GastoRelativo 0.09693446 -0.2053837 0.005579108 0.06877659 1.000000000
library(ggplot2)
cortot = as.data.frame(as.table(corMatriz))
ggplot(cortot, aes(Var1, Var2, fill = Freq)) +
geom_tile() +
geom_text(aes(label = round(Freq, 2))) +
scale_fill_gradient2(low = "blue3", high = "red3", mid = "white", midpoint = 0)
At first glance, the results do not appear to be related. However, the
correlations will be shown separately for each country. With this, it
will be possible to examine possible specific patterns that might not be
evident in a general analysis.
corrgasto = objective5 %>%
group_by(Pais) %>%
summarize(cor_card = cor(GastoRelativo, Prev.cardiac, method = "spearman", use = "complete.obs"),
cor_diab = cor(GastoRelativo, Prev.diab, method = "spearman", use = "complete.obs"),
cor_hiper = cor(GastoRelativo, Prev.hiper, method = "spearman", use = "complete.obs"),
cor_obes = cor(GastoRelativo, Media.obes, method = "spearman", use = "complete.obs"))
corrgasto = na.omit(corrgasto)
corrgasto
# Convertir a formato largo
corrTotal = corrgasto %>%
pivot_longer(cols = starts_with("cor_"),
names_to = "Variable",
values_to = "Correlation")
# Cambiar nombres de variables
corrTotal$Variable = recode(corrTotal$Variable,
cor_card = "Cardiovascular",
cor_diab = "Diabetes",
cor_hiper = "Hypertension",
cor_obes = "Obesidad")
ggplot(corrTotal, aes(x = Variable, y = reorder(Pais, -Correlation), fill = Correlation)) +
geom_tile(color = "white", linewidth = 0.5) + #de mayor a menor correlación
geom_text(aes(label = round(Correlation, 2)), size = 3, color = "black") +
scale_fill_gradient2(low = "blue3", mid = "white", high = "red3", midpoint = 0,
name = "Correlation") +
labs(title = "Correlation between Healthcare and Illnesses by Country",
x = "Illness", y = "Country") +
theme_minimal(base_size = 12) +
theme(plot.title = element_text(size = 12))
library(NbClust)
cordatos = corrgasto[, -1]
# Clustering
distancias = dist(cordatos)
hc = hclust(distancias, method = "ward.D2")
nb = NbClust(cordatos, distance = "manhattan",
min.nc = 2, max.nc = 6, method = "median")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 6 proposed 2 as the best number of clusters
## * 11 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 4 proposed 6 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
clusters = cutree(hc, k = 3)
corrgasto$cluster = factor(clusters)
library(sf)
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
## Warning: package 'rnaturalearthdata' was built under R version 4.4.3
##
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
america = ne_countries(scale = "medium", returnclass = "sf") %>%
filter(region_un == "Americas")
america = left_join(america, corrgasto[, c("Pais", "cluster")], by = c("admin" = "Pais"))
ggplot(america) +
geom_sf(aes(fill = cluster)) +
scale_fill_manual(values = c("1" = "salmon", "2" = "lightgreen", "3" = "skyblue"),
na.value = "lightgray") +
labs(title = "Countries by Clusters about the Correlations with Healthcare", fill = "Cluster") + theme_minimal()
ggplot(corrgasto, aes(x = 1, y = Pais, fill = cluster)) +
geom_col() +
labs(title = "Clusters by Country", x = "Presence", y = "Country", fill = "Cluster") +
scale_fill_manual(values = c("1" = "salmon", "2" = "lightgreen", "3" = "skyblue")) +
theme_minimal()
medias = corrgasto %>%
group_by(cluster) %>%
summarise(across(where(is.numeric), mean))
mediastot <- medias %>%
pivot_longer(cols = starts_with("cor_"), names_to = "Indicador", values_to = "Promedio") %>%
mutate(Indicador = recode(Indicador,
cor_card = "Cardiovascular",
cor_diab = "Diabetes",
cor_hiper = "Hypertension",
cor_obes = "Obesity"))
ggplot(mediastot, aes(x = factor(cluster), y = Promedio, fill = factor(cluster))) +
geom_bar(stat = "identity") +
geom_text(aes(label = round(Promedio, 2)), vjust = -0.3, size = 3) +
facet_wrap(~ Indicador) + # subgráfico por enfermedad
scale_fill_manual(values = c("1" = "salmon", "2" = "lightgreen", "3" = "skyblue")) +
labs(title = "Average correlations by Clusters",
x = "Cluster", y = "Average Correlation", fill = "Cluster") +
theme_minimal()