alcohol = read.csv('alcoholpcap.csv')
summary(alcohol)
## Entity Code Year
## Length:4185 Length:4185 Min. :2000
## Class :character Class :character 1st Qu.:2005
## Mode :character Mode :character Median :2010
## Mean :2010
## 3rd Qu.:2015
## Max. :2020
## Total.alcohol.consumption.per.capita..liters.of.pure.alcohol..projected.estimates..15..years.of.age.
## Min. : 0.00
## 1st Qu.: 1.88
## Median : 4.92
## Mean : 5.49
## 3rd Qu.: 8.67
## Max. :19.40
#View(alcohol)
nrow(alcohol)
## [1] 4185
You can also embed plots, for example:
## Pais Code Año alcohol_pc
## Length:4185 Length:4185 Min. :2000 Min. : 0.00
## Class :character Class :character 1st Qu.:2005 1st Qu.: 1.88
## Mode :character Mode :character Median :2010 Median : 4.92
## Mean :2010 Mean : 5.49
## 3rd Qu.:2015 3rd Qu.: 8.67
## Max. :2020 Max. :19.40
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
top_10_global = alcohol %>%
group_by(Pais) %>%
summarise(media_consumo = mean(alcohol_pc, na.rm = TRUE)) %>%
arrange(desc(media_consumo)) %>%
slice(1:10)
barplot(top_10_global$media_consumo,
names.arg = top_10_global$Pais,
col = "darkorange",
las = 2,
main = "Top 10 países por consumo medio de alcohol per cápita (2000–2020)",
ylab = "Litros de alcohol puro por persona")
top_10_2010_2020 = alcohol %>%
filter(Año >= 2010 & Año <= 2020) %>%
group_by(Pais) %>%
summarise(media_consumo = mean(alcohol_pc, na.rm = TRUE)) %>%
arrange(desc(media_consumo)) %>%
slice(1:10)
print(top_10_2010_2020)
## # A tibble: 10 × 2
## Pais media_consumo
## <chr> <dbl>
## 1 Romania 17.1
## 2 Georgia 14.3
## 3 Estonia 13.7
## 4 Lithuania 13.6
## 5 Czechia 13.1
## 6 Uganda 13.0
## 7 Belarus 12.3
## 8 Germany 12.2
## 9 Austria 12.1
## 10 Latvia 11.9
barplot(top_10_2010_2020$media_consumo,
names.arg = top_10_2010_2020$Pais,
col = "steelblue",
las = 2,
main = "Top 10 países por consumo medio de alcohol (2010–2020)",
ylab = "Litros de alcohol puro per cápita")
alcohol %>%
group_by(Año) %>%
summarise(Media=mean(alcohol_pc, na.rm=TRUE),
Mediana=median(alcohol_pc, na.rm=TRUE),
Maximo=max(alcohol_pc, na.rm=TRUE),
Minimo=min(alcohol_pc, na.rm=TRUE)) %>%
print(n=21)
## # A tibble: 21 × 5
## Año Media Mediana Maximo Minimo
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 2000 5.44 4.57 19.0 0
## 2 2001 5.44 4.57 19.0 0
## 3 2002 5.42 4.57 19.4 0
## 4 2003 5.43 4.56 19.3 0
## 5 2004 5.48 4.55 18.7 0
## 6 2005 5.54 4.81 18.3 0
## 7 2006 5.61 4.82 18.1 0
## 8 2007 5.63 4.88 18.3 0
## 9 2008 5.59 4.71 18.4 0
## 10 2009 5.54 4.99 18.1 0
## 11 2010 5.59 5.23 17.6 0
## 12 2011 5.58 5.08 17.3 0
## 13 2012 5.59 5.09 17.2 0
## 14 2013 5.56 5.08 17.1 0
## 15 2014 5.53 5.1 17.0 0
## 16 2015 5.53 5.15 16.8 0
## 17 2016 5.46 5.03 16.8 0
## 18 2017 5.42 5.01 16.9 0
## 19 2018 5.40 4.95 17.0 0
## 20 2019 5.44 4.98 17.0 0
## 21 2020 5.05 4.3 16.8 0
pais = "Romania" # Cambia por el país que quieras
datos_pais = filter(alcohol, Pais == pais)
plot(datos_pais$Año, datos_pais$alcohol_pc, type="l", col="darkgreen",
main=paste("Evolución del consumo de alcohol en", pais),
xlab="Año", ylab="Litros per cápita")
unique(alcohol$Pais)
## [1] "Afghanistan" "Albania"
## [3] "Algeria" "Andorra"
## [5] "Angola" "Antigua and Barbuda"
## [7] "Argentina" "Armenia"
## [9] "Australia" "Austria"
## [11] "Azerbaijan" "Bahamas"
## [13] "Bahrain" "Bangladesh"
## [15] "Barbados" "Belarus"
## [17] "Belgium" "Belize"
## [19] "Benin" "Bhutan"
## [21] "Bolivia" "Bosnia and Herzegovina"
## [23] "Botswana" "Brazil"
## [25] "Brunei" "Bulgaria"
## [27] "Burkina Faso" "Burundi"
## [29] "Cambodia" "Cameroon"
## [31] "Canada" "Cape Verde"
## [33] "Central African Republic" "Chad"
## [35] "Chile" "China"
## [37] "Colombia" "Comoros"
## [39] "Congo" "Costa Rica"
## [41] "Cote d'Ivoire" "Croatia"
## [43] "Cuba" "Cyprus"
## [45] "Czechia" "Democratic Republic of Congo"
## [47] "Denmark" "Djibouti"
## [49] "Dominica" "Dominican Republic"
## [51] "East Asia and Pacific (WB)" "East Timor"
## [53] "Ecuador" "Egypt"
## [55] "El Salvador" "Equatorial Guinea"
## [57] "Eritrea" "Estonia"
## [59] "Eswatini" "Ethiopia"
## [61] "Europe and Central Asia (WB)" "European Union (27)"
## [63] "Fiji" "Finland"
## [65] "France" "Gabon"
## [67] "Gambia" "Georgia"
## [69] "Germany" "Ghana"
## [71] "Greece" "Grenada"
## [73] "Guatemala" "Guinea"
## [75] "Guinea-Bissau" "Guyana"
## [77] "Haiti" "High-income countries"
## [79] "Honduras" "Hungary"
## [81] "Iceland" "India"
## [83] "Indonesia" "Iran"
## [85] "Iraq" "Ireland"
## [87] "Israel" "Italy"
## [89] "Jamaica" "Japan"
## [91] "Jordan" "Kazakhstan"
## [93] "Kenya" "Kiribati"
## [95] "Kuwait" "Kyrgyzstan"
## [97] "Laos" "Latin America and Caribbean (WB)"
## [99] "Latvia" "Lebanon"
## [101] "Lesotho" "Liberia"
## [103] "Libya" "Lithuania"
## [105] "Low-income countries" "Lower-middle-income countries"
## [107] "Luxembourg" "Madagascar"
## [109] "Malawi" "Malaysia"
## [111] "Maldives" "Mali"
## [113] "Malta" "Mauritania"
## [115] "Mauritius" "Mexico"
## [117] "Micronesia (country)" "Middle East and North Africa (WB)"
## [119] "Middle-income countries" "Moldova"
## [121] "Mongolia" "Montenegro"
## [123] "Morocco" "Mozambique"
## [125] "Myanmar" "Namibia"
## [127] "Nauru" "Nepal"
## [129] "Netherlands" "New Zealand"
## [131] "Nicaragua" "Niger"
## [133] "Nigeria" "North America (WB)"
## [135] "North Korea" "North Macedonia"
## [137] "Norway" "Oman"
## [139] "Pakistan" "Panama"
## [141] "Papua New Guinea" "Paraguay"
## [143] "Peru" "Philippines"
## [145] "Poland" "Portugal"
## [147] "Qatar" "Romania"
## [149] "Russia" "Rwanda"
## [151] "Saint Kitts and Nevis" "Saint Lucia"
## [153] "Saint Vincent and the Grenadines" "Samoa"
## [155] "Sao Tome and Principe" "Saudi Arabia"
## [157] "Senegal" "Serbia"
## [159] "Seychelles" "Sierra Leone"
## [161] "Singapore" "Slovakia"
## [163] "Slovenia" "Solomon Islands"
## [165] "Somalia" "South Africa"
## [167] "South Asia (WB)" "South Korea"
## [169] "Spain" "Sri Lanka"
## [171] "Sub-Saharan Africa (WB)" "Sudan"
## [173] "Suriname" "Sweden"
## [175] "Switzerland" "Syria"
## [177] "Tajikistan" "Tanzania"
## [179] "Thailand" "Togo"
## [181] "Tonga" "Trinidad and Tobago"
## [183] "Tunisia" "Turkey"
## [185] "Turkmenistan" "Tuvalu"
## [187] "Uganda" "Ukraine"
## [189] "United Arab Emirates" "United Kingdom"
## [191] "United States" "Upper-middle-income countries"
## [193] "Uruguay" "Uzbekistan"
## [195] "Vanuatu" "Venezuela"
## [197] "Vietnam" "World"
## [199] "Yemen" "Zambia"
## [201] "Zimbabwe"
drogas = read.csv('drogas.csv')
summary(drogas)
## measure location sex age
## Length:4334 Length:4334 Length:4334 Length:4334
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## cause metric year val
## Length:4334 Length:4334 Min. :2000 Min. : 8
## Class :character Class :character 1st Qu.:2005 1st Qu.: 6420
## Mode :character Mode :character Median :2010 Median : 38961
## Mean :2010 Mean : 679981
## 3rd Qu.:2016 3rd Qu.: 125853
## Max. :2021 Max. :53115936
## upper lower
## Min. : 11 Min. : 6
## 1st Qu.: 8053 1st Qu.: 5204
## Median : 47652 Median : 31934
## Mean : 797174 Mean : 589700
## 3rd Qu.: 154580 3rd Qu.: 103273
## Max. :61090513 Max. :46999805
nrow(drogas)
## [1] 4334
head(drogas)
## measure location sex age cause metric year
## 1 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2000
## 2 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2001
## 3 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2002
## 4 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2003
## 5 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2004
## 6 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2005
## val upper lower
## 1 32081.73 40445.17 25557.24
## 2 32969.77 41633.51 26381.13
## 3 34000.50 43044.28 27277.57
## 4 35145.11 44575.37 28256.00
## 5 36358.59 46071.35 29370.24
## 6 37654.04 47621.54 30286.06
#unique(drogas$location)
drogas_filtrado = drogas %>%
filter(sex == "Ambos",
age == "Todas las edades")
top_drogas = drogas_filtrado %>%
filter(year >= 2010, year <= 2020) %>%
group_by(location) %>%
summarise(media_val = mean(val, na.rm = TRUE)) %>%
arrange(desc(media_val)) %>%
slice(1:10)
print(top_drogas)
## # A tibble: 10 × 2
## location media_val
## <chr> <dbl>
## 1 Mundo 49818439.
## 2 Asia Oriental & Pacífico - BM 13856811.
## 3 América del Norte 10024918.
## 4 Estados Unidos de América 9259663.
## 5 Europa & Asia Central - BM 8375617.
## 6 Asia del Sur - BM 7459573.
## 7 India 5867460.
## 8 América Latina & Caribe - BM 4760204.
## 9 África Subsahariana - BM 3089650.
## 10 África del Norte & Medio Oriente - BM 2201053.
drogas_filtrado_paises = drogas_filtrado %>%
filter(year >= 2010, year <= 2020) %>%
filter(!grepl("BM", location), location != "Mundo", location != "América del Norte") %>%
group_by(location) %>%
summarise(media_val = mean(val, na.rm = TRUE)) %>%
arrange(desc(media_val)) %>%
slice(1:10)
drogas_filtrado_paises
## # A tibble: 10 × 2
## location media_val
## <chr> <dbl>
## 1 Estados Unidos de América 9259663.
## 2 India 5867460.
## 3 Brasil 2177749.
## 4 Rusia 1804688.
## 5 Reino Unido 1223663.
## 6 Indonesia 1218309.
## 7 Japón 827584.
## 8 Pakistán 782775.
## 9 Canadá 764784.
## 10 Alemania 693049.
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
top_10_nombres = drogas_filtrado_paises$location
datos_top10 = drogas_filtrado %>%
filter(year >= 2010, year <= 2020,
location %in% top_10_nombres)
ggplot(datos_top10, aes(x = year, y = val, color = location)) +
geom_line(size = 1) +
labs(title = "Evolución del impacto de drogas (2010–2020)",
x = "Año",
y = "Número estimado de personas",
color = "País") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
names(drogas)[7] = "Año"
names(drogas)[2] = "Pais"
deporte = read.csv('deporte.csv')
summary(deporte)
## IndicatorCode Indicator ValueType ParentLocationCode
## Length:3690 Length:3690 Length:3690 Length:3690
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## ParentLocation Location.type SpatialDimValueCode Location
## Length:3690 Length:3690 Length:3690 Length:3690
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Period.type Period IsLatestYear Dim1.type
## Length:3690 Min. :2000 Length:3690 Length:3690
## Class :character 1st Qu.:2003 Class :character Class :character
## Mode :character Median :2007 Mode :character Mode :character
## Mean :2007
## 3rd Qu.:2011
## Max. :2014
## Dim1 Dim1ValueCode Dim2.type Dim2
## Length:3690 Length:3690 Length:3690 Length:3690
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Dim2ValueCode Dim3.type Dim3 Dim3ValueCode
## Length:3690 Mode:logical Mode:logical Mode:logical
## Class :character NA's:3690 NA's:3690 NA's:3690
## Mode :character
##
##
##
## DataSourceDimValueCode DataSource FactValueNumericPrefix FactValueNumeric
## Mode:logical Mode:logical Mode:logical Min. : 6.26
## NA's:3690 NA's:3690 NA's:3690 1st Qu.:17.71
## Median :24.59
## Mean :26.33
## 3rd Qu.:32.88
## Max. :68.77
## FactValueUoM FactValueNumericLowPrefix FactValueNumericLow
## Mode:logical Mode:logical Min. : 2.40
## NA's:3690 NA's:3690 1st Qu.: 7.72
## Median :12.98
## Mean :15.17
## 3rd Qu.:19.64
## Max. :57.78
## FactValueNumericHighPrefix FactValueNumericHigh Value
## Mode:logical Min. : 9.95 Length:3690
## NA's:3690 1st Qu.:29.28 Class :character
## Median :39.50 Mode :character
## Mean :40.46
## 3rd Qu.:50.86
## Max. :82.28
## FactValueTranslationID FactComments Language DateModified
## Mode:logical Mode:logical Length:3690 Length:3690
## NA's:3690 NA's:3690 Class :character Class :character
## Mode :character Mode :character
##
##
##
A PARTIR DE AQUI VAMOS A UNIR LOS DATOS
diabetes = read.csv('diabetes.csv')
obesidad = read.csv('obesidad.csv')
summary(diabetes)
## Pais ISO Sexo Año
## Length:13650 Length:13650 Length:13650 Min. :1980
## Class :character Class :character Class :character 1st Qu.:1988
## Mode :character Mode :character Mode :character Median :1997
## Mean :1997
## 3rd Qu.:2006
## Max. :2014
## Prev.cruda
## Length:13650
## Class :character
## Mode :character
##
##
##
names(deporte)[8] = 'Pais'
names(deporte)[10] = 'Año'
names(deporte)[24] = "inactividad_val"
top10_deporte = deporte %>%
group_by(Pais) %>%
summarise(media_prevalencia = mean(inactividad_val, na.rm = TRUE)) %>%
arrange(desc(media_prevalencia)) %>%
slice(1:10)
deporte_top10 = deporte %>%
filter(Pais %in% top10_deporte$Pais)
ggplot(deporte_top10, aes(x = Año, y = inactividad_val, color = Pais)) +
geom_smooth(se = FALSE, size = 1.2) +
theme_minimal() +
labs(title = "Evolución de la inactividad física (≥18 años)",
subtitle = "Top 10 países con mayor prevalencia media",
x = "Año", y = "Prevalencia de inactividad física (%)",
color = "País")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'
library(readr) # para parse_number()
diabetes$Prev.cruda = as.character(diabetes$Prev.cruda)
diabetes$Prev.cruda = parse_number(diabetes$Prev.cruda)
diabetes$Prev.cruda = diabetes$Prev.cruda / 100
diabetes = diabetes %>%
group_by(Pais, Año) %>%
summarise(diabetes_val = mean(Prev.cruda, na.rm = TRUE), .groups = "drop")
obesidad$Media = as.character(obesidad$Media)
obesidad$Media = gsub(",", ".", obesidad$Media)
obesidad$Media[!grepl("^[0-9\\.]+$", obesidad$Media)] = NA
obesidad$Media = as.numeric(obesidad$Media)
obesidad = obesidad %>%
group_by(Pais, Año) %>%
summarise(obesidad_val = mean(Media, na.rm = TRUE), .groups = "drop")
obj6 = full_join(deporte, drogas, by = c("Pais", "Año")) %>%
full_join(alcohol, by = c("Pais", "Año")) %>%
full_join(diabetes, by = c("Pais", "Año")) %>%
full_join(obesidad, by = c("Pais", "Año"))
obj = obj6 %>%
select(Pais, Año, inactividad_val, diabetes_val, obesidad_val, alcohol_pc, val)
names(obj)[3] = "inactividad_val"
names(obj)[7] = "drogas_val"
obj_nonulos = obj %>%
filter(complete.cases(.))
nrow(obj_nonulos)
## [1] 1112
head(obj_nonulos)
## Pais Año inactividad_val diabetes_val obesidad_val alcohol_pc drogas_val
## 1 Vanuatu 2014 10.45 1.309390 18.906667 1.73 1711.533
## 2 Zambia 2014 10.77 4.230610 7.303333 3.67 50689.314
## 3 Senegal 2014 11.92 5.103360 7.753333 0.36 31789.552
## 4 Togo 2014 12.45 4.920040 7.763333 1.80 14002.996
## 5 Somalia 2014 13.90 4.852695 10.280000 0.00 48655.065
## 6 Estonia 2014 14.40 9.316385 20.966667 15.64 19930.933
unique(obj_nonulos$Año)
## [1] 2014 2013 2012 2011 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001 2000
unique(obj_nonulos$Pais)
## [1] "Vanuatu" "Zambia" "Senegal" "Togo" "Somalia"
## [6] "Estonia" "Samoa" "Guinea" "Burkina Faso" "Eritrea"
## [11] "Nigeria" "Ecuador" "Liberia" "Dominica" "Albania"
## [16] "Georgia" "Guatemala" "Seychelles" "Yemen" "Tuvalu"
## [21] "Guyana" "Sri Lanka" "Tonga" "Colombia" "Chile"
## [26] "Paraguay" "Jamaica" "Uruguay" "Andorra" "El Salvador"
## [31] "Nicaragua" "Barbados" "Argentina" "Costa Rica" "Cuba"
## [36] "Portugal" "Uganda" "Serbia"
library(ggplot2)
library(tidyr)
base_completa = obj_nonulos %>%
filter(complete.cases(.))
base_completa = base_completa %>%
group_by(Pais, Año) %>%
slice(1) %>%
ungroup()
head(base_completa)
## # A tibble: 6 × 7
## Pais Año inactividad_val diabetes_val obesidad_val alcohol_pc drogas_val
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Albania 2000 16.0 5.72 12.8 4.43 16088.
## 2 Albania 2001 16.1 5.92 13.1 4.43 15896.
## 3 Albania 2002 16.2 6.13 13.4 4.57 15772.
## 4 Albania 2003 16.4 6.34 13.7 4.58 15684.
## 5 Albania 2004 16.5 6.55 14.1 4.98 15642.
## 6 Albania 2005 16.7 3.77 14.4 5.3 15640.
base_box = base_completa %>%
select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val) %>%
scale() %>%
as.data.frame()
vars_long = pivot_longer(base_box,
cols = everything(),
names_to = "variable",
values_to = "valor")
ggplot(vars_long, aes(x = valor)) +
geom_histogram(bins = 30, fill = "skyblue", color = "white") +
facet_wrap(~ variable, scales = "free", ncol = 2) +
theme_minimal() +
labs(title = "Distribución de cada variable")
base_completa = base_completa %>%
group_by(Pais, Año) %>%
slice(1) %>%
ungroup()
head(base_completa)
## # A tibble: 6 × 7
## Pais Año inactividad_val diabetes_val obesidad_val alcohol_pc drogas_val
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Albania 2000 16.0 5.72 12.8 4.43 16088.
## 2 Albania 2001 16.1 5.92 13.1 4.43 15896.
## 3 Albania 2002 16.2 6.13 13.4 4.57 15772.
## 4 Albania 2003 16.4 6.34 13.7 4.58 15684.
## 5 Albania 2004 16.5 6.55 14.1 4.98 15642.
## 6 Albania 2005 16.7 3.77 14.4 5.3 15640.
base_completa$unhealthy_index = scale(base_completa$inactividad_val) + scale(base_completa$alcohol_pc) + scale(base_completa$drogas_val)
base_completa$diabesity_index = scale(base_completa$diabetes_val) + scale(base_completa$obesidad_val)
# Reorganizamos la tabla para graficar
base_long_filtrada = base_completa %>%
select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val) %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "valor")
# Histograma por variable
ggplot(base_long_filtrada, aes(x = valor)) +
geom_histogram(bins = 30, fill = "skyblue", color = "white") +
facet_wrap(~ variable, scales = "free", ncol = 2) +
theme_minimal() +
labs(title = "Distribución de variables tras eliminar outliers",
x = "Valor", y = "Frecuencia")
top_10_global = alcohol %>%
filter(Pais %in% top_10_global$Pais)
ggplot(top_10_global, aes(x = Año, y = alcohol_pc, color = Pais)) +
geom_line(size = 1.2) +
geom_point(size = 2) +
theme_minimal() +
labs(title = "Evolución del consumo de alcohol per cápita",
subtitle = "Top 10 países con mayor consumo medio anual",
x = "Año", y = "Litros de alcohol puro por persona (por año)",
color = "País")
base_escalada = base_completa %>%
select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val) %>%
scale() %>%
as.data.frame()
base_box = base_escalada %>%
select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val)
base_long = base_box %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "valor")
ggplot(base_long, aes(x = variable, y = valor)) +
geom_boxplot(fill = "orange", outlier.color = "red", outlier.shape = 16) +
theme_minimal() +
labs(title = "Boxplot de variables de salud (sin duplicados)",
x = "Variable", y = "Valor")
q1 = quantile(base_completa$obesidad_val, 0.25, na.rm = TRUE)
q3 = quantile(base_completa$obesidad_val, 0.75, na.rm = TRUE)
iqr = q3 - q1
limite_sup = q3 + 1.5 * iqr
outliers_obesidad = base_completa %>%
filter(obesidad_val > limite_sup)
outliers_obesidad %>%
select(Pais, Año, obesidad_val) %>%
arrange(desc(obesidad_val))
## # A tibble: 45 × 3
## Pais Año obesidad_val
## <chr> <int> <dbl>
## 1 Tonga 2014 64.3
## 2 Tonga 2013 63.4
## 3 Tonga 2012 62.5
## 4 Tonga 2011 61.7
## 5 Tonga 2010 60.8
## 6 Tonga 2009 60.0
## 7 Tuvalu 2014 59.9
## 8 Tuvalu 2013 59.4
## 9 Tonga 2008 59.2
## 10 Tuvalu 2012 58.9
## # ℹ 35 more rows
datos_clustering = base_completa %>%
select(Pais, Año, inactividad_val, alcohol_pc, drogas_val, obesidad_val, diabetes_val, unhealthy_index, diabesity_index) %>%
group_by(Pais) %>%
summarise(across(-Año, ~mean(.x, na.rm = TRUE)), .groups = "drop") # media por país
# Guardamos nombres y estandarizamos
nombres_paises = datos_clustering$Pais
datos_numericos = datos_clustering %>%
select(-Pais) %>%
scale() %>%
as.data.frame()
set.seed(123)
wss = sapply(1:10, function(k){
kmeans(datos_numericos, centers = k, nstart = 25)$tot.withinss
})
plot(1:10, wss, type = "b", pch = 19,
xlab = "Número de clústeres (k)",
ylab = "Suma de cuadrados intra-cluster",
main = "Método del codo para elegir k")
set.seed(123)
kmeans_4 = kmeans(datos_numericos, centers = 4, nstart = 25)
datos_clustering$cluster = as.factor(kmeans_4$cluster)
table(datos_clustering$cluster)
##
## 1 2 3 4
## 15 3 17 3
datos_clustering %>%
group_by(cluster) %>%
summarise(across(unhealthy_index:diabesity_index, mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(unhealthy_index:diabesity_index, mean, na.rm = TRUE)`.
## ℹ In group 1: `cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## # A tibble: 4 × 3
## cluster unhealthy_index diabesity_index
## <fct> <dbl> <dbl>
## 1 1 -1.42 -0.973
## 2 2 -1.10 1.46
## 3 3 0.988 0.670
## 4 4 3.68 0.00893
pca = prcomp(datos_numericos)
plot_df = data.frame(pca$x[, 1:2],
cluster = datos_clustering$cluster,
Pais = datos_clustering$Pais)
ggplot(plot_df, aes(PC1, PC2, color = cluster, label = Pais)) +
geom_point(size = 3) +
geom_text(check_overlap = TRUE, size = 3, vjust = -1) +
theme_minimal() +
labs(title = "Clustering de países (k = 4)",
subtitle = "Según variables de estilo de vida y salud",
color = "Grupo")
pca = prcomp(datos_numericos)
pca
## Standard deviations (1, .., p=7):
## [1] 1.700626e+00 1.347256e+00 9.923901e-01 9.072133e-01 6.963480e-01
## [6] 2.825783e-16 1.839496e-16
##
## Rotation (n x k) = (7 x 7):
## PC1 PC2 PC3 PC4 PC5
## inactividad_val 0.4798780 0.1338865 -0.004298093 0.19219243 0.7476315
## alcohol_pc 0.3224834 -0.2952051 0.036661804 -0.80186427 -0.1473517
## drogas_val 0.1836989 -0.4766814 -0.501830687 0.46890498 -0.3546527
## obesidad_val 0.2169776 0.5955268 -0.435125893 -0.11149228 -0.2197580
## diabetes_val 0.3661399 -0.1195941 0.702522827 0.27489172 -0.2802223
## unhealthy_index 0.5085035 -0.3181621 -0.236100659 -0.06781491 0.1417656
## diabesity_index 0.4384778 0.4444334 0.090482121 0.08722973 -0.3829366
## PC6 PC7
## inactividad_val -0.3260077 0.2227135
## alcohol_pc -0.3120449 0.2131747
## drogas_val -0.3106539 0.2122244
## obesidad_val -0.3328654 -0.4872479
## diabetes_val -0.2547852 -0.3729543
## unhealthy_index 0.6177735 -0.4220344
## diabesity_index 0.3774681 0.5525373
scores = as.data.frame(pca$x)
scores$Pais = datos_clustering$Pais
loadings = as.data.frame(pca$rotation)
loadings$Variable = rownames(loadings)
ggplot() +
geom_point(data = scores, aes(x = PC1, y = PC2), color = "gray30") +
geom_text(data = scores, aes(x = PC1, y = PC2, label = Pais), size = 3, vjust = 1.2, alpha = 0.8) +
geom_segment(data = loadings, aes(x = 0, y = 0, xend = PC1 * 5, yend = PC2 * 5),
arrow = arrow(length = unit(0.3, "cm")), color = "red", size = 1) +
geom_text(data = loadings, aes(x = PC1 * 5.2, y = PC2 * 5.2, label = Variable),
color = "red", size = 4) +
labs(title = "Biplot de PCA",
subtitle = "Paises en espacio PC1-PC2 + vectores de variables",
x = "Componente Principal 1",
y = "Componente Principal 2") +
theme_minimal()
# Seleccionamos variables numéricas
vars_spearman = base_completa %>%
select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val, unhealthy_index, diabesity_index)
# Matriz de correlación de Spearman
matriz_spearman = cor(vars_spearman, method = "spearman", use = "complete.obs")
library(ggplot2)
library(reshape2)
##
## Adjuntando el paquete: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Reestructurar para ggplot
cor_long = melt(matriz_spearman)
# Heatmap
ggplot(cor_long, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "red", high = "blue", mid = "white",
midpoint = 0, limit = c(-1, 1), space = "Lab") +
geom_text(aes(label = round(value, 2)), size = 4) +
theme_minimal() +
labs(title = "Matriz de correlación (Spearman)",
x = "", y = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(rnaturalearth)
## Warning: package 'rnaturalearth' was built under R version 4.4.3
library(rnaturalearthdata)
## Warning: package 'rnaturalearthdata' was built under R version 4.4.3
##
## Adjuntando el paquete: 'rnaturalearthdata'
## The following object is masked from 'package:rnaturalearth':
##
## countries110
library(sf)
## Warning: package 'sf' was built under R version 4.4.3
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
# Cargar geometría del mundo
mundo = ne_countries(scale = "medium", returnclass = "sf")
# Asegurarse de que los nombres de países coincidan
datos_clustering = datos_clustering %>%
filter(Pais %in% mundo$name)
# Convertir cluster en factor con orden
datos_clustering$cluster = factor(datos_clustering$cluster, levels = c(1, 2, 3, 4))
# Unir datos de clúster al mapa
mapa_clusters = left_join(mundo, datos_clustering, by = c("name" = "Pais"))
# Definir colores del PCA para que coincidan
colores_cluster = c("1" = "#F8766D", # rosa (cluster 1)
"2" = "#7CAE00", # verde lima (cluster 2)
"3" = "#00BFC4", # turquesa (cluster 3)
"4" = "#C77CFF") # lila (cluster 4)
# Graficar el mapa con fondo gris
ggplot() +
geom_sf(data = mundo, fill = "gray90", color = "white", size = 0.1) + # fondo gris
geom_sf(data = mapa_clusters, aes(fill = cluster), color = "white", size = 0.1) + # solo países con cluster
scale_fill_manual(values = colores_cluster, na.translate = FALSE) +
labs(title = "Mapa mundial por clúster de estilo de vida y salud",
fill = "Clúster") +
theme_minimal()
# Cargar geometría del mundo
mundo = ne_countries(scale = "medium", returnclass = "sf")
# Unir tus datos de clúster
mapa_clusters = left_join(mundo, datos_clustering, by = c("name" = "Pais"))
# Calcular centroides de los países con cluster (para superponer puntos en islas pequeñas)
centroides = st_centroid(mapa_clusters)
## Warning: st_centroid assumes attributes are constant over geometries
# Filtrar solo los países que tienen cluster
centroides_cluster = centroides %>% filter(!is.na(cluster))
# Mapa con países en gris y clústeres coloreados
ggplot() +
geom_sf(data = mundo, fill = "grey85", color = "white", size = 0.1) + # Fondo gris de todos los países
geom_sf(data = mapa_clusters, aes(fill = factor(cluster)), color = "white", size = 0.1) + # Colores por clúster
geom_sf(data = centroides_cluster, aes(color = factor(cluster)), shape = 21, size = 2, stroke = 1.2, fill = NA) + # Puntos visibles
scale_fill_manual(values = c("1" = "red", "2" = "green", "3" = "cyan", "4" = "purple")) +
scale_color_manual(values = c("1" = "red", "2" = "green", "3" = "cyan", "4" = "purple")) +
labs(title = "Mapa mundial por clúster de estilo de vida y salud", fill = "Clúster") +
theme_minimal()