limpieza

Including Plots

You can also embed plots, for example:

##      Pais               Code                Año         alcohol_pc   
##  Length:4185        Length:4185        Min.   :2000   Min.   : 0.00  
##  Class :character   Class :character   1st Qu.:2005   1st Qu.: 1.88  
##  Mode  :character   Mode  :character   Median :2010   Median : 4.92  
##                                        Mean   :2010   Mean   : 5.49  
##                                        3rd Qu.:2015   3rd Qu.: 8.67  
##                                        Max.   :2020   Max.   :19.40

library(dplyr)

## Warning: package 'dplyr' was built under R version 4.4.3

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

top_10_global = alcohol %>%
  group_by(Pais) %>%
  summarise(media_consumo = mean(alcohol_pc, na.rm = TRUE)) %>%
  arrange(desc(media_consumo)) %>%
  slice(1:10)

barplot(top_10_global$media_consumo,
        names.arg = top_10_global$Pais,
        col = "darkorange",
        las = 2,
        main = "Top 10 países por consumo medio de alcohol per cápita (2000–2020)",
        ylab = "Litros de alcohol puro por persona")

top_10_2010_2020 = alcohol %>%
  filter(Año >= 2010 & Año <= 2020) %>%
  group_by(Pais) %>%
  summarise(media_consumo = mean(alcohol_pc, na.rm = TRUE)) %>%
  arrange(desc(media_consumo)) %>%
  slice(1:10)

print(top_10_2010_2020)

## # A tibble: 10 × 2
##    Pais      media_consumo
##    <chr>             <dbl>
##  1 Romania            17.1
##  2 Georgia            14.3
##  3 Estonia            13.7
##  4 Lithuania          13.6
##  5 Czechia            13.1
##  6 Uganda             13.0
##  7 Belarus            12.3
##  8 Germany            12.2
##  9 Austria            12.1
## 10 Latvia             11.9

barplot(top_10_2010_2020$media_consumo,
        names.arg = top_10_2010_2020$Pais,
        col = "steelblue",
        las = 2,
        main = "Top 10 países por consumo medio de alcohol (2010–2020)",
        ylab = "Litros de alcohol puro per cápita")

alcohol %>%
  group_by(Año) %>%
  summarise(Media=mean(alcohol_pc, na.rm=TRUE),
            Mediana=median(alcohol_pc, na.rm=TRUE),
            Maximo=max(alcohol_pc, na.rm=TRUE),
            Minimo=min(alcohol_pc, na.rm=TRUE)) %>%
  print(n=21)

## # A tibble: 21 × 5
##      Año Media Mediana Maximo Minimo
##    <int> <dbl>   <dbl>  <dbl>  <dbl>
##  1  2000  5.44    4.57   19.0      0
##  2  2001  5.44    4.57   19.0      0
##  3  2002  5.42    4.57   19.4      0
##  4  2003  5.43    4.56   19.3      0
##  5  2004  5.48    4.55   18.7      0
##  6  2005  5.54    4.81   18.3      0
##  7  2006  5.61    4.82   18.1      0
##  8  2007  5.63    4.88   18.3      0
##  9  2008  5.59    4.71   18.4      0
## 10  2009  5.54    4.99   18.1      0
## 11  2010  5.59    5.23   17.6      0
## 12  2011  5.58    5.08   17.3      0
## 13  2012  5.59    5.09   17.2      0
## 14  2013  5.56    5.08   17.1      0
## 15  2014  5.53    5.1    17.0      0
## 16  2015  5.53    5.15   16.8      0
## 17  2016  5.46    5.03   16.8      0
## 18  2017  5.42    5.01   16.9      0
## 19  2018  5.40    4.95   17.0      0
## 20  2019  5.44    4.98   17.0      0
## 21  2020  5.05    4.3    16.8      0

pais = "Romania"  # Cambia por el país que quieras
datos_pais = filter(alcohol, Pais == pais)

plot(datos_pais$Año, datos_pais$alcohol_pc, type="l", col="darkgreen",
     main=paste("Evolución del consumo de alcohol en", pais),
     xlab="Año", ylab="Litros per cápita")

unique(alcohol$Pais)

##   [1] "Afghanistan"                       "Albania"                          
##   [3] "Algeria"                           "Andorra"                          
##   [5] "Angola"                            "Antigua and Barbuda"              
##   [7] "Argentina"                         "Armenia"                          
##   [9] "Australia"                         "Austria"                          
##  [11] "Azerbaijan"                        "Bahamas"                          
##  [13] "Bahrain"                           "Bangladesh"                       
##  [15] "Barbados"                          "Belarus"                          
##  [17] "Belgium"                           "Belize"                           
##  [19] "Benin"                             "Bhutan"                           
##  [21] "Bolivia"                           "Bosnia and Herzegovina"           
##  [23] "Botswana"                          "Brazil"                           
##  [25] "Brunei"                            "Bulgaria"                         
##  [27] "Burkina Faso"                      "Burundi"                          
##  [29] "Cambodia"                          "Cameroon"                         
##  [31] "Canada"                            "Cape Verde"                       
##  [33] "Central African Republic"          "Chad"                             
##  [35] "Chile"                             "China"                            
##  [37] "Colombia"                          "Comoros"                          
##  [39] "Congo"                             "Costa Rica"                       
##  [41] "Cote d'Ivoire"                     "Croatia"                          
##  [43] "Cuba"                              "Cyprus"                           
##  [45] "Czechia"                           "Democratic Republic of Congo"     
##  [47] "Denmark"                           "Djibouti"                         
##  [49] "Dominica"                          "Dominican Republic"               
##  [51] "East Asia and Pacific (WB)"        "East Timor"                       
##  [53] "Ecuador"                           "Egypt"                            
##  [55] "El Salvador"                       "Equatorial Guinea"                
##  [57] "Eritrea"                           "Estonia"                          
##  [59] "Eswatini"                          "Ethiopia"                         
##  [61] "Europe and Central Asia (WB)"      "European Union (27)"              
##  [63] "Fiji"                              "Finland"                          
##  [65] "France"                            "Gabon"                            
##  [67] "Gambia"                            "Georgia"                          
##  [69] "Germany"                           "Ghana"                            
##  [71] "Greece"                            "Grenada"                          
##  [73] "Guatemala"                         "Guinea"                           
##  [75] "Guinea-Bissau"                     "Guyana"                           
##  [77] "Haiti"                             "High-income countries"            
##  [79] "Honduras"                          "Hungary"                          
##  [81] "Iceland"                           "India"                            
##  [83] "Indonesia"                         "Iran"                             
##  [85] "Iraq"                              "Ireland"                          
##  [87] "Israel"                            "Italy"                            
##  [89] "Jamaica"                           "Japan"                            
##  [91] "Jordan"                            "Kazakhstan"                       
##  [93] "Kenya"                             "Kiribati"                         
##  [95] "Kuwait"                            "Kyrgyzstan"                       
##  [97] "Laos"                              "Latin America and Caribbean (WB)" 
##  [99] "Latvia"                            "Lebanon"                          
## [101] "Lesotho"                           "Liberia"                          
## [103] "Libya"                             "Lithuania"                        
## [105] "Low-income countries"              "Lower-middle-income countries"    
## [107] "Luxembourg"                        "Madagascar"                       
## [109] "Malawi"                            "Malaysia"                         
## [111] "Maldives"                          "Mali"                             
## [113] "Malta"                             "Mauritania"                       
## [115] "Mauritius"                         "Mexico"                           
## [117] "Micronesia (country)"              "Middle East and North Africa (WB)"
## [119] "Middle-income countries"           "Moldova"                          
## [121] "Mongolia"                          "Montenegro"                       
## [123] "Morocco"                           "Mozambique"                       
## [125] "Myanmar"                           "Namibia"                          
## [127] "Nauru"                             "Nepal"                            
## [129] "Netherlands"                       "New Zealand"                      
## [131] "Nicaragua"                         "Niger"                            
## [133] "Nigeria"                           "North America (WB)"               
## [135] "North Korea"                       "North Macedonia"                  
## [137] "Norway"                            "Oman"                             
## [139] "Pakistan"                          "Panama"                           
## [141] "Papua New Guinea"                  "Paraguay"                         
## [143] "Peru"                              "Philippines"                      
## [145] "Poland"                            "Portugal"                         
## [147] "Qatar"                             "Romania"                          
## [149] "Russia"                            "Rwanda"                           
## [151] "Saint Kitts and Nevis"             "Saint Lucia"                      
## [153] "Saint Vincent and the Grenadines"  "Samoa"                            
## [155] "Sao Tome and Principe"             "Saudi Arabia"                     
## [157] "Senegal"                           "Serbia"                           
## [159] "Seychelles"                        "Sierra Leone"                     
## [161] "Singapore"                         "Slovakia"                         
## [163] "Slovenia"                          "Solomon Islands"                  
## [165] "Somalia"                           "South Africa"                     
## [167] "South Asia (WB)"                   "South Korea"                      
## [169] "Spain"                             "Sri Lanka"                        
## [171] "Sub-Saharan Africa (WB)"           "Sudan"                            
## [173] "Suriname"                          "Sweden"                           
## [175] "Switzerland"                       "Syria"                            
## [177] "Tajikistan"                        "Tanzania"                         
## [179] "Thailand"                          "Togo"                             
## [181] "Tonga"                             "Trinidad and Tobago"              
## [183] "Tunisia"                           "Turkey"                           
## [185] "Turkmenistan"                      "Tuvalu"                           
## [187] "Uganda"                            "Ukraine"                          
## [189] "United Arab Emirates"              "United Kingdom"                   
## [191] "United States"                     "Upper-middle-income countries"    
## [193] "Uruguay"                           "Uzbekistan"                       
## [195] "Vanuatu"                           "Venezuela"                        
## [197] "Vietnam"                           "World"                            
## [199] "Yemen"                             "Zambia"                           
## [201] "Zimbabwe"

drogas = read.csv('drogas.csv')
summary(drogas)

##    measure            location             sex                age           
##  Length:4334        Length:4334        Length:4334        Length:4334       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##     cause              metric               year           val          
##  Length:4334        Length:4334        Min.   :2000   Min.   :       8  
##  Class :character   Class :character   1st Qu.:2005   1st Qu.:    6420  
##  Mode  :character   Mode  :character   Median :2010   Median :   38961  
##                                        Mean   :2010   Mean   :  679981  
##                                        3rd Qu.:2016   3rd Qu.:  125853  
##                                        Max.   :2021   Max.   :53115936  
##      upper              lower         
##  Min.   :      11   Min.   :       6  
##  1st Qu.:    8053   1st Qu.:    5204  
##  Median :   47652   Median :   31934  
##  Mean   :  797174   Mean   :  589700  
##  3rd Qu.:  154580   3rd Qu.:  103273  
##  Max.   :61090513   Max.   :46999805

nrow(drogas)

## [1] 4334

head(drogas)

##       measure   location   sex              age             cause metric year
## 1 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2000
## 2 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2001
## 3 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2002
## 4 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2003
## 5 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2004
## 6 Prevalencia Tayikistán Ambos Todas las edades Consumo de drogas Número 2005
##        val    upper    lower
## 1 32081.73 40445.17 25557.24
## 2 32969.77 41633.51 26381.13
## 3 34000.50 43044.28 27277.57
## 4 35145.11 44575.37 28256.00
## 5 36358.59 46071.35 29370.24
## 6 37654.04 47621.54 30286.06

#unique(drogas$location)

drogas_filtrado = drogas %>%
  filter(sex == "Ambos",
         age == "Todas las edades")

top_drogas = drogas_filtrado %>%
  filter(year >= 2010, year <= 2020) %>%
  group_by(location) %>%
  summarise(media_val = mean(val, na.rm = TRUE)) %>%
  arrange(desc(media_val)) %>%
  slice(1:10)

print(top_drogas)

## # A tibble: 10 × 2
##    location                              media_val
##    <chr>                                     <dbl>
##  1 Mundo                                 49818439.
##  2 Asia Oriental & Pacífico - BM         13856811.
##  3 América del Norte                     10024918.
##  4 Estados Unidos de América              9259663.
##  5 Europa &  Asia Central - BM            8375617.
##  6 Asia del Sur - BM                      7459573.
##  7 India                                  5867460.
##  8 América Latina & Caribe - BM           4760204.
##  9 África Subsahariana - BM               3089650.
## 10 África del Norte & Medio Oriente - BM  2201053.

drogas_filtrado_paises = drogas_filtrado %>%
  filter(year >= 2010, year <= 2020) %>%
  filter(!grepl("BM", location), location != "Mundo", location != "América del Norte") %>%
  group_by(location) %>%
  summarise(media_val = mean(val, na.rm = TRUE)) %>%
  arrange(desc(media_val)) %>%
  slice(1:10)

drogas_filtrado_paises

## # A tibble: 10 × 2
##    location                  media_val
##    <chr>                         <dbl>
##  1 Estados Unidos de América  9259663.
##  2 India                      5867460.
##  3 Brasil                     2177749.
##  4 Rusia                      1804688.
##  5 Reino Unido                1223663.
##  6 Indonesia                  1218309.
##  7 Japón                       827584.
##  8 Pakistán                    782775.
##  9 Canadá                      764784.
## 10 Alemania                    693049.

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.3

top_10_nombres = drogas_filtrado_paises$location
datos_top10 = drogas_filtrado %>%
  filter(year >= 2010, year <= 2020,
         location %in% top_10_nombres)


ggplot(datos_top10, aes(x = year, y = val, color = location)) +
  geom_line(size = 1) +
  labs(title = "Evolución del impacto de drogas (2010–2020)",
       x = "Año",
       y = "Número estimado de personas",
       color = "País") +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

names(drogas)[7] = "Año"
names(drogas)[2] = "Pais"

deporte = read.csv('deporte.csv')
summary(deporte)

##  IndicatorCode       Indicator          ValueType         ParentLocationCode
##  Length:3690        Length:3690        Length:3690        Length:3690       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  ParentLocation     Location.type      SpatialDimValueCode   Location        
##  Length:3690        Length:3690        Length:3690         Length:3690       
##  Class :character   Class :character   Class :character    Class :character  
##  Mode  :character   Mode  :character   Mode  :character    Mode  :character  
##                                                                              
##                                                                              
##                                                                              
##  Period.type            Period     IsLatestYear        Dim1.type        
##  Length:3690        Min.   :2000   Length:3690        Length:3690       
##  Class :character   1st Qu.:2003   Class :character   Class :character  
##  Mode  :character   Median :2007   Mode  :character   Mode  :character  
##                     Mean   :2007                                        
##                     3rd Qu.:2011                                        
##                     Max.   :2014                                        
##      Dim1           Dim1ValueCode       Dim2.type             Dim2          
##  Length:3690        Length:3690        Length:3690        Length:3690       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Dim2ValueCode      Dim3.type        Dim3         Dim3ValueCode 
##  Length:3690        Mode:logical   Mode:logical   Mode:logical  
##  Class :character   NA's:3690      NA's:3690      NA's:3690     
##  Mode  :character                                               
##                                                                 
##                                                                 
##                                                                 
##  DataSourceDimValueCode DataSource     FactValueNumericPrefix FactValueNumeric
##  Mode:logical           Mode:logical   Mode:logical           Min.   : 6.26   
##  NA's:3690              NA's:3690      NA's:3690              1st Qu.:17.71   
##                                                               Median :24.59   
##                                                               Mean   :26.33   
##                                                               3rd Qu.:32.88   
##                                                               Max.   :68.77   
##  FactValueUoM   FactValueNumericLowPrefix FactValueNumericLow
##  Mode:logical   Mode:logical              Min.   : 2.40      
##  NA's:3690      NA's:3690                 1st Qu.: 7.72      
##                                           Median :12.98      
##                                           Mean   :15.17      
##                                           3rd Qu.:19.64      
##                                           Max.   :57.78      
##  FactValueNumericHighPrefix FactValueNumericHigh    Value          
##  Mode:logical               Min.   : 9.95        Length:3690       
##  NA's:3690                  1st Qu.:29.28        Class :character  
##                             Median :39.50        Mode  :character  
##                             Mean   :40.46                          
##                             3rd Qu.:50.86                          
##                             Max.   :82.28                          
##  FactValueTranslationID FactComments     Language         DateModified      
##  Mode:logical           Mode:logical   Length:3690        Length:3690       
##  NA's:3690              NA's:3690      Class :character   Class :character  
##                                        Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##

A PARTIR DE AQUI VAMOS A UNIR LOS DATOS

diabetes = read.csv('diabetes.csv')
obesidad = read.csv('obesidad.csv')
summary(diabetes)

##      Pais               ISO                Sexo                Año      
##  Length:13650       Length:13650       Length:13650       Min.   :1980  
##  Class :character   Class :character   Class :character   1st Qu.:1988  
##  Mode  :character   Mode  :character   Mode  :character   Median :1997  
##                                                           Mean   :1997  
##                                                           3rd Qu.:2006  
##                                                           Max.   :2014  
##   Prev.cruda       
##  Length:13650      
##  Class :character  
##  Mode  :character  
##                    
##                    
##

names(deporte)[8] = 'Pais'
names(deporte)[10] = 'Año'
names(deporte)[24] = "inactividad_val"
top10_deporte = deporte %>%
  group_by(Pais) %>%
  summarise(media_prevalencia = mean(inactividad_val, na.rm = TRUE)) %>%
  arrange(desc(media_prevalencia)) %>%
  slice(1:10)
deporte_top10 = deporte %>%
  filter(Pais %in% top10_deporte$Pais)
ggplot(deporte_top10, aes(x = Año, y = inactividad_val, color = Pais)) +
   geom_smooth(se = FALSE, size = 1.2) +
  theme_minimal() +
  labs(title = "Evolución de la inactividad física (≥18 años)",
       subtitle = "Top 10 países con mayor prevalencia media",
       x = "Año", y = "Prevalencia de inactividad física (%)",
       color = "País")

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

library(readr)  # para parse_number()


diabetes$Prev.cruda = as.character(diabetes$Prev.cruda)
diabetes$Prev.cruda = parse_number(diabetes$Prev.cruda)
diabetes$Prev.cruda = diabetes$Prev.cruda / 100
diabetes = diabetes %>%
  group_by(Pais, Año) %>%
  summarise(diabetes_val = mean(Prev.cruda, na.rm = TRUE), .groups = "drop")


obesidad$Media = as.character(obesidad$Media)
obesidad$Media = gsub(",", ".", obesidad$Media)
obesidad$Media[!grepl("^[0-9\\.]+$", obesidad$Media)] = NA
obesidad$Media = as.numeric(obesidad$Media)

obesidad = obesidad %>%
  group_by(Pais, Año) %>%
  summarise(obesidad_val = mean(Media, na.rm = TRUE), .groups = "drop")



obj6 = full_join(deporte, drogas, by = c("Pais", "Año")) %>%
  full_join(alcohol,  by = c("Pais", "Año")) %>%
  full_join(diabetes, by = c("Pais", "Año")) %>%
  full_join(obesidad, by = c("Pais", "Año"))


obj = obj6 %>%
  select(Pais, Año, inactividad_val, diabetes_val, obesidad_val, alcohol_pc, val)
names(obj)[3] = "inactividad_val"
names(obj)[7] = "drogas_val"

obj_nonulos = obj %>%
  filter(complete.cases(.))
nrow(obj_nonulos)

## [1] 1112

head(obj_nonulos)

##      Pais  Año inactividad_val diabetes_val obesidad_val alcohol_pc drogas_val
## 1 Vanuatu 2014           10.45     1.309390    18.906667       1.73   1711.533
## 2  Zambia 2014           10.77     4.230610     7.303333       3.67  50689.314
## 3 Senegal 2014           11.92     5.103360     7.753333       0.36  31789.552
## 4    Togo 2014           12.45     4.920040     7.763333       1.80  14002.996
## 5 Somalia 2014           13.90     4.852695    10.280000       0.00  48655.065
## 6 Estonia 2014           14.40     9.316385    20.966667      15.64  19930.933

unique(obj_nonulos$Año)

##  [1] 2014 2013 2012 2011 2010 2009 2008 2007 2006 2005 2004 2003 2002 2001 2000

unique(obj_nonulos$Pais)

##  [1] "Vanuatu"      "Zambia"       "Senegal"      "Togo"         "Somalia"     
##  [6] "Estonia"      "Samoa"        "Guinea"       "Burkina Faso" "Eritrea"     
## [11] "Nigeria"      "Ecuador"      "Liberia"      "Dominica"     "Albania"     
## [16] "Georgia"      "Guatemala"    "Seychelles"   "Yemen"        "Tuvalu"      
## [21] "Guyana"       "Sri Lanka"    "Tonga"        "Colombia"     "Chile"       
## [26] "Paraguay"     "Jamaica"      "Uruguay"      "Andorra"      "El Salvador" 
## [31] "Nicaragua"    "Barbados"     "Argentina"    "Costa Rica"   "Cuba"        
## [36] "Portugal"     "Uganda"       "Serbia"

library(ggplot2)
library(tidyr)
base_completa = obj_nonulos %>%
  filter(complete.cases(.))

base_completa = base_completa %>%
  group_by(Pais, Año) %>%
  slice(1) %>%
  ungroup()
head(base_completa)

## # A tibble: 6 × 7
##   Pais      Año inactividad_val diabetes_val obesidad_val alcohol_pc drogas_val
##   <chr>   <int>           <dbl>        <dbl>        <dbl>      <dbl>      <dbl>
## 1 Albania  2000            16.0         5.72         12.8       4.43     16088.
## 2 Albania  2001            16.1         5.92         13.1       4.43     15896.
## 3 Albania  2002            16.2         6.13         13.4       4.57     15772.
## 4 Albania  2003            16.4         6.34         13.7       4.58     15684.
## 5 Albania  2004            16.5         6.55         14.1       4.98     15642.
## 6 Albania  2005            16.7         3.77         14.4       5.3      15640.

base_box = base_completa %>%
  select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val) %>%
  scale() %>%
  as.data.frame()

vars_long = pivot_longer(base_box,
                         cols = everything(),
                         names_to = "variable",
                         values_to = "valor")
ggplot(vars_long, aes(x = valor)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "white") +
  facet_wrap(~ variable, scales = "free", ncol = 2) +
  theme_minimal() +
  labs(title = "Distribución de cada variable")

base_completa = base_completa %>%
  group_by(Pais, Año) %>%
  slice(1) %>%
  ungroup()
head(base_completa)

## # A tibble: 6 × 7
##   Pais      Año inactividad_val diabetes_val obesidad_val alcohol_pc drogas_val
##   <chr>   <int>           <dbl>        <dbl>        <dbl>      <dbl>      <dbl>
## 1 Albania  2000            16.0         5.72         12.8       4.43     16088.
## 2 Albania  2001            16.1         5.92         13.1       4.43     15896.
## 3 Albania  2002            16.2         6.13         13.4       4.57     15772.
## 4 Albania  2003            16.4         6.34         13.7       4.58     15684.
## 5 Albania  2004            16.5         6.55         14.1       4.98     15642.
## 6 Albania  2005            16.7         3.77         14.4       5.3      15640.

base_completa$unhealthy_index = scale(base_completa$inactividad_val) + scale(base_completa$alcohol_pc) + scale(base_completa$drogas_val)

base_completa$diabesity_index = scale(base_completa$diabetes_val) + scale(base_completa$obesidad_val)

# Reorganizamos la tabla para graficar
base_long_filtrada = base_completa %>%
  select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val) %>%
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "valor")

# Histograma por variable
ggplot(base_long_filtrada, aes(x = valor)) +
  geom_histogram(bins = 30, fill = "skyblue", color = "white") +
  facet_wrap(~ variable, scales = "free", ncol = 2) +
  theme_minimal() +
  labs(title = "Distribución de variables tras eliminar outliers",
       x = "Valor", y = "Frecuencia")

top_10_global = alcohol %>%
  filter(Pais %in% top_10_global$Pais)

ggplot(top_10_global, aes(x = Año, y = alcohol_pc, color = Pais)) +
  geom_line(size = 1.2) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(title = "Evolución del consumo de alcohol per cápita",
       subtitle = "Top 10 países con mayor consumo medio anual",
       x = "Año", y = "Litros de alcohol puro por persona (por año)",
       color = "País")

base_escalada = base_completa %>%
  select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val) %>%
  scale() %>%
  as.data.frame()

base_box = base_escalada %>%
  select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val)
base_long = base_box %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "valor")
ggplot(base_long, aes(x = variable, y = valor)) +
  geom_boxplot(fill = "orange", outlier.color = "red", outlier.shape = 16) +
  theme_minimal() +
  labs(title = "Boxplot de variables de salud (sin duplicados)",
       x = "Variable", y = "Valor")

q1 = quantile(base_completa$obesidad_val, 0.25, na.rm = TRUE)
q3 = quantile(base_completa$obesidad_val, 0.75, na.rm = TRUE)
iqr = q3 - q1
limite_sup = q3 + 1.5 * iqr
outliers_obesidad = base_completa %>%
  filter(obesidad_val > limite_sup)
outliers_obesidad %>%
  select(Pais, Año, obesidad_val) %>%
  arrange(desc(obesidad_val))

## # A tibble: 45 × 3
##    Pais     Año obesidad_val
##    <chr>  <int>        <dbl>
##  1 Tonga   2014         64.3
##  2 Tonga   2013         63.4
##  3 Tonga   2012         62.5
##  4 Tonga   2011         61.7
##  5 Tonga   2010         60.8
##  6 Tonga   2009         60.0
##  7 Tuvalu  2014         59.9
##  8 Tuvalu  2013         59.4
##  9 Tonga   2008         59.2
## 10 Tuvalu  2012         58.9
## # ℹ 35 more rows

datos_clustering = base_completa %>%
  select(Pais, Año, inactividad_val, alcohol_pc, drogas_val, obesidad_val, diabetes_val, unhealthy_index, diabesity_index) %>%
  group_by(Pais) %>%
  summarise(across(-Año, ~mean(.x, na.rm = TRUE)), .groups = "drop")  # media por país

# Guardamos nombres y estandarizamos
nombres_paises = datos_clustering$Pais

datos_numericos = datos_clustering %>%
  select(-Pais) %>%
  scale() %>%
  as.data.frame()
set.seed(123)
wss = sapply(1:10, function(k){
  kmeans(datos_numericos, centers = k, nstart = 25)$tot.withinss
})

plot(1:10, wss, type = "b", pch = 19,
     xlab = "Número de clústeres (k)",
     ylab = "Suma de cuadrados intra-cluster",
     main = "Método del codo para elegir k")

set.seed(123)

kmeans_4 = kmeans(datos_numericos, centers = 4, nstart = 25)

datos_clustering$cluster = as.factor(kmeans_4$cluster)
table(datos_clustering$cluster)

## 
##  1  2  3  4 
## 15  3 17  3

datos_clustering %>%
  group_by(cluster) %>%
  summarise(across(unhealthy_index:diabesity_index, mean, na.rm = TRUE))

## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(unhealthy_index:diabesity_index, mean, na.rm = TRUE)`.
## ℹ In group 1: `cluster = 1`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
## 
##   # Previously
##   across(a:b, mean, na.rm = TRUE)
## 
##   # Now
##   across(a:b, \(x) mean(x, na.rm = TRUE))

## # A tibble: 4 × 3
##   cluster unhealthy_index diabesity_index
##   <fct>             <dbl>           <dbl>
## 1 1                -1.42         -0.973  
## 2 2                -1.10          1.46   
## 3 3                 0.988         0.670  
## 4 4                 3.68          0.00893

pca = prcomp(datos_numericos)

plot_df = data.frame(pca$x[, 1:2],
                     cluster = datos_clustering$cluster,
                     Pais = datos_clustering$Pais)


ggplot(plot_df, aes(PC1, PC2, color = cluster, label = Pais)) +
  geom_point(size = 3) +
  geom_text(check_overlap = TRUE, size = 3, vjust = -1) +
  theme_minimal() +
  labs(title = "Clustering de países (k = 4)",
       subtitle = "Según variables de estilo de vida y salud",
       color = "Grupo")

pca = prcomp(datos_numericos)
pca

## Standard deviations (1, .., p=7):
## [1] 1.700626e+00 1.347256e+00 9.923901e-01 9.072133e-01 6.963480e-01
## [6] 2.825783e-16 1.839496e-16
## 
## Rotation (n x k) = (7 x 7):
##                       PC1        PC2          PC3         PC4        PC5
## inactividad_val 0.4798780  0.1338865 -0.004298093  0.19219243  0.7476315
## alcohol_pc      0.3224834 -0.2952051  0.036661804 -0.80186427 -0.1473517
## drogas_val      0.1836989 -0.4766814 -0.501830687  0.46890498 -0.3546527
## obesidad_val    0.2169776  0.5955268 -0.435125893 -0.11149228 -0.2197580
## diabetes_val    0.3661399 -0.1195941  0.702522827  0.27489172 -0.2802223
## unhealthy_index 0.5085035 -0.3181621 -0.236100659 -0.06781491  0.1417656
## diabesity_index 0.4384778  0.4444334  0.090482121  0.08722973 -0.3829366
##                        PC6        PC7
## inactividad_val -0.3260077  0.2227135
## alcohol_pc      -0.3120449  0.2131747
## drogas_val      -0.3106539  0.2122244
## obesidad_val    -0.3328654 -0.4872479
## diabetes_val    -0.2547852 -0.3729543
## unhealthy_index  0.6177735 -0.4220344
## diabesity_index  0.3774681  0.5525373

scores = as.data.frame(pca$x)
scores$Pais = datos_clustering$Pais
loadings = as.data.frame(pca$rotation)
loadings$Variable = rownames(loadings)

ggplot() +
  geom_point(data = scores, aes(x = PC1, y = PC2), color = "gray30") +
  geom_text(data = scores, aes(x = PC1, y = PC2, label = Pais), size = 3, vjust = 1.2, alpha = 0.8) +
  geom_segment(data = loadings, aes(x = 0, y = 0, xend = PC1 * 5, yend = PC2 * 5), 
               arrow = arrow(length = unit(0.3, "cm")), color = "red", size = 1) +
  geom_text(data = loadings, aes(x = PC1 * 5.2, y = PC2 * 5.2, label = Variable), 
            color = "red", size = 4) +
  labs(title = "Biplot de PCA",
       subtitle = "Paises en espacio PC1-PC2 + vectores de variables",
       x = "Componente Principal 1",
       y = "Componente Principal 2") +
  theme_minimal()

# Seleccionamos variables numéricas
vars_spearman = base_completa %>%
  select(inactividad_val, diabetes_val, obesidad_val, alcohol_pc, drogas_val, unhealthy_index, diabesity_index)

# Matriz de correlación de Spearman
matriz_spearman = cor(vars_spearman, method = "spearman", use = "complete.obs")
library(ggplot2)
library(reshape2)

## 
## Adjuntando el paquete: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

# Reestructurar para ggplot
cor_long = melt(matriz_spearman)

# Heatmap
ggplot(cor_long, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "red", high = "blue", mid = "white",
                       midpoint = 0, limit = c(-1, 1), space = "Lab") +
  geom_text(aes(label = round(value, 2)), size = 4) +
  theme_minimal() +
  labs(title = "Matriz de correlación (Spearman)",
       x = "", y = "") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

library(rnaturalearth)

## Warning: package 'rnaturalearth' was built under R version 4.4.3

library(rnaturalearthdata)

## Warning: package 'rnaturalearthdata' was built under R version 4.4.3

## 
## Adjuntando el paquete: 'rnaturalearthdata'

## The following object is masked from 'package:rnaturalearth':
## 
##     countries110

library(sf)

## Warning: package 'sf' was built under R version 4.4.3

## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE

# Cargar geometría del mundo
mundo = ne_countries(scale = "medium", returnclass = "sf")

# Asegurarse de que los nombres de países coincidan
datos_clustering = datos_clustering %>%
  filter(Pais %in% mundo$name)

# Convertir cluster en factor con orden
datos_clustering$cluster = factor(datos_clustering$cluster, levels = c(1, 2, 3, 4))

# Unir datos de clúster al mapa
mapa_clusters = left_join(mundo, datos_clustering, by = c("name" = "Pais"))

# Definir colores del PCA para que coincidan
colores_cluster = c("1" = "#F8766D",  # rosa (cluster 1)
                    "2" = "#7CAE00",  # verde lima (cluster 2)
                    "3" = "#00BFC4",  # turquesa (cluster 3)
                    "4" = "#C77CFF")  # lila (cluster 4)

# Graficar el mapa con fondo gris
ggplot() +
  geom_sf(data = mundo, fill = "gray90", color = "white", size = 0.1) +  # fondo gris
  geom_sf(data = mapa_clusters, aes(fill = cluster), color = "white", size = 0.1) +  # solo países con cluster
  scale_fill_manual(values = colores_cluster, na.translate = FALSE) +
  labs(title = "Mapa mundial por clúster de estilo de vida y salud",
       fill = "Clúster") +
  theme_minimal()

# Cargar geometría del mundo
mundo = ne_countries(scale = "medium", returnclass = "sf")

# Unir tus datos de clúster
mapa_clusters = left_join(mundo, datos_clustering, by = c("name" = "Pais"))

# Calcular centroides de los países con cluster (para superponer puntos en islas pequeñas)
centroides = st_centroid(mapa_clusters)

## Warning: st_centroid assumes attributes are constant over geometries

# Filtrar solo los países que tienen cluster
centroides_cluster = centroides %>% filter(!is.na(cluster))

# Mapa con países en gris y clústeres coloreados
ggplot() +
  geom_sf(data = mundo, fill = "grey85", color = "white", size = 0.1) +  # Fondo gris de todos los países
  geom_sf(data = mapa_clusters, aes(fill = factor(cluster)), color = "white", size = 0.1) +  # Colores por clúster
  geom_sf(data = centroides_cluster, aes(color = factor(cluster)), shape = 21, size = 2, stroke = 1.2, fill = NA) + # Puntos visibles
  scale_fill_manual(values = c("1" = "red", "2" = "green", "3" = "cyan", "4" = "purple")) +
  scale_color_manual(values = c("1" = "red", "2" = "green", "3" = "cyan", "4" = "purple")) +
  labs(title = "Mapa mundial por clúster de estilo de vida y salud", fill = "Clúster") +
  theme_minimal()

limpieza

fernanda mara de paula gonçalves

2025-05-25

R Markdown

Including Plots