dfsui <- read.csv("/Users/juansebastianquintanacontreras/Documents/6 semestre/DATAVIZ/master 2.csv")
head(dfsui)
##   country year    sex         age suicides_no population suicides.100k.pop
## 1 Albania 1987   male 15-24 years          21     312900              6.71
## 2 Albania 1987   male 35-54 years          16     308000              5.19
## 3 Albania 1987 female 15-24 years          14     289700              4.83
## 4 Albania 1987   male   75+ years           1      21800              4.59
## 5 Albania 1987   male 25-34 years           9     274300              3.28
## 6 Albania 1987 female   75+ years           1      35600              2.81
##   country.year HDI.for.year gdp_for_year.... gdp_per_capita....      generation
## 1  Albania1987           NA    2,156,624,900                796    Generation X
## 2  Albania1987           NA    2,156,624,900                796          Silent
## 3  Albania1987           NA    2,156,624,900                796    Generation X
## 4  Albania1987           NA    2,156,624,900                796 G.I. Generation
## 5  Albania1987           NA    2,156,624,900                796         Boomers
## 6  Albania1987           NA    2,156,624,900                796 G.I. Generation
summary(dfsui)
##    country               year          sex                age           
##  Length:27820       Min.   :1985   Length:27820       Length:27820      
##  Class :character   1st Qu.:1995   Class :character   Class :character  
##  Mode  :character   Median :2002   Mode  :character   Mode  :character  
##                     Mean   :2001                                        
##                     3rd Qu.:2008                                        
##                     Max.   :2016                                        
##                                                                         
##   suicides_no        population       suicides.100k.pop country.year      
##  Min.   :    0.0   Min.   :     278   Min.   :  0.00    Length:27820      
##  1st Qu.:    3.0   1st Qu.:   97498   1st Qu.:  0.92    Class :character  
##  Median :   25.0   Median :  430150   Median :  5.99    Mode  :character  
##  Mean   :  242.6   Mean   : 1844794   Mean   : 12.82                      
##  3rd Qu.:  131.0   3rd Qu.: 1486143   3rd Qu.: 16.62                      
##  Max.   :22338.0   Max.   :43805214   Max.   :224.97                      
##                                                                           
##   HDI.for.year   gdp_for_year....   gdp_per_capita....  generation       
##  Min.   :0.483   Length:27820       Min.   :   251     Length:27820      
##  1st Qu.:0.713   Class :character   1st Qu.:  3447     Class :character  
##  Median :0.779   Mode  :character   Median :  9372     Mode  :character  
##  Mean   :0.777                      Mean   : 16866                       
##  3rd Qu.:0.855                      3rd Qu.: 24874                       
##  Max.   :0.944                      Max.   :126352                       
##  NA's   :19456
names(dfsui)
##  [1] "country"            "year"               "sex"               
##  [4] "age"                "suicides_no"        "population"        
##  [7] "suicides.100k.pop"  "country.year"       "HDI.for.year"      
## [10] "gdp_for_year...."   "gdp_per_capita...." "generation"
dim(dfsui)
## [1] 27820    12

Revisión de la Base de Datos

La base de datos dfsui contiene 27,820 observaciones y 12 variables.

  1. country (Categórica)
  2. sex (Categórica)
  3. age (Categórica)
  4. generation (Categórica)
  5. year (Numérica)
  6. suicides_no (Numérica)
  7. population (Numérica)
  8. suicides.100k.pop (Numérica)
  9. gdp_per_capita.... (Numérica)
  10. HDI.for.year (Numérica)
  11. gdp_for_year.... (Numérica)
  12. country.year (Numérica)

En el análisis inicial, observamos una gran cantidad de valores faltantes en la variable HDI.for.year, con 19,456 valores NA. Además.

Dado esto, y basándonos en la revisión de valores extraños realizada en la tarea #2, se procederá a una revisión más detallada de la base de datos utilizando operadores pipe para identificar de manera eficiente cualquier valor faltante, duplicado.

## [1] "Valores faltantes por variable:"
## # A tibble: 1 × 2
##   variable     n_faltantes
##   <chr>              <int>
## 1 HDI.for.year       19456
## [1] "Número de registros duplicados: 0"

Resultados del Análisis de Calidad de Datos

Después de realizar un análisis de la base de datos dfsui, se identificó que la variable HDI.for.year contiene valores faltantes (NA). Todas las demás variables de la base de datos contienen datos válidos y utilizables.

Además, se verificó que no hay registros duplicados en toda la base de datos. Esto asegura que los datos disponibles no están repetidos.

# Filtrar los datos de Colombia
master_col <- dfsui %>%
  filter(country == "Colombia")

# Filtrar los datos de Estados Unidos
master_eu <- dfsui %>%
  filter(country == "United States")

head(master_col)
##    country year    sex         age suicides_no population suicides.100k.pop
## 1 Colombia 1985   male   75+ years          21     123400             17.02
## 2 Colombia 1985   male 55-74 years         113    1015200             11.13
## 3 Colombia 1985   male 25-34 years         193    2323700              8.31
## 4 Colombia 1985   male 15-24 years         256    3190200              8.02
## 5 Colombia 1985   male 35-54 years         188    2451100              7.67
## 6 Colombia 1985 female 15-24 years         117    3140700              3.73
##   country.year HDI.for.year gdp_for_year.... gdp_per_capita....      generation
## 1 Colombia1985        0.573   34,894,411,352               1393 G.I. Generation
## 2 Colombia1985        0.573   34,894,411,352               1393 G.I. Generation
## 3 Colombia1985        0.573   34,894,411,352               1393         Boomers
## 4 Colombia1985        0.573   34,894,411,352               1393    Generation X
## 5 Colombia1985        0.573   34,894,411,352               1393          Silent
## 6 Colombia1985        0.573   34,894,411,352               1393    Generation X
dim(master_col)
## [1] 372  12
head(master_eu)
##         country year    sex         age suicides_no population
## 1 United States 1985   male   75+ years        2177    4064000
## 2 United States 1985   male 55-74 years        5302   17971000
## 3 United States 1985   male 25-34 years        5134   20986000
## 4 United States 1985   male 35-54 years        6053   26589000
## 5 United States 1985   male 15-24 years        4267   19962000
## 6 United States 1985 female 35-54 years        2105   27763000
##   suicides.100k.pop      country.year HDI.for.year  gdp_for_year....
## 1             53.57 United States1985        0.841 4,346,734,000,000
## 2             29.50 United States1985        0.841 4,346,734,000,000
## 3             24.46 United States1985        0.841 4,346,734,000,000
## 4             22.77 United States1985        0.841 4,346,734,000,000
## 5             21.38 United States1985        0.841 4,346,734,000,000
## 6              7.58 United States1985        0.841 4,346,734,000,000
##   gdp_per_capita....      generation
## 1              19693 G.I. Generation
## 2              19693 G.I. Generation
## 3              19693         Boomers
## 4              19693          Silent
## 5              19693    Generation X
## 6              19693          Silent
dim(master_eu)
## [1] 372  12

Análisis de Evolución de Suicidios por cada 100K, PIB per Cápita y IDH

En Colombia:

# Suicidios por cada 100,000 habitantes en Colombia
plot(master_col$year, master_col$suicides.100k.pop,
     type = "l", col = "#EE5C42",
     xlab = "Año", ylab = "Suicidios por 100,000 Habitantes",
     main = "Suicidios por 100,000 Habitantes en Colombia")

# PIB per cápita en Colombia
barplot(master_col$gdp_per_capita....,names.arg = master_col$year, las = 2,
        col = "#836FFF", xlab = "Año", ylab = "PIB per Cápita",
        main = "PIB per Cápita en Colombia")

# IDH en Colombia
plot(master_col$year, master_col$HDI.for.year,
     type = "l",col = "#6C7B8B",
     xlab = "Año",ylab = "IDH",
     main = "Evolución del IDH en Colombia")

En E.E.U.U.:

# Suicidios por cada 100,000 habitantes en Estados Unidos
plot(master_eu$year, master_eu$suicides.100k.pop,
     type = "l", col = "#FFD700",
     xlab = "Año", ylab = "Suicidios por 100,000 Habitantes",
     main = "Suicidios por 100,000 Habitantes en Estados Unidos")

# PIB per cápita en Estados Unidos
barplot(master_eu$gdp_per_capita...., names.arg = master_eu$year, las = 2,
        col = "aquamarine4", xlab = "Año", ylab = "PIB per Cápita",
        main = "PIB per Cápita en Estados Unidos")

# IDH en Estados Unidos
plot(master_eu$year, master_eu$HDI.for.year,
     type = "l", col = "#556B2F",
     xlab = "Año", ylab = "IDH",
     main = "Evolución del IDH en Estados Unidos")

Análisis de la Evolución de Variables por Género en Colombia y Estados Unidos

En Colombia:

generc <- master_col %>%
  group_by(year, sex) %>%
  summarise(suicides_mean = mean(suicides.100k.pop, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
percac <- master_col %>%
  group_by(year, sex) %>%
  summarise(gdp_mean = mean(gdp_per_capita...., na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
hdic <- master_col %>%
  group_by(year, sex) %>%
  summarise(hdi_mean = mean(HDI.for.year, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

## Warning: Removed 2 rows containing missing values (`geom_line()`).

En E.E.U.U.:

genereu <- master_eu %>%
  group_by(year, sex) %>%
  summarise(suicides_mean = mean(suicides.100k.pop, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
percaeu <- master_eu %>%
  group_by(year, sex) %>%
  summarise(gdp_mean = mean(gdp_per_capita...., na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
hdieu <- master_eu %>%
  group_by(year, sex) %>%
  summarise(hdi_mean = mean(HDI.for.year, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

## Warning: Removed 2 rows containing missing values (`geom_line()`).

Análisis de la Evolución de Variables por Edad en Colombia y Estados Unidos

En Colombia:

suiciedac <- master_col  %>%
  group_by(year, age) %>%
  summarise(suicides_mean = mean(suicides.100k.pop, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
percaedac <- master_col %>%
  group_by(year, age) %>%
  summarise(gdp_mean = mean(gdp_per_capita...., na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
hdiedac <- master_col %>%
  group_by(year, age) %>%
  summarise(hdi_mean = mean(HDI.for.year, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(suiciedac, aes(x = year, y = suicides_mean, color = age)) +
  geom_line() +
  labs(title = "Suicidios por 100,000 Habitantes por Grupo de Edad en Colombia",
       x = "Año", y = "Suicidios por 100,000 Habitantes") +
  theme_minimal()

## Warning: Removed 6 rows containing missing values (`geom_line()`).

En E.E.U.U.:

suiciedaeu <- master_eu %>%
  group_by(year, age) %>%
  summarise(suicides_mean = mean(suicides.100k.pop, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
percaedaeu <- master_eu %>%
  group_by(year, age) %>%
  summarise(gdp_mean = mean(gdp_per_capita...., na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
hdiedaeu <- master_eu %>%
  group_by(year, age) %>%
  summarise(hdi_mean = mean(HDI.for.year, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(suiciedaeu, aes(x = year, y = suicides_mean, color = age)) +
  geom_line() +
  labs(title = "Suicidios por 100,000 Habitantes por Grupo de Edad en Estados Unidos",
       x = "Año", y = "Suicidios por 100,000 Habitantes") +
  theme_minimal()

## Warning: Removed 6 rows containing missing values (`geom_line()`).