dfsui <- read.csv("/Users/juansebastianquintanacontreras/Documents/6 semestre/DATAVIZ/master 2.csv")
head(dfsui)
## country year sex age suicides_no population suicides.100k.pop
## 1 Albania 1987 male 15-24 years 21 312900 6.71
## 2 Albania 1987 male 35-54 years 16 308000 5.19
## 3 Albania 1987 female 15-24 years 14 289700 4.83
## 4 Albania 1987 male 75+ years 1 21800 4.59
## 5 Albania 1987 male 25-34 years 9 274300 3.28
## 6 Albania 1987 female 75+ years 1 35600 2.81
## country.year HDI.for.year gdp_for_year.... gdp_per_capita.... generation
## 1 Albania1987 NA 2,156,624,900 796 Generation X
## 2 Albania1987 NA 2,156,624,900 796 Silent
## 3 Albania1987 NA 2,156,624,900 796 Generation X
## 4 Albania1987 NA 2,156,624,900 796 G.I. Generation
## 5 Albania1987 NA 2,156,624,900 796 Boomers
## 6 Albania1987 NA 2,156,624,900 796 G.I. Generation
summary(dfsui)
## country year sex age
## Length:27820 Min. :1985 Length:27820 Length:27820
## Class :character 1st Qu.:1995 Class :character Class :character
## Mode :character Median :2002 Mode :character Mode :character
## Mean :2001
## 3rd Qu.:2008
## Max. :2016
##
## suicides_no population suicides.100k.pop country.year
## Min. : 0.0 Min. : 278 Min. : 0.00 Length:27820
## 1st Qu.: 3.0 1st Qu.: 97498 1st Qu.: 0.92 Class :character
## Median : 25.0 Median : 430150 Median : 5.99 Mode :character
## Mean : 242.6 Mean : 1844794 Mean : 12.82
## 3rd Qu.: 131.0 3rd Qu.: 1486143 3rd Qu.: 16.62
## Max. :22338.0 Max. :43805214 Max. :224.97
##
## HDI.for.year gdp_for_year.... gdp_per_capita.... generation
## Min. :0.483 Length:27820 Min. : 251 Length:27820
## 1st Qu.:0.713 Class :character 1st Qu.: 3447 Class :character
## Median :0.779 Mode :character Median : 9372 Mode :character
## Mean :0.777 Mean : 16866
## 3rd Qu.:0.855 3rd Qu.: 24874
## Max. :0.944 Max. :126352
## NA's :19456
names(dfsui)
## [1] "country" "year" "sex"
## [4] "age" "suicides_no" "population"
## [7] "suicides.100k.pop" "country.year" "HDI.for.year"
## [10] "gdp_for_year...." "gdp_per_capita...." "generation"
dim(dfsui)
## [1] 27820 12
La base de datos dfsui contiene 27,820 observaciones y
12 variables.
country (Categórica)sex (Categórica)age (Categórica)generation (Categórica)year (Numérica)suicides_no (Numérica)population (Numérica)suicides.100k.pop (Numérica)gdp_per_capita.... (Numérica)HDI.for.year (Numérica)gdp_for_year.... (Numérica)country.year (Numérica)En el análisis inicial, observamos una gran cantidad de valores
faltantes en la variable HDI.for.year, con 19,456 valores
NA. Además.
Dado esto, y basándonos en la revisión de valores extraños realizada en la tarea #2, se procederá a una revisión más detallada de la base de datos utilizando operadores pipe para identificar de manera eficiente cualquier valor faltante, duplicado.
## [1] "Valores faltantes por variable:"
## # A tibble: 1 × 2
## variable n_faltantes
## <chr> <int>
## 1 HDI.for.year 19456
## [1] "Número de registros duplicados: 0"
Después de realizar un análisis de la base de datos
dfsui, se identificó que la variable
HDI.for.year contiene valores faltantes (NA).
Todas las demás variables de la base de datos contienen datos válidos y
utilizables.
Además, se verificó que no hay registros duplicados en toda la base de datos. Esto asegura que los datos disponibles no están repetidos.
# Filtrar los datos de Colombia
master_col <- dfsui %>%
filter(country == "Colombia")
# Filtrar los datos de Estados Unidos
master_eu <- dfsui %>%
filter(country == "United States")
head(master_col)
## country year sex age suicides_no population suicides.100k.pop
## 1 Colombia 1985 male 75+ years 21 123400 17.02
## 2 Colombia 1985 male 55-74 years 113 1015200 11.13
## 3 Colombia 1985 male 25-34 years 193 2323700 8.31
## 4 Colombia 1985 male 15-24 years 256 3190200 8.02
## 5 Colombia 1985 male 35-54 years 188 2451100 7.67
## 6 Colombia 1985 female 15-24 years 117 3140700 3.73
## country.year HDI.for.year gdp_for_year.... gdp_per_capita.... generation
## 1 Colombia1985 0.573 34,894,411,352 1393 G.I. Generation
## 2 Colombia1985 0.573 34,894,411,352 1393 G.I. Generation
## 3 Colombia1985 0.573 34,894,411,352 1393 Boomers
## 4 Colombia1985 0.573 34,894,411,352 1393 Generation X
## 5 Colombia1985 0.573 34,894,411,352 1393 Silent
## 6 Colombia1985 0.573 34,894,411,352 1393 Generation X
dim(master_col)
## [1] 372 12
head(master_eu)
## country year sex age suicides_no population
## 1 United States 1985 male 75+ years 2177 4064000
## 2 United States 1985 male 55-74 years 5302 17971000
## 3 United States 1985 male 25-34 years 5134 20986000
## 4 United States 1985 male 35-54 years 6053 26589000
## 5 United States 1985 male 15-24 years 4267 19962000
## 6 United States 1985 female 35-54 years 2105 27763000
## suicides.100k.pop country.year HDI.for.year gdp_for_year....
## 1 53.57 United States1985 0.841 4,346,734,000,000
## 2 29.50 United States1985 0.841 4,346,734,000,000
## 3 24.46 United States1985 0.841 4,346,734,000,000
## 4 22.77 United States1985 0.841 4,346,734,000,000
## 5 21.38 United States1985 0.841 4,346,734,000,000
## 6 7.58 United States1985 0.841 4,346,734,000,000
## gdp_per_capita.... generation
## 1 19693 G.I. Generation
## 2 19693 G.I. Generation
## 3 19693 Boomers
## 4 19693 Silent
## 5 19693 Generation X
## 6 19693 Silent
dim(master_eu)
## [1] 372 12
# Suicidios por cada 100,000 habitantes en Colombia
plot(master_col$year, master_col$suicides.100k.pop,
type = "l", col = "#EE5C42",
xlab = "Año", ylab = "Suicidios por 100,000 Habitantes",
main = "Suicidios por 100,000 Habitantes en Colombia")
# PIB per cápita en Colombia
barplot(master_col$gdp_per_capita....,names.arg = master_col$year, las = 2,
col = "#836FFF", xlab = "Año", ylab = "PIB per Cápita",
main = "PIB per Cápita en Colombia")
# IDH en Colombia
plot(master_col$year, master_col$HDI.for.year,
type = "l",col = "#6C7B8B",
xlab = "Año",ylab = "IDH",
main = "Evolución del IDH en Colombia")
# Suicidios por cada 100,000 habitantes en Estados Unidos
plot(master_eu$year, master_eu$suicides.100k.pop,
type = "l", col = "#FFD700",
xlab = "Año", ylab = "Suicidios por 100,000 Habitantes",
main = "Suicidios por 100,000 Habitantes en Estados Unidos")
# PIB per cápita en Estados Unidos
barplot(master_eu$gdp_per_capita...., names.arg = master_eu$year, las = 2,
col = "aquamarine4", xlab = "Año", ylab = "PIB per Cápita",
main = "PIB per Cápita en Estados Unidos")
# IDH en Estados Unidos
plot(master_eu$year, master_eu$HDI.for.year,
type = "l", col = "#556B2F",
xlab = "Año", ylab = "IDH",
main = "Evolución del IDH en Estados Unidos")
generc <- master_col %>%
group_by(year, sex) %>%
summarise(suicides_mean = mean(suicides.100k.pop, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
percac <- master_col %>%
group_by(year, sex) %>%
summarise(gdp_mean = mean(gdp_per_capita...., na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
hdic <- master_col %>%
group_by(year, sex) %>%
summarise(hdi_mean = mean(HDI.for.year, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## Warning: Removed 2 rows containing missing values (`geom_line()`).
genereu <- master_eu %>%
group_by(year, sex) %>%
summarise(suicides_mean = mean(suicides.100k.pop, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
percaeu <- master_eu %>%
group_by(year, sex) %>%
summarise(gdp_mean = mean(gdp_per_capita...., na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
hdieu <- master_eu %>%
group_by(year, sex) %>%
summarise(hdi_mean = mean(HDI.for.year, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
## Warning: Removed 2 rows containing missing values (`geom_line()`).
suiciedac <- master_col %>%
group_by(year, age) %>%
summarise(suicides_mean = mean(suicides.100k.pop, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
percaedac <- master_col %>%
group_by(year, age) %>%
summarise(gdp_mean = mean(gdp_per_capita...., na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
hdiedac <- master_col %>%
group_by(year, age) %>%
summarise(hdi_mean = mean(HDI.for.year, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(suiciedac, aes(x = year, y = suicides_mean, color = age)) +
geom_line() +
labs(title = "Suicidios por 100,000 Habitantes por Grupo de Edad en Colombia",
x = "Año", y = "Suicidios por 100,000 Habitantes") +
theme_minimal()
## Warning: Removed 6 rows containing missing values (`geom_line()`).
suiciedaeu <- master_eu %>%
group_by(year, age) %>%
summarise(suicides_mean = mean(suicides.100k.pop, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
percaedaeu <- master_eu %>%
group_by(year, age) %>%
summarise(gdp_mean = mean(gdp_per_capita...., na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
hdiedaeu <- master_eu %>%
group_by(year, age) %>%
summarise(hdi_mean = mean(HDI.for.year, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
ggplot(suiciedaeu, aes(x = year, y = suicides_mean, color = age)) +
geom_line() +
labs(title = "Suicidios por 100,000 Habitantes por Grupo de Edad en Estados Unidos",
x = "Año", y = "Suicidios por 100,000 Habitantes") +
theme_minimal()
## Warning: Removed 6 rows containing missing values (`geom_line()`).