data <- data %>%
filter(year != 2016)
# b) excluding countries with <= 3 years of data:
minimum_years <- data %>%
group_by(country) %>%
summarize(rows = n(),
years = rows / 12) %>%
arrange(years)
data <- data %>%
filter(!(country %in% head(minimum_years$country, 7)))
# Making age ordinal
data$age <- factor(data$age,
ordered = T,
levels = c("5-14 years",
"15-24 years",
"25-34 years",
"35-54 years",
"55-74 years",
"75+ years"))
data$country <- as.factor(data$country)
data$age <- as.factor(data$age)
data$sex <- as.factor(data$sex)
data %>% head() %>% knitr::kable()
| country | year | sex | age | suicides_no | population | gdp_for_year | suicides_per_100K | gdp_per_capita |
|---|---|---|---|---|---|---|---|---|
| Albania | 1987 | male | 15-24 years | 21 | 312900 | 2156624900 | 6.71 | 796 |
| Albania | 1987 | male | 35-54 years | 16 | 308000 | 2156624900 | 5.19 | 796 |
| Albania | 1987 | female | 15-24 years | 14 | 289700 | 2156624900 | 4.83 | 796 |
| Albania | 1987 | male | 75+ years | 1 | 21800 | 2156624900 | 4.59 | 796 |
| Albania | 1987 | male | 25-34 years | 9 | 274300 | 2156624900 | 3.28 | 796 |
| Albania | 1987 | female | 75+ years | 1 | 35600 | 2156624900 | 2.81 | 796 |
data %>% dim()
[1] 27492 9
data %>% summary()
country year sex age suicides_no
Argentina: 372 Min. :1985 female:13746 5-14 years :4582 Min. : 0.0
Austria : 372 1st Qu.:1995 male :13746 15-24 years:4582 1st Qu.: 3.0
Belgium : 372 Median :2002 25-34 years:4582 Median : 25.0
Brazil : 372 Mean :2001 35-54 years:4582 Mean : 244.9
Chile : 372 3rd Qu.:2008 55-74 years:4582 3rd Qu.: 133.0
Colombia : 372 Max. :2015 75+ years :4582 Max. :22338.0
(Other) :25260
population gdp_for_year suicides_per_100K gdp_per_capita
Min. : 278 Min. :4.692e+07 Min. : 0.00 Min. : 251
1st Qu.: 99298 1st Qu.:9.025e+09 1st Qu.: 0.95 1st Qu.: 3418
Median : 436562 Median :4.819e+10 Median : 6.04 Median : 9283
Mean : 1861366 Mean :4.497e+11 Mean : 12.87 Mean : 16799
3rd Qu.: 1503556 3rd Qu.:2.617e+11 3rd Qu.: 16.68 3rd Qu.: 24870
Max. :43805214 Max. :1.812e+13 Max. :224.97 Max. :126352
# the global rate over the time period will be useful:
average <- (sum(as.numeric(data$suicides_no)) / sum(as.numeric(data$population))) * 100000
data %>%
group_by(year) %>%
summarize(population = sum(population),
suicides = sum(suicides_no),
suicides_per_100k = (suicides / population) * 100000) %>%
ggplot(aes(x = year, y = suicides_per_100k)) +
geom_line(col = "coral", size = 1) +
geom_point(col = "coral", size = 2) +
geom_hline(yintercept = average, linetype = 2, color = "brown", size = 1) +
labs(title = "Global Suicides per 100k",
subtitle = "Trend over time, 1985 - 2015.",
x = "Year",
y = "Suicides per 100k") +
scale_x_continuous(breaks = seq(1985, 2015, 5) , minor_breaks = FALSE) +
scale_y_continuous(breaks = seq(10, 20))
Insights
The peak global suicide rate reached 15.3 deaths per 100k population in 1995. Since then, it has steadily decreased, reaching 11.5 deaths per 100k in 2015. This represents a reduction of approximately 25% over the course of 20 years.
sex_plot <- data %>%
group_by(sex) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = sex, y = suicide_per_100k, fill = sex)) +
geom_bar(stat = "identity") +
labs(title = "Global suicides (per 100k), by Sex",
x = "Sex",
y = "Suicides per 100k") +
theme(legend.position = "none") +
scale_y_continuous(breaks = seq(0, 25), minor_breaks = F)
### with time
sex_time_plot <- data %>%
group_by(year, sex) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = factor(sex))) +
facet_grid(sex ~ ., scales = "free_y") +
geom_line() +
geom_point() +
labs(title = "Trends Over Time, by Sex",
x = "Year",
y = "Suicides per 100k",
color = "Sex") +
theme(legend.position = "none") +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = F)
grid.arrange(sex_plot, sex_time_plot, ncol = 2)
Insights
The global suicide rate has consistently been approximately 3.5 times higher for men compared to women. Both male and female suicide rates reached their highest point in 1995 and have been declining since then. This ratio of 3.5:1 (male to female) has remained relatively stable since the mid-1990s. However, during the 1980s, this ratio was as low as 2.7:1 (male to female).
age_plot <- data %>%
group_by(age) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = age, y = suicide_per_100k, fill = age)) +
geom_bar(stat = "identity") +
labs(title = "Global suicides per 100k, by Age",
x = "Age",
y = "Suicides per 100k") +
theme(legend.position = "none") +
scale_y_continuous(breaks = seq(0, 30, 1), minor_breaks = F) +
theme(
text = element_text(size = 10) ,
axis.text.x = element_text(angle=45) , legend.position="none")
### with time
age_time_plot <- data %>%
group_by(year, age) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = age)) +
facet_grid(age ~ ., scales = "free_y") +
geom_line() +
geom_point() +
labs(title = "Trends Over Time, by Age",
x = "Year",
y = "Suicides per 100k",
color = "Age") +
theme(legend.position = "none") +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = F)
grid.arrange(age_plot, age_time_plot, ncol = 2)
Insights
Globally, there is a clear correlation between age and the likelihood of suicide, with the risk increasing as individuals get older. Moreover, starting from 1995, there has been a consistent linear decrease in the suicide rate for individuals aged 15 and above. Notably, the suicide rate among individuals aged 75 and older has declined by over 50% since 1990. In contrast, the suicide rate in the ‘5-14’ age category has remained relatively stable and low, with less than 1 per 100k population per year.
country <- data %>%
group_by(country) %>%
summarize(
suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
arrange(desc(suicide_per_100k))
country$country <- factor(country$country,
ordered = T,
levels = rev(country$country))
kable(country)
| country | suicide_per_100k |
|---|---|
| Lithuania | 41.4641006 |
| Russian Federation | 32.7772066 |
| Sri Lanka | 30.4839387 |
| Belarus | 30.3446849 |
| Hungary | 30.0224072 |
| Latvia | 28.4710108 |
| Kazakhstan | 26.8986144 |
| Slovenia | 26.3604769 |
| Estonia | 25.9645245 |
| Ukraine | 24.8703975 |
| Finland | 23.7286200 |
| Japan | 21.9205801 |
| Belgium | 20.6925347 |
| Austria | 20.6761930 |
| Guyana | 20.6452838 |
| Croatia | 20.1285551 |
| France | 19.6992767 |
| Republic of Korea | 19.3166521 |
| Serbia | 19.1683634 |
| Suriname | 18.6094161 |
| Switzerland | 17.4786980 |
| Czech Republic | 16.6037953 |
| Cuba | 16.4494134 |
| Poland | 16.0587863 |
| Bulgaria | 15.6885430 |
| Uruguay | 15.6276498 |
| Luxembourg | 15.1160072 |
| Sweden | 14.9548512 |
| Germany | 14.3841922 |
| New Zealand | 13.9518285 |
| Denmark | 13.6358508 |
| Norway | 13.2777918 |
| Iceland | 13.1217940 |
| Canada | 13.0210902 |
| Trinidad and Tobago | 12.9276557 |
| Australia | 12.9265987 |
| United States | 12.8384592 |
| Romania | 12.6829028 |
| Slovakia | 12.0151604 |
| Mauritius | 11.7090106 |
| Ireland | 11.5332736 |
| Kyrgyzstan | 11.3653939 |
| Singapore | 10.7053679 |
| Netherlands | 10.6147679 |
| Chile | 9.5632835 |
| El Salvador | 9.3701103 |
| Portugal | 9.2096414 |
| Puerto Rico | 8.5807853 |
| Spain | 8.1620246 |
| Montenegro | 8.0893073 |
| Aruba | 8.0179284 |
| Argentina | 7.9363085 |
| United Kingdom | 7.8679282 |
| Italy | 7.7000953 |
| Turkmenistan | 7.4617040 |
| Uzbekistan | 7.1548906 |
| Kiribati | 7.1466520 |
| Seychelles | 7.0361053 |
| Thailand | 6.9563081 |
| Costa Rica | 6.6473023 |
| Israel | 6.5256158 |
| Nicaragua | 6.3416289 |
| Ecuador | 6.0000931 |
| Saint Lucia | 5.8815707 |
| Belize | 5.5334579 |
| Malta | 5.2104659 |
| Saint Vincent and Grenadines | 5.0868414 |
| Panama | 4.9569022 |
| Colombia | 4.7908647 |
| Brazil | 4.6665699 |
| Mexico | 4.0079566 |
| Greece | 3.9127341 |
| Paraguay | 3.8373912 |
| Fiji | 3.7130265 |
| Cyprus | 3.4507703 |
| Georgia | 3.4444886 |
| Albania | 3.1608267 |
| Barbados | 2.8485485 |
| Bahrain | 2.7635314 |
| Guatemala | 2.6686723 |
| Qatar | 2.5740911 |
| Armenia | 2.4523624 |
| Turkey | 2.0990775 |
| Philippines | 2.0026887 |
| United Arab Emirates | 1.7040034 |
| Kuwait | 1.6914729 |
| Grenada | 1.6752826 |
| Azerbaijan | 1.4813450 |
| Bahamas | 1.4183212 |
| South Africa | 0.8384769 |
| Maldives | 0.6895967 |
| Antigua and Barbuda | 0.5527005 |
| Jamaica | 0.4660373 |
country %>%
# slice_max( order_by =suicide_per_100k , n= 50 ) %>%
ggplot( aes(x = country, y = suicide_per_100k , fill =suicide_per_100k )) +
geom_bar(stat = "identity" ) +
geom_hline(yintercept = average, linetype = 2, color = "brown", size = 1) +
labs(title = "Global suicides per 100k, by Country",
x = "Country",
y = "Suicides per 100k") +
coord_flip() +
scale_y_continuous(breaks = seq(0, 45, 2)) +
scale_fill_gradient(low = "sienna1", high = "sienna4") + # Change the color scale here
theme(
text = element_text(size = 10 ) ,
axis.text.y = element_text(size=7) ,
axis.text.x = element_text(angle=45) , legend.position="none" )
Insights
Lithuania has consistently maintained the highest suicide rate by a significant margin, with a rate exceeding 41 suicides per 100k population per year.
library(ggplot2)
library(maps)
library(RColorBrewer)
df_subset <- data %>% group_by(country ) %>% summarize(suicides_per_100K = mean(suicides_per_100K))
# Load world map dataset
world_map <- map_data("world")
# Merge data with world map dataset
merged_data <- merge(world_map, df_subset, by.x = "region", by.y = "country", all.x = TRUE)
# Choose a color palette
palette <- colorRampPalette(brewer.pal(9, "YlOrRd"))
merged_data <- merged_data %>% filter( region != "Russia" )
# Create the plot
p <- ggplot() +
geom_polygon(data = merged_data, aes(x = long, y = lat, group = group, fill = suicides_per_100K) ) +
scale_fill_gradientn(colors = palette(100), name = "suicides per 100K") +
coord_map() +
theme_void()
plotly::ggplotly(p)
country_mean_gdp <- data %>%
group_by(country) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000,
gdp_per_capita = mean(gdp_per_capita))
ggplot(country_mean_gdp, aes(x = gdp_per_capita, y = suicide_per_100k)) +
geom_point() +
scale_x_continuous(labels=scales::dollar_format(prefix="$"), breaks = seq(0, 70000, 10000)) +
labs(title = "Correlation between GDP (per capita) and Suicides per 100k",
subtitle = "Plot containing every country",
x = "GDP (per capita)",
y = "Suicides per 100k")
The presence of several high leverage and residual countries, such as Lithuania (located in the top left quadrant),
Below, I provide an assessment of the statistics for this model after removing the outliers.
model1 <- lm(suicide_per_100k ~ gdp_per_capita, data = country_mean_gdp)
gdp_suicide_no_outliers <- model1 %>%
augment() %>%
arrange(desc(.cooksd)) %>%
filter(.cooksd < 4/nrow(.)) %>% # removes 5/93 countries
inner_join(country_mean_gdp,
by = c("suicide_per_100k" ="suicide_per_100k" , "gdp_per_capita" ="gdp_per_capita")) %>%
dplyr::select(country, suicide_per_100k ,gdp_per_capita)
model2 <- lm(suicide_per_100k ~ gdp_per_capita, data = gdp_suicide_no_outliers)
summary(model2)
Call:
lm(formula = suicide_per_100k ~ gdp_per_capita, data = gdp_suicide_no_outliers)
Residuals:
Min 1Q Median 3Q Max
-11.769 -5.145 -1.724 3.227 20.221
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 8.772e+00 1.119e+00 7.839 1.12e-11 ***
gdp_per_capita 1.115e-04 5.015e-05 2.223 0.0288 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 7.331 on 86 degrees of freedom
Multiple R-squared: 0.05436, Adjusted R-squared: 0.04337
F-statistic: 4.944 on 1 and 86 DF, p-value: 0.02881
The model’s p-value is 0.0288 < 0.05, allowing us to reject the hypothesis that a country’s GDP (per capita) has no association with its suicide rate (per 100k).
The r-squared value is 0.0544, indicating that GDP (per capita) explains only a small portion of the overall variance in suicide rate.
Results
Richer countries tend to have higher suicide rates, indicating a weak yet significant positive linear relationship
ggplot(gdp_suicide_no_outliers, aes(x = gdp_per_capita, y = suicide_per_100k)) +
geom_point() +
geom_smooth(method = "lm", aes(group = 1)) +
scale_x_continuous(labels=scales::dollar_format(prefix="$"), breaks = seq(0, 70000, 10000)) +
labs(title = "Correlation between GDP (per capita) and Suicides per 100k",
subtitle = "Plot with high CooksD countries removed (5/93 total)",
x = "GDP (per capita)",
y = "Suicides per 100k") +
theme(legend.position = "none")