library(tidyverse) # general
library(ggalt) # dumbbell plots
library(countrycode) # continent
library(rworldmap) # quick country-level heat maps
library(gridExtra) # plots
library(broom) # significant trends within countries
theme_set(theme_light())
# 1) Import & data cleaning
data <- read_csv("C:/Users/13728/Desktop/master.csv.zip")
# glimpse(data) # will tidy up these variable names
# sum(is.na(data$`HDI for year`)) # remove, > 2/3 missing, not useable
# table(data$age, data$generation) # don't like this variable
data <- data %>%
select(-c(`HDI for year`, `suicides/100k pop`)) %>%
rename(gdp_for_year = `gdp_for_year ($)`,
gdp_per_capita = `gdp_per_capita ($)`,
country_year = `country-year`) %>%
as.data.frame()
# 2) OTHER ISSUES
# a) this SHOULD give 12 rows for every county-year combination (6 age bands * 2 genders):
# data %>%
# group_by(country_year) %>%
# count() %>%
# filter(n != 12) # note: there appears to be an issue with 2016 data
# not only are there few countries with data, but those that do have data are incomplete
data <- data %>%
filter(year != 2016) %>% # I therefore exclude 2016 data
select(-country_year)
# b) excluding countries with <= 3 years of data:
minimum_years <- data %>%
group_by(country) %>%
summarize(rows = n(),
years = rows / 12) %>%
arrange(years)
data <- data %>%
filter(!(country %in% head(minimum_years$country, 7)))
# no other major data issues found yet
# 3) TIDYING DATAFRAME
data$age <- gsub(" years", "", data$age)
data$sex <- ifelse(data$sex == "male", "Male", "Female")
# getting continent data:
data$continent <- countrycode(sourcevar = data[, "country"],
origin = "country.name",
destination = "continent")
# Nominal factors
data_nominal <- c('country', 'sex', 'continent')
data[data_nominal] <- lapply(data[data_nominal], function(x){factor(x)})
# Making age ordinal
data$age <- factor(data$age,
ordered = T,
levels = c("5-14",
"15-24",
"25-34",
"35-54",
"55-74",
"75+"))
# Making generation ordinal
data$generation <- factor(data$generation,
ordered = T,
levels = c("G.I. Generation",
"Silent",
"Boomers",
"Generation X",
"Millenials",
"Generation Z"))
data <- as_tibble(data)
# the global rate over the time period will be useful:
global_average <- (sum(as.numeric(data$suicides_no)) / sum(as.numeric(data$population))) * 100000
# view the finalized data
glimpse(data)
## Observations: 27,492
## Variables: 10
## $ country <fct> Albania, Albania, Albania, Albania, Albania, Al...
## $ year <int> 1987, 1987, 1987, 1987, 1987, 1987, 1987, 1987,...
## $ sex <fct> Male, Male, Female, Male, Male, Female, Female,...
## $ age <ord> 15-24, 35-54, 15-24, 75+, 25-34, 75+, 35-54, 25...
## $ suicides_no <int> 21, 16, 14, 1, 9, 1, 6, 4, 1, 0, 0, 0, 2, 17, 1...
## $ population <int> 312900, 308000, 289700, 21800, 274300, 35600, 2...
## $ gdp_for_year <dbl> 2156624900, 2156624900, 2156624900, 2156624900,...
## $ gdp_per_capita <int> 796, 796, 796, 796, 796, 796, 796, 796, 796, 79...
## $ generation <ord> Generation X, Silent, Generation X, G.I. Genera...
## $ continent <fct> Europe, Europe, Europe, Europe, Europe, Europe,...
The dashed line is the global average suicide rate from 1985 - 2015: 13.1 deaths (per 100k, per year).
data %>%
group_by(year) %>%
summarize(population = sum(population),
suicides = sum(suicides_no),
suicides_per_100k = (suicides / population) * 100000) %>%
ggplot(aes(x = year, y = suicides_per_100k)) +
geom_line(col = "deepskyblue3", size = 1) +
geom_point(col = "deepskyblue3", size = 2) +
geom_hline(yintercept = global_average, linetype = 2, color = "grey35", size = 1) +
labs(title = "Global Suicides (per 100k)",
subtitle = "Trend over time, 1985 - 2015.",
x = "Year",
y = "Suicides per 100k") +
scale_x_continuous(breaks = seq(1985, 2015, 2)) +
scale_y_continuous(breaks = seq(10, 20))
Insights
continent <- data %>%
group_by(continent) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
arrange(suicide_per_100k)
continent$continent <- factor(continent$continent, ordered = T, levels = continent$continent)
continent_plot <- ggplot(continent, aes(x = continent, y = suicide_per_100k, fill = continent)) +
geom_bar(stat = "identity") +
labs(title = "Global Suicides (per 100k), by Continent",
x = "Continent",
y = "Suicides per 100k",
fill = "Continent") +
theme(legend.position = "none", title = element_text(size = 10)) +
scale_y_continuous(breaks = seq(0, 20, 1), minor_breaks = F)
continent_time <- data %>%
group_by(year, continent) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000)
continent_time$continent <- factor(continent_time$continent, ordered = T, levels = continent$continent)
continent_time_plot <- ggplot(continent_time, aes(x = year, y = suicide_per_100k, col = factor(continent))) +
facet_grid(continent ~ ., scales = "free_y") +
geom_line() +
geom_point() +
labs(title = "Trends Over Time, by Continent",
x = "Year",
y = "Suicides per 100k",
color = "Continent") +
theme(legend.position = "none", title = element_text(size = 10)) +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = F)
grid.arrange(continent_plot, continent_time_plot, ncol = 2)
Insights
sex_plot <- data %>%
group_by(sex) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = sex, y = suicide_per_100k, fill = sex)) +
geom_bar(stat = "identity") +
labs(title = "Global suicides (per 100k), by Sex",
x = "Sex",
y = "Suicides per 100k") +
theme(legend.position = "none") +
scale_y_continuous(breaks = seq(0, 25), minor_breaks = F)
### with time
sex_time_plot <- data %>%
group_by(year, sex) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = factor(sex))) +
facet_grid(sex ~ ., scales = "free_y") +
geom_line() +
geom_point() +
labs(title = "Trends Over Time, by Sex",
x = "Year",
y = "Suicides per 100k",
color = "Sex") +
theme(legend.position = "none") +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = F)
grid.arrange(sex_plot, sex_time_plot, ncol = 2)
Insights
age_plot <- data %>%
group_by(age) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = age, y = suicide_per_100k, fill = age)) +
geom_bar(stat = "identity") +
labs(title = "Global suicides per 100k, by Age",
x = "Age",
y = "Suicides per 100k") +
theme(legend.position = "none") +
scale_y_continuous(breaks = seq(0, 30, 1), minor_breaks = F)
### with time
age_time_plot <- data %>%
group_by(year, age) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = age)) +
facet_grid(age ~ ., scales = "free_y") +
geom_line() +
geom_point() +
labs(title = "Trends Over Time, by Age",
x = "Year",
y = "Suicides per 100k",
color = "Age") +
theme(legend.position = "none") +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = F)
grid.arrange(age_plot, age_time_plot, ncol = 2)
Insights
country <- data %>%
group_by(country, continent) %>%
summarize(n = n(),
suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
arrange(desc(suicide_per_100k))
country$country <- factor(country$country,
ordered = T,
levels = rev(country$country))
ggplot(country, aes(x = country, y = suicide_per_100k, fill = continent)) +
geom_bar(stat = "identity") +
geom_hline(yintercept = global_average, linetype = 2, color = "grey35", size = 1) +
labs(title = "Global suicides per 100k, by Country",
x = "Country",
y = "Suicides per 100k",
fill = "Continent") +
coord_flip() +
scale_y_continuous(breaks = seq(0, 45, 2)) +
theme(legend.position = "bottom")
Insights
Below is a geographical heat map of the suicide rates between the timeframe of this analysis - note the lack of data for Africa and Asia, and bear in mind that 7 countries have been removed due to insufficient data.
country <- data %>%
group_by(country) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000)
countrydata <- joinCountryData2Map(country, joinCode = "NAME", nameJoinColumn = "country")
par(mar=c(0, 0, 0, 0)) # margins
mapCountryData(countrydata,
nameColumnToPlot="suicide_per_100k",
mapTitle="",
colourPalette = "heat",
oceanCol="lightblue",
missingCountryCol="grey65",
catMethod = "pretty")
mapCountryData(countrydata,
nameColumnToPlot="suicide_per_100k",
mapTitle="",
mapRegion = "eurasia",
colourPalette = "heat",
oceanCol="lightblue",
missingCountryCol="grey65",
addLegend = FALSE,
catMethod = "pretty")
It’s important to note that looking at figures at a global/continent level might not truly be representative of the globe/continent for these reasons.
Comparing the raw suicide rates of countries may also lead to some issues - the definition of suicide (and the reliability that a death is recorded as suicide) will likely vary between countries.
However, trends over time (within countries) are likely to be reliable. I address this next.
I’m interested in how the suicide rate is changing over time within each country. Instead of visualizing all 93 countries rates across time, I fit a simple linear regression to every countries data. I extract those with a ‘year’ p-value (corrected for multiple comparisons) of < 0.05.
In other words: as time goes on, I look for countries where the suicide rate is linearly increasing or decreasing over time. These can then be rank ordered by their ‘year’ coefficient, which would be their rate of change as time goes on.
country_year <- data %>%
group_by(country, year) %>%
summarize(suicides = sum(suicides_no),
population = sum(population),
suicide_per_100k = (suicides / population) * 100000,
gdp_per_capita = mean(gdp_per_capita))
country_year_trends <- country_year %>%
ungroup() %>%
nest(-country) %>% # format: country, rest of data (in list column)
mutate(model = map(data, ~ lm(suicide_per_100k ~ year, data = .)), # for each item in 'data', fit a linear model
tidied = map(model, tidy)) %>% # tidy each of these into dataframe format - call this list 'tidied'
unnest(tidied)
country_year_sig_trends <- country_year_trends %>%
filter(term == "year") %>%
mutate(p.adjusted = p.adjust(p.value, method = "holm")) %>%
filter(p.adjusted < .05) %>%
arrange(estimate)
country_year_sig_trends$country <- factor(country_year_sig_trends$country,
ordered = T,
levels = country_year_sig_trends$country)
# plot 1
ggplot(country_year_sig_trends, aes(x=country, y=estimate, col = estimate)) +
geom_point(stat='identity', size = 4) +
geom_hline(yintercept = 0, col = "grey", size = 1) +
scale_color_gradient(low = "green", high = "red") +
geom_segment(aes(y = 0,
x = country,
yend = estimate,
xend = country), size = 1) +
labs(title="Change per year (Suicides per 100k)",
subtitle="Of countries with significant trends (p < 0.05)",
x = "Country", y = "Change Per Year (Suicides per 100k)") +
scale_y_continuous(breaks = seq(-2, 2, 0.2), limits = c(-1.5, 1.5)) +
theme(legend.position = "none") +
coord_flip()
Insights
Steepest increasing trends:
### Lets look at those countries with the steepest increasing trends
top12_increasing <- tail(country_year_sig_trends$country, 12)
country_year %>%
filter(country %in% top12_increasing) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = country)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~ country) +
theme(legend.position = "none") +
labs(title="12 Steepest Increasing Trends",
subtitle="Of countries with significant trends (p < 0.05)",
x = "Year",
y = "Suicides per 100k")
Insights
Steepest decreasing trends:
### Now those with the steepest decreasing trend
top12_decreasing <- head(country_year_sig_trends$country, 12)
country_year %>%
filter(country %in% top12_decreasing) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = country)) +
geom_point() +
geom_smooth(method = "lm") +
facet_wrap(~ country) +
theme(legend.position = "none") +
labs(title="12 Steepest Decreasing Trends",
subtitle="Of countries with significant trends (p < 0.05)",
x = "Year",
y = "Suicides per 100k")
Insights
data %>%
group_by(continent, sex) %>%
summarize(n = n(),
suicides = sum(as.numeric(suicides_no)),
population = sum(as.numeric(population)),
suicide_per_100k = (suicides / population) * 100000) %>%
ggplot(aes(x = continent, y = suicide_per_100k, fill = sex)) +
geom_bar(stat = "identity", position = "dodge") +
geom_hline(yintercept = global_average, linetype = 2, color = "grey35", size = 1) +
labs(title = "Gender Disparity, by Continent",
x = "Continent",
y = "Suicides per 100k",
fill = "Sex") +
coord_flip()
Insights
country_long <- data %>%
group_by(country, continent) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
mutate(sex = "OVERALL")
### by country, continent, sex
sex_country_long <- data %>%
group_by(country, continent, sex) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000)
sex_country_wide <- sex_country_long %>%
spread(sex, suicide_per_100k) %>%
arrange(Male - Female)
sex_country_wide$country <- factor(sex_country_wide$country,
ordered = T,
levels = sex_country_wide$country)
sex_country_long$country <- factor(sex_country_long$country,
ordered = T,
levels = sex_country_wide$country) # using the same order
### this graph shows us how the disparity between deaths varies across gender for every country
# it also has the overall blended death rate - generally countries with a higher death rate have a higher disparity
# this is because, if suicide is more likely in a country, the disparity between men and women is amplified
ggplot(sex_country_wide, aes(y = country, color = sex)) +
geom_dumbbell(aes(x=Female, xend=Male), color = "grey", size = 1) +
geom_point(data = sex_country_long, aes(x = suicide_per_100k), size = 3) +
geom_point(data = country_long, aes(x = suicide_per_100k)) +
geom_vline(xintercept = global_average, linetype = 2, color = "grey35", size = 1) +
theme(axis.text.y = element_text(size = 8),
legend.position = c(0.85, 0.2)) +
scale_x_continuous(breaks = seq(0, 80, 10)) +
labs(title = "Gender Disparity, by Continent & Country",
subtitle = "Ordered by difference in deaths per 100k.",
x = "Suicides per 100k",
y = "Country",
color = "Sex")
country_gender_prop <- sex_country_wide %>%
mutate(Male_Proportion = Male / (Female + Male)) %>%
arrange(Male_Proportion)
sex_country_long$country <- factor(sex_country_long$country,
ordered = T,
levels = country_gender_prop$country)
ggplot(sex_country_long, aes(y = suicide_per_100k, x = country, fill = sex)) +
geom_bar(position = "fill", stat = "identity") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Proportions of suicides that are Male & Female, by Country",
x = "Country",
y = "Suicides per 100k",
fill = "Sex") +
coord_flip()
Insights
data %>%
group_by(continent, age) %>%
summarize(n = n(),
suicides = sum(as.numeric(suicides_no)),
population = sum(as.numeric(population)),
suicide_per_100k = (suicides / population) * 100000) %>%
ggplot(aes(x = continent, y = suicide_per_100k, fill = age)) +
geom_bar(stat = "identity", position = "dodge") +
geom_hline(yintercept = global_average, linetype = 2, color = "grey35", size = 1) +
labs(title = "Age Disparity, by Continent",
x = "Continent",
y = "Suicides per 100k",
fill = "Age")
Insights
It depends on the country - for almost every country, there is a high correlation between year
and gdp per capita
, i.e. as time goes on, gdp per capita linearly increases.
country_year_gdp <- data %>%
group_by(country, year) %>%
summarize(gdp_per_capita = mean(gdp_per_capita))
country_year_gdp_corr <- country_year_gdp %>%
ungroup() %>%
group_by(country) %>%
summarize(year_gdp_correlation = cor(year, gdp_per_capita))
I calculated the pearson correlations between ‘year’ and ‘GDP per capita’ within each country, then summarized the results:
The mean correlation was 0.878, indicating a very strong positive linear relationship.
This basically means that looking within a country and asking “does an increase in weath (per person) have an effect suicide rate” is pretty similar to asking “does a countries suicide rate increase as time progresses”.
This was answered earlier in (2.5.2) - it depends on the country! Some countries are increasing with time, most are decreasing.
Instead, I ask a slightly different question below.
Instead of looking at trends within countries, here I take every country and calculate their mean GDP (per capita) across all the years in which data is available. I then measure how this relates to the countries suicide rate across all those years.
The end result is one data point per country, intended to give a general idea of the wealth of a country and its suicide rate.
country_mean_gdp <- data %>%
group_by(country, continent) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000,
gdp_per_capita = mean(gdp_per_capita))
ggplot(country_mean_gdp, aes(x = gdp_per_capita, y = suicide_per_100k, col = continent)) +
geom_point() +
scale_x_continuous(labels=scales::dollar_format(prefix="$"), breaks = seq(0, 70000, 10000)) +
labs(title = "Correlation between GDP (per capita) and Suicides per 100k",
subtitle = "Plot containing every country",
x = "GDP (per capita)",
y = "Suicides per 100k",
col = "Continent")
There are quite a few high leverage & residual countries that could have a significant impact on the fit of my regression line (e.g. Lithuania, top left). I’ll identify and exclude these using Cooks Distance, excluding those countries with a CooksD value of greater than 4/n.
I assess the statistics of this model (with outliers removed) below.
model1 <- lm(suicide_per_100k ~ gdp_per_capita, data = country_mean_gdp)
gdp_suicide_no_outliers <- model1 %>%
augment() %>%
arrange(desc(.cooksd)) %>%
filter(.cooksd < 4/nrow(.)) %>% # removes 5/93 countries
inner_join(country_mean_gdp, by = c("suicide_per_100k", "gdp_per_capita")) %>%
select(country, continent, gdp_per_capita, suicide_per_100k)
model2 <- lm(suicide_per_100k ~ gdp_per_capita, data = gdp_suicide_no_outliers)
summary(model2)
##
## Call:
## lm(formula = suicide_per_100k ~ gdp_per_capita, data = gdp_suicide_no_outliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11.769 -5.145 -1.724 3.227 20.221
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.772e+00 1.119e+00 7.839 1.12e-11 ***
## gdp_per_capita 1.115e-04 5.015e-05 2.223 0.0288 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.331 on 86 degrees of freedom
## Multiple R-squared: 0.05436, Adjusted R-squared: 0.04337
## F-statistic: 4.944 on 1 and 86 DF, p-value: 0.02881
The p-value of the model is 0.0288 < 0.05. This means we can reject the hypothesis that a countries GDP (per capita) has no association with it’s rate of suicide (per 100k).
The r-squared is 0.0544, so GDP (per capita) explains very little of the variance in suicide rate overall.
What does all this mean?
There is a weak but significant positive linear relationship - richer countries are associated with higher rates of suicide, but this is a weak relationship which can be seen from the graph below.
ggplot(gdp_suicide_no_outliers, aes(x = gdp_per_capita, y = suicide_per_100k, col = continent)) +
geom_point() +
geom_smooth(method = "lm", aes(group = 1)) +
scale_x_continuous(labels=scales::dollar_format(prefix="$"), breaks = seq(0, 70000, 10000)) +
labs(title = "Correlation between GDP (per capita) and Suicides per 100k",
subtitle = "Plot with high CooksD countries removed (5/93 total)",
x = "GDP (per capita)",
y = "Suicides per 100k",
col = "Continent") +
theme(legend.position = "none")
This line of best fit is represented by the equation below, where:
\[ Suicides = 8.7718 + 0.1115*GDP \]
This means that, at a country level and over the time frame of this analysis (1985 - 2015), an increase of GDP (per capita) by $8,967 was associated with 1 additional suicide, per 100k people, per year.
With continuous data, if you have someones age in a given year, you have their generation. The graph below demonstrates how this works for this dataset really well, and is equivalent to the graph of age across time, shown in (2.4).
data %>%
group_by(generation, age, year) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = factor(generation, ordered = F))) +
geom_point() +
geom_line() +
facet_grid(age ~ ., scales = "free_y") +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = NULL) +
labs(title = "Relationship between Generation, Age & Year",
x = "Year",
y = "Suicides per 100k",
col = "Generation") +
theme(legend.position = "bottom")
However, because of the overlap of different age categories, trying to interpret the trend of generation suicide rates over time creates problems.
Compare the rates below to the plot above - large spikes occur at the same time that different age bands begin/stop being classified as from a certain generation.
Note, for example, the supposed spike in suicide rate for G.I. generation in 1991, where those aged ‘55 - 75’ suddenly stop being classified as from this generation.
generation_rate <- data %>%
group_by(generation, year) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = factor(generation, ordered = F))) +
geom_point() +
geom_line() +
facet_grid(generation ~ ., scales = "free_y") +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = NULL) +
labs(title = "Suicides per 100k, by Generation",
x = "Year",
y = "Suicides per 100k") +
theme(legend.position = "none")
generation_population <- data %>%
group_by(generation, year) %>%
summarize(population = sum(as.numeric(population))) %>%
ggplot(aes(x = year, y = population / 1000000, col = factor(generation, ordered = F))) +
geom_point() +
geom_line() +
facet_grid(generation ~ ., scales = "free_y") +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = NULL) +
labs(title = "Population, by Generation",
x = "Year",
y = "Population (Millions)",
col = "Generation") +
theme(legend.position = "none")
grid.arrange(generation_rate, generation_population, ncol = 2)
This is probably a problem with how the dataset was created - it looks like the generation variable was created after the data was summarized (by country, year, age, sex) and just appended onto the end. This shouldn’t be possible, because not everyone in a given age band & year will be of one generation.
This shows why the ‘spikes’ in generation across time are pretty meaningless and I would recommend to others not to use the variable, as it can will lead to wrong conclusions.
I’ll filter out data from 1985 only and look at what happens in the 3 decades following.
Here i’m interested in the 5% highest risk (suicides/100k) demographics between 1986 and 2015.
I define a demographic as a year in a particular country, for some combination of sex & age. e.g. ‘United Kingdom, 2010, Female, 15 - 24’ would be a single demographic/point on the jitter plot below.
In order for a demographic to be in the top 5% for historic suicide rates, it would require a suicide rate exceeding 50.7 (per 100k) in that year.
demographic_most <- data %>%
mutate(suicides_per_100k = suicides_no * 100000 / population) %>%
arrange(desc(suicides_per_100k)) %>%
filter(year != 1985) %>%
head(n = round(nrow(.) * 5 / 100))
demographic_most$time <- ifelse(demographic_most$year <= 1995, "1986 - 1995",
ifelse(demographic_most$year <= 2005, "1996 - 2005",
"2006 - 2015"))
ggplot(demographic_most, aes(x = age, fill = sex)) +
geom_bar() +
labs(title = "5% Most At-Risk Instances in History",
subtitle = "Volumes by Decade, Age & Sex",
x = "Age",
y = "Number of Demographics",
fill = "Sex") +
facet_wrap(~ time) +
scale_y_continuous(breaks = seq(0, 300, 20))
set.seed(1)
ggplot(demographic_most, aes(x = age, y = suicides_per_100k, col = sex)) +
geom_jitter(alpha = 0.5) +
labs(title = "5% Most At-Risk Instances in History",
subtitle = "Instances by Decade, Age, & Sex",
x = "Age",
y = "Suicides per 100k",
col = "Sex") +
facet_wrap(~ time) +
scale_y_continuous(breaks = seq(50, 300, 10))
Insights
Two of the most consistently at-risk demographics seem to be men in South Korea & Hungary, which I will visualize below.
data %>%
filter(country %in% c('Republic of Korea', 'Hungary'), sex == "Male") %>%
group_by(country, age, year) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = country)) +
geom_line() +
geom_point() +
facet_wrap(~ age) +
geom_hline(yintercept = min(demographic_most$suicides_per_100k)) +
theme(legend.position = "bottom") +
scale_y_continuous(breaks = seq(0, 220, 40)) +
labs(title = "Male Age-Group Trends in Hungary & South Korea",
subtitle = "Black reference line indicates where the demographic enters the 'top 5% in history'",
x = "Year",
y = "Suicides per 100k",
col = "Country")
Two very different trends emerge. Hungary is obviously moving in a positive direction, whereas South Korea appears to be coming out of somewhat of a crisis.
For South Korea, mens rates in the 75+ category increased from 26.2 (per 100k) in 1992, to a peak of 185 (per 100k) in 2011 - an increase of more than 600%. Men aged 55-74 see a similar increase.
This was highlighted by my statistical analysis in (2.5.2), which identified South Korea as the steepest increasing country, and Hungary as the 4th steepest decreasing country overall.
I think it would be useful to compare a few countries that people might think of as similar to the UK (culturally, legally, economically).
data_filtered <- data %>%
filter(country %in% c("United Kingdom",
"Ireland",
"United States",
"France",
"Denmark"))
data_filtered %>%
group_by(country, year) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = country)) +
geom_point(alpha = 0.5) +
geom_smooth(se = F, span = 0.2) +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = F) +
labs(title = "UK, Ireland, US, France & Denmark",
subtitle = "Suicides per 100k population, 1985 - 2015",
x = "Year",
y = "Suicides per 100k",
col = "Country")
Insights
data_filtered %>%
group_by(country, sex, year) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = country)) +
geom_point(alpha = 0.5) +
geom_smooth(se = F, span = 0.2) +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = F) +
facet_wrap(~ sex, scales = "free_y", nrow = 2) +
labs(title = "UK, Ireland, US, France & Denmark",
subtitle = "Suicides per 100k population, 1985 - 2015",
x = "Year",
y = "Suicides per 100k",
col = "Country")
Insights
For the purposes of these visualisations, i’m really more interested in data from recent years (France, for example, has changed a lot), so i’ll restrict the timeframe to 2010 onwards.
t1 <- data_filtered %>%
filter(year >= 2010) %>%
group_by(sex) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000)
global_male_proportion <- t1$suicide_per_100k[2] / sum(t1$suicide_per_100k)
t2 <- data_filtered %>%
filter(year >= 2010, continent == "Europe") %>%
group_by(sex) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000)
european_male_proportion <- t2$suicide_per_100k[2] / sum(t2$suicide_per_100k)
data_filtered %>%
filter(year >= 2010) %>%
group_by(country, sex) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = country, y = suicide_per_100k, fill = sex)) +
geom_bar(position = "fill", stat = "identity") +
geom_hline(yintercept = global_male_proportion) +
geom_hline(yintercept = european_male_proportion, col = "blue") +
scale_y_continuous(labels = scales::percent) +
labs(title = "Proportion of suicides that were Male & Female",
subtitle = "2010 - 2015 only, with reference lines for Europe (blue) & Globally (black)",
x = "Country",
y = "",
fill = "Sex")
Insights
data_filtered %>%
filter(year >= 2010) %>%
group_by(country, age) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = country, y = suicide_per_100k, fill = age)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Age ",
subtitle = "2010 - 2015 only",
x = "Country",
y = "Suicides per 100k",
fill = "Age")
Insights
data_filtered %>%
filter(year >= 2010) %>%
group_by(country, sex, age) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = age, y = suicide_per_100k, fill = country)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(~ sex, scales = "free_x") +
labs(title = "Age Disparity, by Country",
subtitle = "2010 - 2015 only",
x = "Age",
y = "Suicides per 100k",
fill = "Country") +
coord_flip() +
theme(legend.position = "bottom")
Insights
There is a big concern in my country (UK) regarding mental health problems and suicide for young to middle-aged men. Here i’m going to restrict the analysis to just:
I’ll basically be observing whether concerning trends are present. I think having other countries here for comparison will be useful and will help provide perspective in the analysis.
data_filtered %>%
filter(age %in% c("15-24", "25-34", "35-54"), sex == "Male") %>%
group_by(country, year) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = country)) +
geom_point(alpha = 0.5) +
geom_smooth(se = F, span = 0.2) +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = F) +
labs(title = "UK, Ireland, US, France & Denmark",
subtitle = "Suicides per 100k population, 1985 - 2015",
x = "Year",
y = "Suicides per 100k",
col = "Country")
Insights
data_filtered %>%
filter(age %in% c("15-24", "25-34", "35-54"), sex == "Male") %>%
group_by(country, age, year) %>%
summarize(suicide_per_100k = (sum(as.numeric(suicides_no)) / sum(as.numeric(population))) * 100000) %>%
ggplot(aes(x = year, y = suicide_per_100k, col = country)) +
geom_point(alpha = 0.5) +
geom_smooth(se = F, span = 0.2) +
facet_wrap(~ age, nrow = 3, scales = "free_y") +
scale_x_continuous(breaks = seq(1985, 2015, 5), minor_breaks = F) +
labs(title = "UK, Ireland, US, France & Denmark",
subtitle = "Suicides per 100k population, 1985 - 2015",
x = "Year",
y = "Suicides per 100k",
col = "Country")
Insights
```
This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.