Suicide Rates Investigation

Suicide prevention

Yenting, Liu (s3750625)

Last updated: JUN 2nd, 2019

Introduction

Problem Statement

Data Preparation

getwd()
## [1] "/Users/qmoa_liu/Downloads"
suicide <- read_csv("/Users/qmoa_liu/Downloads/master.csv")

# Factor the variables
suicide$sex=suicide$sex %>% factor(levels = c("male","female"),labels = c("male","female"))
suicide$age=suicide$age %>% factor(levels = c("15-24 years","25-34 years",
                                              "35-54 years","55-74 years","75+ years"),
                                   labels =c("15-24 years","25-34 years",
                                             "35-54 years","55-74 years","75+ years"),
                                   ordered = T )
suicide$year=suicide$year %>% factor(
  levels =c("1985","1986","1987","1988","1989","1990","1991","1992","1993","1994",
            "1995","1996","1997","1998","1999","2000","2001","2002","2003","2004",
            "2005","2006","2007","2008","2009","2010","2011","2012","2013","2014"
            ,"2015","2016"),
  labels = c("1985","1986","1987","1988","1989","1990","1991","1992","1993","1994",
             "1995","1996","1997","1998","1999","2000","2001","2002","2003","2004",
             "2005","2006","2007","2008","2009","2010","2011","2012","2013","2014",
             "2015","2016"),
  ordered = T )

# Missing value
suicide$`suicides/100k pop` %>% is.na() %>% sum()
## [1] 0
# Filter outliers
boxplot(suicide$`suicides/100k pop`)

outliers=boxplot(suicide$`suicides/100k pop`, plot=FALSE)$out
min(outliers)
## [1] 40.19
suicide_clean=suicide %>% filter(`suicides/100k pop`<min(outliers))

Decsriptive Statistics

# A quick summary and the numbers of suicide incidents in particular year.
table1=suicide_clean$`suicides/100k pop` %>% summary()
table1
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.710   4.990   8.547  13.280  40.160
table2=suicide_clean %>% group_by(year) %>% summarise(mean(`suicides/100k pop`))
table2

Data Visualisation

# The change of year (Diagram 1)
plot(table2,main = "The change of suicide number in the whole segment",
                  ylab="The mean of suicides/100k pop ",
                  legend=rownames(table2))

# Suicide vs gdp per capita (Diagram 2-1)
hist(suicide_clean$`suicides/100k pop`,
     main = "The distribution of suicides per 100k people",xlab = "Number of suicides per 100k people")

# Diagram 2-2
hist(suicide_clean$`gdp_per_capita ($)`,
     main = "The distribution of GDP per capita ($)",xlab = "Amount of GDP per capita")

# Diagram 3
plot(suicide_clean$`gdp_per_capita ($)`~ suicide_clean$`suicides/100k pop`,data = suicide_clean,main="Scatter plot between GDP per capita with suicide per 100k people",xlab = "Number of suicides per 100k people",ylab ="Amount of GDP per capita" )

# Use Box-cox to transform the dataset (Original data) (Diagram 4)
boxcox_suicide=BoxCox(suicide$`suicides/100k pop`,lambda = "auto")
boxcox_gdp_per_cap=BoxCox(suicide$`gdp_per_capita ($)`,lambda = "auto")
hist(boxcox_suicide,
     main = "The distribution of suicides per 100k people (Transformed)",xlab = "Number of suicides per 100k people")

hist(boxcox_gdp_per_cap,
     main = "The distribution of GDP per capita ($) (Transformed)",xlab = "Amount of GDP per capita")

plot(BoxCox(suicide$`suicides/100k pop`,lambda = "auto")~BoxCox(suicide$`gdp_per_capita ($)`,lambda = "auto"),data = suicide,main="Scatter plot between GDP per capita with suicide per 100k people  (Transformed)")

# Analyze who is less possible to suicide (Diagram 5)
non_suicide=suicide_clean %>% filter(suicides_no==0)
table(non_suicide$age)
## 
## 15-24 years 25-34 years 35-54 years 55-74 years   75+ years 
##         511         451         370         559         922
table(non_suicide$sex)
## 
##   male female 
##   1624   2657
table3=table(non_suicide$age,non_suicide$sex) %>% prop.table(margin = 2)
table3 %>%barplot(main = "Non-suicide group",
                  ylab="Proportion within gender",
                  ylim=c(0,.5),legend=rownames(table3),
                  beside=TRUE,
                  args.legend=c(x="top",horiz=T,title="Age"),xlab="Gender")

Hypothesis Testing for the association between gender and age for non-suicide group

# Recall the end of last page by filtering data with suicide equal to 0
chi=chisq.test(non_suicide$age,non_suicide$sex)
chi
## 
##  Pearson's Chi-squared test
## 
## data:  non_suicide$age and non_suicide$sex
## X-squared = 17.427, df = 4, p-value = 0.001596

Hypothesis Testing for the difference of suicide number in genders

# Is gender a factor of the commitment of suicide?
leveneTest(suicide_clean$`suicides/100k pop`~suicide_clean$sex,
                         data = suicide_clean)
t.test(suicide_clean$`suicides/100k pop`~suicide_clean$sex,data = suicide_clean,var.equal=F)
## 
##  Welch Two Sample t-test
## 
## data:  suicide_clean$`suicides/100k pop` by suicide_clean$sex
## t = 64.174, df = 17961, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  7.176116 7.628291
## sample estimates:
##   mean in group male mean in group female 
##            12.522013             5.119809

Hypothesis Testing for the correlation between suicide number with GDP per capita

# Suicide vs gdp per cap correlation
model1 <- lm(suicide_clean$`suicides/100k pop` ~ suicide_clean$`gdp_per_capita ($)`
             , data =suicide_clean)
model1 %>% summary()
## 
## Call:
## lm(formula = suicide_clean$`suicides/100k pop` ~ suicide_clean$`gdp_per_capita ($)`, 
##     data = suicide_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.303  -7.512  -3.526   4.714  32.404 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)
## (Intercept)                        7.646e+00  7.987e-02   95.74   <2e-16
## suicide_clean$`gdp_per_capita ($)` 5.268e-05  3.111e-06   16.94   <2e-16
##                                       
## (Intercept)                        ***
## suicide_clean$`gdp_per_capita ($)` ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.561 on 25772 degrees of freedom
## Multiple R-squared:  0.01101,    Adjusted R-squared:  0.01097 
## F-statistic: 286.8 on 1 and 25772 DF,  p-value: < 2.2e-16

Conclusion & Discussion

References