Yenting, Liu (s3750625)
Last updated: JUN 2nd, 2019
getwd()## [1] "/Users/qmoa_liu/Downloads"
suicide <- read_csv("/Users/qmoa_liu/Downloads/master.csv")
# Factor the variables
suicide$sex=suicide$sex %>% factor(levels = c("male","female"),labels = c("male","female"))
suicide$age=suicide$age %>% factor(levels = c("15-24 years","25-34 years",
"35-54 years","55-74 years","75+ years"),
labels =c("15-24 years","25-34 years",
"35-54 years","55-74 years","75+ years"),
ordered = T )
suicide$year=suicide$year %>% factor(
levels =c("1985","1986","1987","1988","1989","1990","1991","1992","1993","1994",
"1995","1996","1997","1998","1999","2000","2001","2002","2003","2004",
"2005","2006","2007","2008","2009","2010","2011","2012","2013","2014"
,"2015","2016"),
labels = c("1985","1986","1987","1988","1989","1990","1991","1992","1993","1994",
"1995","1996","1997","1998","1999","2000","2001","2002","2003","2004",
"2005","2006","2007","2008","2009","2010","2011","2012","2013","2014",
"2015","2016"),
ordered = T )
# Missing value
suicide$`suicides/100k pop` %>% is.na() %>% sum()## [1] 0
# Filter outliers
boxplot(suicide$`suicides/100k pop`)outliers=boxplot(suicide$`suicides/100k pop`, plot=FALSE)$out
min(outliers)## [1] 40.19
suicide_clean=suicide %>% filter(`suicides/100k pop`<min(outliers))# A quick summary and the numbers of suicide incidents in particular year.
table1=suicide_clean$`suicides/100k pop` %>% summary()
table1## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.710 4.990 8.547 13.280 40.160
table2=suicide_clean %>% group_by(year) %>% summarise(mean(`suicides/100k pop`))
table2# The change of year (Diagram 1)
plot(table2,main = "The change of suicide number in the whole segment",
ylab="The mean of suicides/100k pop ",
legend=rownames(table2))# Suicide vs gdp per capita (Diagram 2-1)
hist(suicide_clean$`suicides/100k pop`,
main = "The distribution of suicides per 100k people",xlab = "Number of suicides per 100k people")# Diagram 2-2
hist(suicide_clean$`gdp_per_capita ($)`,
main = "The distribution of GDP per capita ($)",xlab = "Amount of GDP per capita")# Diagram 3
plot(suicide_clean$`gdp_per_capita ($)`~ suicide_clean$`suicides/100k pop`,data = suicide_clean,main="Scatter plot between GDP per capita with suicide per 100k people",xlab = "Number of suicides per 100k people",ylab ="Amount of GDP per capita" )# Use Box-cox to transform the dataset (Original data) (Diagram 4)
boxcox_suicide=BoxCox(suicide$`suicides/100k pop`,lambda = "auto")
boxcox_gdp_per_cap=BoxCox(suicide$`gdp_per_capita ($)`,lambda = "auto")
hist(boxcox_suicide,
main = "The distribution of suicides per 100k people (Transformed)",xlab = "Number of suicides per 100k people")hist(boxcox_gdp_per_cap,
main = "The distribution of GDP per capita ($) (Transformed)",xlab = "Amount of GDP per capita")plot(BoxCox(suicide$`suicides/100k pop`,lambda = "auto")~BoxCox(suicide$`gdp_per_capita ($)`,lambda = "auto"),data = suicide,main="Scatter plot between GDP per capita with suicide per 100k people (Transformed)")# Analyze who is less possible to suicide (Diagram 5)
non_suicide=suicide_clean %>% filter(suicides_no==0)
table(non_suicide$age)##
## 15-24 years 25-34 years 35-54 years 55-74 years 75+ years
## 511 451 370 559 922
table(non_suicide$sex)##
## male female
## 1624 2657
table3=table(non_suicide$age,non_suicide$sex) %>% prop.table(margin = 2)
table3 %>%barplot(main = "Non-suicide group",
ylab="Proportion within gender",
ylim=c(0,.5),legend=rownames(table3),
beside=TRUE,
args.legend=c(x="top",horiz=T,title="Age"),xlab="Gender")# Recall the end of last page by filtering data with suicide equal to 0
chi=chisq.test(non_suicide$age,non_suicide$sex)
chi##
## Pearson's Chi-squared test
##
## data: non_suicide$age and non_suicide$sex
## X-squared = 17.427, df = 4, p-value = 0.001596
# Is gender a factor of the commitment of suicide?
leveneTest(suicide_clean$`suicides/100k pop`~suicide_clean$sex,
data = suicide_clean)t.test(suicide_clean$`suicides/100k pop`~suicide_clean$sex,data = suicide_clean,var.equal=F)##
## Welch Two Sample t-test
##
## data: suicide_clean$`suicides/100k pop` by suicide_clean$sex
## t = 64.174, df = 17961, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 7.176116 7.628291
## sample estimates:
## mean in group male mean in group female
## 12.522013 5.119809
# Suicide vs gdp per cap correlation
model1 <- lm(suicide_clean$`suicides/100k pop` ~ suicide_clean$`gdp_per_capita ($)`
, data =suicide_clean)
model1 %>% summary()##
## Call:
## lm(formula = suicide_clean$`suicides/100k pop` ~ suicide_clean$`gdp_per_capita ($)`,
## data = suicide_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14.303 -7.512 -3.526 4.714 32.404
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.646e+00 7.987e-02 95.74 <2e-16
## suicide_clean$`gdp_per_capita ($)` 5.268e-05 3.111e-06 16.94 <2e-16
##
## (Intercept) ***
## suicide_clean$`gdp_per_capita ($)` ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.561 on 25772 degrees of freedom
## Multiple R-squared: 0.01101, Adjusted R-squared: 0.01097
## F-statistic: 286.8 on 1 and 25772 DF, p-value: < 2.2e-16