data <- read.csv(url("https://raw.githubusercontent.com/hovig/MSDS_CUNY/master/Suicide.csv"))
Introduction:
Suicide is a trend that many communities and scientific studies want to assess its risks to understand its symptoms and analyze the rates that are driving its factors. Group death (such as Blue Whale) is among this trend.
Keep into consideration that suicide typically results from the interaction of many factors, for example: mental illness, marital breakdown or employment or financial hardship, deteriorating physical health, a major loss, or a lack of social support or addictive illness, etc.
This study presents the statistics on Suicide.csv dataset (can be found at https://github.com/hovig/MSDS_CUNY/blob/master/Suicide.csv), looking primarily at trends and variations by sex, age, methods and frequency. We will target the following questions:
What this dataset has?
What the means can explain to us?
How can we approach to more details to the overall suicide data ?
What are the rates of male vs. female suicide?
What age range has the most crisis that is leading to the most deaths?
What can we conclude from all these stats?
And many other questions will be answered while we analyze our dataset.
head(data,10)
## X Freq sex method age age.group method2
## 1 1 4 male poison 10 10-20 poison
## 2 2 0 male cookgas 10 10-20 gas
## 3 3 0 male toxicgas 10 10-20 gas
## 4 4 247 male hang 10 10-20 hang
## 5 5 1 male drown 10 10-20 drown
## 6 6 17 male gun 10 10-20 gun
## 7 7 1 male knife 10 10-20 knife
## 8 8 6 male jump 10 10-20 jump
## 9 9 0 male other 10 10-20 other
## 10 10 348 male poison 15 10-20 poison
summary(data)
## X Freq sex method
## Min. : 1.00 Min. : 0.00 female:153 cookgas: 34
## 1st Qu.: 77.25 1st Qu.: 10.25 male :153 drown : 34
## Median :153.50 Median : 59.00 gun : 34
## Mean :153.50 Mean : 173.80 hang : 34
## 3rd Qu.:229.75 3rd Qu.: 178.75 jump : 34
## Max. :306.00 Max. :1381.00 knife : 34
## (Other):102
## age age.group method2
## Min. :10 10-20:54 gas :68
## 1st Qu.:30 25-35:54 drown :34
## Median :50 40-50:54 gun :34
## Mean :50 55-65:54 hang :34
## 3rd Qu.:70 70-90:90 jump :34
## Max. :90 knife :34
## (Other):68
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data %>%
summarise(dataset_count=length(data$X), mean_Freq=round(mean(data$Freq),3), mean_age=round(mean(data$age),3),median_Freq=round(median(data$Freq),3), median_age=round(median(data$age),3), quartile_Freq=round(quantile(data$Freq, 0.25),3), quartile_age=round(quantile(data$age, 0.25),3))
## dataset_count mean_Freq mean_age median_Freq median_age quartile_Freq
## 1 306 173.797 50 59 50 10.25
## quartile_age
## 1 30
ds_male<-subset(data, data$sex=='male', c('Freq','age','method2'))
ds_m<-rename(ds_male,Male_Frequency=Freq,Male_Age=age,Male_Method=method2)
ds_m %>%
mutate(Male_Frequency=replace(ds_m$Male_Frequency,ds_m$Male_Frequency==0,'None')) %>%
head
## Male_Frequency Male_Age Male_Method
## 1 4 10 poison
## 2 None 10 gas
## 3 None 10 gas
## 4 247 10 hang
## 5 1 10 drown
## 6 17 10 gun
library(dplyr)
ds_male %>%
summarise(Male_dataset_count=length(ds_male$age), mean_Freq=round(mean(ds_male$Freq),3), mean_age=round(mean(ds_male$age),3),median_Freq=round(median(ds_male$Freq),3), median_age=round(median(ds_male$age),3), quartile_Freq=round(quantile(ds_male$Freq, 0.25),3), quartile_age=round(quantile(ds_male$age, 0.25),3))
## Male_dataset_count mean_Freq mean_age median_Freq median_age
## 1 153 221.039 50 77 50
## quartile_Freq quartile_age
## 1 18 30
ds_female<-subset(data, data$sex=='female', c('Freq','age','method2'))
ds_f<-rename(ds_female,Female_Frequency=Freq,Female_Age=age,Female_Method2=method2)
ds_f %>%
mutate(Female_Frequency=replace(ds_f$Female_Frequency,ds_f$Female_Frequency==0,'None')) %>%
head
## Female_Frequency Female_Age Female_Method2
## 1 28 10 poison
## 2 None 10 gas
## 3 3 10 gas
## 4 20 10 hang
## 5 None 10 drown
## 6 1 10 gun
library(dplyr)
ds_female %>%
summarise(Female_dataset_count=length(ds_female$age), mean_Freq=round(mean(ds_female$Freq),3), mean_age=round(mean(ds_female$age),3),median_Freq=round(median(ds_female$Freq),3), median_age=round(median(ds_female$age),3), quartile_Freq=round(quantile(ds_female$Freq, 0.25),3), quartile_age=round(quantile(ds_female$age, 0.25),3))
## Female_dataset_count mean_Freq mean_age median_Freq median_age
## 1 153 126.556 50 29 50
## quartile_Freq quartile_age
## 1 7 30
if(sum(length(ds_m$Male_Age),length(ds_f$Female_Age))==nrow(data)){
print(paste("The sum of Male_age and Female_age is ", sum(length(ds_m$Male_Age),length(ds_f$Female_Age))))
} else {
print(paste("Received wrong count: ", nrow(data)))
}
## [1] "The sum of Male_age and Female_age is 306"
library(ggplot2)
library(lattice)
library(plot3D)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
ggplot(data, aes(x=data$age, y=data$Freq)) + geom_point()
boxplot(ds_male$Freq ~ ds_male$age, data=ds_male, main=toupper("Male Age vs. Male Frequency"), font.main=3, cex.main=1.2, xlab="Male Age", ylab="Male Frequency", font.lab=3, col="darkblue")
boxplot(ds_female$Freq ~ ds_female$age, data=ds_female, main=toupper("Female Age vs. Female Frequency"), font.main=3, cex.main=1.2, xlab="Female Age", ylab="Female Frequency", font.lab=3, col="darkred")
hist(data$Freq, freq=TRUE, xlab = "Frequency", ylab = "Scale", main = "Overall Frequency", col="red" )
hist(data$age, freq=TRUE, xlab = "Age", ylab = "Frequency", main = "Overall Age", col="lightgreen" )
male_age <- ds_male$age
female_age <- ds_female$age
plot(male_age, female_age, type="o", col="blue", pch="o", lty=1, ylim=c(0,length(male_age)) )
maleFreq <- ds_m$Male_Frequency
femaleFreq <- ds_f$Female_Frequency
plot(maleFreq,type = "o",col = "red", xlab = "Frequency", ylab = "Male vs. Female", main = "Male vs. Female Frequency")
lines(femaleFreq, type = "o", col = "blue")
p <- plot_ly(data, x = ~data$Freq, y = ~data$age, z = ~data$method2, marker = list(color = mpg, colorscale = c('blue', 'red'), showscale = TRUE)) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Frequency'), yaxis = list(title = 'Age'), zaxis = list(title = 'Method')), annotations = list(x = 50000, y = 1, text = 'Suicide Dataset', xref = 'paper', yref = 'paper', showarrow = FALSE))
p
Conclusion:
print(paste("suicide_with_gas = ",nrow(subset(data, data$method2=='gas', c('Freq','age')))))
## [1] "suicide_with_gas = 68"
print(paste("male_death_in_70-90_age_group = ",nrow(subset(data, data$age.group=='70-90' & data$sex=='male', c('Freq','age')))))
## [1] "male_death_in_70-90_age_group = 45"
We notice that gas is the most used method for suicide, with a total of 68, while other methods are equal to 34 or less each. We used the method2 column where gas includes toxicgas and cookgas. The group age with the most death rates is the age-group: 70-90. It has 90 suicide cases (50% males), while the rest is equally distributed by 54.
We can’t determine for how long this pattern has been going on, but we can definitely notice that males commit suicide at a higher rates than females (average_male_freq > average_female_freq). Knowing that Male_dataset_count and Female_dataset_count are both equal to 153.
data.frame(avg_male_freq=mean(ds_male$Freq),avg_female_freq=mean(ds_female$Freq),Male_dataset_count=length(ds_m$Male_Age),Female_dataset_count=length(ds_f$Female_Age))
## avg_male_freq avg_female_freq Male_dataset_count Female_dataset_count
## 1 221.0392 126.5556 153 153