#loading dataset and saving it in data variable
data <- read.csv("master.csv")
#checking NA values and removing NA values, if any
any(is.na(data[]))
## [1] TRUE
#seeing the structure of the data
str(data)
## 'data.frame': 27820 obs. of 12 variables:
## $ country : Factor w/ 101 levels "Albania","Antigua and Barbuda",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 1 2 2 1 1 1 2 1 ...
## $ age : Factor w/ 6 levels "15-24 years",..: 1 3 1 6 2 6 3 2 5 4 ...
## $ suicides_no : int 21 16 14 1 9 1 6 4 1 0 ...
## $ population : int 312900 308000 289700 21800 274300 35600 278800 257200 137500 311000 ...
## $ suicides.100k.pop : num 6.71 5.19 4.83 4.59 3.28 2.81 2.15 1.56 0.73 0 ...
## $ country.year : Factor w/ 2321 levels "Albania1987",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ HDI.for.year : num NA NA NA NA NA NA NA NA NA NA ...
## $ gdp_for_year.... : Factor w/ 2321 levels "1,002,219,052,968",..: 727 727 727 727 727 727 727 727 727 727 ...
## $ gdp_per_capita....: int 796 796 796 796 796 796 796 796 796 796 ...
## $ generation : Factor w/ 6 levels "Boomers","G.I. Generation",..: 3 6 3 2 1 2 6 1 2 3 ...
#removing not required column HDI
data1 <- data[-9]
#seeing the structure of the data1
str(data1)
## 'data.frame': 27820 obs. of 11 variables:
## $ country : Factor w/ 101 levels "Albania","Antigua and Barbuda",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
## $ sex : Factor w/ 2 levels "female","male": 2 2 1 2 2 1 1 1 2 1 ...
## $ age : Factor w/ 6 levels "15-24 years",..: 1 3 1 6 2 6 3 2 5 4 ...
## $ suicides_no : int 21 16 14 1 9 1 6 4 1 0 ...
## $ population : int 312900 308000 289700 21800 274300 35600 278800 257200 137500 311000 ...
## $ suicides.100k.pop : num 6.71 5.19 4.83 4.59 3.28 2.81 2.15 1.56 0.73 0 ...
## $ country.year : Factor w/ 2321 levels "Albania1987",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ gdp_for_year.... : Factor w/ 2321 levels "1,002,219,052,968",..: 727 727 727 727 727 727 727 727 727 727 ...
## $ gdp_per_capita....: int 796 796 796 796 796 796 796 796 796 796 ...
## $ generation : Factor w/ 6 levels "Boomers","G.I. Generation",..: 3 6 3 2 1 2 6 1 2 3 ...
#checking NA values and removing NA values, if any, there was none
any(is.na(data1[]))
## [1] FALSE
#summary of the dataset
summary(data1)
## country year sex age
## Austria : 382 Min. :1985 female:13910 15-24 years:4642
## Iceland : 382 1st Qu.:1995 male :13910 25-34 years:4642
## Mauritius : 382 Median :2002 35-54 years:4642
## Netherlands: 382 Mean :2001 5-14 years :4610
## Argentina : 372 3rd Qu.:2008 55-74 years:4642
## Belgium : 372 Max. :2016 75+ years :4642
## (Other) :25548
## suicides_no population suicides.100k.pop
## Min. : 0.0 Min. : 278 Min. : 0.00
## 1st Qu.: 3.0 1st Qu.: 97498 1st Qu.: 0.92
## Median : 25.0 Median : 430150 Median : 5.99
## Mean : 242.6 Mean : 1844794 Mean : 12.82
## 3rd Qu.: 131.0 3rd Qu.: 1486143 3rd Qu.: 16.62
## Max. :22338.0 Max. :43805214 Max. :224.97
##
## country.year gdp_for_year.... gdp_per_capita....
## Albania1987: 12 1,002,219,052,968: 12 Min. : 251
## Albania1988: 12 1,011,797,457,139: 12 1st Qu.: 3447
## Albania1989: 12 1,016,418,229 : 12 Median : 9372
## Albania1992: 12 1,018,847,043,277: 12 Mean : 16866
## Albania1993: 12 1,022,191,296 : 12 3rd Qu.: 24874
## Albania1994: 12 1,023,196,003,075: 12 Max. :126352
## (Other) :27748 (Other) :27748
## generation
## Boomers :4990
## G.I. Generation:2744
## Generation X :6408
## Generation Z :1470
## Millenials :5844
## Silent :6364
##
#range and standard deviation of quantitative dataset
range(data1$year)
## [1] 1985 2016
range(data1$suicides_no)
## [1] 0 22338
range(data1$population)
## [1] 278 43805214
range(data1$suicides.100k.pop)
## [1] 0.00 224.97
range(data1$gdp_per_capita....)
## [1] 251 126352
#standard deviation of quantitative dataset
sd(data1$year)
## [1] 8.469055
sd(data1$suicides_no)
## [1] 902.0479
sd(data1$population)
## [1] 3911779
sd(data1$suicides.100k.pop)
## [1] 18.96151
sd(data1$gdp_per_capita....)
## [1] 18887.58
#ploting data
library(ggplot2)
#using ggplot for ploting sex vs sucide number, can clearly see that male are more scatered up
#then female
qplot(sex, suicides_no, data=data1, geom=c("boxplot", "jitter"),
fill=sex, main="Suicide numbers Vs Sex",
xlab="", ylab="Suicide numbers")

#using scattterplot for ploting year vs sucide number, can see that year 1990 - 1995 and 2000 -2005
#has are more suicide numbers
plot(data1$year,data1$suicides_no, xlab = 'Year', ylab = 'Suicide numbers', main='Suicide number Vs Year')

#using scattterplot for ploting population vs sucide 100k population, can see that population between
#0 - 100000 dense sucide 100k population
plot(data1$population,data1$suicides.100k.pop, xlab = 'Population', ylab = 'Suicides.100k.pop',
main='population Vs Suicides.100k.pop')

#using scattterplot for ploting gdp/capita vs sucide numbers, can see that less gdp has more
#suicide numbers then in more gdp per capita
plot(data1$gdp_per_capita....,data1$suicides_no, xlab = 'GDP/Capita', ylab = 'Suicide number',
main='GDP/Capita Vs Suicide')

#using plot for ploting age vs sucide numbers, can see that maximum suicide number 35 - 54
#and least suicide number is between 5-14 years
plot(data1$age,data1$suicides_no, xlab = 'Age', ylab = 'Suicide number',
main='Age Vs Suicide')

#using scattterplot for ploting generation vs suicide numbers, we can see that Boomers are highest in
#suicide number and the least in generation z, silent also have quite a lot of suicide numbers.
plot(data1$generation,data1$suicides_no, xlab = 'Generation', ylab = 'Suicide number',
main='Generation Vs Suicide')

#One sample ttest
#hypothesis:
#H0 = Gdp/Capita is equal or less then 500
#H1 = Gdp/Capita is greater then 500
# Assuming confidence level of 95%
# as we can see that p-value < 0.05, we reject the null hypothesis thus mean gdp is greater then 500
#in this dataset, in order to know gdp vs suicide number is that suicide number not only depends on less
#gdp but higher gdp too
boxplot(data1$gdp_per_capita....)

t.test(data1$gdp_per_capita....,mu=500, alternative = 'greater')
##
## One Sample t-test
##
## data: data1$gdp_per_capita....
## t = 144.53, df = 27819, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 500
## 95 percent confidence interval:
## 16680.2 Inf
## sample estimates:
## mean of x
## 16866.46
#two sample ttest
#hypothesis:
#H0 = mean of suicide number in male is equal to mean of suicide number in female
#H1 = mean of suicide number in male is not equal to mean of suicide number in female
# Assuming confidence level of 95%
# as we can see that p-value < 0.05, we reject the null hypothesis thus accepting H1 = mean of suicide
#number in male is not equal to mean of suicide number in female
boxplot(data1$suicides_no ~ data1$sex)

t.test(data1$suicides_no ~ data1$sex,mu=0, alternative = 'two.sided', conf=0.95, var.eq=F, paired=F)
##
## Welch Two Sample t-test
##
## data: data1$suicides_no by data1$sex
## t = -24.379, df = 15985, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -281.8989 -239.9415
## sample estimates:
## mean in group female mean in group male
## 112.1143 373.0345
#paired sample ttest(data wrangling)
#two sample ttest
#hypothesis:
#H0 = Suicide number is less in 2005-2015
#H1 = Suicide number is more in 2005-2015
# Assuming confidence level of 95%
# as we can see that p-value < 0.05, we reject the null hypothesis thus accepting H1 = Suicide number is more in 2005-2015
data1$below2005 <- data1$year <= 2005
data1$above2005 <- data1$year > 2005
boxplot(data1$below2005 ~ data1$above2005)

t.test(data1$below2005, data1$above2005,mu=0, alternative = 'two.sided', conf=0.95, var.eq=F, paired=T)
##
## Paired t-test
##
## data: data1$below2005 and data1$above2005
## t = 48.853, df = 27819, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 0.2698150 0.2923705
## sample estimates:
## mean of the differences
## 0.2810927
#Annova sample ttest
#hypothesis:
#H0 = All variance are equal
#H1 = All variance are not equal
# Assuming confidence level of 95%
# as we can see that p-value < 0.05, we reject the null hypothesis thus accepting H1 = mean of suicide
#number in male is not equal to mean of suicide number in female
library(car)
## Loading required package: carData
#shows count of gen, here it seems that Boomer, Silent and generation x has more suicidal tendency
boxplot(data1$suicides_no ~ data1$generation)

leveneTest(data1$suicides_no ~ data1$generation)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 5 101.88 < 2.2e-16 ***
## 27814
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#H0 = All means are equal
#H1 = All means are not equal
#we will run linear model for analysis of variance
analysis <- lm(data1$suicides_no ~ data1$generation)
#now look into the structure of the fitted model, we can see degree of freedom,sum squares
#mean square = varibility between generations and varibility within the generation,
#here we can see thats F value > 1 and P value < 0.05 thus we reject null hypothesis
anova(analysis)
## Analysis of Variance Table
##
## Response: data1$suicides_no
## Df Sum Sq Mean Sq F value Pr(>F)
## data1$generation 5 4.3585e+08 87169838 109.21 < 2.2e-16 ***
## Residuals 27814 2.2200e+10 798167
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#ploting the residuals
plot(analysis, which = 1)

plot(analysis, which = 2)

#positive left hand sqweness data
resids <- rstandard(analysis)
hist(resids)

#we have reject the null hypothesis thus accepting that there is a difference in suicide no. in generation,
#but which are the effective generation, diffence in means can be calculated by tukey test and aov for analysis
#of variance
#we can see that G.I. Generation-Boomers, Generation X-Boomers, Generation Z-Boomers, Millenials-Boomers,
#Silent-Boomers, Generation Z-G.I. Generation, Generation Z-Generation X, Millenials-Generation X,
#Silent-Generation Z, Silent-Millenials
#are having 0.0000000 are differ from one another
TukeyHSD(aov(analysis))
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = analysis)
##
## $`data1$generation`
## diff lwr upr p adj
## G.I. Generation-Boomers -271.95189 -332.459015 -211.44477 0.0000000
## Generation X-Boomers -218.61361 -266.680887 -170.54633 0.0000000
## Generation Z-Boomers -446.99482 -522.548231 -371.44141 0.0000000
## Millenials-Boomers -351.13162 -400.203852 -302.05939 0.0000000
## Silent-Boomers -177.84289 -225.982857 -129.70291 0.0000000
## Generation X-G.I. Generation 53.33828 -4.745037 111.42161 0.0931382
## Generation Z-G.I. Generation -175.04293 -257.332215 -92.75365 0.0000000
## Millenials-G.I. Generation -79.17973 -138.097406 -20.26206 0.0017854
## Silent-G.I. Generation 94.10901 35.965514 152.25250 0.0000584
## Generation Z-Generation X -228.38121 -302.007826 -154.75460 0.0000000
## Millenials-Generation X -132.51802 -178.568466 -86.46757 0.0000000
## Silent-Generation X 40.77072 -4.284991 85.82643 0.1024686
## Millenials-Generation Z 95.86320 21.576605 170.14979 0.0032202
## Silent-Generation Z 269.15194 195.477848 342.82602 0.0000000
## Silent-Millenials 173.28874 127.162417 219.41506 0.0000000
#chisquare test
#Research Question:Does suicide number associated among genders?
#H0 = two variables are independent
#H1 = two variables are related
# Generate frequency table. If values in all cells are same, then have a balanced design.
cam <- table(data1$sex, data1$suicides_no)
##chisquare test
#the p-value is extremely low, then there is a lot of tilt to the
#stacks (the counts are not likely to be from chance).
chisq.test(cam)
## Warning in chisq.test(cam): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: cam
## X-squared = 3235.7, df = 2083, p-value < 2.2e-16
#critical value
#Chi Square value >= Critical Value, reject the null hypothesis that means accepting two variables
#are related
qchisq(0.95,2083)
## [1] 2190.291