MiniProject1.utf8.md

#loading dataset and saving it in data variable
data <- read.csv("master.csv")
#checking NA values and removing NA values, if any
any(is.na(data[]))

## [1] TRUE

#seeing the structure of the data
str(data)

## 'data.frame':    27820 obs. of  12 variables:
##  $ country           : Factor w/ 101 levels "Albania","Antigua and Barbuda",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year              : int  1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
##  $ sex               : Factor w/ 2 levels "female","male": 2 2 1 2 2 1 1 1 2 1 ...
##  $ age               : Factor w/ 6 levels "15-24 years",..: 1 3 1 6 2 6 3 2 5 4 ...
##  $ suicides_no       : int  21 16 14 1 9 1 6 4 1 0 ...
##  $ population        : int  312900 308000 289700 21800 274300 35600 278800 257200 137500 311000 ...
##  $ suicides.100k.pop : num  6.71 5.19 4.83 4.59 3.28 2.81 2.15 1.56 0.73 0 ...
##  $ country.year      : Factor w/ 2321 levels "Albania1987",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ HDI.for.year      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ gdp_for_year....  : Factor w/ 2321 levels "1,002,219,052,968",..: 727 727 727 727 727 727 727 727 727 727 ...
##  $ gdp_per_capita....: int  796 796 796 796 796 796 796 796 796 796 ...
##  $ generation        : Factor w/ 6 levels "Boomers","G.I. Generation",..: 3 6 3 2 1 2 6 1 2 3 ...

#removing not required column HDI
data1 <- data[-9]
#seeing the structure of the data1
str(data1)

## 'data.frame':    27820 obs. of  11 variables:
##  $ country           : Factor w/ 101 levels "Albania","Antigua and Barbuda",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ year              : int  1987 1987 1987 1987 1987 1987 1987 1987 1987 1987 ...
##  $ sex               : Factor w/ 2 levels "female","male": 2 2 1 2 2 1 1 1 2 1 ...
##  $ age               : Factor w/ 6 levels "15-24 years",..: 1 3 1 6 2 6 3 2 5 4 ...
##  $ suicides_no       : int  21 16 14 1 9 1 6 4 1 0 ...
##  $ population        : int  312900 308000 289700 21800 274300 35600 278800 257200 137500 311000 ...
##  $ suicides.100k.pop : num  6.71 5.19 4.83 4.59 3.28 2.81 2.15 1.56 0.73 0 ...
##  $ country.year      : Factor w/ 2321 levels "Albania1987",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ gdp_for_year....  : Factor w/ 2321 levels "1,002,219,052,968",..: 727 727 727 727 727 727 727 727 727 727 ...
##  $ gdp_per_capita....: int  796 796 796 796 796 796 796 796 796 796 ...
##  $ generation        : Factor w/ 6 levels "Boomers","G.I. Generation",..: 3 6 3 2 1 2 6 1 2 3 ...

#checking NA values and removing NA values, if any, there was none
any(is.na(data1[]))

## [1] FALSE

#summary of the dataset
summary(data1)

##         country           year          sex                 age      
##  Austria    :  382   Min.   :1985   female:13910   15-24 years:4642  
##  Iceland    :  382   1st Qu.:1995   male  :13910   25-34 years:4642  
##  Mauritius  :  382   Median :2002                  35-54 years:4642  
##  Netherlands:  382   Mean   :2001                  5-14 years :4610  
##  Argentina  :  372   3rd Qu.:2008                  55-74 years:4642  
##  Belgium    :  372   Max.   :2016                  75+ years  :4642  
##  (Other)    :25548                                                   
##   suicides_no        population       suicides.100k.pop
##  Min.   :    0.0   Min.   :     278   Min.   :  0.00   
##  1st Qu.:    3.0   1st Qu.:   97498   1st Qu.:  0.92   
##  Median :   25.0   Median :  430150   Median :  5.99   
##  Mean   :  242.6   Mean   : 1844794   Mean   : 12.82   
##  3rd Qu.:  131.0   3rd Qu.: 1486143   3rd Qu.: 16.62   
##  Max.   :22338.0   Max.   :43805214   Max.   :224.97   
##                                                        
##       country.year            gdp_for_year.... gdp_per_capita....
##  Albania1987:   12   1,002,219,052,968:   12   Min.   :   251    
##  Albania1988:   12   1,011,797,457,139:   12   1st Qu.:  3447    
##  Albania1989:   12   1,016,418,229    :   12   Median :  9372    
##  Albania1992:   12   1,018,847,043,277:   12   Mean   : 16866    
##  Albania1993:   12   1,022,191,296    :   12   3rd Qu.: 24874    
##  Albania1994:   12   1,023,196,003,075:   12   Max.   :126352    
##  (Other)    :27748   (Other)          :27748                     
##            generation  
##  Boomers        :4990  
##  G.I. Generation:2744  
##  Generation X   :6408  
##  Generation Z   :1470  
##  Millenials     :5844  
##  Silent         :6364  
##

#range and standard deviation of quantitative dataset
range(data1$year)

## [1] 1985 2016

range(data1$suicides_no)

## [1]     0 22338

range(data1$population)

## [1]      278 43805214

range(data1$suicides.100k.pop)

## [1]   0.00 224.97

range(data1$gdp_per_capita....)

## [1]    251 126352

#standard deviation of quantitative dataset
sd(data1$year)

## [1] 8.469055

sd(data1$suicides_no)

## [1] 902.0479

sd(data1$population)

## [1] 3911779

sd(data1$suicides.100k.pop)

## [1] 18.96151

sd(data1$gdp_per_capita....)

## [1] 18887.58

#ploting data
library(ggplot2)
#using ggplot for ploting sex vs sucide number, can clearly see that male are more scatered up 
#then female
qplot(sex, suicides_no, data=data1, geom=c("boxplot", "jitter"),
      fill=sex, main="Suicide numbers Vs Sex",
      xlab="", ylab="Suicide numbers")

#using scattterplot for ploting year vs sucide number, can see that year 1990 - 1995 and 2000 -2005
#has are more suicide numbers
plot(data1$year,data1$suicides_no, xlab = 'Year', ylab = 'Suicide numbers', main='Suicide number Vs Year')

#using scattterplot for ploting population vs sucide 100k population, can see that population between
#0 - 100000 dense sucide 100k population
plot(data1$population,data1$suicides.100k.pop, xlab = 'Population', ylab = 'Suicides.100k.pop',
            main='population Vs Suicides.100k.pop')

#using scattterplot for ploting gdp/capita vs sucide numbers, can see that less gdp has more 
#suicide numbers then in more gdp per capita
plot(data1$gdp_per_capita....,data1$suicides_no, xlab = 'GDP/Capita', ylab = 'Suicide number',
            main='GDP/Capita Vs Suicide')

#using plot for ploting age vs sucide numbers, can see that maximum suicide number 35 - 54 
#and least suicide number is between 5-14 years
plot(data1$age,data1$suicides_no, xlab = 'Age', ylab = 'Suicide number',
            main='Age Vs Suicide')

#using scattterplot for ploting generation vs suicide numbers, we can see that Boomers are highest in
#suicide number and the least in generation z, silent also have quite a lot of suicide numbers.
plot(data1$generation,data1$suicides_no, xlab = 'Generation', ylab = 'Suicide number',
     main='Generation Vs Suicide')

#One sample ttest
#hypothesis:
#H0 = Gdp/Capita is equal or less then 500
#H1 = Gdp/Capita is greater then 500
# Assuming confidence level of 95% 
# as we can see that p-value < 0.05, we reject the null hypothesis thus mean gdp is greater then 500
#in this dataset, in order to know gdp vs suicide number is that suicide number not only depends on less
#gdp but higher gdp too
boxplot(data1$gdp_per_capita....)

t.test(data1$gdp_per_capita....,mu=500, alternative = 'greater')

## 
##  One Sample t-test
## 
## data:  data1$gdp_per_capita....
## t = 144.53, df = 27819, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 500
## 95 percent confidence interval:
##  16680.2     Inf
## sample estimates:
## mean of x 
##  16866.46

#two sample ttest
#hypothesis:
#H0 = mean of suicide number in male is equal to mean of suicide number in female
#H1 = mean of suicide number in male is not equal to mean of suicide number in female
# Assuming confidence level of 95% 
# as we can see that p-value < 0.05, we reject the null hypothesis thus accepting H1 = mean of suicide 
#number in male is not equal to mean of suicide number in female
boxplot(data1$suicides_no ~ data1$sex)

t.test(data1$suicides_no ~ data1$sex,mu=0, alternative = 'two.sided', conf=0.95, var.eq=F, paired=F)

## 
##  Welch Two Sample t-test
## 
## data:  data1$suicides_no by data1$sex
## t = -24.379, df = 15985, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -281.8989 -239.9415
## sample estimates:
## mean in group female   mean in group male 
##             112.1143             373.0345

#paired sample ttest(data wrangling)
#two sample ttest
#hypothesis:
#H0 = Suicide number is less in 2005-2015
#H1 = Suicide number is more in 2005-2015
# Assuming confidence level of 95% 
# as we can see that p-value < 0.05, we reject the null hypothesis thus accepting H1 = Suicide number is more in 2005-2015
data1$below2005 <- data1$year <= 2005
data1$above2005 <- data1$year > 2005
boxplot(data1$below2005 ~ data1$above2005)

t.test(data1$below2005, data1$above2005,mu=0, alternative = 'two.sided', conf=0.95, var.eq=F, paired=T)

## 
##  Paired t-test
## 
## data:  data1$below2005 and data1$above2005
## t = 48.853, df = 27819, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.2698150 0.2923705
## sample estimates:
## mean of the differences 
##               0.2810927

#Annova sample ttest
#hypothesis:
#H0 = All variance are equal
#H1 = All variance are not equal
# Assuming confidence level of 95% 
# as we can see that p-value < 0.05, we reject the null hypothesis thus accepting H1 = mean of suicide 
#number in male is not equal to mean of suicide number in female
library(car)

## Loading required package: carData

#shows count of gen, here it seems that Boomer, Silent and generation x has more suicidal tendency
boxplot(data1$suicides_no ~ data1$generation)

leveneTest(data1$suicides_no ~ data1$generation)

## Levene's Test for Homogeneity of Variance (center = median)
##          Df F value    Pr(>F)    
## group     5  101.88 < 2.2e-16 ***
##       27814                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#H0 = All means are equal
#H1 = All means are not equal
#we will run linear model for analysis of variance 
analysis <- lm(data1$suicides_no ~ data1$generation)
#now look into the structure of the fitted model, we can see degree of freedom,sum squares 
#mean square = varibility between generations and varibility within the generation, 
#here we can see thats F value > 1 and P value < 0.05 thus we reject null hypothesis
anova(analysis)

## Analysis of Variance Table
## 
## Response: data1$suicides_no
##                     Df     Sum Sq  Mean Sq F value    Pr(>F)    
## data1$generation     5 4.3585e+08 87169838  109.21 < 2.2e-16 ***
## Residuals        27814 2.2200e+10   798167                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#ploting the residuals
plot(analysis, which = 1)

plot(analysis, which = 2)

#positive left hand sqweness data
resids <- rstandard(analysis)
hist(resids)

#we have reject the null hypothesis thus accepting that there is a difference in suicide no. in generation,
#but which are the effective generation, diffence in means can be calculated by tukey test and aov for analysis
#of variance
#we can see that G.I. Generation-Boomers, Generation X-Boomers, Generation Z-Boomers, Millenials-Boomers,
#Silent-Boomers, Generation Z-G.I. Generation, Generation Z-Generation X, Millenials-Generation X,
#Silent-Generation Z, Silent-Millenials
#are having 0.0000000 are differ from one another
TukeyHSD(aov(analysis))

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = analysis)
## 
## $`data1$generation`
##                                    diff         lwr        upr     p adj
## G.I. Generation-Boomers      -271.95189 -332.459015 -211.44477 0.0000000
## Generation X-Boomers         -218.61361 -266.680887 -170.54633 0.0000000
## Generation Z-Boomers         -446.99482 -522.548231 -371.44141 0.0000000
## Millenials-Boomers           -351.13162 -400.203852 -302.05939 0.0000000
## Silent-Boomers               -177.84289 -225.982857 -129.70291 0.0000000
## Generation X-G.I. Generation   53.33828   -4.745037  111.42161 0.0931382
## Generation Z-G.I. Generation -175.04293 -257.332215  -92.75365 0.0000000
## Millenials-G.I. Generation    -79.17973 -138.097406  -20.26206 0.0017854
## Silent-G.I. Generation         94.10901   35.965514  152.25250 0.0000584
## Generation Z-Generation X    -228.38121 -302.007826 -154.75460 0.0000000
## Millenials-Generation X      -132.51802 -178.568466  -86.46757 0.0000000
## Silent-Generation X            40.77072   -4.284991   85.82643 0.1024686
## Millenials-Generation Z        95.86320   21.576605  170.14979 0.0032202
## Silent-Generation Z           269.15194  195.477848  342.82602 0.0000000
## Silent-Millenials             173.28874  127.162417  219.41506 0.0000000

#chisquare test
#Research Question:Does suicide number associated among genders?
#H0 = two variables are independent
#H1 = two variables are related

# Generate frequency table. If values in all cells are same, then have a balanced design.
cam <- table(data1$sex, data1$suicides_no)

##chisquare test
#the p-value is extremely low, then there is a lot of tilt to the
#stacks (the counts are not likely to be from chance).
chisq.test(cam)

## Warning in chisq.test(cam): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  cam
## X-squared = 3235.7, df = 2083, p-value < 2.2e-16

#critical value
#Chi Square value >= Critical Value, reject the null hypothesis that means accepting two variables
#are related
qchisq(0.95,2083)

## [1] 2190.291

MiniProject1.R

arnabchakraboty

2020-02-12