library(dplyr)
library(data.table)
library(purrr)
library(ggplot2)
library(vcd)
library(reshape)
library(boot)
library(gplots)
library(car)
library(HH)The first step is loading the data:
load("be3_gss")The data used for this Inference Project, is an extract of the General Social Survey (GSS) which provides indicators across the Us in providing data sources for students learning statistical reasoning. Basically, our data consist of measure constants like attitudes, behaviors and attributes of American society.
Through further exploratory analysis, we can’t make causal conclusions since there can be variables which are not computerized from the data collected (i.e.: missing daily activities, hobbies, etc.) and they should include a deeper evaluation and understanding. Our further analysis doesn’t exactly mean that is bad, since base on the research questions we are simply stablishing relationships from our data and we can make an observational study with methods such as inference. The results then, can be generalized to the entire population.
Globally, there are several reasons in people to think why a person may commit a suicide. One of those are: Incurable diseases, bankrupt, dishonored family and tired of living; moreover, these reasons could have any relationship if a person owns a fire gun in their houses. Based on these reasons, how likely is the way of thinking on American society that a person should commit a suicide, given their economic status?
After going through the attributes, we have selected: age; sex; marital status; financial situation; unemployment; whether owning a gun and total family income in dollars.
gss.data <- gss
i <- sapply(gss.data, is.factor)
gss.data[i] <- lapply(gss.data[i], as.character)
gss.data$suicide1[gss.data$suicide1=="Yes"] <- "INCURABLE DISEASE"
gss.data$suicide2[gss.data$suicide2=="Yes"] <- "BANKRUPT"
gss.data$suicide3[gss.data$suicide3=="Yes"] <- "DISHONORED FAMILY"
gss.data$suicide4[gss.data$suicide4=="Yes"] <- "TIRED LIVING"
gss.data.t <- gss.data %>% dplyr::select(age, sex, marital, class, finalter, owngun, unemp, coninc, suicide1, suicide2, suicide3, suicide4)
gss.data.t <- melt(gss.data.t, id=c("age", "sex", "marital", "class", "finalter","owngun", "unemp", "coninc"))
gss.data.t <- gss.data.t[,-9]
gss.data.t$value <- as.character(gss.data.t$value)
gss.data.t <- subset(gss.data.t, is.na(value)==FALSE & value != "No")
# changing names
names(gss.data.t)[names(gss.data.t) == "value"] <- "suicide"hist(gss.data.t$age, col = "red", xlab="Age", main="Age Frequencies - GSS 2014")ggplot(data = gss.data.t, aes(x = suicide, y = age, fill = sex)) +
geom_boxplot() + labs(title = "Age by Suicide Thinking", x = "Suicide Type", y="Age") + theme(plot.title = element_text(hjust = 0.5))ggplot(data = gss.data.t, aes(x = suicide, y = coninc)) +
geom_boxplot() + labs(title = "Total Family Income Suicide Types", x = "Suicide Type", y="Family Income") + theme(plot.title = element_text(hjust = 0.5))mosaic( ~ suicide + finalter + unemp , data= gss.data.t,highlighting="finalter",
highlighting_fill=c("green", "yellow", "red"), labeling_args =
list(set_varnames = c(finalter = "Financial status",suicide="Suicide Type", unemp = "unemployed?" )))gss.data.chi.fin <- xtabs(~finalter + suicide, data=gss.data.t)
chisq.test(gss.data.chi.fin)##
## Pearson's Chi-squared test
##
## data: gss.data.chi.fin
## X-squared = 12.311, df = 6, p-value = 0.05539
gss.data.chi.marital <- xtabs(~marital + suicide, data=gss.data.t)
chisq.test(gss.data.chi.marital)##
## Pearson's Chi-squared test
##
## data: gss.data.chi.marital
## X-squared = 133.99, df = 12, p-value < 2.2e-16
gss.data.chi.unemp <- xtabs(~unemp + suicide, data=gss.data.t)
chisq.test(gss.data.chi.unemp)##
## Pearson's Chi-squared test
##
## data: gss.data.chi.unemp
## X-squared = 0.059428, df = 3, p-value = 0.9962
gss.data.chi.owngun <- xtabs(~owngun + suicide, data=gss.data.t)
chisq.test(gss.data.chi.owngun)##
## Pearson's Chi-squared test
##
## data: gss.data.chi.owngun
## X-squared = 26.523, df = 6, p-value = 0.0001778
keeps <- c("coninc")
gss.data.inc <- gss.data.t[keeps]
gss.data.inc <- na.omit(gss.data.inc, cols="coninc")
keeps2 <- c("coninc", "suicide", "owngun", "marital")
gss.data.anova <- gss.data.t[keeps2]
gss.data.anova <- gss.data.anova[complete.cases(gss.data.anova),]
attach(gss.data.anova)
suicide.group.means <- aggregate(coninc, by=list(suicide), FUN=mean, na.rm=TRUE )
print(suicide.group.means)## Group.1 x
## 1 BANKRUPT 53886.43
## 2 DISHONORED FAMILY 54294.17
## 3 INCURABLE DISEASE 49211.92
## 4 TIRED LIVING 50684.16
plotmeans(coninc ~ suicide, digits=2, mean.labels = T,
main="Familiar Income means by Suicide Thinking (95% confidence intervals)",
xlab = "Suicide Type", ylab = "Salary means") boxplot(gss.data.anova$coninc ~ gss.data.anova$suicide, main="Salary Income by Suicide Thinking", ylab="Salary Income", xlab="Suicide Thinking", col=rainbow(7))suicide.group.sd <- aggregate(coninc, by=list(suicide), FUN=sd, na.rm=TRUE)
print(suicide.group.sd)## Group.1 x
## 1 BANKRUPT 42139.63
## 2 DISHONORED FAMILY 42526.45
## 3 INCURABLE DISEASE 38252.49
## 4 TIRED LIVING 41463.16
suicide.anova <- aov(coninc ~ suicide)
summary(suicide.anova)## Df Sum Sq Mean Sq F value Pr(>F)
## suicide 3 4.604e+10 1.535e+10 9.693 2.19e-06 ***
## Residuals 11872 1.880e+13 1.583e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
tuk<- TukeyHSD(suicide.anova)
print(tuk)## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = coninc ~ suicide)
##
## $suicide
## diff lwr upr
## DISHONORED FAMILY-BANKRUPT 407.7419 -3565.596 4381.07941
## INCURABLE DISEASE-BANKRUPT -4674.5092 -7755.210 -1593.80838
## TIRED LIVING-BANKRUPT -3202.2754 -6784.368 379.81769
## INCURABLE DISEASE-DISHONORED FAMILY -5082.2512 -8124.995 -2039.50782
## TIRED LIVING-DISHONORED FAMILY -3610.0173 -7159.519 -60.51593
## TIRED LIVING-INCURABLE DISEASE 1472.2339 -1038.198 3982.66551
## p adj
## DISHONORED FAMILY-BANKRUPT 0.9935846
## INCURABLE DISEASE-BANKRUPT 0.0005632
## TIRED LIVING-BANKRUPT 0.0987536
## INCURABLE DISEASE-DISHONORED FAMILY 0.0001052
## TIRED LIVING-DISHONORED FAMILY 0.0444650
## TIRED LIVING-INCURABLE DISEASE 0.4333675
a <- aggregate(coninc, by=list(marital, suicide), FUN=mean)
print(a)## Group.1 Group.2 x
## 1 Divorced BANKRUPT 41475.55
## 2 Married BANKRUPT 71637.12
## 3 Never Married BANKRUPT 40385.74
## 4 Separated BANKRUPT 41036.69
## 5 Widowed BANKRUPT 33104.33
## 6 Divorced DISHONORED FAMILY 42187.09
## 7 Married DISHONORED FAMILY 71998.71
## 8 Never Married DISHONORED FAMILY 40707.51
## 9 Separated DISHONORED FAMILY 41094.82
## 10 Widowed DISHONORED FAMILY 35295.43
## 11 Divorced INCURABLE DISEASE 36564.55
## 12 Married INCURABLE DISEASE 63106.71
## 13 Never Married INCURABLE DISEASE 37150.65
## 14 Separated INCURABLE DISEASE 33834.25
## 15 Widowed INCURABLE DISEASE 27147.02
## 16 Divorced TIRED LIVING 39375.86
## 17 Married TIRED LIVING 66571.86
## 18 Never Married TIRED LIVING 38787.36
## 19 Separated TIRED LIVING 38179.04
## 20 Widowed TIRED LIVING 29148.97
b <- aggregate(coninc, by=list(marital, suicide), FUN=sd)
print(b)## Group.1 Group.2 x
## 1 Divorced BANKRUPT 30522.90
## 2 Married BANKRUPT 44362.35
## 3 Never Married BANKRUPT 35517.33
## 4 Separated BANKRUPT 34607.60
## 5 Widowed BANKRUPT 39590.50
## 6 Divorced DISHONORED FAMILY 31907.43
## 7 Married DISHONORED FAMILY 44249.34
## 8 Never Married DISHONORED FAMILY 36693.59
## 9 Separated DISHONORED FAMILY 34903.72
## 10 Widowed DISHONORED FAMILY 39421.98
## 11 Divorced INCURABLE DISEASE 29938.14
## 12 Married INCURABLE DISEASE 39481.49
## 13 Never Married INCURABLE DISEASE 32961.99
## 14 Separated INCURABLE DISEASE 30792.84
## 15 Widowed INCURABLE DISEASE 27432.16
## 16 Divorced TIRED LIVING 30905.25
## 17 Married TIRED LIVING 44297.44
## 18 Never Married TIRED LIVING 35339.35
## 19 Separated TIRED LIVING 33731.16
## 20 Widowed TIRED LIVING 33761.39
suicide.anova3 <- aov(coninc ~ suicide*marital )
summary(suicide.anova3)## Df Sum Sq Mean Sq F value Pr(>F)
## suicide 3 4.604e+10 1.535e+10 11.171 2.57e-07 ***
## marital 4 2.499e+12 6.248e+11 454.787 < 2e-16 ***
## suicide:marital 12 1.059e+10 8.823e+08 0.642 0.808
## Residuals 11856 1.629e+13 1.374e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
interaction.plot(suicide, marital, coninc, type="b", col=c("red","blue","green", "orange"), pch=c(20,10),main = "Interaction between Suicide Type and Marital Status")interaction2wt(coninc ~ suicide*marital*owngun, rot=c(90,0), main.in = "Interaction plot: Income = Suicide x Marital x Gun")