#Get rid of scientific notation
options(scipen = 999)
#loading NY Suicide Data Set
suicide <- read.csv(file = "Vital_Statistics_Suicide.csv")
#renaming year column (weird character in front of year)
colnames(suicide)[1]<- "Year"
#removing not stated from Race.or.Ethnicity
suicide2 <- subset(suicide, Race.or.Ethnicity!= "Not Stated")
#renaming Black Non Hispanic to Black in Race.or.Ethnicity
suicide2$Race.or.Ethnicity <- ifelse(suicide2$Race.or.Ethnicity == "Black Non Hispanic", "Black", suicide2$Race.or.Ethnicity)
#renaming Other Non Hispanic to Other in Race.or.Ethnicity
suicide2$Race.or.Ethnicity <- ifelse(suicide2$Race.or.Ethnicity == "Other Non Hispanic", "Other", suicide2$Race.or.Ethnicity)
#renaming White Other Non Hispanic to White in Race.or.Ethnicity
suicide2$Race.or.Ethnicity <- ifelse(suicide2$Race.or.Ethnicity == "White Non Hispanic", "White", suicide2$Race.or.Ethnicity)
#Just the totals from each of the age groups based on Year (used for ANOVA test)
suicide3 <- subset(suicide2, Age.Group == "Total")
#Summed up the results between the two regions
suicide3 <- aggregate(suicide3$Suicide.Deaths~suicide3$Year + suicide3$Race.or.Ethnicity,FUN = sum, na.rm = TRUE)
#removing data that says total, <1, 1-9 from Age.Group column
suicide2 <- subset(suicide2, Age.Group != "Total" & Age.Group != "<1" & Age.Group != "1-9")
#loading Depression Income Set
income <- read.csv(file = "depression_income.csv")
# Re-coding income labels
income$income <- ifelse(income$income == "Not classified", NA, income$income)
income$income <- ifelse(income$income == "High income", "high", ifelse (income$income == "Upper middle income", "upper-mid", ifelse (income$income == "Low income", "low", income$income)))
income$income <- ifelse(income$income =="High income: OECD", "OECD", ifelse (income$income == "Lower middle income", "lower-mid", income$income))## Year Region Race.or.Ethnicity Sex
## Min. :2003 Length:1296 Length:1296 Length:1296
## 1st Qu.:2007 Class :character Class :character Class :character
## Median :2011 Mode :character Mode :character Mode :character
## Mean :2011
## 3rd Qu.:2015
## Max. :2017
## Age.Group Firearm.Deaths Alcohol.Related.Deaths Suicide.Deaths
## Length:1296 Min. : 0.00 Length:1296 Min. : 0.00
## Class :character 1st Qu.: 0.00 Class :character 1st Qu.: 0.00
## Mode :character Median : 0.00 Mode :character Median : 3.00
## Mean : 8.61 Mean : 17.21
## 3rd Qu.: 5.00 3rd Qu.: 19.00
## Max. :130.00 Max. :223.00
#line plot for average suicide deaths per year and total suicide deaths per year
year_summary <- suicide2 %>%
group_by(Year) %>%
summarize(`Total Suicide Deaths` = sum(Suicide.Deaths),
`Average Suicide Deaths` = mean(Suicide.Deaths))## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(year_summary, aes(x = Year, y = `Average Suicide Deaths`)) +
geom_point() +
geom_path() +
ylim(0,40) +
theme_minimal()ggplot(year_summary, aes(x = Year, y = `Total Suicide Deaths`)) +
geom_point() +
geom_path() +
theme_minimal()#tables of mean number of suicide deaths
aggregate(suicide2$Suicide.Deaths~suicide2$Race.or.Ethnicity,FUN = mean)## suicide2$Race.or.Ethnicity suicide2$Suicide.Deaths
## 1 Black 1.496575
## 2 Hispanic 32.946296
## 3 Other 2.000000
## 4 White 11.849530
## suicide2$Race.or.Ethnicity suicide2$Year suicide2$Suicide.Deaths
## 1 Black 2003 0.03846154
## 2 Hispanic 2003 34.61111111
## 3 Other 2003 0.00000000
## 4 White 2003 0.18181818
## 5 Black 2004 0.00000000
## 6 Hispanic 2004 34.80555556
## 7 Other 2004 0.00000000
## 8 White 2004 0.00000000
## 9 Black 2005 0.00000000
## 10 Hispanic 2005 36.44444444
## 11 Other 2005 0.00000000
## 12 White 2005 0.11111111
## 13 Black 2006 0.00000000
## 14 Hispanic 2006 36.22222222
## 15 Other 2006 0.00000000
## 16 White 2006 0.05882353
## 17 Black 2007 0.07692308
## 18 Hispanic 2007 38.55555556
## 19 Other 2007 0.00000000
## 20 White 2007 0.07142857
## 21 Black 2008 0.00000000
## 22 Hispanic 2008 38.44444444
## 23 Other 2008 0.00000000
## 24 White 2008 0.11764706
## 25 Black 2009 0.00000000
## 26 Hispanic 2009 38.63888889
## 27 Other 2009 0.00000000
## 28 White 2009 0.06666667
## 29 Black 2010 0.05263158
## 30 Hispanic 2010 41.86111111
## 31 White 2010 0.00000000
## 32 Black 2011 0.00000000
## 33 Hispanic 2011 44.75000000
## 34 Other 2011 0.50000000
## 35 White 2011 0.23809524
## 36 Black 2012 0.00000000
## 37 Hispanic 2012 45.94444444
## 38 Other 2012 0.00000000
## 39 White 2012 0.06666667
## 40 Black 2013 0.10000000
## 41 Hispanic 2013 45.25000000
## 42 Other 2013 0.00000000
## 43 White 2013 0.05882353
## 44 Black 2014 0.00000000
## 45 Hispanic 2014 45.63888889
## 46 Other 2014 0.00000000
## 47 White 2014 0.33333333
## 48 Black 2015 3.83333333
## 49 Hispanic 2015 4.16666667
## 50 Other 2015 2.69444444
## 51 White 2015 35.19444444
## 52 Black 2016 3.97222222
## 53 Hispanic 2016 4.36111111
## 54 Other 2016 2.69444444
## 55 White 2016 34.66666667
## 56 Black 2017 4.22222222
## 57 Hispanic 2017 4.50000000
## 58 Other 2017 2.63888889
## 59 White 2017 34.50000000
#box plot
boxplot(suicide2$Suicide.Deaths~suicide2$Race.or.Ethnicity,col=c("grey25", "grey50", "grey75", "grey100"), ylab = "NUMBER OF SUICIDE DEATHS", xlab = "RACE/ETHNICITY")#Histogram to find a high number of suicides
hist(suicide2$Suicide.Deaths, xlab = "NUMBER OF SUICIDE DEATHS", ylab = "FREQUENCY") Based on the mean number of suicide deaths per race/ethnic group and then suicides deaths per race/ethnic group per year it appears as though Hispanic people have higher numbers of suicide deaths. Based on the box plot, it appears as though Hispanics have the highest number of suicide deaths on average. White appears to be the second largest, with Other and Black being very low. The histogram shows that the distribution of suicide deaths is not normally distributed. Based on the line plot for total suicide deaths, total suicide deaths are increasing every year. Based on the line plot for average suicide deaths, the average suicide deaths were increasing from 2003 to 2012 and then started decreasing.
Null hypothesis: There is no difference in the mean number suicide deaths between different years and race/ethnicities.
Alternative hypothesis: There is a difference in the mean number of suicide deaths between different years and race/ethnicities.
#ANOVA Tests
ANOVA1 <- aov(suicide3$`suicide3$Suicide.Deaths`~ as.factor(suicide3$`suicide3$Year`) + suicide3$`suicide3$Race.or.Ethnicity`)
summary(ANOVA1)## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(suicide3$`suicide3$Year`) 14 142513 10179 0.052 1
## suicide3$`suicide3$Race.or.Ethnicity` 3 13606920 4535640 23.292 0.00000000581
## Residuals 41 7983818 194727
##
## as.factor(suicide3$`suicide3$Year`)
## suicide3$`suicide3$Race.or.Ethnicity` ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
ANOVA2 <- aov(suicide3$`suicide3$Suicide.Deaths`~ suicide3$`suicide3$Race.or.Ethnicity`)
summary(ANOVA2)## Df Sum Sq Mean Sq F value
## suicide3$`suicide3$Race.or.Ethnicity` 3 13655399 4551800 30.99
## Residuals 55 8077852 146870
## Pr(>F)
## suicide3$`suicide3$Race.or.Ethnicity` 0.00000000000726 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = suicide3$`suicide3$Suicide.Deaths` ~ suicide3$`suicide3$Race.or.Ethnicity`)
##
## $`suicide3$`suicide3$Race.or.Ethnicity``
## diff lwr upr p adj
## Hispanic-Black 1157.266667 786.5212 1528.0122 0.0000000
## Other-Black -8.552381 -385.8603 368.7555 0.9999213
## White-Black 222.733333 -148.0122 593.4788 0.3917449
## Other-Hispanic -1165.819048 -1543.1269 -788.5112 0.0000000
## White-Hispanic -934.533333 -1305.2788 -563.7878 0.0000001
## White-Other 231.285714 -146.0222 608.5936 0.3738476
Interpretation: Hispanics had the highest number of suicide deaths out of all of the racial/ethnic groups. The Hispanic group had a higher number of of suicide deaths that was significantly higher than the Black, White and Other racial groups based on the p-value of effectively 0. There was no significant difference between the White, Black, and Other racial groups based on the adjusted P-values. There was no effect of year on mean number of suicide deaths (p value = 1)
Null: The data is approximately normally distributed. Alternative: The data is not approximately normally distributed.
#Density plot
ggplot(suicide3, aes(x=`suicide3$Suicide.Deaths`)) +
geom_density() +
theme_minimal() +
labs(x = "Suicide Deaths Per Year",
y = "Density")##
## Shapiro-Wilk normality test
##
## data: suicide3$`suicide3$Suicide.Deaths`
## W = 0.62083, p-value = 0.00000000004386
Interpretation: Based on the p-value of 0.00000000004386, we reject the null hypothesis. We can conclude that the number of suicide deaths is not normally distributed.
Null: Hispanic and Non-Hispanic people have the same variance regarding suicide deaths Alternative: Hispanic and Non-Hispanic people do not have the same variance regarding suicide deaths
#Creating the x and y for the Fisher's F test and Kolmogrov and Smirnov Test
hispanic <- subset(suicide3,suicide3$`suicide3$Race.or.Ethnicity`== "Hispanic")
nonhispanic <- subset(suicide3, suicide3$`suicide3$Race.or.Ethnicity` == "Black" | suicide3$`suicide3$Race.or.Ethnicity` == "White" | suicide3$`suicide3$Race.or.Ethnicity` == "Other")
#Fisher F test
var.test(hispanic$`suicide3$Suicide.Deaths`,nonhispanic$`suicide3$Suicide.Deaths`,alternative = c("two.sided"))##
## F test to compare two variances
##
## data: hispanic$`suicide3$Suicide.Deaths` and nonhispanic$`suicide3$Suicide.Deaths`
## F = 3.0136, num df = 14, denom df = 43, p-value = 0.00547
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 1.377670 8.021705
## sample estimates:
## ratio of variances
## 3.013621
Interpretation: I re-coded the race/ethnicity groups to be either hispanic or non-hispanic to preform the Fisher F test. We reject the null hypothesis that hispanic and non-hispanic groups have the same variance regarding suicide deaths based on the p-value of 0.00547.
Null: Hispanic and Non-Hispanic people have the same distribution regarding suicide deaths Alternative: Hispanic and Non-Hispanic people do not have the same distribution regarding suicide deaths
#Kolmogrov and Smirnov Test
ks.test(hispanic$`suicide3$Suicide.Deaths`, nonhispanic$`suicide3$Suicide.Deaths`, alternative = c("two.sided"))## Warning in ks.test(hispanic$`suicide3$Suicide.Deaths`,
## nonhispanic$`suicide3$Suicide.Deaths`, : cannot compute exact p-value with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: hispanic$`suicide3$Suicide.Deaths` and nonhispanic$`suicide3$Suicide.Deaths`
## D = 0.90909, p-value = 0.00000001866
## alternative hypothesis: two-sided
Interpretation: Based on the p-value of 0.00000001866, we can conclude that Hispanic and Non Hispanic people do not have the same distribution regarding suicide deaths.Because the number of suicide deaths are not normally distributed and because there is an unequal distribution of the variance between hispanic and non-hispanic groups an ANOVA is not an appropriate test even though it was preformed above. A more appropriate test would be a Krustal-Wallis based on these factors.
Null: There is no difference in the median number of suicide deaths across the 4 ethnic groups. Alternative: There is a difference in the median number of suicide deaths across the 4 ethnic groups.
#Kruskal-Wallis test
kruskal.test(suicide3$`suicide3$Suicide.Deaths`~ suicide3$`suicide3$Race.or.Ethnicity`)##
## Kruskal-Wallis rank sum test
##
## data: suicide3$`suicide3$Suicide.Deaths` by suicide3$`suicide3$Race.or.Ethnicity`
## Kruskal-Wallis chi-squared = 36.655, df = 3, p-value = 0.00000005444
Interpretation: Based on the p-value of 0.00000005444, we can reject the null hypothesis. We can conclude that there is a difference in the median number of suicide deaths across the 4 ethnic groups. Similarly to the ANOVA results, there is a difference in the distribution of the number of suicide deaths among the 4 ethnic groups.
Null: The frequency of a high number of suicide deaths (>=1000) is equal among the 4 ethnic groups. Alternative: The frequency of a high number of suicide deaths (>=1000) is not equal among the 4 ethnic groups.
#Creating a column for high and low number of suicide deaths
suicide3$Suicide.hl <- ifelse(suicide3$`suicide3$Suicide.Deaths` >= 1000, "high", "low")
#Table for the number of times you see high frequency or low frequency for the 4 ethnic groups.
suicide.tab <- table(suicide3$`suicide3$Race.or.Ethnicity`, suicide3$Suicide.hl)
suicide.tab##
## high low
## Black 0 15
## Hispanic 12 3
## Other 0 14
## White 3 12
## Warning in chisq.test(suicide.tab): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: suicide.tab
## X-squared = 33.684, df = 3, p-value = 0.000000231
## Warning in chisq.test(x, ...): Chi-squared approximation may be incorrect
## Dimension Value high low
## 1 Black Residuals -2.6185724 2.6185724
## 2 Black p values 0.0706390 0.0706390
## 3 Hispanic Residuals 5.6212021 -5.6212021
## 4 Hispanic p values 0.0000000 0.0000000
## 5 Other Residuals -2.5015147 2.5015147
## 6 Other p values 0.0989310 0.0989310
## 7 White Residuals -0.5586288 0.5586288
## 8 White p values 1.0000000 1.0000000
Interpretation: We reject the null hypothesis based on the p-value of 2.31e-07. We can conclude that there is a significant difference between the high and low frequencies of suicide deaths between the 4 ethnic groups. The observed suicide frequency for black people was not significantly different than what was expected (p = 0.07).The observed suicide frequency for people classified as other was not significantly different than what was expected (p = 0.09). The observed suicide frequency for white people was not significantly different than what was expected (p = 1.00). In contrast, the observed suicide frequency for hispanic people was significantly higher than what was expected (p < 0.00). However, because 3 out of 8 cells of the table have N < 5, a more appropriate test would be the Fisher Exact Test below.
Null: The frequency of a high number of suicide deaths (>=1000) is equal among the 4 ethnic groups. Alternative: The frequency of a high number of suicide deaths (>=1000) is not equal among the 4 ethnic groups.
##
## Fisher's Exact Test for Count Data
##
## data: suicide.tab
## p-value = 0.00000007613
## alternative hypothesis: two.sided
##
## Pairwise comparisons using Fisher's exact test for count data
##
## data: suicide.tab
##
## Black Hispanic Other
## Hispanic 0.00003156 - -
## Other 1.00000000 0.00003156 -
## White 0.26896552 0.00562866 0.269
##
## P value adjustment method: BH
Interpretation: Based on the p-value of 7.613e-08, we can reject the null hypothesis and conclude that the frequency of a high number of suicides is not equal among the 4 ethnic groups. The observed suicide frequency for black people compared to hispanic people was significantly different than what was expected (p = 0.00003156).The observed suicide frequency for people classified as other was not significantly different than for people classified as black (p = 1.00). The observed suicide frequency for white people was not significantly different than what was expected compared to other groups except the hispanic group based on the observed p-values (p = 1.00). In contrast, the observed suicide frequency for hispanic people was significantly higher than what was expected for all of the groups based on the p-values observed.
#Hispanic versus non-Hispanic column
suicide3$Hispanic <- ifelse (suicide3$`suicide3$Race.or.Ethnicity` == "Hispanic", "Hispanic", "Non-Hispanic")
#Hispanic verus non-Hispanic table
hispanic.tab <- table(suicide3$Hispanic,suicide3$Suicide.hl)
hispanic.tab##
## high low
## Hispanic 12 3
## Non-Hispanic 3 41
##
## high low
## Hispanic 0.20338983 0.05084746
## Non-Hispanic 0.05084746 0.69491525
## Warning in prop.test(hispanic.tab): Chi-squared approximation may be incorrect
##
## 2-sample test for equality of proportions with continuity correction
##
## data: hispanic.tab
## X-squared = 27.856, df = 1, p-value = 0.0000001307
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.4714308 0.9922056
## sample estimates:
## prop 1 prop 2
## 0.80000000 0.06818182
## [1] 4
#To get the odds ratio
(prop.table(hispanic.tab)[1,1]/prop.table(hispanic.tab)[1,2])/(prop.table(hispanic.tab)[2,1]/prop.table(hispanic.tab)[2,2])## [1] 54.66667
Interpretation: There is not an equal proportion of high frequency suicide deaths between hispanic and non-hispanic ethnic groups. The relative risk of a high frequency of suicides in a given year is 300% higher for hispanics than non-hispanic. The odds ratio is saying that the odds of a high frequency suicide in a given year attributed to hispanics is 5468% higher than it is for non-hispanics.
Null: There is no linear relationship between suicide deaths and alcohol deaths. Alternative: There is a linear relationship between suicide deaths and alcohol deaths.
#Changing char to numeric
suicide2$Alcohol.Related.Deaths = as.numeric(suicide2$Alcohol.Related.Deaths)
#Non-parametric Linear Correlation test
cor.test(y = suicide2$Suicide.Deaths, x = suicide2$Alcohol.Related.Deaths, method = "spearman")## Warning in cor.test.default(y = suicide2$Suicide.Deaths, x =
## suicide2$Alcohol.Related.Deaths, : Cannot compute exact p-value with ties
##
## Spearman's rank correlation rho
##
## data: suicide2$Alcohol.Related.Deaths and suicide2$Suicide.Deaths
## S = 65795272, p-value < 0.00000000000000022
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8186443
#Making graph
ggplot(suicide2, aes(x = Alcohol.Related.Deaths, y = Suicide.Deaths))+
geom_point(shape=1)+
geom_smooth(method = lm,
se = FALSE)## `geom_smooth()` using formula 'y ~ x'
Interpretation: We reject the null hypothesis that there is no relationship between suicide deaths and alcohol deaths based on the p-value of < 0.00000000000000022. We can conclude that there is a significant positive relationship between suicide deaths and alcohol deaths. According to the Spearman’s row (rho), as it is above a .75 it suggests that there is a strong positive linear relationship.
## country iso3c year prevalence
## Length:6468 Length:6468 Min. :1990 Min. : 931
## Class :character Class :character 1st Qu.:1997 1st Qu.: 73904
## Mode :character Mode :character Median :2004 Median : 277645
## Mean :2004 Mean : 4407362
## 3rd Qu.:2010 3rd Qu.: 1313348
## Max. :2017 Max. :264455593
##
## iso2c gdp_percap population birth_rate
## Length:6468 Min. : 239.7 Min. : 43972 Min. : 7.60
## Class :character 1st Qu.: 2158.3 1st Qu.: 1982691 1st Qu.:13.53
## Mode :character Median : 6474.6 Median : 7004332 Median :22.07
## Mean : 12649.5 Mean : 33833353 Mean :24.51
## 3rd Qu.: 17409.9 3rd Qu.: 19717848 3rd Qu.:34.55
## Max. :141442.2 Max. :1364270000 Max. :55.12
## NA's :2567 NA's :2224 NA's :2311
## neonat_mortal_rate region income
## Min. : 1.00 Length:6468 Length:6468
## 1st Qu.: 6.40 Class :character Class :character
## Median :15.30 Mode :character Mode :character
## Mean :19.65
## 3rd Qu.:30.00
## Max. :73.10
## NA's :2368
# Create new column for depression prevalence adjusted by population size
income$depression_prop <- income$prevalence/income$population
# This one shows that the distribution is strongly skewed right
ggplot(income) +
geom_density(aes(x = prevalence)) +
labs(title = "Density Plot of Depression Counts per Country per Year") +
theme_minimal()# When I adjust by Population size, the data is less skewed, and looks more normally distributed.
ggplot(income) +
geom_density(aes(x = depression_prop)) +
labs(title = "Density Plot of Depression Proportion adjusted by Population") +
theme_minimal()## Warning: Removed 2224 rows containing non-finite values (stat_density).
# Density Graph by Income bracket.
# Possible discussion: I notice that High income: OECD countries
ggplot(na.omit((income))) +
geom_density(aes(x = depression_prop, fill = income), alpha = 0.5) +
labs(title = "Density Plot of Depression Proportion adjusted by Population") +
theme_minimal()# Density Graph by Region
ggplot(income) +
geom_density(aes(x = depression_prop, fill = region), alpha = 0.5) +
labs(title = "Density Plot of Depression Proportion adjusted by Population",
subtitle = "Grouped by Region") +
theme_minimal()## Warning: Removed 2224 rows containing non-finite values (stat_density).
##
## Call:
## lm(formula = (income$depression_prop) ~ (income$gdp_percap))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.023153 -0.005114 -0.001871 0.004043 0.025287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.028362851588 0.000150385026 188.60 <0.0000000000000002 ***
## income$gdp_percap 0.000000248049 0.000000007392 33.56 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007356 on 3899 degrees of freedom
## (2567 observations deleted due to missingness)
## Multiple R-squared: 0.2241, Adjusted R-squared: 0.2239
## F-statistic: 1126 on 1 and 3899 DF, p-value: < 0.00000000000000022
This Density Plot of Depression counts per country per year shows that the distribution is strongly skewed right. When I adjust by Population size, the data is less skewed, and looks more normally distributed in the Density Plot of Depression Proportion adjusted by Population. The Density Plot of Depression Proportion adjusted by Population, looks at depression prevalence adjusted by population and grouped by income level. Based on this graph, I can see that high income countries have higher population adjusted depression prevalence than the other categories. The Density Plot of Depression Proportion adjusted by Population by region has North America with the highest depression prevalence.
Null: There is no relationship between GDP per capita and age adjusted depression prevalence Alternative: There is a relationship GDP per capita and age adjusted depression prevalence
##
## Call:
## lm(formula = (income$depression_prop) ~ (income$gdp_percap))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.023153 -0.005114 -0.001871 0.004043 0.025287
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.028362851588 0.000150385026 188.60 <0.0000000000000002 ***
## income$gdp_percap 0.000000248049 0.000000007392 33.56 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.007356 on 3899 degrees of freedom
## (2567 observations deleted due to missingness)
## Multiple R-squared: 0.2241, Adjusted R-squared: 0.2239
## F-statistic: 1126 on 1 and 3899 DF, p-value: < 0.00000000000000022
Based on the p-value of < 2.2e-16, we can reject the null hypothesis. We can conclude that there is a relationship between GDP per capita and age adjusted depression prevalence based on the p-value of p-value of < 2.2e-16. Based on the adjusted r-squared, that is effectively 0.2239, GDP per capita explains 22.39 % of the variance in depression prevalence.
Null: The residuals from our linear model of GDP per Capita on Depression Proportion is approximately normally distributed. Alternative: The residuals from our linear model of GDP per Capita on Depression Proportion is not approximately normally distributed.
##
## Shapiro-Wilk normality test
##
## data: resid(mod2)
## W = 0.93934, p-value < 0.00000000000000022
Based on the p-value of < 2.2e-16, and a significance level alpha = 0.5, we would reject our null hypothesis that the residuals are normally distributed. This indicates that our model does not capture all the variance in the depression prevalence proportion. Because of this a Kruskal Wallis is more appropriate than an ANOVA test.
Null: There is no difference in the median depression prevalence by population across the income categories. Alternative: There is a difference in the median depression prevalence by population across the income categories.
##
## Kruskal-Wallis rank sum test
##
## data: income$depression_prop by income$income
## Kruskal-Wallis chi-squared = 1663.9, df = 4, p-value <
## 0.00000000000000022
#box plot
boxplot(income$depression_prop ~ income$income, xlab = "COUNTRY INCOME CATEGORY", ylab = "POPULATION ADJUSTED DEPRESSSION PREVALENCE")Interpretation: Based on the p value of < 0.00000000000000022, we can reject the null hypothesis and conclude that there is a difference in the median depression prevalence adjusted by population across the income categories. The box plot shows that low income countries have the lowest depression prevalence adjusted by population, but high income OECD countries (examples: USA, UK, CANADA) have the highest depression prevalence adjusted by population.
Null: There is no difference in neo-natal mortality, gdp per capita, or year in the ability to predict a popluation adjusted depression prevalence above the median of 0.02945099. Alternative: There is a difference in neo-natal mortality, gdp per capita, or year in the ability to predict a population adjusted depression prevalence above the median of 0.02945099.
## [1] 0.02945099
#Making a new column
income$preval01 <- ifelse (income$depression_prop > median(income$depression_prop, na.rm = TRUE), 1, 0)
#Multiple logistic regression
mod3 <- glm(income$preval01 ~ income$year + income$gdp_percap + income$neonat_mortal_rate, family = binomial(link = "logit"))
summary(mod3)##
## Call:
## glm(formula = income$preval01 ~ income$year + income$gdp_percap +
## income$neonat_mortal_rate, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.8931 -0.7660 -0.5527 0.7474 2.0514
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 42.086094194 11.564852581 3.639
## income$year -0.021620916 0.005777254 -3.742
## income$gdp_percap 0.000157756 0.000008318 18.966
## income$neonat_mortal_rate -0.013992663 0.003793449 -3.689
## Pr(>|z|)
## (Intercept) 0.000274 ***
## income$year 0.000182 ***
## income$gdp_percap < 0.0000000000000002 ***
## income$neonat_mortal_rate 0.000225 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5341.1 on 3852 degrees of freedom
## Residual deviance: 3716.9 on 3849 degrees of freedom
## (2615 observations deleted due to missingness)
## AIC: 3724.9
##
## Number of Fisher Scoring iterations: 6
boxplot(income$gdp_percap ~ income$preval01, xlab = "DEPRESSION PREVALENCE ABOVE MEDIAN", ylab = "GDP PER CAPITA")boxplot(income$neonat_mortal_rate ~ income$preval01, xlab = "DEPRESSION PREVALENCE ABOVE MEDIAN", ylab = "NEONATAL MORTALITY")plot(income$depression_prop ~ income$year, xlab = "YEAR", ylab = "DEPRESSION PREVALENCE SCORE")
abline(lm(income$depression_prop ~ income$year), col = "red") Interpretation: There is a significant effect of increasing years on population adjusted depression prevalence based on the p-value of 0.000182. The red trend line on the scatterplot indicates a very slight increase in population adjusted depression prevalence with increasing year. For GDP per capita there is a significant effect on population adjusted depression prevalence based on the p-values of < 2e-16; this is supported by the box plot. There is a significant effect of decreasing neonatal mortality rates on popluation adjusted depression prevalence based on the p-values of 0.000225 and this is also evidenced by the box plot.