library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(mosaic)
## Registered S3 method overwritten by 'mosaic':
## method from
## fortify.SpatialPolygonsDataFrame ggplot2
##
## The 'mosaic' package masks several functions from core packages in order to add
## additional features. The original behavior of these functions should not be affected by this.
##
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
##
## count, do, tally
## The following object is masked from 'package:Matrix':
##
## mean
## The following object is masked from 'package:ggplot2':
##
## stat
## The following object is masked from 'package:openintro':
##
## dotPlot
## The following objects are masked from 'package:stats':
##
## binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
## quantile, sd, t.test, var
## The following objects are masked from 'package:base':
##
## max, mean, min, prod, range, sample, sum
library(dplyr)
library(ggplot2)
library(infer)
##
## Attaching package: 'infer'
## The following objects are masked from 'package:mosaic':
##
## prop_test, t_test
"air" <- read.csv("~/Documents/Big Data UNLV:Touro/R/Data/pollution_death_country.csv")
Biren Katyal Module 4: Data Analysis
Original Hypotheses/Research Question: (I realize I did not phrase these very well) Hypotheses: H0: Air pollution-attributable death rate does not decrease in the majority of regions over time HA: Air pollution-attributable death rate decreases in the majority of regions over time
Question: Does air pollution attributable death rate decrease over time in the majority of regions?
Hypotheses: H0: Air pollution-attributable death rate does not decrease in the majority of regions over time HA: Air pollution-attributable death rate decreases in the majority of regions over time
Question: Does air pollution attributable death rate decrease over time in the majority of regions?
For the data analysis portion of this project, I decided to look at my overall research question first with a simple linear regression model, without filtering any data. I found that based on the p-value being less than 0.05, we reject the null hypothesis. General air pollution-attributable death rate does decrease globally over time. I also made an associated scatterplot with a linear regression line and a histogram. However, this is a very vague result, as there are many countries, regions, and diseases that this is accounting for. Thus, I did a few exploratory models and plots. I performed a multiple linear regression using region (ParentLocation) and disease type (Dim2) respectively. I also created associated scatterplots for each of those models, using those categorical variables as the colour dimension.
Additionally, I created a subset of the data, only looking at death rate from stroke, to see if taking smaller subsets of the data could make it easier to model/visualize. I created a scatterplot and model based on this subset, which did appear to be statistically significant.
The associated data and charts are seen below
Original Hypothesis
ggplot(air, aes(x = Period, y = FactValueNumeric, colour = ParentLocation)) +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
labs(x = "Year", y = "Estimated Death Rate per 100,000", title = "Estimated Death Rate from Respiratory Diseases from 2010-2019 by Region", caption = "Data taken from WHO GHE estimates, colour indicates the WHO global region")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(air, aes(x = Period, y = FactValueNumeric, colour = ParentLocation)) +
geom_histogram(stat = "identity") +
labs(x = "Year", y = "Estimated Death Rate per 100,000", title = "Estimated Death Rate from Respiratory Diseases from 2010-2019 by Region", caption = "Data taken from WHO GHE estimates, colour indicates the WHO global region")
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`
model1 <- lm(Period~FactValueNumeric, data = air)
summary(model1)
##
## Call:
## lm(formula = Period ~ FactValueNumeric, data = air)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5736 -2.5439 0.3479 2.4524 5.1483
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.015e+03 1.924e-02 104701.954 < 2e-16 ***
## FactValueNumeric -1.958e-03 2.863e-04 -6.839 8.13e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.87 on 32398 degrees of freedom
## Multiple R-squared: 0.001441, Adjusted R-squared: 0.001411
## F-statistic: 46.77 on 1 and 32398 DF, p-value: 8.13e-12
Exploratory Analysis
model2 <- lm(Period~FactValueNumeric + ParentLocation, data = air)
summary(model2)
##
## Call:
## lm(formula = Period ~ FactValueNumeric + ParentLocation, data = air)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6422 -2.5292 0.3359 2.4683 5.1764
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.015e+03 3.681e-02 54732.641 < 2e-16
## FactValueNumeric -2.220e-03 3.049e-04 -7.283 3.35e-13
## ParentLocationAmericas -1.052e-01 5.069e-02 -2.076 0.0379
## ParentLocationEastern Mediterranean -5.147e-02 5.859e-02 -0.878 0.3797
## ParentLocationEurope -9.881e-02 4.574e-02 -2.160 0.0308
## ParentLocationSouth-East Asia -3.627e-02 7.183e-02 -0.505 0.6136
## ParentLocationWestern Pacific -4.135e-02 5.644e-02 -0.733 0.4637
##
## (Intercept) ***
## FactValueNumeric ***
## ParentLocationAmericas *
## ParentLocationEastern Mediterranean
## ParentLocationEurope *
## ParentLocationSouth-East Asia
## ParentLocationWestern Pacific
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.87 on 32393 degrees of freedom
## Multiple R-squared: 0.001635, Adjusted R-squared: 0.00145
## F-statistic: 8.839 on 6 and 32393 DF, p-value: 1.176e-09
model3 <- lm(Period~FactValueNumeric + Dim2, data = air)
summary(model3)
##
## Call:
## lm(formula = Period ~ FactValueNumeric + Dim2, data = air)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8582 -2.5068 0.1519 2.4914 5.3668
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 2.015e+03 4.031e-02 49979.339
## FactValueNumeric -3.388e-03 3.764e-04 -9.000
## Dim2ALL CAUSES 2.920e-01 6.404e-02 4.560
## Dim2Chronic obstructive pulmonary disease -5.249e-02 5.552e-02 -0.945
## Dim2Ischaemic heart disease 6.270e-02 5.565e-02 1.127
## Dim2Stroke 1.542e-03 5.521e-02 0.028
## Dim2Trachea, bronchus, lung cancers -8.058e-02 5.594e-02 -1.441
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## FactValueNumeric < 2e-16 ***
## Dim2ALL CAUSES 5.14e-06 ***
## Dim2Chronic obstructive pulmonary disease 0.344
## Dim2Ischaemic heart disease 0.260
## Dim2Stroke 0.978
## Dim2Trachea, bronchus, lung cancers 0.150
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.869 on 32393 degrees of freedom
## Multiple R-squared: 0.002494, Adjusted R-squared: 0.002309
## F-statistic: 13.5 on 6 and 32393 DF, p-value: 2.334e-15
ggplot(air, aes(x = Period, y = FactValueNumeric, colour = Dim2)) +
geom_point() +
geom_smooth(method = lm, se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
air2 <- air %>%
filter(Dim2 == "Stroke")
ggplot(air2, aes(x = Period, y = FactValueNumeric, colour = ParentLocation)) +
geom_point() +
geom_smooth(method = lm, se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
model4 <- lm(Period~FactValueNumeric, data = air2)
summary(model4)
##
## Call:
## lm(formula = Period ~ FactValueNumeric, data = air2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7021 -2.5734 0.2683 2.4064 5.1394
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.015e+03 5.845e-02 34469.566 < 2e-16 ***
## FactValueNumeric -7.834e-03 1.607e-03 -4.875 1.12e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.867 on 5398 degrees of freedom
## Multiple R-squared: 0.004383, Adjusted R-squared: 0.004198
## F-statistic: 23.76 on 1 and 5398 DF, p-value: 1.122e-06
ggplot(air, aes(x = Period, y = FactValueNumeric, colour = ParentLocation)) +
geom_histogram(stat = "identity")
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`