library(openintro)  
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(mosaic)  
## Registered S3 method overwritten by 'mosaic':
##   method                           from   
##   fortify.SpatialPolygonsDataFrame ggplot2
## 
## The 'mosaic' package masks several functions from core packages in order to add 
## additional features.  The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following object is masked from 'package:ggplot2':
## 
##     stat
## The following object is masked from 'package:openintro':
## 
##     dotPlot
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cor.test, cov, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum
library(dplyr)  
library(ggplot2)
library(infer)
## 
## Attaching package: 'infer'
## The following objects are masked from 'package:mosaic':
## 
##     prop_test, t_test
"air" <- read.csv("~/Documents/Big Data UNLV:Touro/R/Data/pollution_death_country.csv")

Biren Katyal Module 4: Data Analysis

Original Hypotheses/Research Question: (I realize I did not phrase these very well) Hypotheses: H0: Air pollution-attributable death rate does not decrease in the majority of regions over time HA: Air pollution-attributable death rate decreases in the majority of regions over time

Question: Does air pollution attributable death rate decrease over time in the majority of regions?

Hypotheses: H0: Air pollution-attributable death rate does not decrease in the majority of regions over time HA: Air pollution-attributable death rate decreases in the majority of regions over time

Question: Does air pollution attributable death rate decrease over time in the majority of regions?

For the data analysis portion of this project, I decided to look at my overall research question first with a simple linear regression model, without filtering any data. I found that based on the p-value being less than 0.05, we reject the null hypothesis. General air pollution-attributable death rate does decrease globally over time. I also made an associated scatterplot with a linear regression line and a histogram. However, this is a very vague result, as there are many countries, regions, and diseases that this is accounting for. Thus, I did a few exploratory models and plots. I performed a multiple linear regression using region (ParentLocation) and disease type (Dim2) respectively. I also created associated scatterplots for each of those models, using those categorical variables as the colour dimension.

Additionally, I created a subset of the data, only looking at death rate from stroke, to see if taking smaller subsets of the data could make it easier to model/visualize. I created a scatterplot and model based on this subset, which did appear to be statistically significant.

The associated data and charts are seen below

Original Hypothesis

ggplot(air, aes(x = Period, y = FactValueNumeric, colour = ParentLocation)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE) +
  labs(x = "Year", y = "Estimated Death Rate per 100,000", title = "Estimated Death Rate from Respiratory Diseases from 2010-2019 by Region", caption = "Data taken from WHO GHE estimates, colour indicates the WHO global region")
## `geom_smooth()` using formula = 'y ~ x'

ggplot(air, aes(x = Period, y = FactValueNumeric, colour = ParentLocation)) +
  geom_histogram(stat = "identity") +
  labs(x = "Year", y = "Estimated Death Rate per 100,000", title = "Estimated Death Rate from Respiratory Diseases from 2010-2019 by Region", caption = "Data taken from WHO GHE estimates, colour indicates the WHO global region")
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`

model1 <- lm(Period~FactValueNumeric, data = air)
summary(model1)
## 
## Call:
## lm(formula = Period ~ FactValueNumeric, data = air)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5736 -2.5439  0.3479  2.4524  5.1483 
## 
## Coefficients:
##                    Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)       2.015e+03  1.924e-02 104701.954  < 2e-16 ***
## FactValueNumeric -1.958e-03  2.863e-04     -6.839 8.13e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.87 on 32398 degrees of freedom
## Multiple R-squared:  0.001441,   Adjusted R-squared:  0.001411 
## F-statistic: 46.77 on 1 and 32398 DF,  p-value: 8.13e-12

Exploratory Analysis

model2 <- lm(Period~FactValueNumeric + ParentLocation, data = air)
summary(model2)
## 
## Call:
## lm(formula = Period ~ FactValueNumeric + ParentLocation, data = air)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6422 -2.5292  0.3359  2.4683  5.1764 
## 
## Coefficients:
##                                       Estimate Std. Error   t value Pr(>|t|)
## (Intercept)                          2.015e+03  3.681e-02 54732.641  < 2e-16
## FactValueNumeric                    -2.220e-03  3.049e-04    -7.283 3.35e-13
## ParentLocationAmericas              -1.052e-01  5.069e-02    -2.076   0.0379
## ParentLocationEastern Mediterranean -5.147e-02  5.859e-02    -0.878   0.3797
## ParentLocationEurope                -9.881e-02  4.574e-02    -2.160   0.0308
## ParentLocationSouth-East Asia       -3.627e-02  7.183e-02    -0.505   0.6136
## ParentLocationWestern Pacific       -4.135e-02  5.644e-02    -0.733   0.4637
##                                        
## (Intercept)                         ***
## FactValueNumeric                    ***
## ParentLocationAmericas              *  
## ParentLocationEastern Mediterranean    
## ParentLocationEurope                *  
## ParentLocationSouth-East Asia          
## ParentLocationWestern Pacific          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.87 on 32393 degrees of freedom
## Multiple R-squared:  0.001635,   Adjusted R-squared:  0.00145 
## F-statistic: 8.839 on 6 and 32393 DF,  p-value: 1.176e-09
model3 <- lm(Period~FactValueNumeric + Dim2, data = air)
summary(model3)
## 
## Call:
## lm(formula = Period ~ FactValueNumeric + Dim2, data = air)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8582 -2.5068  0.1519  2.4914  5.3668 
## 
## Coefficients:
##                                             Estimate Std. Error   t value
## (Intercept)                                2.015e+03  4.031e-02 49979.339
## FactValueNumeric                          -3.388e-03  3.764e-04    -9.000
## Dim2ALL CAUSES                             2.920e-01  6.404e-02     4.560
## Dim2Chronic obstructive pulmonary disease -5.249e-02  5.552e-02    -0.945
## Dim2Ischaemic heart disease                6.270e-02  5.565e-02     1.127
## Dim2Stroke                                 1.542e-03  5.521e-02     0.028
## Dim2Trachea, bronchus, lung cancers       -8.058e-02  5.594e-02    -1.441
##                                           Pr(>|t|)    
## (Intercept)                                < 2e-16 ***
## FactValueNumeric                           < 2e-16 ***
## Dim2ALL CAUSES                            5.14e-06 ***
## Dim2Chronic obstructive pulmonary disease    0.344    
## Dim2Ischaemic heart disease                  0.260    
## Dim2Stroke                                   0.978    
## Dim2Trachea, bronchus, lung cancers          0.150    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.869 on 32393 degrees of freedom
## Multiple R-squared:  0.002494,   Adjusted R-squared:  0.002309 
## F-statistic:  13.5 on 6 and 32393 DF,  p-value: 2.334e-15
ggplot(air, aes(x = Period, y = FactValueNumeric, colour = Dim2)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

air2 <- air %>%
  filter(Dim2 == "Stroke")
  
ggplot(air2, aes(x = Period, y = FactValueNumeric, colour = ParentLocation)) +
    geom_point() +
    geom_smooth(method = lm, se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

model4 <- lm(Period~FactValueNumeric, data = air2)
summary(model4)
## 
## Call:
## lm(formula = Period ~ FactValueNumeric, data = air2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.7021 -2.5734  0.2683  2.4064  5.1394 
## 
## Coefficients:
##                    Estimate Std. Error   t value Pr(>|t|)    
## (Intercept)       2.015e+03  5.845e-02 34469.566  < 2e-16 ***
## FactValueNumeric -7.834e-03  1.607e-03    -4.875 1.12e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.867 on 5398 degrees of freedom
## Multiple R-squared:  0.004383,   Adjusted R-squared:  0.004198 
## F-statistic: 23.76 on 1 and 5398 DF,  p-value: 1.122e-06
ggplot(air, aes(x = Period, y = FactValueNumeric, colour = ParentLocation)) +
  geom_histogram(stat = "identity")
## Warning in geom_histogram(stat = "identity"): Ignoring unknown parameters:
## `binwidth`, `bins`, and `pad`