Initial data exploration

data %>% filter(!is.na(Age.Adjusted.Rate)) %>% filter(Gender=='') %>%
  ggplot(aes(Value, Age.Adjusted.Rate, color = State.x)) +
  geom_point(size=3) +
  #stat_smooth(method="lm")+
  #xlab("") +
  theme_bw() +
  theme(axis.title.x = element_text(face="bold", colour="#990000", size=20),
        axis.text.x  = element_text(angle=90, vjust=0.5, size=12),
        axis.title.y = element_text(face="bold", colour="#990000", size=20))
## Warning: package 'bindrcpp' was built under R version 3.3.3

data %>% filter(!is.na(Age.Adjusted.Rate)) %>% filter(Gender=='') %>%
  ggplot(aes(Value, Age.Adjusted.Rate)) + #, color = State.x
  geom_point(size=3) +
  stat_smooth(method="lm") +
  xlab("Avg PM2.5 content\nµg/m^3") + ylab("Age Adjusted Mortality Rate\n(Deaths per 100,000 people)") +
  theme_bw() +
  theme(axis.title.x = element_text(face="bold", colour="#990000", size=20),
        axis.title.y = element_text(face="bold", colour="#990000", size=20))

stats <- data %>% filter(!is.na(Age.Adjusted.Rate)) %>% filter(Gender=='')
x <- as.numeric(stats$Value)
y <- as.numeric(stats$Age.Adjusted.Rate)

cor.test(y,x)
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 4.6039, df = 530, p-value = 5.197e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1129692 0.2764970
## sample estimates:
##       cor 
## 0.1960961
lm <- lm(y~x)
lm
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##     192.169        4.293
summary(lm)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -109.242  -36.181   -3.817   25.225  175.283 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 192.1691     8.1737  23.511  < 2e-16 ***
## x             4.2934     0.9326   4.604  5.2e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.98 on 530 degrees of freedom
## Multiple R-squared:  0.03845,    Adjusted R-squared:  0.03664 
## F-statistic:  21.2 on 1 and 530 DF,  p-value: 5.197e-06

Data exploration: subsetting the database

#Default among whole dataset
datastat <- data %>% filter(!is.na(Age.Adjusted.Rate)) %>% filter(Gender!='') %>% filter(Race!='') %>% filter(ICD.Sub.Chapter!='')
View(datastat)
datastat_mean <- tapply(datastat$Age.Adjusted.Rate, datastat$County.Code, sum)
datastat_meanF <- data.frame(key=names(datastat_mean), value=datastat_mean)
colnames(datastat_meanF) <- c("County.Code","Age.Adjusted.Rate")
View(datastat_meanF)

datastat_F <- merge(datastat_meanF, PM2.5, by="County.Code") 
View(datastat_F)

test <- datastat_F 
x <- as.numeric(test$Value)
y <- as.numeric(test$Age.Adjusted.Rate)

cor.test(y,x)
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 7.0868, df = 482, p-value = 4.902e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2241963 0.3857552
## sample estimates:
##       cor 
## 0.3071875
lm <- lm(y~x)
lm
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      183.89        40.43
summary(lm)
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -910.40 -192.56  -84.74  163.27  880.50 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  183.892     50.844   3.617  0.00033 ***
## x             40.435      5.706   7.087  4.9e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 281.2 on 482 degrees of freedom
## Multiple R-squared:  0.09436,    Adjusted R-squared:  0.09249 
## F-statistic: 50.22 on 1 and 482 DF,  p-value: 4.902e-12
for (gender in c("Male", "Female")) {
  print(paste("Gender: ", gender))
  
  datastat_temp <- datastat %>% filter(Gender==gender)
  datastat_mean <- tapply(datastat_temp$Age.Adjusted.Rate, datastat_temp$County.Code, sum)
  datastat_meanF <- data.frame(key=names(datastat_mean), value=datastat_mean)
  colnames(datastat_meanF) <- c("County.Code","Age.Adjusted.Rate")
  
  datastat_F <- merge(datastat_meanF, PM2.5, by="County.Code") 
  
  test <- datastat_F
  x <- as.numeric(test$Value)
  y <- as.numeric(test$Age.Adjusted.Rate)
  
  print(cor.test(y,x))
  lm <- lm(y~x)
  print(lm)
  print(summary(lm))
}
## [1] "Gender:  Male"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 6.2978, df = 480, p-value = 6.829e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1916808 0.3567767
## sample estimates:
##       cor 
## 0.2762655 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      135.80        21.82  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -464.0 -117.0  -61.3  103.3  669.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  135.797     30.930   4.391 1.39e-05 ***
## x             21.823      3.465   6.298 6.83e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 169.2 on 480 degrees of freedom
## Multiple R-squared:  0.07632,    Adjusted R-squared:  0.0744 
## F-statistic: 39.66 on 1 and 480 DF,  p-value: 6.829e-10
## 
## [1] "Gender:  Female"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 8.1051, df = 447, p-value = 5.093e-15
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2745087 0.4360537
## sample estimates:
##       cor 
## 0.3579569 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##       49.37        20.38  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -236.09  -78.78  -27.93   83.77  329.74 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   49.374     22.393   2.205    0.028 *  
## x             20.377      2.514   8.105 5.09e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 110.6 on 447 degrees of freedom
## Multiple R-squared:  0.1281, Adjusted R-squared:  0.1262 
## F-statistic: 65.69 on 1 and 447 DF,  p-value: 5.093e-15
for (icd in c("Other forms of heart disease", "Cerebrovascular diseases", "Hypertensive diseases", "Ischaemic heart diseases", "Diseases of arteries, arterioles and capillaries", "Chronic rheumatic heart diseases", "Pulmonary heart disease and diseases of pulmonary circulation", "Diseases of veins, lymphatic vessels and lymph nodes, not elsewhere classified")) {
  print(paste("ICD: ", icd))
  
  datastat_temp <- datastat %>% filter(ICD.Sub.Chapter==icd)
  datastat_mean <- tapply(datastat_temp$Age.Adjusted.Rate, datastat_temp$County.Code, sum)
  datastat_meanF <- data.frame(key=names(datastat_mean), value=datastat_mean)
  colnames(datastat_meanF) <- c("County.Code","Age.Adjusted.Rate")
  
  datastat_F <- merge(datastat_meanF, PM2.5, by="County.Code") 
  
  test <- datastat_F
  x <- as.numeric(test$Value)
  y <- as.numeric(test$Age.Adjusted.Rate)
  
  #print(cor.test(y,x))
  lm <- lm(y~x)
  print(lm)
  #print(summary(lm))
}
## [1] "ICD:  Other forms of heart disease"
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      66.884        9.931  
## 
## [1] "ICD:  Cerebrovascular diseases"
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      46.304        6.105  
## 
## [1] "ICD:  Hypertensive diseases"
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      13.669        6.506  
## 
## [1] "ICD:  Ischaemic heart diseases"
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      139.64        15.92  
## 
## [1] "ICD:  Diseases of arteries, arterioles and capillaries"
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##    16.45737      0.01768  
## 
## [1] "ICD:  Chronic rheumatic heart diseases"
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      2.7551      -0.1453  
## 
## [1] "ICD:  Pulmonary heart disease and diseases of pulmonary circulation"
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      0.1636       0.8150  
## 
## [1] "ICD:  Diseases of veins, lymphatic vessels and lymph nodes, not elsewhere classified"
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##     2.90920     -0.01964
unique(datastat$X2013.Urbanization)
## [1] Small Metro              NonCore (non-metro)     
## [3] Large Central Metro      Medium Metro            
## [5] Micropolitan (non-metro) Large Fringe Metro      
## 6 Levels: Large Central Metro Large Fringe Metro ... Small Metro
for (urban in c("Small Metro", "NonCore (non-metro)", "Large Central Metro", "Medium Metro", "Micropolitan (non-metro)", "Large Fringe Metro")) {
  print(paste("Urban: ", urban))
  
  datastat_temp <- datastat %>% filter(X2013.Urbanization==urban)
  datastat_mean <- tapply(datastat_temp$Age.Adjusted.Rate, datastat_temp$County.Code, sum)
  datastat_meanF <- data.frame(key=names(datastat_mean), value=datastat_mean)
  colnames(datastat_meanF) <- c("County.Code","Age.Adjusted.Rate")
  
  datastat_F <- merge(datastat_meanF, PM2.5, by="County.Code") 
  
  test <- datastat_F
  x <- as.numeric(test$Value)
  y <- as.numeric(test$Age.Adjusted.Rate)
  
  print(cor.test(y,x))
  lm <- lm(y~x)
  print(lm)
  print(summary(lm))
}
## [1] "Urban:  Small Metro"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = -0.040232, df = 98, p-value = 0.968
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2003223  0.1925077
## sample estimates:
##          cor 
## -0.004064053 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##    439.2188      -0.3257  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -293.45 -117.85  -46.35   48.59  960.78 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 439.2188    71.8970   6.109 2.03e-08 ***
## x            -0.3257     8.0951  -0.040    0.968    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 203 on 98 degrees of freedom
## Multiple R-squared:  1.652e-05,  Adjusted R-squared:  -0.01019 
## F-statistic: 0.001619 on 1 and 98 DF,  p-value: 0.968
## 
## [1] "Urban:  NonCore (non-metro)"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = -1.0736, df = 25, p-value = 0.2933
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5463612  0.1848239
## sample estimates:
##       cor 
## -0.209935 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##     340.321       -7.401  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -206.53  -75.19  -12.35   44.33  296.79 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  340.321     59.126   5.756 5.35e-06 ***
## x             -7.401      6.894  -1.074    0.293    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 112.1 on 25 degrees of freedom
## Multiple R-squared:  0.04407,    Adjusted R-squared:  0.005836 
## F-statistic: 1.153 on 1 and 25 DF,  p-value: 0.2933
## 
## [1] "Urban:  Large Central Metro"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 3.6725, df = 63, p-value = 0.0004972
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1961062 0.6021486
## sample estimates:
##       cor 
## 0.4199204 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      229.69        72.61  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -673.71 -117.14   27.77  167.71  478.88 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   229.69     194.40   1.182 0.241840    
## x              72.61      19.77   3.672 0.000497 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 246.5 on 63 degrees of freedom
## Multiple R-squared:  0.1763, Adjusted R-squared:  0.1633 
## F-statistic: 13.49 on 1 and 63 DF,  p-value: 0.0004972
## 
## [1] "Urban:  Medium Metro"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 2.517, df = 135, p-value = 0.01301
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0456239 0.3664236
## sample estimates:
##       cor 
## 0.2117193 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      333.42        26.14  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -408.9 -188.8 -109.7  193.8  859.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   333.42      94.17   3.541 0.000548 ***
## x              26.14      10.38   2.517 0.013005 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 267.2 on 135 degrees of freedom
## Multiple R-squared:  0.04483,    Adjusted R-squared:  0.03775 
## F-statistic: 6.335 on 1 and 135 DF,  p-value: 0.01301
## 
## [1] "Urban:  Micropolitan (non-metro)"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 3.7397, df = 61, p-value = 0.000409
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2061644 0.6139414
## sample estimates:
##       cor 
## 0.4318694 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##       57.47        34.08  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -219.26  -94.88  -33.86  103.04  435.65 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   57.467     70.602   0.814 0.418838    
## x             34.079      9.113   3.740 0.000409 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 141.5 on 61 degrees of freedom
## Multiple R-squared:  0.1865, Adjusted R-squared:  0.1732 
## F-statistic: 13.99 on 1 and 61 DF,  p-value: 0.000409
## 
## [1] "Urban:  Large Fringe Metro"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 3.4401, df = 90, p-value = 0.0008834
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1462942 0.5100985
## sample estimates:
##       cor 
## 0.3408971 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      145.82        44.47  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -408.28 -174.49  -70.32  166.16  516.00 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   145.82     114.57   1.273 0.206401    
## x              44.47      12.93   3.440 0.000883 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 225.4 on 90 degrees of freedom
## Multiple R-squared:  0.1162, Adjusted R-squared:  0.1064 
## F-statistic: 11.83 on 1 and 90 DF,  p-value: 0.0008834
unique(datastat$Race)
## [1] White                            Black or African American       
## [3] Asian or Pacific Islander        American Indian or Alaska Native
## 5 Levels:  American Indian or Alaska Native ... White
for (race in c("White", "Black or African American", "Asian or Pacific Islander")) { # insufficient data, "American Indian or Alaska Native"
  print(paste("Race: ", race))
  
  datastat_temp <- datastat %>% filter(Race==race)
  datastat_mean <- tapply(datastat_temp$Age.Adjusted.Rate, datastat_temp$County.Code, sum)
  datastat_meanF <- data.frame(key=names(datastat_mean), value=datastat_mean)
  colnames(datastat_meanF) <- c("County.Code","Age.Adjusted.Rate")
  
  datastat_F <- merge(datastat_meanF, PM2.5, by="County.Code") 
  
  test <- datastat_F
  x <- as.numeric(test$Value)
  y <- as.numeric(test$Age.Adjusted.Rate)
  
  print(cor.test(y,x))
  lm <- lm(y~x)
  print(lm)
  print(summary(lm))
}
## [1] "Race:  White"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 5.9641, df = 481, p-value = 4.769e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1773405 0.3435878
## sample estimates:
##       cor 
## 0.2624103 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      269.66        12.35  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -395.11  -56.15   11.47   63.31  343.67 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   269.66      18.47  14.603  < 2e-16 ***
## x              12.35       2.07   5.964 4.77e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 101.4 on 481 degrees of freedom
## Multiple R-squared:  0.06886,    Adjusted R-squared:  0.06692 
## F-statistic: 35.57 on 1 and 481 DF,  p-value: 4.769e-09
## 
## [1] "Race:  Black or African American"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 3.1782, df = 167, p-value = 0.001766
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09114659 0.37621709
## sample estimates:
##       cor 
## 0.2388206 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##      210.97        21.17  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -399.79 -123.60   14.63  135.38  451.05 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   210.97      62.97   3.351 0.000997 ***
## x              21.17       6.66   3.178 0.001766 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 170.2 on 167 degrees of freedom
## Multiple R-squared:  0.05704,    Adjusted R-squared:  0.05139 
## F-statistic:  10.1 on 1 and 167 DF,  p-value: 0.001766
## 
## [1] "Race:  Asian or Pacific Islander"
## 
##  Pearson's product-moment correlation
## 
## data:  y and x
## t = 1.1722, df = 37, p-value = 0.2486
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1343040  0.4763104
## sample estimates:
##       cor 
## 0.1892318 
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##     109.738        7.213  
## 
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -145.146  -75.221    6.402   59.725  259.613 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  109.738     62.114   1.767   0.0855 .
## x              7.213      6.153   1.172   0.2486  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 98.79 on 37 degrees of freedom
## Multiple R-squared:  0.03581,    Adjusted R-squared:  0.009749 
## F-statistic: 1.374 on 1 and 37 DF,  p-value: 0.2486

Graphing important relationships

#Large Central Metro (1)
datastat_temp <- datastat %>% filter(X2013.Urbanization=="Large Central Metro")
datastat_mean <- tapply(datastat_temp$Age.Adjusted.Rate, datastat_temp$County.Code, sum)
datastat_LCM <- data.frame(key=names(datastat_mean), value=datastat_mean)
colnames(datastat_LCM) <- c("County.Code","Age.Adjusted.Rate")
datastat_LCM_F <- merge(datastat_LCM, PM2.5, by="County.Code") 
View(datastat_LCM_F)

#Graph _LCM

datastat_LCM_F %>% filter(!is.na(Age.Adjusted.Rate)) %>%
  ggplot(aes(Value, Age.Adjusted.Rate)) + #, color = State.x
  geom_point(size=3) +
  stat_smooth(method="lm") +
  xlab("Avg PM2.5 content\nµg/m^3") + ylab("Age Adjusted Mortality Rate\n(Deaths per 100,000 people)") +
  theme_bw() +
  theme(axis.title.x = element_text(face="bold", colour="#990000", size=20),
        axis.title.y = element_text(face="bold", colour="#990000", size=20)) + 
  ggtitle("Large Central Metro")

#Micropolitan (2)
datastat_temp <- datastat %>% filter(X2013.Urbanization=="Micropolitan (non-metro)")
datastat_mean <- tapply(datastat_temp$Age.Adjusted.Rate, datastat_temp$County.Code, sum)
datastat_LCM <- data.frame(key=names(datastat_mean), value=datastat_mean)
colnames(datastat_LCM) <- c("County.Code","Age.Adjusted.Rate")
datastat_LCM_F <- merge(datastat_LCM, PM2.5, by="County.Code") 
#View(datastat_LCM_F)

#Graph _micro

datastat_LCM_F %>% filter(!is.na(Age.Adjusted.Rate)) %>%
  ggplot(aes(Value, Age.Adjusted.Rate)) + #, color = State.x
  geom_point(size=3) +
  stat_smooth(method="lm") +
  xlab("Avg PM2.5 content\nµg/m^3") + ylab("Age Adjusted Mortality Rate\n(Deaths per 100,000 people)") +
  theme_bw() +
  theme(axis.title.x = element_text(face="bold", colour="#990000", size=20),
        axis.title.y = element_text(face="bold", colour="#990000", size=20)) +
  ggtitle("Micropolitan")

#Ischaemic Heart Diseases (3)
datastat_temp <- datastat %>% filter(ICD.Sub.Chapter=="Ischaemic heart diseases")
datastat_mean <- tapply(datastat_temp$Age.Adjusted.Rate, datastat_temp$County.Code, sum)
datastat_LCM <- data.frame(key=names(datastat_mean), value=datastat_mean)
colnames(datastat_LCM) <- c("County.Code","Age.Adjusted.Rate")
datastat_LCM_F <- merge(datastat_LCM, PM2.5, by="County.Code") 

#Graph _micro

datastat_LCM_F %>% filter(!is.na(Age.Adjusted.Rate)) %>%
  ggplot(aes(Value, Age.Adjusted.Rate)) + #, color = State.x
  geom_point(size=3) +
  stat_smooth(method="lm") +
  xlab("Avg PM2.5 content\nµg/m^3") + ylab("Age Adjusted Mortality Rate\n(Deaths per 100,000 people)") +
  theme_bw() +
  theme(axis.title.x = element_text(face="bold", colour="#990000", size=20),
        axis.title.y = element_text(face="bold", colour="#990000", size=20)) +
  ggtitle("Ischaemic Heart Diseases")

#Large Fringe Metro (4)
datastat_temp <- datastat %>% filter(X2013.Urbanization=="Large Fringe Metro")
datastat_mean <- tapply(datastat_temp$Age.Adjusted.Rate, datastat_temp$County.Code, sum)
datastat_LCM <- data.frame(key=names(datastat_mean), value=datastat_mean)
colnames(datastat_LCM) <- c("County.Code","Age.Adjusted.Rate")
datastat_LCM_F <- merge(datastat_LCM, PM2.5, by="County.Code") 

#Graph _LCM

datastat_LCM_F %>% filter(!is.na(Age.Adjusted.Rate)) %>%
  ggplot(aes(Value, Age.Adjusted.Rate)) + #, color = State.x
  geom_point(size=3) +
  stat_smooth(method="lm") +
  xlab("Avg PM2.5 content\nµg/m^3") + ylab("Age Adjusted Mortality Rate\n(Deaths per 100,000 people)") +
  theme_bw() +
  theme(axis.title.x = element_text(face="bold", colour="#990000", size=20),
        axis.title.y = element_text(face="bold", colour="#990000", size=20)) + 
  ggtitle("Large Fringe Metro")

Unused exploration methods

#ANOVAS ONLY tell you about interaction effects - are these means different from each other and what interaction effects lead to there? It doesn't control for this.

model1 <- aov(as.numeric(Age.Adjusted.Rate) ~ Race*ICD.Sub.Chapter*Gender, data=datastat)
summary(model1)
model2 <- update(model1, . ~ . - Race:ICD.Sub.Chapter:Gender)
#Anova(model1, model2, type="III")
anova(model1, model2)
#this code's also pretty broken tbh


##lm is the proper way to do this, the output shows the effect on mortality (+/- magnitude) of each individual variable.

model3 <- lm(as.numeric(Age.Adjusted.Rate) ~ Race + ICD.Sub.Chapter + Gender + X2013.Urbanization, data=datastat)
summary(model3)

table(datastat$Race) #Black or African American
table(datastat$Gender) #Female
table(datastat$ICD.Sub.Chapter) #Cerebrovascular diseases
table(datastat$X2013.Urbanization) #Large Central Metro


datastat %>% filter(X2013.Urbanization=='Small Metro') %>% #filter(Race=='Black or African American') %>%
  ggplot(aes(Value, Age.Adjusted.Rate)) + #, color = State.x
  geom_point(size=3) +
  stat_smooth(method="lm") +
  # facet_wrap(~ICD.Sub.Chapter) +
  xlab("Avg PM2.5 content\nµg/m^3") + ylab("Age Adjusted Mortality Rate\n(Deaths per 100,000 people)") +
  theme_bw() +
  theme(axis.title.x = element_text(face="bold", colour="#990000", size=20),
        axis.title.y = element_text(face="bold", colour="#990000", size=20))

unique(data$Race)

temp <- datastat %>% filter(!is.na(Age.Adjusted.Rate)) %>% filter(Gender=='') 
View(datastat)