Libraries

library(haven);
library(sas7bdat);
library(plyr);
library(dplyr);
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr);
library(survival);
library(car);
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(muhaz);
library(survminer);
## Loading required package: ggplot2
## Loading required package: ggpubr
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
## 
##     mutate
library(ggplot2);
library(eha)
library(foreign)
library(survival)
library(car)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
## 
##     dotchart
library(sjmisc)
## Warning: package 'sjmisc' was built under R version 4.0.3
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
## 
## Attaching package: 'sjmisc'
## The following object is masked from 'package:tidyr':
## 
##     replace_na

Loading PM2.5 data, merging, and removing single years

PM2010 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2010.csv");
PM2011 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2011.csv");
PM2012 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2012.csv");
PM2013 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2013.csv");
PM2014 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2014.csv");
PM2015 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2015.csv");
PM25ALL <- rbind(PM2010,PM2011,PM2012,PM2013,PM2014,PM2015);
rm(PM2010,PM2011,PM2012,PM2013,PM2014,PM2015)

Loading Mortality Data, merging, and removing single years

Mor2010 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2010.sas7bdat");
Mor2011 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2011.sas7bdat");
Mor2012 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2012.sas7bdat");
Mor2013 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2013.sas7bdat");
Mor2014 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2014.sas7bdat");
Mor2015 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2015.sas7bdat");
MorALL <- rbind(Mor2010,Mor2011,Mor2012,Mor2013,Mor2014,Mor2015);
rm(Mor2010,Mor2011,Mor2012,Mor2013,Mor2014,Mor2015)

Loading a county / county number list in order to merge the data

County = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/TXCounties.csv")

Adding to County to Fips codes in mortality data in order to merge the data sets (the county name may be redundant now, but that was a good way to check if it worked correctly)

MorALL$RESIDCOUNTY_CODE <- as.numeric(MorALL$RESIDCOUNTY_CODE);
County$ï..Number <- as.numeric(County$ï..Number);
MorALL <- merge(MorALL, County, by.x = "RESIDCOUNTY_CODE", by.y = "ï..Number", all.x = TRUE, all.y = FALSE)
rm(County)

Recoding dates (strptime did not work on all dates, so, I had to take it apart as string and recombine it so I can merge the data)

PM25ALL$NewDateM <- substr(PM25ALL$Date, start = 1, stop = 2);
PM25ALL$NewDateD <- substr(PM25ALL$Date, start = 4, stop = 5);
PM25ALL$NewDateY <- substr(PM25ALL$Date, start = 7, stop = 10);
PM25ALL$Date <- paste(PM25ALL$NewDateY,PM25ALL$NewDateM,PM25ALL$NewDateD, sep = "")

Creating averages per county for fine dust emissions per day

PM25ALL <- PM25ALL %>%
    dplyr::group_by(Date, COUNTY_CODE) %>% 
    dplyr::mutate(
                  PM25Aver = mean(Daily.Mean.PM2.5.Concentration)) %>% 
  dplyr::ungroup()

Merging mortality data with PM2.5 averages

PM25Sub <- PM25ALL[,c(1,17,18,24)]
names(MorALL)[names(MorALL) == "DATEOFDEATH"]<-"Date"
names(PM25Sub)[names(PM25Sub) == "COUNTY_CODE"]<-"Fips"
MorALL <- MorALL %>% semi_join(PM25Sub, by = "Fips") #reducing to only data I have PM2.5 values for
PM25Sub$Merge <- paste(PM25Sub$Date,PM25Sub$Fips, sep = "")
MorALL$Merge <- paste(MorALL$Date,MorALL$Fips, sep = "")
PM25Sub<-PM25Sub[!duplicated(PM25Sub), ]
FromDustToDust <- MorALL
FromDustToDust$PM25Aver <- 0
MorALL$Merge <- as.numeric(MorALL$Merge)
PM25Sub$Merge <- as.numeric(PM25Sub$Merge)
FromDustToDust <- merge(x = MorALL, y = PM25Sub, by = c("Merge"), all.x = TRUE, all.y = FALSE)

Removing some lines due to missing data

FromDustToDust$AGE <- as.numeric(FromDustToDust$AGE)
FromDustToDust$SEX <- as.numeric(FromDustToDust$SEX)
FromDustToDust$DEATHRESULTOFINJURY <- as.numeric(FromDustToDust$DEATHRESULTOFINJURY)
FromDustToDust$PrematureDeath <- as.numeric(FromDustToDust$AGE<65) #Creating Dummy Variable for premature death (below 65 = 1; 65Plus = 0)
FromDustToDust <- subset(FromDustToDust, complete.cases (PM25Aver,AGE))
FromDustToDust <- subset(FromDustToDust, AGE < 999)
FromDustToDust <- subset(FromDustToDust, DEATHRESULTOFINJURY < 9) #missing data
FromDustToDust <- subset(FromDustToDust, DEATHRESULTOFINJURY > 1) #injuries
Urbanicity <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/NCHSurbruralcodes2013.sas7bdat");
TXUrbanicity <- subset(Urbanicity, ST_ABBREV == 'TX')
rm(Urbanicity)
TXUrbanicity$CODE2013 <- as.numeric(TXUrbanicity$CODE2013);
FromDustToDustHW2 <- merge(x = FromDustToDust, y = TXUrbanicity, by.x = "Fips.x", by.y = "ctyfips", all.x = TRUE)
rm(TXUrbanicity)
#FromDustToDust$One <- 1 #dummy variable for survfit
#names(FromDustToDustHW2)[names(FromDustToDust) == "DEATHRESULTOFINJURY"]<-"Injury"

Creating a dummy variable for respiratory diseases as part of cause of death (ICD codes starting with J indicate respiratory conditions as underlaying factor)

FromDustToDustHW2$MULTIPLECAUSES <- as.character(FromDustToDust$MULTIPLECAUSES)
FromDustToDustHW2$RespiratoryDisease <- 0 # for whatever reason, I cannot run the next line without having the default negative value there ;-)
FromDustToDustHW2$RespiratoryDisease[grep('J', FromDustToDustHW2$MULTIPLECAUSES)] <- 1

The Grouping Variable are SEX (1 = male, 2 = female)

My hypothesis is that females have a higher survival probability than males.

The hypothesis seemed to be supported.

fit1<-glm(PrematureDeath~PM25Aver, data = FromDustToDustHW2, na.action = na.omit, family = binomial) #pollution average
fit2<-glm(PrematureDeath~PM25Aver + RespiratoryDisease, data = FromDustToDustHW2, na.action = na.omit, family = binomial) #plus Respiratory Disease
fit3<-glm(PrematureDeath~PM25Aver + RespiratoryDisease + SEX, data = FromDustToDustHW2, na.action = na.omit, family = binomial) #plus sex
fit4<-glm(PrematureDeath~PM25Aver + RespiratoryDisease + SEX + CODE2013, data = FromDustToDustHW2, na.action = na.omit, family = binomial) #plus urbanicity 2013
fit5<-glm(PrematureDeath~PM25Aver + SEX, data = FromDustToDustHW2, na.action = na.omit, family = binomial) # fit3 w/o Respiratory Disease
fit6<-glm(PrematureDeath~PM25Aver + SEX + CODE2013, data = FromDustToDustHW2, na.action = na.omit, family = binomial) # fit4 w/o Respiratory Disease
AIC(fit1,fit2,fit3,fit4,fit5,fit6)
##      df      AIC
## fit1  2 674615.3
## fit2  3 674593.0
## fit3  4 666566.2
## fit4  5 665905.6
## fit5  3 666588.2
## fit6  4 665928.0
summary(fit1)
## 
## Call:
## glm(formula = PrematureDeath ~ PM25Aver, family = binomial, data = FromDustToDustHW2, 
##     na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0078  -0.8239  -0.8191   1.5706   1.5967  
## 
## Coefficients:
##              Estimate Std. Error  z value Pr(>|z|)    
## (Intercept) -0.947786   0.006808 -139.214  < 2e-16 ***
## PM25Aver     0.004108   0.000646    6.359 2.03e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 674652  on 562526  degrees of freedom
## Residual deviance: 674611  on 562525  degrees of freedom
## AIC: 674615
## 
## Number of Fisher Scoring iterations: 4
summary(fit2)
## 
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + RespiratoryDisease, 
##     family = binomial, data = FromDustToDustHW2, na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9987  -0.8253  -0.8192   1.5685   1.6068  
## 
## Coefficients:
##                     Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)        -0.938294   0.007073 -132.659  < 2e-16 ***
## PM25Aver            0.004102   0.000646    6.350 2.15e-10 ***
## RespiratoryDisease -0.031836   0.006462   -4.927 8.36e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 674652  on 562526  degrees of freedom
## Residual deviance: 674587  on 562524  degrees of freedom
## AIC: 674593
## 
## Number of Fisher Scoring iterations: 4
summary(fit3)
## 
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + RespiratoryDisease + 
##     SEX, family = binomial, data = FromDustToDustHW2, na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0981  -0.9085  -0.7303   1.4587   1.7309  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -0.1536930  0.0111974 -13.726  < 2e-16 ***
## PM25Aver            0.0040509  0.0006506   6.226 4.78e-10 ***
## RespiratoryDisease -0.0318469  0.0065080  -4.894 9.91e-07 ***
## SEX                -0.5314086  0.0059696 -89.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 674652  on 562526  degrees of freedom
## Residual deviance: 666558  on 562523  degrees of freedom
## AIC: 666566
## 
## Number of Fisher Scoring iterations: 4
summary(fit4)
## 
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + RespiratoryDisease + 
##     SEX + CODE2013, family = binomial, data = FromDustToDustHW2, 
##     na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9898  -0.8825  -0.7437   1.4383   1.8717  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -0.012955   0.012488  -1.037      0.3    
## PM25Aver            0.002869   0.000655   4.380 1.19e-05 ***
## RespiratoryDisease -0.032127   0.006512  -4.934 8.07e-07 ***
## SEX                -0.533526   0.005974 -89.309  < 2e-16 ***
## CODE2013           -0.075654   0.002964 -25.525  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 674652  on 562526  degrees of freedom
## Residual deviance: 665896  on 562522  degrees of freedom
## AIC: 665906
## 
## Number of Fisher Scoring iterations: 4
summary(fit5)
## 
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + SEX, family = binomial, 
##     data = FromDustToDustHW2, na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1076  -0.9093  -0.7291   1.4607   1.7213  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.1631899  0.0110282 -14.798  < 2e-16 ***
## PM25Aver     0.0040568  0.0006506   6.235 4.51e-10 ***
## SEX         -0.5314080  0.0059695 -89.021  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 674652  on 562526  degrees of freedom
## Residual deviance: 666582  on 562524  degrees of freedom
## AIC: 666588
## 
## Number of Fisher Scoring iterations: 4
summary(fit6)
## 
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + SEX + CODE2013, family = binomial, 
##     data = FromDustToDustHW2, na.action = na.omit)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9863  -0.8804  -0.7434   1.4407   1.8627  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.022578   0.012335  -1.830   0.0672 .  
## PM25Aver     0.002875   0.000655   4.389 1.14e-05 ***
## SEX         -0.533525   0.005974 -89.311  < 2e-16 ***
## CODE2013    -0.075631   0.002964 -25.518  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 674652  on 562526  degrees of freedom
## Residual deviance: 665920  on 562523  degrees of freedom
## AIC: 665928
## 
## Number of Fisher Scoring iterations: 4
logit2prob <- function(logit){
  odds <- exp(logit)
  prob <- odds / (1 + odds)
  return(prob)
}
logit2prob(coef(fit1))
## (Intercept)    PM25Aver 
##   0.2793304   0.5010270
logit2prob(coef(fit2))
##        (Intercept)           PM25Aver RespiratoryDisease 
##          0.2812450          0.5010256          0.4920417
logit2prob(coef(fit3))
##        (Intercept)           PM25Aver RespiratoryDisease                SEX 
##          0.4616522          0.5010127          0.4920389          0.3701884
logit2prob(coef(fit4))
##        (Intercept)           PM25Aver RespiratoryDisease                SEX 
##          0.4967613          0.5007173          0.4919690          0.3696949 
##           CODE2013 
##          0.4810954
logit2prob(coef(fit5))
## (Intercept)    PM25Aver         SEX 
##   0.4592928   0.5010142   0.3701886
logit2prob(coef(fit6))
## (Intercept)    PM25Aver         SEX    CODE2013 
##   0.4943559   0.5007188   0.3696951   0.4811013
plot(fit4)