Libraries
library(haven);
library(sas7bdat);
library(plyr);
library(dplyr);
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr);
library(survival);
library(car);
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(muhaz);
library(survminer);
## Loading required package: ggplot2
## Loading required package: ggpubr
##
## Attaching package: 'ggpubr'
## The following object is masked from 'package:plyr':
##
## mutate
library(ggplot2);
library(eha)
library(foreign)
library(survival)
library(car)
library(survey)
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
library(sjmisc)
## Warning: package 'sjmisc' was built under R version 4.0.3
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
##
## Attaching package: 'sjmisc'
## The following object is masked from 'package:tidyr':
##
## replace_na
Loading PM2.5 data, merging, and removing single years
PM2010 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2010.csv");
PM2011 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2011.csv");
PM2012 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2012.csv");
PM2013 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2013.csv");
PM2014 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2014.csv");
PM2015 = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/pm25TX2015.csv");
PM25ALL <- rbind(PM2010,PM2011,PM2012,PM2013,PM2014,PM2015);
rm(PM2010,PM2011,PM2012,PM2013,PM2014,PM2015)
Loading Mortality Data, merging, and removing single years
Mor2010 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2010.sas7bdat");
Mor2011 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2011.sas7bdat");
Mor2012 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2012.sas7bdat");
Mor2013 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2013.sas7bdat");
Mor2014 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2014.sas7bdat");
Mor2015 <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/death2015.sas7bdat");
MorALL <- rbind(Mor2010,Mor2011,Mor2012,Mor2013,Mor2014,Mor2015);
rm(Mor2010,Mor2011,Mor2012,Mor2013,Mor2014,Mor2015)
Loading a county / county number list in order to merge the data
County = read.csv("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/TXCounties.csv")
Adding to County to Fips codes in mortality data in order to merge the data sets (the county name may be redundant now, but that was a good way to check if it worked correctly)
MorALL$RESIDCOUNTY_CODE <- as.numeric(MorALL$RESIDCOUNTY_CODE);
County$ï..Number <- as.numeric(County$ï..Number);
MorALL <- merge(MorALL, County, by.x = "RESIDCOUNTY_CODE", by.y = "ï..Number", all.x = TRUE, all.y = FALSE)
rm(County)
Recoding dates (strptime did not work on all dates, so, I had to take it apart as string and recombine it so I can merge the data)
PM25ALL$NewDateM <- substr(PM25ALL$Date, start = 1, stop = 2);
PM25ALL$NewDateD <- substr(PM25ALL$Date, start = 4, stop = 5);
PM25ALL$NewDateY <- substr(PM25ALL$Date, start = 7, stop = 10);
PM25ALL$Date <- paste(PM25ALL$NewDateY,PM25ALL$NewDateM,PM25ALL$NewDateD, sep = "")
Creating averages per county for fine dust emissions per day
PM25ALL <- PM25ALL %>%
dplyr::group_by(Date, COUNTY_CODE) %>%
dplyr::mutate(
PM25Aver = mean(Daily.Mean.PM2.5.Concentration)) %>%
dplyr::ungroup()
Merging mortality data with PM2.5 averages
PM25Sub <- PM25ALL[,c(1,17,18,24)]
names(MorALL)[names(MorALL) == "DATEOFDEATH"]<-"Date"
names(PM25Sub)[names(PM25Sub) == "COUNTY_CODE"]<-"Fips"
MorALL <- MorALL %>% semi_join(PM25Sub, by = "Fips") #reducing to only data I have PM2.5 values for
PM25Sub$Merge <- paste(PM25Sub$Date,PM25Sub$Fips, sep = "")
MorALL$Merge <- paste(MorALL$Date,MorALL$Fips, sep = "")
PM25Sub<-PM25Sub[!duplicated(PM25Sub), ]
FromDustToDust <- MorALL
FromDustToDust$PM25Aver <- 0
MorALL$Merge <- as.numeric(MorALL$Merge)
PM25Sub$Merge <- as.numeric(PM25Sub$Merge)
FromDustToDust <- merge(x = MorALL, y = PM25Sub, by = c("Merge"), all.x = TRUE, all.y = FALSE)
Removing some lines due to missing data
FromDustToDust$AGE <- as.numeric(FromDustToDust$AGE)
FromDustToDust$SEX <- as.numeric(FromDustToDust$SEX)
FromDustToDust$DEATHRESULTOFINJURY <- as.numeric(FromDustToDust$DEATHRESULTOFINJURY)
FromDustToDust$PrematureDeath <- as.numeric(FromDustToDust$AGE<65) #Creating Dummy Variable for premature death (below 65 = 1; 65Plus = 0)
FromDustToDust <- subset(FromDustToDust, complete.cases (PM25Aver,AGE))
FromDustToDust <- subset(FromDustToDust, AGE < 999)
FromDustToDust <- subset(FromDustToDust, DEATHRESULTOFINJURY < 9) #missing data
FromDustToDust <- subset(FromDustToDust, DEATHRESULTOFINJURY > 1) #injuries
Urbanicity <- read_sas("C:/Users/tobik/OneDrive/Documents/Air Pollution Research/data/NCHSurbruralcodes2013.sas7bdat");
TXUrbanicity <- subset(Urbanicity, ST_ABBREV == 'TX')
rm(Urbanicity)
TXUrbanicity$CODE2013 <- as.numeric(TXUrbanicity$CODE2013);
FromDustToDustHW2 <- merge(x = FromDustToDust, y = TXUrbanicity, by.x = "Fips.x", by.y = "ctyfips", all.x = TRUE)
rm(TXUrbanicity)
#FromDustToDust$One <- 1 #dummy variable for survfit
#names(FromDustToDustHW2)[names(FromDustToDust) == "DEATHRESULTOFINJURY"]<-"Injury"
Creating a dummy variable for respiratory diseases as part of cause of death (ICD codes starting with J indicate respiratory conditions as underlaying factor)
FromDustToDustHW2$MULTIPLECAUSES <- as.character(FromDustToDust$MULTIPLECAUSES)
FromDustToDustHW2$RespiratoryDisease <- 0 # for whatever reason, I cannot run the next line without having the default negative value there ;-)
FromDustToDustHW2$RespiratoryDisease[grep('J', FromDustToDustHW2$MULTIPLECAUSES)] <- 1
The Grouping Variable are SEX (1 = male, 2 = female)
My hypothesis is that females have a higher survival probability than males.
The hypothesis seemed to be supported.
fit1<-glm(PrematureDeath~PM25Aver, data = FromDustToDustHW2, na.action = na.omit, family = binomial) #pollution average
fit2<-glm(PrematureDeath~PM25Aver + RespiratoryDisease, data = FromDustToDustHW2, na.action = na.omit, family = binomial) #plus Respiratory Disease
fit3<-glm(PrematureDeath~PM25Aver + RespiratoryDisease + SEX, data = FromDustToDustHW2, na.action = na.omit, family = binomial) #plus sex
fit4<-glm(PrematureDeath~PM25Aver + RespiratoryDisease + SEX + CODE2013, data = FromDustToDustHW2, na.action = na.omit, family = binomial) #plus urbanicity 2013
fit5<-glm(PrematureDeath~PM25Aver + SEX, data = FromDustToDustHW2, na.action = na.omit, family = binomial) # fit3 w/o Respiratory Disease
fit6<-glm(PrematureDeath~PM25Aver + SEX + CODE2013, data = FromDustToDustHW2, na.action = na.omit, family = binomial) # fit4 w/o Respiratory Disease
AIC(fit1,fit2,fit3,fit4,fit5,fit6)
## df AIC
## fit1 2 674615.3
## fit2 3 674593.0
## fit3 4 666566.2
## fit4 5 665905.6
## fit5 3 666588.2
## fit6 4 665928.0
summary(fit1)
##
## Call:
## glm(formula = PrematureDeath ~ PM25Aver, family = binomial, data = FromDustToDustHW2,
## na.action = na.omit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0078 -0.8239 -0.8191 1.5706 1.5967
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.947786 0.006808 -139.214 < 2e-16 ***
## PM25Aver 0.004108 0.000646 6.359 2.03e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 674652 on 562526 degrees of freedom
## Residual deviance: 674611 on 562525 degrees of freedom
## AIC: 674615
##
## Number of Fisher Scoring iterations: 4
summary(fit2)
##
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + RespiratoryDisease,
## family = binomial, data = FromDustToDustHW2, na.action = na.omit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9987 -0.8253 -0.8192 1.5685 1.6068
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.938294 0.007073 -132.659 < 2e-16 ***
## PM25Aver 0.004102 0.000646 6.350 2.15e-10 ***
## RespiratoryDisease -0.031836 0.006462 -4.927 8.36e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 674652 on 562526 degrees of freedom
## Residual deviance: 674587 on 562524 degrees of freedom
## AIC: 674593
##
## Number of Fisher Scoring iterations: 4
summary(fit3)
##
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + RespiratoryDisease +
## SEX, family = binomial, data = FromDustToDustHW2, na.action = na.omit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0981 -0.9085 -0.7303 1.4587 1.7309
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1536930 0.0111974 -13.726 < 2e-16 ***
## PM25Aver 0.0040509 0.0006506 6.226 4.78e-10 ***
## RespiratoryDisease -0.0318469 0.0065080 -4.894 9.91e-07 ***
## SEX -0.5314086 0.0059696 -89.019 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 674652 on 562526 degrees of freedom
## Residual deviance: 666558 on 562523 degrees of freedom
## AIC: 666566
##
## Number of Fisher Scoring iterations: 4
summary(fit4)
##
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + RespiratoryDisease +
## SEX + CODE2013, family = binomial, data = FromDustToDustHW2,
## na.action = na.omit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9898 -0.8825 -0.7437 1.4383 1.8717
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.012955 0.012488 -1.037 0.3
## PM25Aver 0.002869 0.000655 4.380 1.19e-05 ***
## RespiratoryDisease -0.032127 0.006512 -4.934 8.07e-07 ***
## SEX -0.533526 0.005974 -89.309 < 2e-16 ***
## CODE2013 -0.075654 0.002964 -25.525 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 674652 on 562526 degrees of freedom
## Residual deviance: 665896 on 562522 degrees of freedom
## AIC: 665906
##
## Number of Fisher Scoring iterations: 4
summary(fit5)
##
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + SEX, family = binomial,
## data = FromDustToDustHW2, na.action = na.omit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1076 -0.9093 -0.7291 1.4607 1.7213
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1631899 0.0110282 -14.798 < 2e-16 ***
## PM25Aver 0.0040568 0.0006506 6.235 4.51e-10 ***
## SEX -0.5314080 0.0059695 -89.021 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 674652 on 562526 degrees of freedom
## Residual deviance: 666582 on 562524 degrees of freedom
## AIC: 666588
##
## Number of Fisher Scoring iterations: 4
summary(fit6)
##
## Call:
## glm(formula = PrematureDeath ~ PM25Aver + SEX + CODE2013, family = binomial,
## data = FromDustToDustHW2, na.action = na.omit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9863 -0.8804 -0.7434 1.4407 1.8627
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.022578 0.012335 -1.830 0.0672 .
## PM25Aver 0.002875 0.000655 4.389 1.14e-05 ***
## SEX -0.533525 0.005974 -89.311 < 2e-16 ***
## CODE2013 -0.075631 0.002964 -25.518 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 674652 on 562526 degrees of freedom
## Residual deviance: 665920 on 562523 degrees of freedom
## AIC: 665928
##
## Number of Fisher Scoring iterations: 4
logit2prob <- function(logit){
odds <- exp(logit)
prob <- odds / (1 + odds)
return(prob)
}
logit2prob(coef(fit1))
## (Intercept) PM25Aver
## 0.2793304 0.5010270
logit2prob(coef(fit2))
## (Intercept) PM25Aver RespiratoryDisease
## 0.2812450 0.5010256 0.4920417
logit2prob(coef(fit3))
## (Intercept) PM25Aver RespiratoryDisease SEX
## 0.4616522 0.5010127 0.4920389 0.3701884
logit2prob(coef(fit4))
## (Intercept) PM25Aver RespiratoryDisease SEX
## 0.4967613 0.5007173 0.4919690 0.3696949
## CODE2013
## 0.4810954
logit2prob(coef(fit5))
## (Intercept) PM25Aver SEX
## 0.4592928 0.5010142 0.3701886
logit2prob(coef(fit6))
## (Intercept) PM25Aver SEX CODE2013
## 0.4943559 0.5007188 0.3696951 0.4811013
plot(fit4)