Project 1 Part 3

Effects on Leading Causes of Death in NY

Recipes for the Design of Experiments

Felipe Ortiz

RPI

October 11th 2016 V1

1 Setting

In this example we will be working with a world real data to demonstrate cleaning the data, and DOE of 2 or more factors.

The data set was obtained from one of them many links provided under the 100 interesting data sets site. It is a Data.Gov dataset. It is a large dataset containing the causes of death in NY.

For our purposes we will only be using a total of 5 variables. 4 Independent variables, and a response variable.

System under test

What we will be focusing on is testing if the variables chosen had an effect on the count of deaths in NY

Factors and Levels

The factors that were chosen were: “Year”, “Ethnicity”, “Sex”, “Cause.of.Death” The factors and their levels can be seen below

levels(Data$Year)
## [1] "2007" "2008" "2009" "2010" "2011"
levels(Data$Ethnicity)
## [1] "ASIAN & PACIFIC ISLANDER" "HISPANIC"                
## [3] "NON-HISPANIC BLACK"       "NON-HISPANIC WHITE"
levels(Data$Sex)
## [1] "FEMALE" "MALE"
levels(Data$Cause.of.Death)
##  [1] "ACCIDENTS EXCEPT DRUG POISONING"                 
##  [2] "ALZHEIMERS DISEASE"                              
##  [3] "ANEMIAS"                                         
##  [4] "AORTIC ANEURYSM AND DISSECTION"                  
##  [5] "ASSAULT (HOMICIDE)"                              
##  [6] "ATHEROSCLEROSIS"                                 
##  [7] "BENIGN AND UNCERTAIN NEOPLASMS"                  
##  [8] "CARDIOVASCULAR DISORDERS IN PERINATAL PERIOD"    
##  [9] "CEREBROVASCULAR DISEASE"                         
## [10] "CHOLELITHIASIS AND DISORDERS OF GALLBLADDER"     
## [11] "CHRONIC LIVER DISEASE AND CIRRHOSIS"             
## [12] "CHRONIC LOWER RESPIRATORY DISEASES"              
## [13] "CONGENITAL MALFORMATIONS,DEFORMATIONS"           
## [14] "DIABETES MELLITUS"                               
## [15] "DISEASES OF HEART"                               
## [16] "ESSENTIAL HYPERTENSION AND RENAL DISEASES"       
## [17] "HUMAN IMMUNODEFICIENCY VIRUS DISEASE"            
## [18] "INFLUENZA AND PNEUMONIA"                         
## [19] "INTENTIONAL SELF-HARM (SUICIDE)"                 
## [20] "MALIGNANT NEOPLASMS"                             
## [21] "MENTAL DISORDERS DUE TO USE OF ALCOHOL"          
## [22] "NEPHRITIS, NEPHROTIC SYNDROME AND NEPHROSIS"     
## [23] "PARKINSONS DISEASE"                              
## [24] "PEPTIC ULCER"                                    
## [25] "PNEUMONITIS DUE TO SOLIDS AND LIQUIDS"           
## [26] "PREGNANCY, CHILDBIRTH AND THE PUERPERIUM"        
## [27] "PSYCH. SUBSTANCE USE & ACCIDENTAL DRUG POISONING"
## [28] "RESPIRATORY DISTRESS OF NEWBORN"                 
## [29] "SEPTICEMIA"                                      
## [30] "SHORT GESTATION/LBW"                             
## [31] "TUBERCULOSIS"                                    
## [32] "VIRAL HEPATITIS"

As can be seen, the “Cause.of.Death” factor has a large number of levels. For the sake of simplicity lets say we are only interested in the levels that have the 5 highest average “Count”. That is to say, we only want to keep the levels of “Cause.of.Death” that had the highest avg death count. In this case we can subset the data as seen below, and have a dataset with less levels to work with without discarding data for the specific levels.

#Find highest 5 values
tail(sort(z),5)
## [1]  199.025  206.375  290.725 1639.925 2408.125
#lowest value is 199.025
#These will be the Levels we will subset
LevelsWanted <- x[which(z >= 199)]
Data2 <- Data[Data$Cause.of.Death %in% LevelsWanted,]
Data2[,1:4] <- lapply(Data2[,1:4], factor)

With this we shrink our data from almost 4000 observations to 800, but the bigger change is the decrease in factor levels. The results from this subset are not biased because all the data point for the levels that will be analyzed are still present. That is to say, the experiment will still test all the levels being analyzed against eachother.

levels(Data2$Year)
## [1] "2007" "2008" "2009" "2010" "2011"
levels(Data2$Ethnicity)
## [1] "ASIAN & PACIFIC ISLANDER" "HISPANIC"                
## [3] "NON-HISPANIC BLACK"       "NON-HISPANIC WHITE"
levels(Data2$Sex)
## [1] "FEMALE" "MALE"
levels(Data2$Cause.of.Death)
## [1] "CHRONIC LOWER RESPIRATORY DISEASES"
## [2] "DIABETES MELLITUS"                 
## [3] "DISEASES OF HEART"                 
## [4] "INFLUENZA AND PNEUMONIA"           
## [5] "MALIGNANT NEOPLASMS"

Response variable

The response variable that is being used is “Counts” or the death count in NY due. This is a discrete variable that has a wide range. The first step should be to get a feel for the data, this can be seen below.

summary(Data2$Count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    28.0   188.8   375.5   948.8  1185.0  7050.0

The Data

Part of the subset data can be seen below. Although these steps are simple, they are an important part of setting up the higher level analysis.

head(Data2, n=10)
##    Year          Ethnicity    Sex                     Cause.of.Death Count
## 2  2010 NON-HISPANIC BLACK   MALE            INFLUENZA AND PNEUMONIA   201
## 4  2010 NON-HISPANIC BLACK   MALE                MALIGNANT NEOPLASMS  1540
## 20 2010 NON-HISPANIC WHITE FEMALE CHRONIC LOWER RESPIRATORY DISEASES   502
## 22 2010 NON-HISPANIC WHITE FEMALE                  DIABETES MELLITUS   258
## 23 2010 NON-HISPANIC WHITE FEMALE                  DISEASES OF HEART  5351
## 26 2010 NON-HISPANIC WHITE FEMALE            INFLUENZA AND PNEUMONIA   707
## 28 2010 NON-HISPANIC WHITE FEMALE                MALIGNANT NEOPLASMS  3438
## 46 2010 NON-HISPANIC WHITE   MALE CHRONIC LOWER RESPIRATORY DISEASES   388
## 48 2010 NON-HISPANIC WHITE   MALE                  DIABETES MELLITUS   245
## 49 2010 NON-HISPANIC WHITE   MALE                  DISEASES OF HEART  4495
summary(Data2, n=10)
##    Year                        Ethnicity       Sex     
##  2007:160   ASIAN & PACIFIC ISLANDER:200   FEMALE:400  
##  2008:160   HISPANIC                :200   MALE  :400  
##  2009:160   NON-HISPANIC BLACK      :200               
##  2010:160   NON-HISPANIC WHITE      :200               
##  2011:160                                              
##                                                        
##                             Cause.of.Death     Count       
##  CHRONIC LOWER RESPIRATORY DISEASES:160    Min.   :  28.0  
##  DIABETES MELLITUS                 :160    1st Qu.: 188.8  
##  DISEASES OF HEART                 :160    Median : 375.5  
##  INFLUENZA AND PNEUMONIA           :160    Mean   : 948.8  
##  MALIGNANT NEOPLASMS               :160    3rd Qu.:1185.2  
##                                            Max.   :7050.0

2. (Experimental) Design

How will the experiment be organized and conducted to test the hypothesis?

Proper Experimental design is an important part of getting valid results. In this case we will be using factorial design to test various hypotheses. Our null hypotheses are that each of the individual factors does not have an effect on the response variable, as well as this we are interested in an interaction effects. Thus for all data collected, all the factors must also be collected.

What is the rationale for this design?

In this case since we are using real world data that has already been collected, we are limited on our input to the design. We do however see that the response variable is Death Count, and that the data collectors believed that it varied by “Year”, “Ethnicity”, “Sex”, and “Cause.of.Death”. Or in other words it was possibly influenced by those factors.

Other factors that may have been selected would include “age”, “Medical History”, and “Region”.

Randomize: What is the Randomization Scheme?

Randomized design is used to control for effects from extraneous variables, or variables that have an effect but are not being studied. The assumption is that on average these variables will affect the response variable equally so any significant differences should come from the independent variable(s) being tested.

Again, since we weren’t present for the collection of this data we cannot assure that this data was collected randomly. However it does state that it is the entire population. This is likely not the case, however it is the majority of NY City. As such we can assume that extraneous variables will balance out.

Block: Did you use blocking in the design?

Blocking is used to eliminate the influence of extraneous variables, however since we are assuming that these will average out there was no blocking done outside of the factors themselves.

Relevant theory (ISYE 6020)

Relevant theory has been discussed within each section. We have arrived at the conclusion that because the data is an overwhelming majority of the population the effect of extraneous variables that randomization and blocking try to remove are averaged out.

In terms of replication, due to the nature of the data there wasn’t a viable way to do replications. Replication is used to ensure that the results represent the full population. It is useful for finding the variability within the groups. A possible way to replications in this experiment would be to drop year as a factor, and have a year of data be a replication. In this case however, although unlikely, there may be a need for blocking due to effects that change year by year.

3. (Statistical) Analysis

This example will demonstrate how to do the following: Compute the main effects of all four factors Compute the 2-way interactions between the factors Compute Analysis of Variance for all main effects and the two factor interactions

(Exploratory Data Analysis) Graphics and descriptive summary

It is important to begin with getting a feel for the data. We started this in section 1 by looking at some values. Now we can plot a histogram and a boxplot

hist(Data2$Count)

boxplot(Data2$Count, main = "Boxplot of Count")

As can be seen, we may have some outliers in out data. It is important however not to remove them without knowing how they affect the results.

Lets look at the boxplot of each factor. They will help visually represent the significance of a factor.

boxplot(Data2$Count ~ Data2$Year, main = "Box Plots of Death Counts by Year", xlab = "Year", ylab = "Counts")

boxplot(Data2$Count ~ Data2$Ethnicity, main = "Box Plots of Death Counts by Ethnicity", xlab = "Ethnicity", ylab = "Counts", cex.axis = 0.7)

boxplot(Data2$Count ~ Data2$Sex, main = "Box Plots of Death Counts by Sex", xlab = "Sex", ylab = "Counts")

boxplot(Data2$Count ~ Data2$Cause.of.Death, main = "Box Plots of Death Counts by Cause.of.Death", xlab = "Cause of Death", ylab = "Counts", cex.axis = 0.4)

As is shown by these boxplots, not all the factors have a significant effect on the counts. The year factor is very constant for every level, this probably means it does not have a strong main effect. The ethnicity boxplots show an almost linear change with the level. Just like with years, the sex factor shows little change. Although the female range does go higher than the male, and has more outliers. This might be reason to look into the data. Cause of death however shows the highest effect, it has the most change between the different median of the boxplots. These effects will further be explored with the use of ANOVA.

Testing

m1 = aov(Data2$Count ~ Data2$Year)
summary(m1)
##              Df    Sum Sq Mean Sq F value Pr(>F)
## Data2$Year    4 9.500e+05  237496   0.126  0.973
## Residuals   795 1.493e+09 1877470
m2 = aov(Data2$Count ~ Data2$Ethnicity)
summary(m2)
##                  Df    Sum Sq   Mean Sq F value Pr(>F)    
## Data2$Ethnicity   3 3.715e+08 123820521   87.84 <2e-16 ***
## Residuals       796 1.122e+09   1409644                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m3 = aov(Data2$Count ~ Data2$Sex)
summary(m3)
##              Df    Sum Sq Mean Sq F value Pr(>F)
## Data2$Sex     1 2.307e+06 2307382   1.235  0.267
## Residuals   798 1.491e+09 1868711
m4 = aov(Data2$Count ~ Data2$Cause.of.Death)
summary(m4)
##                       Df    Sum Sq   Mean Sq F value Pr(>F)    
## Data2$Cause.of.Death   4 664592536 166148134   159.3 <2e-16 ***
## Residuals            795 828945894   1042699                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As we can see from the results of the ANOVA and the P-Values, for Year and Sex the null hypothesis can probably not be discarded. That is to say, they do not have a significant effect on the death count. While we can discard the null for Ethnicity and Cause of Death. If we look at the SS, Ethnicity has a larger main effect than the main effect of Cause of Death.

Now we can move forward to the Interaction effects. It is important to also include the factors which did not have a large main effect because the interaction effect may still be significant. All the interactions will be tested

m5 = aov(Data2$Count ~ Data2$Year*Data2$Ethnicity)
summary(m5)
##                             Df    Sum Sq   Mean Sq F value Pr(>F)    
## Data2$Year                   4 9.500e+05    237496   0.166  0.956    
## Data2$Ethnicity              3 3.715e+08 123820521  86.328 <2e-16 ***
## Data2$Year:Data2$Ethnicity  12 2.376e+06    198018   0.138  1.000    
## Residuals                  780 1.119e+09   1434296                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m6 = aov(Data2$Count ~ Data2$Year*Data2$Sex)
summary(m6)
##                       Df    Sum Sq Mean Sq F value Pr(>F)
## Data2$Year             4 9.500e+05  237496   0.126  0.973
## Data2$Sex              1 2.307e+06 2307382   1.223  0.269
## Data2$Year:Data2$Sex   4 1.184e+05   29588   0.016  1.000
## Residuals            790 1.490e+09 1886282
m7 = aov(Data2$Count ~ Data2$Year*Data2$Cause.of.Death)
summary(m7)
##                                  Df    Sum Sq   Mean Sq F value Pr(>F)    
## Data2$Year                        4    949985    237496   0.224  0.925    
## Data2$Cause.of.Death              4 664592536 166148134 156.846 <2e-16 ***
## Data2$Year:Data2$Cause.of.Death  16   7032681    439543   0.415  0.979    
## Residuals                       775 820963228   1059307                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From these results we can confidently say that Year does not have an effect on the Death Count

m8 = aov(Data2$Count ~ Data2$Ethnicity*Data2$Sex)
summary(m8)
##                            Df    Sum Sq   Mean Sq F value Pr(>F)    
## Data2$Ethnicity             3 3.715e+08 123820521  87.842 <2e-16 ***
## Data2$Sex                   1 2.307e+06   2307382   1.637  0.201    
## Data2$Ethnicity:Data2$Sex   3 3.376e+06   1125254   0.798  0.495    
## Residuals                 792 1.116e+09   1409588                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
m9 = aov(Data2$Count ~ Data2$Ethnicity*Data2$Cause.of.Death)
summary(m9)
##                                       Df    Sum Sq   Mean Sq F value
## Data2$Ethnicity                        3 371461563 123820521  2620.6
## Data2$Cause.of.Death                   4 664592536 166148134  3516.4
## Data2$Ethnicity:Data2$Cause.of.Death  12 420629954  35052496   741.9
## Residuals                            780  36854376     47249        
##                                      Pr(>F)    
## Data2$Ethnicity                      <2e-16 ***
## Data2$Cause.of.Death                 <2e-16 ***
## Data2$Ethnicity:Data2$Cause.of.Death <2e-16 ***
## Residuals                                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Here we see that The interaction between Ethnicity and Cause of death is also significant.

m10 = aov(Data2$Count ~ Data2$Sex*Data2$Cause.of.Death)
summary(m10)
##                                 Df    Sum Sq   Mean Sq F value Pr(>F)    
## Data2$Sex                        1   2307382   2307382   2.216  0.137    
## Data2$Cause.of.Death             4 664592536 166148134 159.556 <2e-16 ***
## Data2$Sex:Data2$Cause.of.Death   4   4000519   1000130   0.960  0.428    
## Residuals                      790 822637993   1041314                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Here we also see that sex probably has no effect on the death rate.

qqnorm(residuals(m2))
qqline(residuals(m2))

qqnorm(residuals(m4))
qqline(residuals(m4))

qqnorm(residuals(m9))
qqline(residuals(m9))

As can be seen none of the residuals are normal. This means that it violates the homogeneity of variances and thus the ANOVA results are not valid.

The next step at this point would be to manipulate the data. This can involve removing outliers, transforming the data (for example a log or exponential transformation), or moving on to a different test.

4. References to the literature

http://qsel.columbia.edu/formhub.R/demo/RemoveOutliers.html

http://stattrek.com/statistics/dictionary.aspx?definition=Completely%20randomized%20design

Design and Analysis of Experiments, 8th Edition Douglas C. Montgomery

5. Appendices

Appendix II: Full Code

library(ggplot2)
library(plyr)
rm(list=ls())
New_York_City_Leading_Causes_of_Death <- read.csv("C:/Users/T420ortizf2/Documents/CollegeWork/Fall 2016/DesignOfExperiments/Assignment1/Assignment1/New_York_City_Leading_Causes_of_Death.csv", stringsAsFactors=FALSE)

#3840 obs 6 variables

# Select Variables of interest
variables <- c("Year", "Ethnicity", "Sex", "Cause.of.Death", "Count")
Data <- New_York_City_Leading_Causes_of_Death[variables]

#Turn appropriate variables into factors (In this case all)
Data[,1:4] <- lapply(Data[,1:4], factor)
#Check class change
class(Data$Ethnicity)
Data[,1:4] <- lapply(Data[,1:4], factor)

#Want to subset data to include factors that have the highest Count average
x<-levels(Data$Cause.of.Death)
z <- rep(0, times = length(x))

#Use for loop to find avg Count by "Cause.ofDeath" factor
for(i in 1:length(x)){
  z[i] <- mean(subset(Data$Count, Data$Cause.of.Death == x[i])) 
}

#Find highest 5 values
tail(sort(z),5)
#lowest value is 199.025
#These will be the Levels we will subset
LevelsWanted <- x[which(z >= 199)]
Data2 <- Data[Data$Cause.of.Death %in% LevelsWanted,]
Data2[,1:4] <- lapply(Data2[,1:4], factor)

levels(Data2$Year)
levels(Data2$Ethnicity)
levels(Data2$Sex)
levels(Data2$Cause.of.Death)


boxplot(Data2$Count)
qplot(Data2$Count, main = "Histogram of Death Counts")


boxplot(Data2$Count)

boxplot(Data2$Count ~ Data2$Year, main = "Box Plots of Death Counts by Year", xlab = "Year", ylab = "Counts")
boxplot(Data2$Count ~ Data2$Ethnicity, main = "Box Plots of Death Counts by Ethnicity", xlab = "Ethnicity", ylab = "Counts", cex.axis = 0.7)
boxplot(Data2$Count ~ Data2$Sex, main = "Box Plots of Death Counts by Sex", xlab = "Sex", ylab = "Counts")
boxplot(Data2$Count ~ Data2$Cause.of.Death, main = "Box Plots of Death Counts by Cause.of.Death", xlab = "Cause of Death", ylab = "Counts", cex.axis = 0.4)

m1 = aov(Data2$Count ~ Data2$Year)
summary(m1)

m2 = aov(Data2$Count ~ Data2$Ethnicity)
summary(m2)

m3 = aov(Data2$Count ~ Data2$Sex)
summary(m3)

m4 = aov(Data2$Count ~ Data2$Cause.of.Death)
summary(m4)


m5 = aov(Data2$Count ~ Data2$Year*Data2$Ethnicity)
summary(m5)
m6 = aov(Data2$Count ~ Data2$Year*Data2$Sex)
summary(m6)
m7 = aov(Data2$Count ~ Data2$Year*Data2$Cause.of.Death)
summary(m7)

m8 = aov(Data2$Count ~ Data2$Ethnicity*Data2$Sex)
summary(m8)
m9 = aov(Data2$Count ~ Data2$Ethnicity*Data2$Cause.of.Death)
summary(m9)

m10 = aov(Data2$Count ~ Data2$Sex*Data2$Cause.of.Death)
summary(m10)