DOE Project1

Project - Part 3

Benjamin Byeon

ISYE 4330

Rensselaer Polytechnic Institute - Troy, NY

V1.10.8.16

1. Settings

Shown below is the read.csv to load the data into R. I store the data, temporarily, as ‘data’. From the 100+ interesting data set, I selected a small data that contained N=30 observations. The data looks into the different factors that affect the air quality in the Californian Metropolitan Area.

The data was copied and pasted into Excel and saved as a .cs

#The model analyzes what affects the air quality in the California Metropolitan Area.
data = read.csv("airquality.csv", header = TRUE, sep=",")
data
##    airq    vala  rain coas     dens
## 1   104  2734.4 12.63  yes  1815.86
## 2    85  2479.2 47.14  yes   804.86
## 3   127  4845.0 42.77  yes  1907.86
## 4   145 19733.8 33.18   no  1876.08
## 5    84  4093.6 34.55  yes   340.93
## 6   135  1849.8 14.81   no   335.52
## 7    88  4179.4 45.94  yes   315.78
## 8   118  2525.3 39.25   no   360.39
## 9    74  1899.2 42.36  yes 12957.50
## 10  104 15257.1 12.63  yes  1728.19
## 11   64  1219.0 59.76  yes   620.96
## 12   75   992.9 53.90  yes   529.62
## 13  131 15120.8 42.37  yes  5397.47
## 14  129  9189.9 42.48  yes  1356.04
## 15   84  1596.9 37.18  yes   276.44
## 16  165  4157.3 36.14  yes   787.47
## 17   80  1185.2 12.63  yes   318.63
## 18   59  3817.7 18.69  yes  1255.04
## 19  110  1686.2 35.35   no   750.28
## 20  120  1322.0 35.08  yes   325.36
## 21  118  3476.2 43.05  yes   916.78
## 22  120  1123.8 68.13  yes   271.59
## 23  120  1151.6 35.35   no   645.83
## 24   59  2896.3 18.69  yes   819.23
## 25   74  5608.6 42.36  yes  2649.07
## 26  124  3700.0 29.51   no  9642.86
## 27   69  1395.5 42.92  yes  1105.55
## 28  118  3022.8 41.32   no   910.79
## 29  129  1515.4 31.22   no   379.58
## 30  129  1878.9 30.95   no   455.92

System under Test

The data set helps determine which factors affect the air quality in the California region. There are 5 variables and a total of N=30 observations.

Below are the definitions of the variables:

aiq: The indication of the air quality. A lower value is preferred.

vala: The value added of companies. The units are in USD thousands.

rain: The amount of rain fall, measured in inches.

coas: Whether if the region is located in the coastal area or not.

dens: The population density per square mile.

A sixth variable was omitted because of my preference. This sixth variable measured the median income.

#Display the descriptive statistics of "data_raw".
summary(data)
##       airq            vala              rain        coas   
##  Min.   : 59.0   Min.   :  992.9   Min.   :12.63   no : 9  
##  1st Qu.: 81.0   1st Qu.: 1535.8   1st Qu.:31.02   yes:21  
##  Median :114.0   Median : 2629.8   Median :36.66           
##  Mean   :104.7   Mean   : 4188.5   Mean   :36.08           
##  3rd Qu.:126.2   3rd Qu.: 4141.4   3rd Qu.:42.70           
##  Max.   :165.0   Max.   :19733.8   Max.   :68.13           
##       dens        
##  Min.   :  271.6  
##  1st Qu.:  365.2  
##  Median :  796.2  
##  Mean   : 1728.6  
##  3rd Qu.: 1635.2  
##  Max.   :12957.5
#Display the names found.
names(data)
## [1] "airq" "vala" "rain" "coas" "dens"
head(data)
##   airq    vala  rain coas    dens
## 1  104  2734.4 12.63  yes 1815.86
## 2   85  2479.2 47.14  yes  804.86
## 3  127  4845.0 42.77  yes 1907.86
## 4  145 19733.8 33.18   no 1876.08
## 5   84  4093.6 34.55  yes  340.93
## 6  135  1849.8 14.81   no  335.52

Factors and Levels

The data set contains 4 factors and several levels.

#Display the structure. 
str(data)
## 'data.frame':    30 obs. of  5 variables:
##  $ airq: int  104 85 127 145 84 135 88 118 74 104 ...
##  $ vala: num  2734 2479 4845 19734 4094 ...
##  $ rain: num  12.6 47.1 42.8 33.2 34.5 ...
##  $ coas: Factor w/ 2 levels "no","yes": 2 2 2 1 2 1 2 1 2 2 ...
##  $ dens: num  1816 805 1908 1876 341 ...

The issue with the data was that not all of the factors were ‘factor’. Some were integers and NUM. Therefore, I converted the non-factors into a factor using as.factor

#data$airq = as.factor(data$airq)
#data$airq
 #[1] 104 85  127 145 84  135 88  118 74  104 64  75  131 129 84  165 80  59  #110 120
#[21] 118 120 120 59  74  124 69  118 129 129
#20 Levels: 59 64 69 74 75 80 84 85 88 104 110 118 120 124 127 129 131 135 ... #165

#data$vala = as.factor(data$vala)
#data$vala

# [1] 2734.4  2479.2  4845    19733.8 4093.6  1849.8  4179.4  2525.3  1899.2  #15257.1
#[11] 1219    992.9   15120.8 9189.9  1596.9  4157.3  1185.2  3817.7  1686.2  #1322   
#[21] 3476.2  1123.8  1151.6  2896.3  5608.6  3700    1395.5  3022.8  1515.4  #1878.9 
#30 Levels: 992.9 1123.8 1151.6 1185.2 1219 1322 1395.5 1515.4 1596.9 ... 19733.8

#data$rain = as.factor(data$rain)
#data$rain

#[1] 12.63 47.14 42.77 33.18 34.55 14.81 45.94 39.25 42.36 12.63 59.76 53.9  #42.37
#[14] 42.48 37.18 36.14 12.63 18.69 35.35 35.08 43.05 68.13 35.35 18.69 42.36 #29.51
#[27] 42.92 41.32 31.22 30.95
#25 Levels: 12.63 14.81 18.69 29.51 30.95 31.22 33.18 34.55 35.08 35.35 ... 68.13

#data$dens = as.factor(data$dens)
#data$dens
# [1] 1815.86 804.86  1907.86 1876.08 340.93  335.52  315.78  360.39  12957.5 #1728.19
#[11] 620.96  529.62  5397.47 1356.04 276.44  787.47  318.63  1255.04 750.28  325.36 
#[21] 916.78  271.59  645.83  819.23  2649.07 9642.86 1105.55 910.79  379.58  #30 Levels: 271.59 276.44 315.78 318.63 325.36 335.52 340.93 360.39 379.58 ... 12957.5

Then, I checked for the levels of each factor.

#Check the levels of each 4 factors 
levels(data$vala)
## NULL
levels(data$rain)
## NULL
levels(data$coas)
## [1] "no"  "yes"
levels(data$dens)
## NULL

To summarize, there were 4 factors to test on the airq:

vala: 30 levels

rain: 25 levels

coas: 2 levels

dens: 30 levels

Continuous Variables

The data had 4 continuous variables:

airq: the air quality

vala: the value added of companies

rain: the amount of rain fall

dens: the population density

Response Variable(s)

The objective of the experiment was to determine which factors affect the air quality in California. Therefore, the airq variable represents the response variable.

airq: the air quality

The Data: How is it organized and what does it look like?

The data consists of N=30 observations with 5 variables. The year in which it was collected was 1972 in the United States, specifically the Californian Metropolitan Region. The different factors are used to examine the effects on air quality.

Here are the organization and views of the data:

head(data)
##   airq    vala  rain coas    dens
## 1  104  2734.4 12.63  yes 1815.86
## 2   85  2479.2 47.14  yes  804.86
## 3  127  4845.0 42.77  yes 1907.86
## 4  145 19733.8 33.18   no 1876.08
## 5   84  4093.6 34.55  yes  340.93
## 6  135  1849.8 14.81   no  335.52
tail(data)
##    airq   vala  rain coas    dens
## 25   74 5608.6 42.36  yes 2649.07
## 26  124 3700.0 29.51   no 9642.86
## 27   69 1395.5 42.92  yes 1105.55
## 28  118 3022.8 41.32   no  910.79
## 29  129 1515.4 31.22   no  379.58
## 30  129 1878.9 30.95   no  455.92
summary(data)
##       airq            vala              rain        coas   
##  Min.   : 59.0   Min.   :  992.9   Min.   :12.63   no : 9  
##  1st Qu.: 81.0   1st Qu.: 1535.8   1st Qu.:31.02   yes:21  
##  Median :114.0   Median : 2629.8   Median :36.66           
##  Mean   :104.7   Mean   : 4188.5   Mean   :36.08           
##  3rd Qu.:126.2   3rd Qu.: 4141.4   3rd Qu.:42.70           
##  Max.   :165.0   Max.   :19733.8   Max.   :68.13           
##       dens        
##  Min.   :  271.6  
##  1st Qu.:  365.2  
##  Median :  796.2  
##  Mean   : 1728.6  
##  3rd Qu.: 1635.2  
##  Max.   :12957.5

2. (Experimental) Design

How will the experiment be organized and conducted to test the hypothesis?

The experiment will be organized in the following way. I will use 4 factors, each with multiple levels, to test the effects on the response variable (air quality). I will use statistical analysis to study the factors’ effect on air quality and their interaction effects.

The hypothesis to test will be looking at the p-values. I will set the null hypothesis to be that the air quality is affected by the value added of companies, the amount of rainfall, the location whether it is in the coast, and the population density, and not affected by the 2fi (2 factor interaction).

What is the rationale for this design?

The rationale for this design was limited on the basis of the original experimentor’s design. I adapted the data set to test my experiment, but the original factors chosen and procedures for this was created by the experimentor. Therefore, I had less flexibility to design the experiment the way I wanted to because it was already set. I chose the 4 factors based mostly on my preference and my best guess on which of them looked more relevant to the response variable.

Randomize: What is the Randomization Scheme?

From the data set information given, I could only assume that the data would random because it would have been collected at random times of the day and the year.

In terms of my experiment, I randomized the orders of the data:

index = sample(1:nrow(data), 30, replace=FALSE)
caliairq = data[index,]
caliairq
##    airq    vala  rain coas     dens
## 5    84  4093.6 34.55  yes   340.93
## 24   59  2896.3 18.69  yes   819.23
## 20  120  1322.0 35.08  yes   325.36
## 26  124  3700.0 29.51   no  9642.86
## 9    74  1899.2 42.36  yes 12957.50
## 18   59  3817.7 18.69  yes  1255.04
## 12   75   992.9 53.90  yes   529.62
## 11   64  1219.0 59.76  yes   620.96
## 21  118  3476.2 43.05  yes   916.78
## 2    85  2479.2 47.14  yes   804.86
## 8   118  2525.3 39.25   no   360.39
## 23  120  1151.6 35.35   no   645.83
## 19  110  1686.2 35.35   no   750.28
## 28  118  3022.8 41.32   no   910.79
## 22  120  1123.8 68.13  yes   271.59
## 1   104  2734.4 12.63  yes  1815.86
## 17   80  1185.2 12.63  yes   318.63
## 16  165  4157.3 36.14  yes   787.47
## 27   69  1395.5 42.92  yes  1105.55
## 7    88  4179.4 45.94  yes   315.78
## 14  129  9189.9 42.48  yes  1356.04
## 3   127  4845.0 42.77  yes  1907.86
## 30  129  1878.9 30.95   no   455.92
## 10  104 15257.1 12.63  yes  1728.19
## 4   145 19733.8 33.18   no  1876.08
## 29  129  1515.4 31.22   no   379.58
## 13  131 15120.8 42.37  yes  5397.47
## 15   84  1596.9 37.18  yes   276.44
## 25   74  5608.6 42.36  yes  2649.07
## 6   135  1849.8 14.81   no   335.52
#Display the randomized Descriptive Statistics of caliairq
summary(caliairq)
##       airq            vala              rain        coas   
##  Min.   : 59.0   Min.   :  992.9   Min.   :12.63   no : 9  
##  1st Qu.: 81.0   1st Qu.: 1535.8   1st Qu.:31.02   yes:21  
##  Median :114.0   Median : 2629.8   Median :36.66           
##  Mean   :104.7   Mean   : 4188.5   Mean   :36.08           
##  3rd Qu.:126.2   3rd Qu.: 4141.4   3rd Qu.:42.70           
##  Max.   :165.0   Max.   :19733.8   Max.   :68.13           
##       dens        
##  Min.   :  271.6  
##  1st Qu.:  365.2  
##  Median :  796.2  
##  Mean   : 1728.6  
##  3rd Qu.: 1635.2  
##  Max.   :12957.5
#Head and Tail of the data
head(caliairq)
##    airq   vala  rain coas     dens
## 5    84 4093.6 34.55  yes   340.93
## 24   59 2896.3 18.69  yes   819.23
## 20  120 1322.0 35.08  yes   325.36
## 26  124 3700.0 29.51   no  9642.86
## 9    74 1899.2 42.36  yes 12957.50
## 18   59 3817.7 18.69  yes  1255.04
tail(caliairq)
##    airq    vala  rain coas    dens
## 4   145 19733.8 33.18   no 1876.08
## 29  129  1515.4 31.22   no  379.58
## 13  131 15120.8 42.37  yes 5397.47
## 15   84  1596.9 37.18  yes  276.44
## 25   74  5608.6 42.36  yes 2649.07
## 6   135  1849.8 14.81   no  335.52

Replicate: Are there replicates and/or repeated measures?

Due to the simplification of the data set, I cannot verify if there were any replicates and repeated measures. However, it is safe to say that there are none.

Block: Did you use blocking in the design?

There is no indication that blocking was used in the original experiment. Personally, I did not use blocking in my design.

3. (Statistical) Analysis

(Exploratory Data Analysis) Graphics and descriptive summary

To get an idea of the general overview of the data, I plotted a histogram and the summary is shown below using the randomized data:

summary(caliairq)
##       airq            vala              rain        coas   
##  Min.   : 59.0   Min.   :  992.9   Min.   :12.63   no : 9  
##  1st Qu.: 81.0   1st Qu.: 1535.8   1st Qu.:31.02   yes:21  
##  Median :114.0   Median : 2629.8   Median :36.66           
##  Mean   :104.7   Mean   : 4188.5   Mean   :36.08           
##  3rd Qu.:126.2   3rd Qu.: 4141.4   3rd Qu.:42.70           
##  Max.   :165.0   Max.   :19733.8   Max.   :68.13           
##       dens        
##  Min.   :  271.6  
##  1st Qu.:  365.2  
##  Median :  796.2  
##  Mean   : 1728.6  
##  3rd Qu.: 1635.2  
##  Max.   :12957.5

The histogram is fairly evenly distributed.

Main Effects - Boxplot

To analyze the 4 main effects to the airq (response variable), I used box plots to get a visual representation of the factors.

ME based on boxplots: (take the difference in means) I took the difference in the means for each level, using the boxplot.

See the means in the boxplot.

me = -42, 0, 42, -20, 100, 50, -40, 30, -10, -30,40, -10, -40, 0, 10, -60, 10, 10,-100, -40, -50, 0, -40, 40

See the means in the boxplot. me = -40, -80, 60, 0, -20, 70, -40, -10, -40, 0, 40, -50, -5,-5, -70, 40, -40, -5, 10, 10, -60

See the means in the boxplot mean = 120, 80 me= -40

See the means in the boxplot me = -40, 10, -10, -40, 20, -20, -60, 40, 10,0,-50 -10, 60, -10, 50, -80, -20, 60, 0, -70,-10, 40,0 , 50, -30, 10,-10, -40

There are many main effects due to the number of levels each factor has.

According to the boxplot, there is limited information that I could get due to the size of the observation. However, I can analyze the caliairq$coas. From the boxplot, I could tell that ‘no’ had a higher mean than ‘yes’, which meant that not being located in the coastal region may have an effect on the air quality.

Testing

In order to check the statistical significance and analysis, I used ANOVA. I will use this as one of the tools to determine whether to accept or reject my null hypothesis and understand the relationship of variance to randomness.

Main Effects - Computation and ANOVA

To compute the main effects, I would have split the data into subsets over the levels of each factor. Then, the graph of the me would have a line with two points. I would use the slope of lines to get the me.

However, the data was integer heavy and limited my ability to split the data into subsets and compute the me perfectly.

Here is the me of what I could do based on the given data.

If it were not for the data, I would have : ex:

Split into subsets of 2 groups by levels:

yes = subset(data, data$x == “yes”)

no = subset (data, data$x == “no”)

#plot(c(1,2),c(mean(yes$rv), mean(no$rv)),type ='1', xlim=c(0.5,2.5), xaxt='n')
#ME for vala
me1 = plot(mean(caliairq$airq), mean(caliairq$vala))

#ME for rain
me2 = plot(mean(caliairq$airq), mean(caliairq$rain))

#me for coas
#for coastal: we have yes/no
yes = subset(data, data$coas == "yes") #21
no = subset(data, data$coas == "no") #9
me3 = plot(mean(9), mean(21))

#me3 = plot(c(1,2), c(mean(no),mean(yes)), type ='1', xlim=c(0.5,2.5),xaxt="n")

#me for dens
me4 = plot(mean(caliairq$airq), mean(caliairq$dens))

To get the me, I used the slopes. If the data was clean with clear levels, then I would have taken the two points on the slope to get the me.

me1 = 0

me2 = 0

me3 = 0

me4 = 0

Essentially, the slope were all 0, which meant that no me existed. There were no main effects in the experiment. I will accept this because from our boxplot, we see that there are too many levels. The data is not proper and I believe that it is justifiable to say the me is 0 and assume the difference in variation was due to randomness not me.

The main effects were computed by the difference in the averages. Typically, we should have two points and subtract the mean of them. However, the data is heavily integer-based and even after as.factor there are many levels. Therefore, there was a limit to how I could plot the ME and compute it.

Done here was a representation of how I would have done it, if the data was more supportive. Computationally wise, I would take the averages of the factor of interest and subtract the two.

The main effects would look into the averages of each and compute the difference of treatments. However, this study focuses more on the main effects via boxplot and its mean, as shown above.

factor1 = aov(caliairq$airq~caliairq$vala)
anova(factor1)
## Analysis of Variance Table
## 
## Response: caliairq$airq
##               Df  Sum Sq Mean Sq F value  Pr(>F)  
## caliairq$vala  1  2411.3 2411.27  3.3143 0.07938 .
## Residuals     28 20371.0  727.54                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
factor2 = aov(caliairq$airq~caliairq$rain)
anova(factor2)
## Analysis of Variance Table
## 
## Response: caliairq$airq
##               Df  Sum Sq Mean Sq F value Pr(>F)
## caliairq$rain  1    15.7   15.67  0.0193 0.8906
## Residuals     28 22766.6  813.09
factor3 = aov(caliairq$airq~caliairq$coas)
anova(factor3)
## Analysis of Variance Table
## 
## Response: caliairq$airq
##               Df  Sum Sq Mean Sq F value   Pr(>F)   
## caliairq$coas  1  5473.7  5473.7  8.8548 0.005965 **
## Residuals     28 17308.6   618.2                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
factor4 = aov(caliairq$airq~caliairq$dens)
anova(factor4)
## Analysis of Variance Table
## 
## Response: caliairq$airq
##               Df  Sum Sq Mean Sq F value Pr(>F)
## caliairq$dens  1    34.5   34.50  0.0425 0.8382
## Residuals     28 22747.8  812.42

I used ANOVA to analyze the main effects. According to the results of the ANOVA, we can reject the null hypothesis only on the basis of the p-values. From a general statistical point of view, all the p-values > 0.05, except for the coastal region factor. Since the null hypothesis was that all of the factors would affect the air quality, we can still reject the null hypothesis because only coastal region affected the air quality. The alpha value was set by my me since 0.05 is widely used to practice hypothesis tests. The variance in for value of company added, amount of rainfall, and population density did not affect the air quality. However, the p-value does not explain everything because there could be randomness that may have played in the results.

#Effects for all 4 factors
me = aov(airq ~., data=caliairq)
anova(me)
## Analysis of Variance Table
## 
## Response: airq
##           Df  Sum Sq Mean Sq F value   Pr(>F)   
## vala       1  2411.3  2411.3  4.2133 0.050716 . 
## rain       1    11.4    11.4  0.0200 0.888652   
## coas       1  5852.1  5852.1 10.2256 0.003737 **
## dens       1   199.9   199.9  0.3492 0.559846   
## Residuals 25 14307.6   572.3                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

This just quickly looks at the effects when measuring all 4 factors.

Interaction Effects- Computation and ANOVA

Below, I used ANOVA for the interaction effects, or 2fi.

To compute the ie, I would subtract the two lines that should form. In other words, the magnitude of the lines would be subtracted to get the ie. For example: interaction effect = (x,y) of line 1 - (x,y) of line 2.

A biggest problem of the data set was that there was no interaction effect. Therefore, the interaction.plot, as one would soon be able to see below, has nothing.

f12 = aov(caliairq$airq~caliairq$vala*caliairq$rain)
summary(f12)
##                             Df Sum Sq Mean Sq F value Pr(>F)  
## caliairq$vala                1   2411  2411.3   3.227 0.0841 .
## caliairq$rain                1     11    11.4   0.015 0.9024  
## caliairq$vala:caliairq$rain  1    931   931.1   1.246 0.2745  
## Residuals                   26  19428   747.2                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value > 0.05, so there is no interaction effect between valaxrain. Also, the graph justifies the statement well.

valaxrain = 0

f13 = aov(caliairq$airq~caliairq$vala*caliairq$coas)
summary(f13)
##                             Df Sum Sq Mean Sq F value  Pr(>F)   
## caliairq$vala                1   2411    2411   4.319 0.04769 * 
## caliairq$coas                1   5548    5548   9.938 0.00405 **
## caliairq$vala:caliairq$coas  1    309     309   0.553 0.46384   
## Residuals                   26  14514     558                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value > 0.05, so there is no interaction effect between valaxcoas. Also, the graph justifies the statement well.

valaxcoas = 0

f14 = aov(caliairq$airq~caliairq$vala*caliairq$dens)
summary(f14)
##                             Df Sum Sq Mean Sq F value Pr(>F)  
## caliairq$vala                1   2411  2411.3   3.145 0.0879 .
## caliairq$dens                1    191   190.8   0.249 0.6221  
## caliairq$vala:caliairq$dens  1    244   244.2   0.318 0.5774  
## Residuals                   26  19936   766.8                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value > 0.05, so there is no interaction effect between valaxdens. Also, the graph justifies the statement well.

valaxdens = 0

f23 = aov(caliairq$airq~caliairq$rain*caliairq$coas)
summary(f23)
##                             Df Sum Sq Mean Sq F value  Pr(>F)   
## caliairq$rain                1     16      16   0.024 0.87755   
## caliairq$coas                1   5556    5556   8.584 0.00697 **
## caliairq$rain:caliairq$coas  1    381     381   0.589 0.44957   
## Residuals                   26  16829     647                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value > 0.05, so there is no interaction effect between rainxcoas. Also, the graph justifies the statement well.

rainxcoas = 0

##                             Df Sum Sq Mean Sq F value Pr(>F)
## caliairq$rain                1     16    15.7   0.018  0.894
## caliairq$dens                1     34    34.1   0.040  0.844
## caliairq$rain:caliairq$dens  1    413   412.5   0.481  0.494
## Residuals                   26  22320   858.5

The p-value > 0.05, so there is no interaction effect between rainxdens. Also, the graph justifies the statement well.

rainxdens = 0

f34 = aov(caliairq$airq~caliairq$coas*caliairq$dens)
summary(f34)
##                             Df Sum Sq Mean Sq F value  Pr(>F)   
## caliairq$coas                1   5474    5474   8.247 0.00802 **
## caliairq$dens                1     30      30   0.045 0.83303   
## caliairq$coas:caliairq$dens  1     21      21   0.031 0.86065   
## Residuals                   26  17258     664                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value > 0.05, so there is no interaction effect between coasxdens. Also, the graph justifies the statement well. coasxdens = 0

From the graphical visuals, we could see that the factors are independent factors. Due to independent factors, an interaction effect does not exist. If it were to have existed, then I would have comptued the differences in averages, as we did in class and as explained above.

Also, the interaction.plots do not look clean due to the size of the data.

Therefore, there was no statistically significant interaction amongst the factors.

Check for Normality

To check for normality, I used qqplots and qqlines.

The qqline is closely fit to the qqnorm plot. Therefore, the normality check passed.

Summary: Overall, the experiment did not turn out to be neat due to the size of the data. Based on the p-values, we learned that the air quality was not affected as much from values added from companies, rain fall, coastal region, and population density. The coastal region had some effect, but randomness could have been the effect as well. Therefore, none of the factors studied and their interaction effects had an influence on the air qualtiy in the Californian Metropolitan Area. The differences in the variation could have contributed to the air quality. Further studies and more in-depth analysis would be required.

In terms of the model, I would say that this is a mixed effects model, assuming the general population pertains only to the State of California Cities. If not, the model was more close to a fixed effects model because the variables were selected and cannot represent the general population. Therefore, depending on one’s view of the data, it could be a mixed or fixed effects.

For future projects, I should use that is easy to work with.

Overall, the analysis made sense and the factorial design was well executed.

4. References to the Literature

  1. Montgomery, Douglas C.. Design and Analysis of Experiments, 8th Edition, PDF
  2. http://stackoverflow.com/questions/tagged/r

5. Appendices

  1. https://cran.r-project.org/web/packages/Ecdat/Ecdat.pdf
  2. https://vincentarelbundock.github.io/Rdatasets/doc/Ecdat/Airq.html
#Benjamin Byeon
#10/8/16
#Section 1
#The model analyzes what affects the air quality in the California Metropolitan Area.

#Load the data into R.
#data = read.csv("airquality.csv", header = TRUE, sep=",")

#Display the descriptive statistics of "data_raw".
#summary(data)
#head(data)

#Display the names found.
#names(data)

#Display the structure. 
#str(data)

#Change to a int and num to Factor
#data$airq = as.factor(data$airq)
#data$airq

#data$vala = as.factor(data$vala)
#data$vala

#data$rain = as.factor(data$rain)
#data$rain

#data$dens = as.factor(data$dens)
#data$dens

#Check the levels of each 4 factors 
#levels(data$vala)
#levels(data$rain)
#levels(data$coas)
#levels(data$dens)

#Randomize the data
#index = sample(1:nrow(data), 30, replace=FALSE)
#caliairq = data[index,]
#caliairq

#Display the randomized Descriptive Statistics of caliairq
#summary(caliairq)

#Head and Tail of the data
#head(caliairq)
#tail(caliairq)

#Histogram
#hist(caliairq$airq)

#Boxplots
#boxplot(caliairq$airq~caliairq$vala, main = "Comparing Air Quality to Values Added of Companies", xlab="USD", ylab="Air Quality")
#boxplot(caliairq$airq~caliairq$rain, main = "Comparing Air Quality to Rain Fall", xlab="Rain Fall", ylab="Air Quality")
#boxplot(caliairq$airq~caliairq$coas, main = "Comparing Air Quality to Region near Coast", xlab="Yes/No", ylab="Air Quality")
#boxplot(caliairq$airq~caliairq$dens, main = "Comparing Air Quality to Density", xlab="Density", ylab="Air Quality")

#factor1 = aov(caliairq$airq~caliairq$vala)
#anova(factor1)

#factor2 = aov(caliairq$airq~caliairq$rain)
#anova(factor2)

#factor3 = aov(caliairq$airq~caliairq$coas)
#anova(factor3)

#factor4 = aov(caliairq$airq~caliairq$dens)
#anova(factor4)

#Effects for all 4 factors
#me = aov(airq ~., data=caliairq)
#anova(me)

#Interaction Effects
#f12 = aov(caliairq$airq~caliairq$vala*caliairq$rain)
#summary(f12)
#interaction.plot(caliairq$rain,caliairq$vala, caliairq$airq)

#f13 = aov(caliairq$airq~caliairq$vala*caliairq$coas)
#summary(f13)
#interaction.plot(caliairq$coas,caliairq$vala, caliairq$airq)

#f14 = aov(caliairq$airq~caliairq$vala*caliairq$dens)
#summary(f14)
#interaction.plot(caliairq$dens,caliairq$vala, caliairq$airq)

#f23 = aov(caliairq$airq~caliairq$rain*caliairq$coas)
#summary(f23)
#interaction.plot(caliairq$coas,caliairq$rain, caliairq$airq)

#f24 = aov(caliairq$airq~caliairq$rain*caliairq$dens)
#summary(f24)
#interaction.plot(caliairq$dens,caliairq$rain, caliairq$airq)

#f34 = aov(caliairq$airq~caliairq$coas*caliairq$dens)
#summary(f34)
#interaction.plot(caliairq$dens,caliairq$coas, caliairq$airq)

#Check for Normality
#qqnorm(residuals(factor1))
#qqline(residuals(factor1))

#qqnorm(residuals(f23))
#qqline(residuals(f23))