1. Setting

This project will be using the same dataset as was used in the fractional factorial project. To recap, this is the Housing dataset from the Ecdat package. Housing looks at the sales prices of houses in Windsor. It is a dataframe consiting of a cross-section from 1987 with 546 observations. The factors we will be focusing on can be seen below:

Two-Level Independent Variable : driveway - Does the house has a driveway ? Two-Level Independent Variable : fullbase - Does the house has a full finished basement ? Three-Level Independent Variable: stories - Stories number of stories excluding basement Three-Level Independent Variable: bathrms - Number of full bathrooms

library("Ecdat")
## Loading required package: Ecfun
## 
## Attaching package: 'Ecfun'
## The following object is masked from 'package:base':
## 
##     sign
## 
## Attaching package: 'Ecdat'
## The following object is masked from 'package:datasets':
## 
##     Orange
data("Housing")

Below is the first 6 rows of data:

head(Housing)
##   price lotsize bedrooms bathrms stories driveway recroom fullbase gashw
## 1 42000    5850        3       1       2      yes      no      yes    no
## 2 38500    4000        2       1       1      yes      no       no    no
## 3 49500    3060        3       1       1      yes      no       no    no
## 4 60500    6650        3       1       2      yes     yes       no    no
## 5 61000    6360        2       1       1      yes      no       no    no
## 6 66000    4160        3       1       1      yes     yes      yes    no
##   airco garagepl prefarea
## 1    no        1       no
## 2    no        0       no
## 3    no        0       no
## 4    no        0       no
## 5    no        0       no
## 6   yes        0       no

Below is the summary of the Unemployment dataframe:

summary(Housing)
##      price           lotsize         bedrooms        bathrms     
##  Min.   : 25000   Min.   : 1650   Min.   :1.000   Min.   :1.000  
##  1st Qu.: 49125   1st Qu.: 3600   1st Qu.:2.000   1st Qu.:1.000  
##  Median : 62000   Median : 4600   Median :3.000   Median :1.000  
##  Mean   : 68122   Mean   : 5150   Mean   :2.965   Mean   :1.286  
##  3rd Qu.: 82000   3rd Qu.: 6360   3rd Qu.:3.000   3rd Qu.:2.000  
##  Max.   :190000   Max.   :16200   Max.   :6.000   Max.   :4.000  
##     stories      driveway  recroom   fullbase  gashw     airco    
##  Min.   :1.000   no : 77   no :449   no :355   no :521   no :373  
##  1st Qu.:1.000   yes:469   yes: 97   yes:191   yes: 25   yes:173  
##  Median :2.000                                                    
##  Mean   :1.808                                                    
##  3rd Qu.:2.000                                                    
##  Max.   :4.000                                                    
##     garagepl      prefarea 
##  Min.   :0.0000   no :418  
##  1st Qu.:0.0000   yes:128  
##  Median :0.0000            
##  Mean   :0.6923            
##  3rd Qu.:1.0000            
##  Max.   :3.0000

Factors and Levels

For this project we needed 2 two-level factors and 2 three-factor factors. As you can see below, the two “three-level” factors that I chose were originally four-level factors.

In order to get a dataframe that would fulfill the requirements of this project, I created two subsets. First, I removed all the observations with 4 stories and then all the observations with 4 bathrooms, to finally end up with the dataframe I will use: Housing3.

Housing2 = subset(Housing, stories != "4")
Housing3 = subset(Housing2, bathrms != "4")

Now when we set these variables to factors and check the levels we should be able to see the updated amount of levels. During this step for some reason R keeps the amount of levels the same as before so it is important to use the “droplevels” function.

Housing3$bathrms <- as.factor(Housing3$bathrms)
Housing3$stories <- as.factor(Housing3$stories)
levels(droplevels(Housing3$bathrms))
## [1] "1" "2" "3"
levels(droplevels(Housing3$stories))
## [1] "1" "2" "3"

Conitinuos Variables

There are two continuos variables in this dataframe; the sale price of the house in dollars, price, and the lot size of the property in square feet, lotsize.

Response Variable

The response variable for this dataframe is the sale price of the house measured in dollars, price.

The dataframe is now going to be altered again so that it is condensed into only response variable price, and the independent variables bathrms, stories, driveway, fullbase.

Housing3$lotsize <- NULL
Housing3$bedrooms <- NULL
Housing3$recroom <- NULL
Housing3$gashw <- NULL
Housing3$airco <- NULL
Housing3$garagepl <- NULL
Housing3$prefarea <- NULL
head(droplevels(Housing3))
##   price bathrms stories driveway fullbase
## 1 42000       1       2      yes      yes
## 2 38500       1       1      yes       no
## 3 49500       1       1      yes       no
## 4 60500       1       2      yes       no
## 5 61000       1       1      yes       no
## 6 66000       1       1      yes      yes
summary(droplevels(Housing3))
##      price        bathrms stories driveway  fullbase 
##  Min.   : 25000   1:390   1:227   no : 77   no :317  
##  1st Qu.: 48000   2:106   2:238   yes:428   yes:188  
##  Median : 60000   3:  9   3: 40                      
##  Mean   : 65292                                      
##  3rd Qu.: 77500                                      
##  Max.   :190000
str(droplevels(Housing3))
## 'data.frame':    505 obs. of  5 variables:
##  $ price   : num  42000 38500 49500 60500 61000 66000 66000 69000 83800 90000 ...
##  $ bathrms : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 2 1 1 2 ...
##  $ stories : Factor w/ 3 levels "1","2","3": 2 1 1 2 1 1 2 3 1 1 ...
##  $ driveway: Factor w/ 2 levels "no","yes": 2 2 2 2 2 2 2 2 2 2 ...
##  $ fullbase: Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 2 1 2 2 ...

In order to use these factors for a factorial design, we needed to define the factors with levels of high and low. For the 2 three-level factors they are already set this way, but the 2 two-level factors need to be set below to have their levels altered so that yes = 1 and no = 0.

# Find length of frame
l = nrow(Housing3) 

# Create four new frames for writing numbers instead of factors 
drivewaynum = data.frame(l)
fullbasenum = data.frame(l)
storiesnum = data.frame(l)
bathrmsnum = data.frame(l)

## Replacement loop
for (i in 1:l){
  
  # driveway replace yes with 1 and no with 0
  if (Housing3$driveway[i] == "yes"){
    drivewaynum[i,1] <- 1
  }  else {
    drivewaynum[i,1] <- 0
  }
  
  # fullbase replace yes with 1 and no with 0
  if (Housing3$fullbase[i] == "yes"){
    fullbasenum[i,1] <- 1
  }  else {
    fullbasenum[i,1] <- 0
  }
  
  #stories 1 = 0, 2 = 2, 3 = 2
  if (Housing3$stories[i] == "1"){
    storiesnum[i,1] <- 0
  }
  
  if (Housing3$stories[i] == "2") {
    storiesnum[i,1] <- 1
  } 
  if (Housing3$stories[i] == "3") {
    storiesnum[i,1] <- 2
  }

  #bathrms 1 = 0, 2 = 1, 3 = 2
  if (Housing3$bathrms[i] == "1"){
    bathrmsnum[i,1] <- 0
  }
  
  if (Housing3$bathrms[i] == "2") {
    bathrmsnum[i,1] <- 1
  } 
  if (Housing3$bathrms[i] == "3") {
    bathrmsnum[i,1] <- 2
  }
}

# Create numbered data frame out of column vectors and response variable 
Housing4 <- cbind(drivewaynum,fullbasenum,storiesnum,bathrmsnum,Housing3$price)

# Title columns of data frame appropriately
colnames(Housing4) <- c("driveway","full","stories","bathrooms","price")
head(Housing4)
##   driveway full stories bathrooms price
## 1        1    1       1         0 42000
## 2        1    0       0         0 38500
## 3        1    0       0         0 49500
## 4        1    0       1         0 60500
## 5        1    0       0         0 61000
## 6        1    1       0         0 66000
str(Housing4)
## 'data.frame':    505 obs. of  5 variables:
##  $ driveway : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ full     : num  1 0 0 0 0 1 1 0 1 1 ...
##  $ stories  : num  1 0 0 1 0 0 1 2 0 0 ...
##  $ bathrooms: num  0 0 0 0 0 0 1 0 0 1 ...
##  $ price    : num  42000 38500 49500 60500 61000 66000 66000 69000 83800 90000 ...
summary(Housing4)
##     driveway           full           stories         bathrooms     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.8475   Mean   :0.3723   Mean   :0.6297   Mean   :0.2455  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :2.0000   Max.   :2.0000  
##      price       
##  Min.   : 25000  
##  1st Qu.: 48000  
##  Median : 60000  
##  Mean   : 65292  
##  3rd Qu.: 77500  
##  Max.   :190000

2. Experimental Design

Design

We will use a linear model in order to understand whether the variation in driveway, full, stories, and bathrooms effects the variation in the price of a house.
Null Hypothesis: No factor with have an effect on the price of a house.
Alternate Hypothesis: At least one factor will have an effect on the price of a house.
We will use a Response Surface Method in order to estimate residuals and coefficients. We will use an ANOA to analyze main effects of driveway, full, stories, and bathrooms.

Rational

Response Surface Methods are used for the goal of optimization, or to find the factor level combinations that will give us the best result. In this experiment we are looking to find what combination of driveway, full, stories, and bathrooms will give us the maximum price of a house.

Randomization Scheme

There was not a randomization scheme used in this recipe

Replicates and Repeated Measures

There were not replicates or repeated measures in this data set.

Blocking

Originally the housing data set had 12 factors

3. Statistical Analysis

Exploratory Data Analysis) Graphics and Descriptive Summary

Below you can see the summary of our data again, as well as boxplots of the response variable, price against the four IVs:

summary(Housing4)
##     driveway           full           stories         bathrooms     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:1.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.0000   Median :1.0000   Median :0.0000  
##  Mean   :0.8475   Mean   :0.3723   Mean   :0.6297   Mean   :0.2455  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :2.0000   Max.   :2.0000  
##      price       
##  Min.   : 25000  
##  1st Qu.: 48000  
##  Median : 60000  
##  Mean   : 65292  
##  3rd Qu.: 77500  
##  Max.   :190000

Below are boxplots to show the difference between levels of each factors on price.

boxplot(Housing4$price ~ Housing4$full + Housing4$driveway + Housing4$stories + Housing4$bathrooms, xlab= "fullbasement + driveway + stories + bathrooms",ylab="price of house", main="Histogram of Price and four IVs")

Below are some boxplots of the response variable against the four IV:

boxplot(Housing4$price~Housing4$driveway, main="Affect of Driveway on House Price", ylab="House Price", xlab="1= Driveway, 0 = No Driveway")

There is a noticable difference above in the median house price between factor levels.

boxplot(Housing4$price~Housing4$full, main="Affect of Full Basement on House Price", ylab="House Price", xlab="1= Full Basement, 0 = No Full Basement")

There is a noticible increase in the median price of a house with a full basement verses a house without a full basement.

boxplot(Housing4$price~Housing4$stories, main="Affect of Number of Stories on House Price", ylab="House Price", xlab="0 = 1 Story, 1 = Two Stories 2 = Three Stories")

The difference in prices between one and two stories is slight, but noticiable. The increase in house price when the numbers of stories increases to three stories is quite large.

boxplot(Housing4$price~Housing4$bathrooms, main="Affect of Number of Bathrooms on House Price", ylab="House Price", xlab="0 = 1 Bathrooms, 1 = Two Bathrooms 2 = Three Bathrooms")

It is clear to see the the median House Price increases as the amount of bathrooms increases.

Testing

library("rsm")
house.rsm = rsm(price ~ SO(driveway,full, stories, bathrooms), data=Housing4)
## Warning in rsm(price ~ SO(driveway, full, stories, bathrooms), data = Housing4): Some coefficients are aliased - cannot use 'rsm' methods.
##   Returning an 'lm' object.
summary(house.rsm)
## 
## Call:
## rsm(formula = price ~ SO(driveway, full, stories, bathrooms), 
##     data = Housing4)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -52871 -11403  -1741   8652  91259 
## 
## Coefficients: (2 not defined because of singularities)
##                                                            Estimate
## (Intercept)                                                40696.64
## FO(driveway, full, stories, bathrooms)driveway             12706.62
## FO(driveway, full, stories, bathrooms)full                 15225.42
## FO(driveway, full, stories, bathrooms)stories                533.42
## FO(driveway, full, stories, bathrooms)bathrooms             7189.88
## TWI(driveway, full, stories, bathrooms)driveway:full        1618.92
## TWI(driveway, full, stories, bathrooms)driveway:stories     -878.32
## TWI(driveway, full, stories, bathrooms)driveway:bathrooms  18140.16
## TWI(driveway, full, stories, bathrooms)full:stories        -2453.94
## TWI(driveway, full, stories, bathrooms)full:bathrooms     -10517.06
## TWI(driveway, full, stories, bathrooms)stories:bathrooms    3263.53
## PQ(driveway, full, stories, bathrooms)driveway^2                 NA
## PQ(driveway, full, stories, bathrooms)full^2                     NA
## PQ(driveway, full, stories, bathrooms)stories^2             4292.57
## PQ(driveway, full, stories, bathrooms)bathrooms^2             53.17
##                                                           Std. Error
## (Intercept)                                                  3461.62
## FO(driveway, full, stories, bathrooms)driveway               3799.19
## FO(driveway, full, stories, bathrooms)full                   5847.83
## FO(driveway, full, stories, bathrooms)stories                5518.50
## FO(driveway, full, stories, bathrooms)bathrooms              8634.27
## TWI(driveway, full, stories, bathrooms)driveway:full         5679.11
## TWI(driveway, full, stories, bathrooms)driveway:stories      4670.09
## TWI(driveway, full, stories, bathrooms)driveway:bathrooms    5591.54
## TWI(driveway, full, stories, bathrooms)full:stories          3525.06
## TWI(driveway, full, stories, bathrooms)full:bathrooms        4444.10
## TWI(driveway, full, stories, bathrooms)stories:bathrooms     3961.84
## PQ(driveway, full, stories, bathrooms)driveway^2                  NA
## PQ(driveway, full, stories, bathrooms)full^2                      NA
## PQ(driveway, full, stories, bathrooms)stories^2              2362.55
## PQ(driveway, full, stories, bathrooms)bathrooms^2            3999.28
##                                                           t value Pr(>|t|)
## (Intercept)                                                11.757  < 2e-16
## FO(driveway, full, stories, bathrooms)driveway              3.345 0.000887
## FO(driveway, full, stories, bathrooms)full                  2.604 0.009504
## FO(driveway, full, stories, bathrooms)stories               0.097 0.923036
## FO(driveway, full, stories, bathrooms)bathrooms             0.833 0.405410
## TWI(driveway, full, stories, bathrooms)driveway:full        0.285 0.775714
## TWI(driveway, full, stories, bathrooms)driveway:stories    -0.188 0.850896
## TWI(driveway, full, stories, bathrooms)driveway:bathrooms   3.244 0.001258
## TWI(driveway, full, stories, bathrooms)full:stories        -0.696 0.486669
## TWI(driveway, full, stories, bathrooms)full:bathrooms      -2.367 0.018342
## TWI(driveway, full, stories, bathrooms)stories:bathrooms    0.824 0.410485
## PQ(driveway, full, stories, bathrooms)driveway^2               NA       NA
## PQ(driveway, full, stories, bathrooms)full^2                   NA       NA
## PQ(driveway, full, stories, bathrooms)stories^2             1.817 0.069836
## PQ(driveway, full, stories, bathrooms)bathrooms^2           0.013 0.989399
##                                                              
## (Intercept)                                               ***
## FO(driveway, full, stories, bathrooms)driveway            ***
## FO(driveway, full, stories, bathrooms)full                ** 
## FO(driveway, full, stories, bathrooms)stories                
## FO(driveway, full, stories, bathrooms)bathrooms              
## TWI(driveway, full, stories, bathrooms)driveway:full         
## TWI(driveway, full, stories, bathrooms)driveway:stories      
## TWI(driveway, full, stories, bathrooms)driveway:bathrooms ** 
## TWI(driveway, full, stories, bathrooms)full:stories          
## TWI(driveway, full, stories, bathrooms)full:bathrooms     *  
## TWI(driveway, full, stories, bathrooms)stories:bathrooms     
## PQ(driveway, full, stories, bathrooms)driveway^2             
## PQ(driveway, full, stories, bathrooms)full^2                 
## PQ(driveway, full, stories, bathrooms)stories^2           .  
## PQ(driveway, full, stories, bathrooms)bathrooms^2            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19990 on 492 degrees of freedom
## Multiple R-squared:  0.3786, Adjusted R-squared:  0.3635 
## F-statistic: 24.99 on 12 and 492 DF,  p-value: < 2.2e-16

Based on the values from the RSM model, we can reject the null hypothesis for first-order driveway, first-order full, and two-way interaction between driveway and bathrooms.
These contour plots ahow the price in response to two variables per plot:

#Contour Plots
par(mfrow=c(2,3))
contour(house.rsm, ~driveway + full + stories + bathrooms, image=TRUE, at=summary(house.rsm$canonical$xs))
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading

These are perspective plots of the interactions:

#Perspective Plots
par(mfrow=c(1,1))

persp(house.rsm, ~ driveway + full, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in persp.default(dat$x, dat$y, dat$z, zlim = dat$zlim, theta =
## theta, : "image" is not a graphical parameter
## Warning in persp.default(dat$x, dat$y, dat$z, xlab = dat$labs[1], ylab =
## dat$labs[2], : "image" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "image" is not a graphical
## parameter

persp(house.rsm, ~ driveway + stories, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in persp.default(dat$x, dat$y, dat$z, zlim = dat$zlim, theta =
## theta, : "image" is not a graphical parameter
## Warning in persp.default(dat$x, dat$y, dat$z, xlab = dat$labs[1], ylab =
## dat$labs[2], : "image" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "image" is not a graphical
## parameter

persp(house.rsm, ~ driveway + bathrooms, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in persp.default(dat$x, dat$y, dat$z, zlim = dat$zlim, theta =
## theta, : "image" is not a graphical parameter
## Warning in persp.default(dat$x, dat$y, dat$z, xlab = dat$labs[1], ylab =
## dat$labs[2], : "image" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "image" is not a graphical
## parameter

persp(house.rsm, ~ full + stories, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in persp.default(dat$x, dat$y, dat$z, zlim = dat$zlim, theta =
## theta, : "image" is not a graphical parameter
## Warning in persp.default(dat$x, dat$y, dat$z, xlab = dat$labs[1], ylab =
## dat$labs[2], : "image" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "image" is not a graphical
## parameter

persp(house.rsm, ~ full + bathrooms, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in persp.default(dat$x, dat$y, dat$z, zlim = dat$zlim, theta =
## theta, : "image" is not a graphical parameter
## Warning in persp.default(dat$x, dat$y, dat$z, xlab = dat$labs[1], ylab =
## dat$labs[2], : "image" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "image" is not a graphical
## parameter

persp(house.rsm, ~ stories + bathrooms, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)
## Warning in predict.lm(lmobj, newdata = newdata): prediction from a rank-
## deficient fit may be misleading
## Warning in persp.default(dat$x, dat$y, dat$z, zlim = dat$zlim, theta =
## theta, : "image" is not a graphical parameter
## Warning in persp.default(dat$x, dat$y, dat$z, xlab = dat$labs[1], ylab =
## dat$labs[2], : "image" is not a graphical parameter
## Warning in title(sub = dat$labs[5], ...): "image" is not a graphical
## parameter

Model Adequecy Checking

Shapiro-Wilk Test

We will use the Shapiro-Wilk test to test for normalcy. For this test the null-hypothesis is that the population is normally distributed. Therefore, if the p-value we get is less that alpha = .05, we will reject the null hypothesis and say that there is evidence that the data is not normally distributed.

#Normalcy Test
shapiro.test(Housing4$price)
## 
##  Shapiro-Wilk normality test
## 
## data:  Housing4$price
## W = 0.90553, p-value < 2.2e-16
shapiro.test(1/(Housing4$price))
## 
##  Shapiro-Wilk normality test
## 
## data:  1/(Housing4$price)
## W = 0.95496, p-value = 2.724e-11
shapiro.test((Housing4$price)^2)
## 
##  Shapiro-Wilk normality test
## 
## data:  (Housing4$price)^2
## W = 0.72113, p-value < 2.2e-16
shapiro.test(sqrt(Housing4$price))
## 
##  Shapiro-Wilk normality test
## 
## data:  sqrt(Housing4$price)
## W = 0.96574, p-value = 1.816e-09
shapiro.test(log(Housing4$price))
## 
##  Shapiro-Wilk normality test
## 
## data:  log(Housing4$price)
## W = 0.99474, p-value = 0.08166
shapiro.test(log10(Housing4$price))
## 
##  Shapiro-Wilk normality test
## 
## data:  log10(Housing4$price)
## W = 0.99474, p-value = 0.08166

The results from these tests are that we fail to reject the null hypothesis for the log and log10 transformations. We cannot say that these two transformation will not result is a normal distribution.

We can also use a Q-Q plot to check for normalcy:

#Model Adequecy
qqnorm(residuals(house.rsm), main="Normal Q-Q Plot", ylab="Price")
qqline(residuals(house.rsm))

plot(fitted(house.rsm),residuals(house.rsm))

As you can see, the Q-Q plot is not perfect and the residuals plot has a lot of linearity and is not random or sporadic.

My Conclusions

Due to the RSM analysis and the plots above, I fail to reject the null hypothesis. This data is not normally distributed and since that is one of the assumptions when completing at RSM analysis I cannot reject the null hypothesis.

4. R Code

#Read in unemployment data
library("Ecdat")
data(Housing)

#Summary of data
head(Housing)
summary(Housing)

#Make subsets to delete all rows with 4 stories or 4 bathrooms 
Housing2 = subset(Housing, stories != "4")
Housing3 = subset(Housing2, bathrms != "4")

#Make bathrms and stories factors 
Housing3$bathrms <- as.factor(Housing3$bathrms)
Housing3$stories <- as.factor(Housing3$stories)

#Drop old levels 
levels(droplevels(Housing3$bathrms))
levels(droplevels(Housing3$stories))

#condense to have only 4 IV and the respnse 
Housing3$lotsize <- NULL
Housing3$bedrooms <- NULL
Housing3$recroom <- NULL
Housing3$gashw <- NULL
Housing3$airco <- NULL
Housing3$garagepl <- NULL
Housing3$prefarea <- NULL

#Look at newly condensed dataframe
head(droplevels(Housing3))
summary(droplevels(Housing3))
str(droplevels(Housing3))

# Find length of frame
l = nrow(Housing3) 

# Create four new frames for writing numbers instead of factors 
drivewaynum = data.frame(l)
fullbasenum = data.frame(l)
storiesnum = data.frame(l)
bathrmsnum = data.frame(l)

## Replacement loop
for (i in 1:l){
  
  # driveway replace yes with 1 and no with 0
  if (Housing3$driveway[i] == "yes"){
    drivewaynum[i,1] <- 1
  }  else {
    drivewaynum[i,1] <- 0
  }
  
  # fullbase replace yes with 1 and no with 0
  if (Housing3$fullbase[i] == "yes"){
    fullbasenum[i,1] <- 1
  }  else {
    fullbasenum[i,1] <- 0
  }
  
  #stories 1 = 0, 2 = 1, 3 = 2
  if (Housing3$stories[i] == "1"){
    storiesnum[i,1] <- 0
  }
  
  if (Housing3$stories[i] == "2") {
    storiesnum[i,1] <- 1
  } 
  if (Housing3$stories[i] == "3") {
    storiesnum[i,1] <- 2
  }
  
  #bathrms 1 = 0, 2 = 1, 3 = 2
  if (Housing3$bathrms[i] == "1"){
    bathrmsnum[i,1] <- 0
  }
  
  if (Housing3$bathrms[i] == "2") {
    bathrmsnum[i,1] <- 1
  } 
  if (Housing3$bathrms[i] == "3") {
    bathrmsnum[i,1] <- 2
  }
}



# Create numbered data frame out of column vectors and response variable 
Housing4 <- cbind(drivewaynum,fullbasenum,storiesnum,bathrmsnum,Housing3$price)

# Title columns of data frame appropriately
colnames(Housing4) <- c("driveway","full","stories","bathrooms","price")
head(Housing4)

#Summary of new dataframe
head(Housing4)
str(Housing4)
summary(Housing4)

#Boxplot of price and four IVs
boxplot(Housing4$price ~ Housing4$full + Housing4$driveway +
          Housing4$stories + Housing4$bathrooms, xlab= "fullbasement + 
        driveway + stories + bathrooms",ylab="price of house", 
        main="Histogram of Price and four IVs")

#Install FrF2
install.packages("FrF2")
library("FrF2")

#64 Full Factorial Runs
f = FrF2(64,6)
print(f)

#FrF2
b <- FrF2(8,6,res3 = T) 
print(b) 
#Alias structure
aliasprint(b) 


#Boxplots of Four IVS and response price
boxplot(Housing4$price~Housing4$driveway, 
        main="Affect of Driveway on House Price", 
        ylab="House Price", xlab="1= Driveway, 0 = No Driveway")
boxplot(Housing4$price~Housing4$full, 
        main="Affect of Full Basement on House Price", 
        ylab="House Price", xlab="1= Full Basement, 0 = No Full Basement")
boxplot(Housing4$price~Housing4$stories, 
        main="Affect of Number of Stories on House Price", 
        ylab="House Price", xlab="0 = 1 Story, 1 = Two Stories 2 = Three Stories")
boxplot(Housing4$price~Housing4$bathrooms, 
        main="Affect of Number of Bathrooms on House Price", 
        ylab="House Price", xlab="0 = 1 Bathrooms, 1 = Two Bathrooms 2 = Three Bathrooms")


#Interaction Effects and Main Effects Model
model1 = lm(Housing4$price ~ (Housing4$driveway + Housing4$full + Housing4$stories + Housing4$bathrooms)^2)
anova(model1)

#Main Effects Model
model2 = lm(Housing4$price ~ (Housing4$driveway + Housing4$full + Housing4$stories + Housing4$bathrooms))
anova(model2)

# Model Adequacy Checking
qqnorm(residuals(model2), main = "Normal Q-Q Plot")
qqline(residuals(model2))
plot(fitted(model2),residuals(model2))

str(Housing4)
#RSM
install.packages("rsm")
library("rsm")
house.rsm = rsm(price ~ SO(driveway,full, stories, bathrooms), data=Housing4)
summary(house.rsm)

#Contour Plots
par(mfrow=c(2,3))
contour(house.rsm, ~driveway + full + stories + bathrooms, image=TRUE, at=summary(house.rsm$canonical$xs))

#Perspective Plots
par(mfrow=c(1,1))

persp(house.rsm, ~ driveway + full, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)

persp(house.rsm, ~ driveway + stories, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)

persp(house.rsm, ~ driveway + bathrooms, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)

persp(house.rsm, ~ full + stories, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)

persp(house.rsm, ~ full + bathrooms, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)

persp(house.rsm, ~ stories + bathrooms, image = TRUE,at = c(summary(house.rsm)$canonical$xs, Block="B2"),contour="colors",zlab="Price in Dollars",theta=30)

#Steepest
steepest(house.rsm)
canonical.path(house.rsm, dist = seq(-5, 5, by = 0.5))

#Normalcy Test
shapiro.test(Housing4$price)
shapiro.test(1/(Housing4$price))
shapiro.test((Housing4$price)^2)
shapiro.test(sqrt(Housing4$price))
shapiro.test(log(Housing4$price))
shapiro.test(log10(Housing4$price))

#Model Adequecy
qqnorm(residuals(house.rsm), main="Normal Q-Q Plot", ylab="Price")
qqline(residuals(house.rsm))

plot(fitted(house.rsm),residuals(house.rsm))