# ANOVA WORKFLOW - The Nuts and Bolts
library(ggplot2)
library(plyr)
# Data Transformation Section ----
Cube.Tns <- function (x) { x ^ 3 }
Square.Tns <- function (x) { x ^ 2 }
Raw.Tns <- function (x) { x }
Sqrt.Tns <- function (x) { sqrt(x) }
Log.Tns <- function (x) { log10(x + 0.00001) }
RecipRoot.Tns <- function (x) { -1 / sqrt(x) }
Recip.Tns <- function (x) { -1 / (x) }
InvSquare.Tns <- function (x) { -1 / (x ^ 2) }data("faithful") # data import
head(faithful)## eruptions waiting
## 1 3.600 79
## 2 1.800 54
## 3 3.333 74
## 4 2.283 62
## 5 4.533 85
## 6 2.883 55
summary(faithful)## eruptions waiting
## Min. :1.600 Min. :43.0
## 1st Qu.:2.163 1st Qu.:58.0
## Median :4.000 Median :76.0
## Mean :3.488 Mean :70.9
## 3rd Qu.:4.454 3rd Qu.:82.0
## Max. :5.100 Max. :96.0
str(faithful)## 'data.frame': 272 obs. of 2 variables:
## $ eruptions: num 3.6 1.8 3.33 2.28 4.53 ...
## $ waiting : num 79 54 74 62 85 55 88 85 51 85 ...
The following histograms provide insight into the normality of the data.
hist(faithful$eruptions) # Eruptions histogramFigure: Histogram of Eruption Length and Waiting Times from faithful data set
hist(faithful$waiting) # Waiting Time histogramFigure: Histogram of Eruption Length and Waiting Times from faithful data set
The following two QQ Plots will give a better assessment of the normality of the data.
qqnorm(faithful$eruptions)
qqline(faithful$eruptions)Figure: QQ Plot of Eruption Length from the faithful data set
qqnorm(faithful$waiting)
qqline(faithful$waiting)Figure: QQ Plot of Waiting Times from the faithful data set
The following two boxplots will also give an assessment of the data distribution.
boxplot(faithful$eruptions) # Eruptions boxplotFigure: Boxplot of Eruption Length from faithful data set
boxplot(faithful$waiting) #Waiting Times boxplotFigure: Boxplot of Waiting Times from faithful data set
Transformations of these data do not improve normality of the data according to the QQ plots. Therefore, the following linear regression model will operate with the untransformed data.
LRerupt <- lm(faithful$eruptions ~ faithful$waiting) # Generates object
plot(LRerupt) # assumptions all metFigure: Diagnostic plots for the untransformed Eruption data
Figure: Diagnostic plots for the untransformed Eruption data
Figure: Diagnostic plots for the untransformed Eruption data
Figure: Diagnostic plots for the untransformed Eruption data
summary(LRerupt) # significant p-value##
## Call:
## lm(formula = faithful$eruptions ~ faithful$waiting)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.29917 -0.37689 0.03508 0.34909 1.19329
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.874016 0.160143 -11.70 <2e-16 ***
## faithful$waiting 0.075628 0.002219 34.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4965 on 270 degrees of freedom
## Multiple R-squared: 0.8115, Adjusted R-squared: 0.8108
## F-statistic: 1162 on 1 and 270 DF, p-value: < 2.2e-16
p <- ggplot(faithful, aes(eruptionsLOG, waitingLOG)) + geom_point()
p + geom_smooth(method = "lm", se = FALSE) + labs(x = "Eruptions") + labs(y = "Waiting Times")