data("faithful") #registers data for the Old Faithful dataset
str(faithful) #describes structure of OF data## 'data.frame': 272 obs. of 2 variables:
## $ eruptions: num 3.6 1.8 3.33 2.28 4.53 ...
## $ waiting : num 79 54 74 62 85 55 88 85 51 85 ...
summary(faithful) #summarizes data## eruptions waiting
## Min. :1.600 Min. :43.0
## 1st Qu.:2.163 1st Qu.:58.0
## Median :4.000 Median :76.0
## Mean :3.488 Mean :70.9
## 3rd Qu.:4.454 3rd Qu.:82.0
## Max. :5.100 Max. :96.0
head(faithful) #gives first few data points of OF. Should have two columns## eruptions waiting
## 1 3.600 79
## 2 1.800 54
## 3 3.333 74
## 4 2.283 62
## 5 4.533 85
## 6 2.883 55
#Loading necessary packages
library(ggplot2)
library(plyr)
#Potential transformation functions
Cube.Tns <- function (x) { x ^ 3 }
Square.Tns <- function (x) { x ^ 2 }
Raw.Tns <- function (x) { x }
Sqrt.Tns <- function (x) { sqrt(x) }
Log.Tns <- function (x) { log10(x + 0.00001) }
RecipRoot.Tns <- function (x) { -1 / sqrt(x) }
Recip.Tns <- function (x) { -1 / (x) }
InvSquare.Tns <- function (x) { -1 / (x ^ 2) }hist(faithful$waiting) Figure 1: Histogram of waiting time from the Faithful data set.
hist(faithful$eruptions)Figure 2: Histogram of eruption length from the Faithful data set.
qqnorm(faithful$waiting)
qqline(faithful$waiting)Figure 3: Q-Q Plot for waiting length
qqnorm(faithful$eruptions)
qqline(faithful$eruptions)Figure 4: Q-Q Plot for eruption time
This data does not look very statistically sound, so I am going to run a log transformation to see if that makes it look any better. Woo!
faithful$waitingLOG <- Log.Tns(faithful$waiting)
qqnorm(faithful$waitingLOG)
qqline(faithful$waitingLOG)Figure 5: Q-Q plot of logarithmic transformation of waiting time
faithful$eruptionsLOG <- Log.Tns(faithful$eruptions)
qqnorm(faithful$eruptionsLOG)
qqline(faithful$eruptionsLOG)Figure 6: Q-Q plot of logarithmic transformation of eruption time
Better? Debatable!
LinMod <- lm(waitingLOG~eruptionsLOG, data = faithful)
plot(LinMod)Diagnostic plots for linear model of waiting time by eruptions
Diagnostic plots for linear model of waiting time by eruptions
Diagnostic plots for linear model of waiting time by eruptions
Diagnostic plots for linear model of waiting time by eruptions
FinalGraph <- ggplot(faithful, aes(x=waitingLOG, y=eruptionsLOG))
FinalGraph + geom_point(shape=1) + # Use hollow circles
geom_smooth(method=lm) + #linear regression line, shading 95% confidence level
theme_classic() +
labs(title = "Eruption duration by waiting time" ,
x = "Logarithmic transformation of waiting time (minutes)" ,
y = "Logarithmic transformation of euption time (minutes)") #labels axes and gives title Figure 11: final scatter plot with transformed data and a best-fit line