library(LearnEDAfunctions)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
library(ggplot2)
ht.wtdata<-read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vT6fX2d7mVNdr-D4ECC1fpmZcq_CFTXfnnRjr5nUHz8_PdSkj8eXfobI7lCj82UAbyBY6uadn12wJmG/pub?output=csv")
head(ht.wtdata)
## Height Weight
## 1 66.0 140
## 2 72.0 145
## 3 73.5 160
## 4 73.0 190
## 5 69.0 155
## 6 73.0 165
ggplot(ht.wtdata, aes(Height, Weight)) +
geom_point() +
labs(title = "Relationship Between Height & Weight", x="Height", y="Weight")
lm_fit<-lm(Weight~Height, data=ht.wtdata)
summary(lm_fit)
##
## Call:
## lm(formula = Weight ~ Height, data = ht.wtdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.499 -11.339 -1.132 8.615 53.134
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -204.7408 29.1597 -7.021 4.02e-10 ***
## Height 5.0918 0.4237 12.016 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.79 on 90 degrees of freedom
## Multiple R-squared: 0.616, Adjusted R-squared: 0.6117
## F-statistic: 144.4 on 1 and 90 DF, p-value: < 2.2e-16
ggplot(ht.wtdata, aes(Height, Weight)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE) +
labs(title="Height & Weight with Least Squares Regression Line")
## `geom_smooth()` using formula = 'y ~ x'
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:LearnEDAfunctions':
##
## farms
## The following object is masked from 'package:dplyr':
##
## select
rlm_fit<-rlm(Weight~Height, data=ht.wtdata)
summary(rlm_fit)
##
## Call: rlm(formula = Weight ~ Height, data = ht.wtdata)
## Residuals:
## Min 1Q Median 3Q Max
## -30.9756 -10.8907 -0.7816 9.0850 53.8304
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -202.3228 28.2656 -7.1579
## Height 5.0485 0.4108 12.2908
##
## Residual standard error: 14.34 on 90 degrees of freedom
ggplot(ht.wtdata, aes(Height, Weight)) +
geom_point() +
geom_smooth(method = "lm", se=FALSE) +
geom_abline(intercept = coef(rlm_fit)[1], slope=coef(rlm_fit)[2], color = "red") +
labs(title="Least Squares & Resistant Fit Lines")
## `geom_smooth()` using formula = 'y ~ x'
ht.wtdata$ls_resid<-resid(lm_fit)
plot(ht.wtdata$Height, ht.wtdata$ls_resid, main="Least Squares Residuals", ylab="Residuals", xlab="Height")
abline(h=0)
ht.wtdata$rlm_resid<-resid(rlm_fit)
plot(ht.wtdata$Height, ht.wtdata$rlm_resid, main="Resistant Residuals", ylab="Residuals", xlab="Height")
abline(h=0)
The least-squares regression line is greater influenced by outliers in a
dataset, and the resistant line is a more consistent fit that better
represents the trend in the data. The lines are very similar in this
case, but there are a few extremes in the weight data, and thus, the
resistant line gives a slightly better representation of the
relationship between the two variables.
head(pop.england)
## YEAR POPULATION
## 1 1801 8.89
## 2 1811 10.16
## 3 1821 12.00
## 4 1831 13.90
## 5 1841 15.91
## 6 1851 17.93
ggplot(pop.england, aes(YEAR, POPULATION)) +
geom_point()
ggplot(pop.england, aes(YEAR, POPULATION)) +
geom_point() + geom_smooth(method="lm", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
r_model<-line(pop.england$YEAR, pop.england$POPULATION)
print(r_model)
##
## Call:
## line(pop.england$YEAR, pop.england$POPULATION)
##
## Coefficients:
## [1] -476.1686 0.2674
plot(pop.england$YEAR, pop.england$POPULATION, xlab="YEAR", ylab="POPULATION", main="POP.ENGLAND")
abline(r_model, lwd=2)
For the following problems (from Tukey, EDA), (a) straighten plot using transformations applied on summary points (b) fit a line to the transformed plots and plot the residuals (c) summarize all results
These datasets are stored as tukey.26a, tukey.26b, and tukey.26c in the LearnEDA package.
26a) Measurement of a certain impurity in DDT; change of scale factor with temperature. (21, 248) translates as: “At 21 degrees C, the rate of crystallization (in microns per 5 minutes) is 24.8 times the log of this particular impurity.
head(tukey.26a)
## temp rate
## 1 21 248
## 2 22 308
## 3 23 388
## 4 24 465
## 5 25 569
## 6 26 678
ggplot(tukey.26a, aes(temp, rate)) +
geom_point() + labs(title = "Rates of Crystallization", x="Temperature", y="Rate")
ggplot(tukey.26a, aes(temp, rate)) +
geom_point() + labs(title = "Rates of Crystallization", x="Temperature", y="Rate") +
geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
summpoints<-data.frame(x=c(23,28,32), y=c(388,959,202))
ggplot(tukey.26a, aes(temp, rate)) +
geom_point() + labs(title = "Rates of Crystallization", x="Temperature", y="Rate") +
geom_point(data=summpoints, aes(x,y), size=4)
straighten<-function(sp, px, py){
sp$tx<-with(sp, (x^px-1)/px)
sp$ty<-with(sp, (y^py-1)/py)
sp$slope[1]<-with(sp, diff(ty[1:2])/diff(tx[1:2]))
sp$slope[2]<-with(sp, diff(ty[2:3])/diff(tx[2:3]))
sp$half.slope.ratio<-with(sp,slope[2]/slope[1])
sp$slope[3]<-NA
sp$half.slope.ratio[2:3]<-NA
row.names(sp)<-c("Left", "Center","Right")
sp
}
straighten(summpoints, 1, 1)
## x y tx ty slope half.slope.ratio
## Left 23 388 22 387 114.20 -1.65718
## Center 28 959 27 958 -189.25 NA
## Right 32 202 31 201 NA NA
straighten(summpoints, .5, 1)
## x y tx ty slope half.slope.ratio
## Left 23 388 7.591663 387 575.9868 -1.798632
## Center 28 959 8.583005 958 -1035.9883 NA
## Right 32 202 9.313708 201 NA NA
straighten(summpoints, .001, 1)
## x y tx ty slope half.slope.ratio
## Left 23 388 3.140415 387 2893.374 -1.952683
## Center 28 959 3.337762 958 -5649.843 NA
## Right 32 202 3.471749 201 NA NA
straighten(summpoints, .001, .001)
## x y tx ty slope half.slope.ratio
## Left 23 388 3.140415 5.978807 4.614743 -2.534539
## Center 28 959 3.337762 6.889515 -11.696248 NA
## Right 32 202 3.471749 5.322382 NA NA
straighten(summpoints, .001, -1)
## x y tx ty slope half.slope.ratio
## Left 23 388 3.140415 0.9974227 0.007775964 -3.750698
## Center 28 959 3.337762 0.9989572 -0.029165295 NA
## Right 32 202 3.471749 0.9950495 NA NA
straighten(summpoints, -.33, -1)
## x y tx ty slope half.slope.ratio
## Left 23 388 1.953551 0.9974227 0.02267507 -3.961773
## Center 28 959 2.021227 0.9989572 -0.08983348 NA
## Right 32 202 2.064727 0.9950495 NA NA
tukey.26a<-mutate(tukey.26a, new.x=rate^.001,
new.y=-1/temp)
ggplot(tukey.26a, aes(new.x, new.y)) +
geom_point()
tukey.26a<-tukey.26a%>%
mutate(x1=temp^.5, y1=1/rate)
ggplot(tukey.26a, aes(x1,y1)) +
geom_point() + labs(x="sqrt(temp)", y="1/rate")
model<-lm(y1~x1, data=tukey.26a)
summary(model)
##
## Call:
## lm(formula = y1 ~ x1, data = tukey.26a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0028403 -0.0014452 -0.0004257 0.0012592 0.0046688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.008549 0.008049 -1.062 0.309
## x1 0.002349 0.001535 1.531 0.152
##
## Residual standard error: 0.002219 on 12 degrees of freedom
## Multiple R-squared: 0.1633, Adjusted R-squared: 0.09362
## F-statistic: 2.343 on 1 and 12 DF, p-value: 0.1518
ggplot(tukey.26a, aes(x1,y1)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title="Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'
tukey.26a$resid<-residuals(model)
ggplot(tukey.26a, aes(x=fitted(model), y=resid)) +
geom_point() +
geom_hline(yintercept = 0) +
labs(title="Residual Plot for Transformed Model",
x="Fitted Values", y="Residuals")
26b) Demand deposits in post-office savings accounts in Switzerland. (37, 458) translates as: “In 1937, there were 458 million francs in post-office savings account.”
head(tukey.26b)
## year deposits
## 1 37 458
## 2 38 498
## 3 39 523
## 4 40 643
## 5 41 707
## 6 42 787
ggplot(tukey.26b, aes(year, deposits)) +
geom_point() + labs(title = "Demand Deposits in Post Office Savings Accounts", x="Year", y="Deposits")
ggplot(tukey.26b, aes(year, deposits)) +
geom_point() + labs(title = "Demand Deposits in Post Office Savings Accounts", x="Year", y="Deposits") +
geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
summpoints<-data.frame(x=c(41,51,61), y=c(746,1260,2710))
ggplot(tukey.26b, aes(year, deposits)) +
geom_point() + labs(title = "Demand Deposits in Post Office Savings Accounts", x="Year", y="Deposits") +
geom_point(data=summpoints, aes(x,y), size=4)
straighten(summpoints, 1, 1)
## x y tx ty slope half.slope.ratio
## Left 41 746 40 745 51.4 2.821012
## Center 51 1260 50 1259 145.0 NA
## Right 61 2710 60 2709 NA NA
straighten(summpoints, .5, 1)
## x y tx ty slope half.slope.ratio
## Left 41 746 10.80625 745 348.095 3.114083
## Center 51 1260 12.28286 1259 1083.997 NA
## Right 61 2710 13.62050 2709 NA NA
straighten(summpoints, .001, 1)
## x y tx ty slope half.slope.ratio
## Left 41 746 3.720476 745 2346.073 3.438032
## Center 51 1260 3.939565 1259 8065.876 NA
## Right 61 2710 4.119335 2709 NA NA
straighten(summpoints, .001, .001)
## x y tx ty slope half.slope.ratio
## Left 41 746 3.720476 6.636651 2.408871 1.781858
## Center 51 1260 3.939565 7.164409 4.292265 NA
## Right 61 2710 4.119335 7.936029 NA NA
straighten(summpoints, .001, -1)
## x y tx ty slope half.slope.ratio
## Left 41 746 3.720476 0.9986595 0.002495929 0.9464103
## Center 51 1260 3.939565 0.9992063 0.002362173 NA
## Right 61 2710 4.119335 0.9996310 NA NA
straighten(summpoints, -.33, -1)
## x y tx ty slope half.slope.ratio
## Left 41 746 2.140554 0.9986595 0.008844167 1.010803
## Center 51 1260 2.202384 0.9992063 0.008939711 NA
## Right 61 2710 2.249885 0.9996310 NA NA
tukey.26b<-mutate(tukey.26b, new.x=year^.001,
new.y=-1/deposits)
ggplot(tukey.26b, aes(new.x, new.y)) +
geom_point()
tukey.26b<-tukey.26b%>%
mutate(x1=year^.5, y1=1/deposits)
ggplot(tukey.26b, aes(x1,y1)) +
geom_point() + labs(x="sqrt(year)", y="1/deposits")
model<-lm(y1~x1, data=tukey.26b)
summary(model)
##
## Call:
## lm(formula = y1 ~ x1, data = tukey.26b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.529e-04 -8.385e-05 3.600e-07 4.477e-05 4.304e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.738e-03 3.590e-04 18.77 < 2e-16 ***
## x1 -8.196e-04 5.027e-05 -16.30 1.69e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0001597 on 27 degrees of freedom
## Multiple R-squared: 0.9078, Adjusted R-squared: 0.9044
## F-statistic: 265.8 on 1 and 27 DF, p-value: 1.686e-15
ggplot(tukey.26b, aes(x1,y1)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title="Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'
tukey.26b$resid<-residuals(model)
ggplot(tukey.26b, aes(x=fitted(model), y=resid)) +
geom_point() +
geom_hline(yintercept = 0) +
labs(title="Residual Plot for Transformed Model",
x="Fitted Values", y="Residuals")
26c) Revenue passenger miles on US passenger airlines. (37, 412) translates as: “In 1937, there were 412,000 revenue passenger miles on US domestic scheduled airlines.”
head(tukey.26c)
## year miles
## 1 37 412
## 2 38 480
## 3 39 683
## 4 40 1052
## 5 41 1385
## 6 42 1418
ggplot(tukey.26c, aes(year, miles)) +
geom_point() + labs(title = "Revenue Passenger Miles on US Airlines", x="Year", y="Miles")
ggplot(tukey.26c, aes(year, miles)) +
geom_point() + labs(title = "Revenue Passenger Miles on US Airlines", x="Year", y="Miles") +
geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
summpoints<-data.frame(x=c(41,49,57), y=c(1155,7406,22897))
ggplot(tukey.26c, aes(year, miles)) +
geom_point() + labs(title = "Revenue Passenger Miles on US Airlines", x="Year", y="Miles") +
geom_point(data=summpoints, aes(x,y), size=4)
straighten(summpoints, 1, 1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 40 1154 781.375 2.478163
## Center 49 7406 48 7405 1936.375 NA
## Right 57 22897 56 22896 NA NA
straighten(summpoints, .5, 1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 10.80625 1154 5236.433 2.690184
## Center 49 7406 12.00000 7405 14086.968 NA
## Right 57 22897 13.09967 22896 NA NA
straighten(summpoints, .001, 1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 3.720476 1154 34935.97 2.920404
## Center 49 7406 3.899403 7405 102027.13 NA
## Right 57 22897 4.051235 22896 NA NA
straighten(summpoints, .001, .001)
## x y tx ty slope half.slope.ratio
## Left 41 1155 3.720476 7.076779 10.468382 0.7168954
## Center 49 7406 3.899403 8.949858 7.504735 NA
## Right 57 22897 4.051235 10.089319 NA NA
straighten(summpoints, .001, -1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 3.720476 0.9991342 0.0040842008 0.1473148
## Center 49 7406 3.899403 0.9998650 0.0006016631 NA
## Right 57 22897 4.051235 0.9999563 NA NA
straighten(summpoints, -.33, -1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 2.140554 0.9991342 0.01437764 0.155577
## Center 49 7406 2.191381 0.9998650 0.00223683 NA
## Right 57 22897 2.232221 0.9999563 NA NA
tukey.26c<-mutate(tukey.26c, new.x=year^.001,
new.y=-1/miles)
ggplot(tukey.26c, aes(new.x, new.y)) +
geom_point()
tukey.26c<-tukey.26c%>%
mutate(x1=year^.5, y1=1/miles)
ggplot(tukey.26c, aes(x1,y1)) +
geom_point() + labs(x="sqrt(year)", y="1/miles")
model<-lm(y1~x1, data=tukey.26c)
summary(model)
##
## Call:
## lm(formula = y1 ~ x1, data = tukey.26c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0004642 -0.0002700 -0.0001307 0.0002275 0.0010589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0077671 0.0011222 6.922 5.98e-07 ***
## x1 -0.0010520 0.0001611 -6.529 1.44e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0003943 on 22 degrees of freedom
## Multiple R-squared: 0.6596, Adjusted R-squared: 0.6441
## F-statistic: 42.62 on 1 and 22 DF, p-value: 1.444e-06
ggplot(tukey.26c, aes(x1,y1)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title="Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'
tukey.26c$resid<-residuals(model)
ggplot(tukey.26c, aes(x=fitted(model), y=resid)) +
geom_point() +
geom_hline(yintercept = 0) +
labs(title="Residual Plot for Transformed Model",
x="Fitted Values", y="Residuals")