Question 1 - My Data Set: Height vs Weight Data (uvm.edu)

  1. Find some (x,y) data where you think x and y are strongly related. Make a scatter plot and find the least-squares and resistant fits. Plot the two sets of residuals. Interpret the fit and the residuals. Contrast the two methods of fitting – is it better to fit a resistant line?
library(LearnEDAfunctions)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: ggplot2
library(ggplot2)
ht.wtdata<-read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vT6fX2d7mVNdr-D4ECC1fpmZcq_CFTXfnnRjr5nUHz8_PdSkj8eXfobI7lCj82UAbyBY6uadn12wJmG/pub?output=csv")
head(ht.wtdata)
##   Height Weight
## 1   66.0    140
## 2   72.0    145
## 3   73.5    160
## 4   73.0    190
## 5   69.0    155
## 6   73.0    165
ggplot(ht.wtdata, aes(Height, Weight)) +
  geom_point() +
  labs(title = "Relationship Between Height & Weight", x="Height", y="Weight")

lm_fit<-lm(Weight~Height, data=ht.wtdata)
summary(lm_fit)
## 
## Call:
## lm(formula = Weight ~ Height, data = ht.wtdata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.499 -11.339  -1.132   8.615  53.134 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -204.7408    29.1597  -7.021 4.02e-10 ***
## Height         5.0918     0.4237  12.016  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.79 on 90 degrees of freedom
## Multiple R-squared:  0.616,  Adjusted R-squared:  0.6117 
## F-statistic: 144.4 on 1 and 90 DF,  p-value: < 2.2e-16
ggplot(ht.wtdata, aes(Height, Weight)) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE) +
  labs(title="Height & Weight with Least Squares Regression Line")
## `geom_smooth()` using formula = 'y ~ x'

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:LearnEDAfunctions':
## 
##     farms
## The following object is masked from 'package:dplyr':
## 
##     select
rlm_fit<-rlm(Weight~Height, data=ht.wtdata)
summary(rlm_fit)
## 
## Call: rlm(formula = Weight ~ Height, data = ht.wtdata)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -30.9756 -10.8907  -0.7816   9.0850  53.8304 
## 
## Coefficients:
##             Value     Std. Error t value  
## (Intercept) -202.3228   28.2656    -7.1579
## Height         5.0485    0.4108    12.2908
## 
## Residual standard error: 14.34 on 90 degrees of freedom
ggplot(ht.wtdata, aes(Height, Weight)) +
  geom_point() +
  geom_smooth(method = "lm", se=FALSE) +
  geom_abline(intercept = coef(rlm_fit)[1], slope=coef(rlm_fit)[2], color = "red") +
  labs(title="Least Squares & Resistant Fit Lines")
## `geom_smooth()` using formula = 'y ~ x'

ht.wtdata$ls_resid<-resid(lm_fit)
plot(ht.wtdata$Height, ht.wtdata$ls_resid, main="Least Squares Residuals", ylab="Residuals", xlab="Height")
abline(h=0)

ht.wtdata$rlm_resid<-resid(rlm_fit)
plot(ht.wtdata$Height, ht.wtdata$rlm_resid, main="Resistant Residuals", ylab="Residuals", xlab="Height")
abline(h=0)

The least-squares regression line is greater influenced by outliers in a dataset, and the resistant line is a more consistent fit that better represents the trend in the data. The lines are very similar in this case, but there are a few extremes in the weight data, and thus, the resistant line gives a slightly better representation of the relationship between the two variables.

Question 2 - Population of England & Wales

  1. Fit a resistant line to the following data (population of England and Wales in millions for different years). Data is stored in the file pop.england in the LearnEDA package. Plot the residuals and summarize the results.
head(pop.england)
##   YEAR POPULATION
## 1 1801       8.89
## 2 1811      10.16
## 3 1821      12.00
## 4 1831      13.90
## 5 1841      15.91
## 6 1851      17.93
ggplot(pop.england, aes(YEAR, POPULATION)) +
  geom_point()

ggplot(pop.england, aes(YEAR, POPULATION)) +
  geom_point() + geom_smooth(method="lm", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

r_model<-line(pop.england$YEAR, pop.england$POPULATION)
print(r_model)
## 
## Call:
## line(pop.england$YEAR, pop.england$POPULATION)
## 
## Coefficients:
## [1]  -476.1686     0.2674
plot(pop.england$YEAR, pop.england$POPULATION, xlab="YEAR", ylab="POPULATION", main="POP.ENGLAND")
abline(r_model, lwd=2)

For the following problems (from Tukey, EDA), (a) straighten plot using transformations applied on summary points (b) fit a line to the transformed plots and plot the residuals (c) summarize all results

These datasets are stored as tukey.26a, tukey.26b, and tukey.26c in the LearnEDA package.

26a) Measurement of a certain impurity in DDT; change of scale factor with temperature. (21, 248) translates as: “At 21 degrees C, the rate of crystallization (in microns per 5 minutes) is 24.8 times the log of this particular impurity.

head(tukey.26a)
##   temp rate
## 1   21  248
## 2   22  308
## 3   23  388
## 4   24  465
## 5   25  569
## 6   26  678
ggplot(tukey.26a, aes(temp, rate)) +
  geom_point() + labs(title = "Rates of Crystallization", x="Temperature", y="Rate")

ggplot(tukey.26a, aes(temp, rate)) +
  geom_point() + labs(title = "Rates of Crystallization", x="Temperature", y="Rate") +
  geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

summpoints<-data.frame(x=c(23,28,32), y=c(388,959,202))
ggplot(tukey.26a, aes(temp, rate)) +
  geom_point() + labs(title = "Rates of Crystallization", x="Temperature", y="Rate") +
  geom_point(data=summpoints, aes(x,y), size=4)

straighten<-function(sp, px, py){
  sp$tx<-with(sp, (x^px-1)/px)
  sp$ty<-with(sp, (y^py-1)/py)
  sp$slope[1]<-with(sp, diff(ty[1:2])/diff(tx[1:2]))
  sp$slope[2]<-with(sp, diff(ty[2:3])/diff(tx[2:3]))
  sp$half.slope.ratio<-with(sp,slope[2]/slope[1])
  sp$slope[3]<-NA
  sp$half.slope.ratio[2:3]<-NA
  row.names(sp)<-c("Left", "Center","Right")
  sp
}
straighten(summpoints, 1, 1)
##         x   y tx  ty   slope half.slope.ratio
## Left   23 388 22 387  114.20         -1.65718
## Center 28 959 27 958 -189.25               NA
## Right  32 202 31 201      NA               NA
straighten(summpoints, .5, 1)
##         x   y       tx  ty      slope half.slope.ratio
## Left   23 388 7.591663 387   575.9868        -1.798632
## Center 28 959 8.583005 958 -1035.9883               NA
## Right  32 202 9.313708 201         NA               NA
straighten(summpoints, .001, 1)
##         x   y       tx  ty     slope half.slope.ratio
## Left   23 388 3.140415 387  2893.374        -1.952683
## Center 28 959 3.337762 958 -5649.843               NA
## Right  32 202 3.471749 201        NA               NA
straighten(summpoints, .001, .001)
##         x   y       tx       ty      slope half.slope.ratio
## Left   23 388 3.140415 5.978807   4.614743        -2.534539
## Center 28 959 3.337762 6.889515 -11.696248               NA
## Right  32 202 3.471749 5.322382         NA               NA
straighten(summpoints, .001, -1)
##         x   y       tx        ty        slope half.slope.ratio
## Left   23 388 3.140415 0.9974227  0.007775964        -3.750698
## Center 28 959 3.337762 0.9989572 -0.029165295               NA
## Right  32 202 3.471749 0.9950495           NA               NA
straighten(summpoints, -.33, -1)
##         x   y       tx        ty       slope half.slope.ratio
## Left   23 388 1.953551 0.9974227  0.02267507        -3.961773
## Center 28 959 2.021227 0.9989572 -0.08983348               NA
## Right  32 202 2.064727 0.9950495          NA               NA
tukey.26a<-mutate(tukey.26a, new.x=rate^.001, 
                  new.y=-1/temp)
ggplot(tukey.26a, aes(new.x, new.y)) +
  geom_point()

tukey.26a<-tukey.26a%>%
  mutate(x1=temp^.5, y1=1/rate)
ggplot(tukey.26a, aes(x1,y1)) + 
  geom_point() + labs(x="sqrt(temp)", y="1/rate")

model<-lm(y1~x1, data=tukey.26a)
summary(model)
## 
## Call:
## lm(formula = y1 ~ x1, data = tukey.26a)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0028403 -0.0014452 -0.0004257  0.0012592  0.0046688 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.008549   0.008049  -1.062    0.309
## x1           0.002349   0.001535   1.531    0.152
## 
## Residual standard error: 0.002219 on 12 degrees of freedom
## Multiple R-squared:  0.1633, Adjusted R-squared:  0.09362 
## F-statistic: 2.343 on 1 and 12 DF,  p-value: 0.1518
ggplot(tukey.26a, aes(x1,y1)) +
  geom_point() + 
  geom_smooth(method="lm", se=FALSE) +
  labs(title="Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'

tukey.26a$resid<-residuals(model)

ggplot(tukey.26a, aes(x=fitted(model), y=resid)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  labs(title="Residual Plot for Transformed Model",
       x="Fitted Values", y="Residuals")

26b) Demand deposits in post-office savings accounts in Switzerland. (37, 458) translates as: “In 1937, there were 458 million francs in post-office savings account.”

head(tukey.26b)
##   year deposits
## 1   37      458
## 2   38      498
## 3   39      523
## 4   40      643
## 5   41      707
## 6   42      787
ggplot(tukey.26b, aes(year, deposits)) +
  geom_point() + labs(title = "Demand Deposits in Post Office Savings Accounts", x="Year", y="Deposits")

ggplot(tukey.26b, aes(year, deposits)) +
  geom_point() + labs(title = "Demand Deposits in Post Office Savings Accounts", x="Year", y="Deposits") +
  geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

summpoints<-data.frame(x=c(41,51,61), y=c(746,1260,2710))
ggplot(tukey.26b, aes(year, deposits)) +
  geom_point() + labs(title = "Demand Deposits in Post Office Savings Accounts", x="Year", y="Deposits") +
  geom_point(data=summpoints, aes(x,y), size=4)

straighten(summpoints, 1, 1)
##         x    y tx   ty slope half.slope.ratio
## Left   41  746 40  745  51.4         2.821012
## Center 51 1260 50 1259 145.0               NA
## Right  61 2710 60 2709    NA               NA
straighten(summpoints, .5, 1)
##         x    y       tx   ty    slope half.slope.ratio
## Left   41  746 10.80625  745  348.095         3.114083
## Center 51 1260 12.28286 1259 1083.997               NA
## Right  61 2710 13.62050 2709       NA               NA
straighten(summpoints, .001, 1)
##         x    y       tx   ty    slope half.slope.ratio
## Left   41  746 3.720476  745 2346.073         3.438032
## Center 51 1260 3.939565 1259 8065.876               NA
## Right  61 2710 4.119335 2709       NA               NA
straighten(summpoints, .001, .001)
##         x    y       tx       ty    slope half.slope.ratio
## Left   41  746 3.720476 6.636651 2.408871         1.781858
## Center 51 1260 3.939565 7.164409 4.292265               NA
## Right  61 2710 4.119335 7.936029       NA               NA
straighten(summpoints, .001, -1)
##         x    y       tx        ty       slope half.slope.ratio
## Left   41  746 3.720476 0.9986595 0.002495929        0.9464103
## Center 51 1260 3.939565 0.9992063 0.002362173               NA
## Right  61 2710 4.119335 0.9996310          NA               NA
straighten(summpoints, -.33, -1)
##         x    y       tx        ty       slope half.slope.ratio
## Left   41  746 2.140554 0.9986595 0.008844167         1.010803
## Center 51 1260 2.202384 0.9992063 0.008939711               NA
## Right  61 2710 2.249885 0.9996310          NA               NA
tukey.26b<-mutate(tukey.26b, new.x=year^.001, 
                  new.y=-1/deposits)
ggplot(tukey.26b, aes(new.x, new.y)) +
  geom_point()

tukey.26b<-tukey.26b%>%
  mutate(x1=year^.5, y1=1/deposits)
ggplot(tukey.26b, aes(x1,y1)) + 
  geom_point() + labs(x="sqrt(year)", y="1/deposits")

model<-lm(y1~x1, data=tukey.26b)
summary(model)
## 
## Call:
## lm(formula = y1 ~ x1, data = tukey.26b)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.529e-04 -8.385e-05  3.600e-07  4.477e-05  4.304e-04 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.738e-03  3.590e-04   18.77  < 2e-16 ***
## x1          -8.196e-04  5.027e-05  -16.30 1.69e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0001597 on 27 degrees of freedom
## Multiple R-squared:  0.9078, Adjusted R-squared:  0.9044 
## F-statistic: 265.8 on 1 and 27 DF,  p-value: 1.686e-15
ggplot(tukey.26b, aes(x1,y1)) +
  geom_point() + 
  geom_smooth(method="lm", se=FALSE) +
  labs(title="Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'

tukey.26b$resid<-residuals(model)

ggplot(tukey.26b, aes(x=fitted(model), y=resid)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  labs(title="Residual Plot for Transformed Model",
       x="Fitted Values", y="Residuals")

26c) Revenue passenger miles on US passenger airlines. (37, 412) translates as: “In 1937, there were 412,000 revenue passenger miles on US domestic scheduled airlines.”

head(tukey.26c)
##   year miles
## 1   37   412
## 2   38   480
## 3   39   683
## 4   40  1052
## 5   41  1385
## 6   42  1418
ggplot(tukey.26c, aes(year, miles)) +
  geom_point() + labs(title = "Revenue Passenger Miles on US Airlines", x="Year", y="Miles")

ggplot(tukey.26c, aes(year, miles)) +
  geom_point() + labs(title = "Revenue Passenger Miles on US Airlines", x="Year", y="Miles") +
  geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

summpoints<-data.frame(x=c(41,49,57), y=c(1155,7406,22897))
ggplot(tukey.26c, aes(year, miles)) +
  geom_point() + labs(title = "Revenue Passenger Miles on US Airlines", x="Year", y="Miles") +
  geom_point(data=summpoints, aes(x,y), size=4)

straighten(summpoints, 1, 1)
##         x     y tx    ty    slope half.slope.ratio
## Left   41  1155 40  1154  781.375         2.478163
## Center 49  7406 48  7405 1936.375               NA
## Right  57 22897 56 22896       NA               NA
straighten(summpoints, .5, 1)
##         x     y       tx    ty     slope half.slope.ratio
## Left   41  1155 10.80625  1154  5236.433         2.690184
## Center 49  7406 12.00000  7405 14086.968               NA
## Right  57 22897 13.09967 22896        NA               NA
straighten(summpoints, .001, 1)
##         x     y       tx    ty     slope half.slope.ratio
## Left   41  1155 3.720476  1154  34935.97         2.920404
## Center 49  7406 3.899403  7405 102027.13               NA
## Right  57 22897 4.051235 22896        NA               NA
straighten(summpoints, .001, .001)
##         x     y       tx        ty     slope half.slope.ratio
## Left   41  1155 3.720476  7.076779 10.468382        0.7168954
## Center 49  7406 3.899403  8.949858  7.504735               NA
## Right  57 22897 4.051235 10.089319        NA               NA
straighten(summpoints, .001, -1)
##         x     y       tx        ty        slope half.slope.ratio
## Left   41  1155 3.720476 0.9991342 0.0040842008        0.1473148
## Center 49  7406 3.899403 0.9998650 0.0006016631               NA
## Right  57 22897 4.051235 0.9999563           NA               NA
straighten(summpoints, -.33, -1)
##         x     y       tx        ty      slope half.slope.ratio
## Left   41  1155 2.140554 0.9991342 0.01437764         0.155577
## Center 49  7406 2.191381 0.9998650 0.00223683               NA
## Right  57 22897 2.232221 0.9999563         NA               NA
tukey.26c<-mutate(tukey.26c, new.x=year^.001, 
                  new.y=-1/miles)
ggplot(tukey.26c, aes(new.x, new.y)) +
  geom_point()

tukey.26c<-tukey.26c%>%
  mutate(x1=year^.5, y1=1/miles)
ggplot(tukey.26c, aes(x1,y1)) + 
  geom_point() + labs(x="sqrt(year)", y="1/miles")

model<-lm(y1~x1, data=tukey.26c)
summary(model)
## 
## Call:
## lm(formula = y1 ~ x1, data = tukey.26c)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0004642 -0.0002700 -0.0001307  0.0002275  0.0010589 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0077671  0.0011222   6.922 5.98e-07 ***
## x1          -0.0010520  0.0001611  -6.529 1.44e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0003943 on 22 degrees of freedom
## Multiple R-squared:  0.6596, Adjusted R-squared:  0.6441 
## F-statistic: 42.62 on 1 and 22 DF,  p-value: 1.444e-06
ggplot(tukey.26c, aes(x1,y1)) +
  geom_point() + 
  geom_smooth(method="lm", se=FALSE) +
  labs(title="Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'

tukey.26c$resid<-residuals(model)

ggplot(tukey.26c, aes(x=fitted(model), y=resid)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  labs(title="Residual Plot for Transformed Model",
       x="Fitted Values", y="Residuals")