1. Find some (x, y) data where you think x and y are strongly related. Make a scatterplot and find the least-squares and resistant fits. Plot the two sets of residuals. Interpret the fit and the residuals. Contrast the two methods of fitting – is it better to fit a resistant line?
library(LearnEDAfunctions)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: ggplot2
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(TeachingDemos)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:LearnEDAfunctions':
## 
##     farms
## 
## The following object is masked from 'package:dplyr':
## 
##     select
largest.us.cities.by.population <- read.csv("C:/Users/eclai/Downloads/largest.us.cities.by.population.csv")

ggplot(largest.us.cities.by.population, aes(census.2022, population.2023)) + geom_point() + geom_smooth(method = "lm", se = FALSE) 
## `geom_smooth()` using formula = 'y ~ x'

lm(population.2023 ~ I(census.2022), data=largest.us.cities.by.population)
## 
## Call:
## lm(formula = population.2023 ~ I(census.2022), data = largest.us.cities.by.population)
## 
## Coefficients:
##    (Intercept)  I(census.2022)  
##      1.898e+04       9.232e-01
Dat <- largest.us.cities.by.population[,c(4,5)]
rline(population.2023 ~ census.2022, Dat, iter=10)
## $a
## [1] 173818.7
## 
## $b
## [1] 0.9895414
## 
## $xC
## [1] 174687
## 
## $half.slope.ratio
## [1] 0.9902253
## 
## $residual
##   [1] -7.940625e+05 -8.062537e+04 -1.055889e+05 -1.205455e+04  5.589280e+04
##   [6] -5.733335e+04  5.534922e+04  2.212661e+03 -3.115862e+04  1.228527e+04
##  [11]  2.148889e+04  4.798339e+04 -7.043194e+04  1.052040e+04  1.712692e+04
##  [16] -7.610853e+03 -5.923383e+03 -1.461565e+05 -1.179522e+04  2.118508e+04
##  [21]  5.589556e+03 -2.447306e+04  1.632030e+04 -5.214122e+04 -3.904844e+03
##  [26] -4.621408e+03 -1.126720e+04 -2.723756e+04 -5.071969e+04 -1.453689e+04
##  [31] -1.178203e+03 -1.559233e+04  1.179459e+04  8.868456e+03  9.405567e+03
##  [36]  1.525496e+04  5.740626e+03  1.538072e+04 -5.049661e+03 -7.334702e+03
##  [41]  9.044979e+03 -1.238233e+03 -2.170390e+04 -2.307092e+03 -1.271074e+04
##  [46] -7.410746e+03  1.603438e+04 -3.161109e+02  1.144545e+04  1.234126e+04
##  [51] -1.044345e+03 -4.241012e+02 -1.557304e+04 -9.059699e+03 -7.635683e+02
##  [56] -1.001222e+04  1.183923e+04  6.538910e+03  1.021522e+04  5.862504e+02
##  [61]  2.128653e+03  1.024077e+04  6.935463e+03  9.433609e+02 -1.236286e+03
##  [66] -7.007807e+03 -8.818616e+03  3.523021e+03 -4.776010e+03  5.927824e+03
##  [71]  3.268570e+04  6.821512e+03  5.409792e+03  7.689213e+03 -5.401978e+03
##  [76]  1.381988e+04 -1.895353e+04  4.200813e+03  1.626465e+04 -1.342106e+03
##  [81]  2.904118e+03  7.256204e+03 -2.125449e+04  1.118545e+04 -2.707383e+03
##  [86]  4.170823e+03  3.785662e+02  6.452658e+03  4.574207e+03  4.151434e+03
##  [91] -6.214102e+03  4.040646e+03  3.522870e+04  6.391806e+03 -8.576278e+03
##  [96] -5.901765e+03  2.097816e+03  2.590422e+04  1.214016e+03  2.620915e+03
## [101]  2.680128e+04  7.101414e+03 -1.078966e+04  2.263140e+03  8.018042e+02
## [106] -5.425888e+03 -3.587225e+03 -1.005298e+04  8.472440e+03  7.133225e+03
## [111]  1.828298e+04  2.385335e+02  1.828778e+03 -3.688643e+03 -3.042184e+03
## [116]  2.283675e+02  1.111631e+04 -2.797123e+03  3.813765e+03  2.155466e+03
## [121]  6.953308e+01 -2.959359e+02  9.980259e+03 -8.035667e+02  4.158712e+03
## [126]  3.383827e+03  7.263711e+03 -3.321518e+02 -2.862852e+03 -1.148023e+03
## [131]  4.352700e+03 -4.982438e+03 -6.441524e+03  1.099852e+04  4.219634e+03
## [136] -1.944449e+02 -1.409786e+03 -1.020857e+04  2.411897e+03  7.005503e+03
## [141] -3.352037e+03 -2.972214e+03  5.025147e+03  6.694049e+03  5.713688e+03
## [146] -2.388918e+03  1.213465e+04 -8.775342e+03 -2.319375e+03  2.583143e+03
## [151]  1.752549e+03 -1.915239e+03 -2.552500e+03  1.109415e+03 -3.153545e+03
## [156] -6.687974e+02 -3.896414e+03  1.352111e+04 -7.159741e+03  7.298326e+03
## [161]  1.049799e+04 -1.061732e+03  1.879164e+04  1.536871e+04  1.123679e+04
## [166] -9.524214e+03  2.064402e+03 -4.116336e+03 -1.620738e+03 -9.360070e+03
## [171]  2.756598e+03 -4.357574e+03 -1.077555e+03  1.131236e+03 -1.154668e+03
## [176]  7.761360e+01  7.780713e+02  5.301115e+03  5.417210e+03 -1.210770e+04
## [181] -1.244890e+03 -2.406243e+03  4.002686e+03 -9.886558e+03 -6.454612e+03
## [186] -2.192876e+03 -5.281192e+03 -8.430081e+03  1.864221e+03 -7.392758e+03
## [191] -7.594048e+03  4.439673e+03  1.916826e+03  1.998656e+04 -3.323247e+03
## [196]  2.329562e+03  4.107577e+03  1.159476e+03  2.341439e+03 -8.727638e+03
## [201]  3.815461e+03 -2.763246e+03 -9.767216e+02  2.569718e+03 -2.479541e+03
## [206] -3.713237e+03 -2.834731e+03 -3.736458e+01 -3.805397e+03 -2.183604e+03
## [211] -4.837698e+03 -6.774299e+03  1.072260e+04 -3.138901e+03  2.714963e+03
## [216]  3.384600e+03  8.772289e+03 -3.711571e+03  3.061610e+03  1.023081e+03
## [221]  7.382445e+03 -4.489385e+02  2.778443e+02  7.204701e+01 -5.540391e+02
## [226]  6.962079e+02 -8.070246e+02 -2.309033e+03  5.077091e+03 -2.480536e+03
## [231] -2.926470e+03  5.179961e+02  9.040563e+03 -2.756770e+03 -3.567287e+02
## [236]  7.292895e+03  5.430166e+03  4.805088e+03 -2.172992e+03 -8.335952e+02
## [241] -2.638133e+03 -7.141345e+01  2.613730e+04  1.527292e+03  5.878921e+02
## [246]  2.988819e+03  1.605698e+04  5.198939e+03 -8.474988e+02 -1.545669e+03
## [251]  2.452843e+04  9.328663e+02  5.718052e+02  3.774987e+03 -1.657035e+03
## [256] -2.033282e+03  8.155147e+02 -1.530949e+03  2.050241e+03  3.421462e+02
## [261] -1.121262e+03 -3.905697e+03 -8.056929e+02  9.938974e+03  1.650297e+04
## [266]  9.629716e+02 -6.529220e+03 -1.876861e+03 -1.149480e+02  2.488709e+03
## [271] -2.297793e+03  6.037559e+03 -3.521136e+03  2.219617e+03 -4.893259e+03
## [276] -1.524637e+02 -4.498426e+02  6.639736e+03 -1.631629e+03 -4.299694e+03
## [281]  3.839724e+03 -3.123990e+03  3.929085e+03 -3.871943e+02 -4.738856e+03
## [286] -9.590902e+00  4.934746e+03 -1.261871e+03 -2.684170e+03  1.265263e+04
## [291] -6.718445e+03  7.600632e+03  7.854339e+03 -7.024122e+03 -1.353738e+02
## [296] -3.953085e+03 -7.043818e+02 -3.480778e+03  1.885051e+04 -2.776309e+03
## 
## $spoints.x
## [1] 115562.5 174687.0 390282.5
## 
## $spoints.y
## [1] 115866.0 175986.5 393071.0
largest.us.cities.by.population %>%
mutate(FIT = 0.92 *
(largest.us.cities.by.population$census.2022) + 18983.29,
Residual = population.2023 - FIT) %>%
ggplot(aes(census.2022, Residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red") 

largest.us.cities.by.population %>%
mutate(Fit = 173818.7 + 0.9895414 * census.2022,
Residual = population.2023 - Fit) -> largest.us.cities.by.population
ggplot(largest.us.cities.by.population, aes(census.2022, Residual)) +
geom_point() +
geom_hline(yintercept = 0)

least squares fit: y = 18983.29 + 0.92x resistant fit: y = 173818.7 + 0.9895414x

Let us interpret the line fit for our least squares fit. The slope is m = 0.92, so this means that the populations of various countries in 2023 have been generally increasing at a rate of 0.92 compared to the 2022 census populations. We see that out resistant fit has a slightly higher slope of 0.9895414. The resistant fit has an intercept of 173818.7 while the least squares fit has a slightly higher intercept of 18983.29. When looking at the residuals for the least squares fit, we see that there is no obvious pattern for these residuals except for that they are slightly increasing perhaps towards the end (except for the one outlier). We do notice a few outliers that are either way above or way below the zero line. For the resistant fit, we instantly notice that all the residuals lie below the zero line, meaning they are negative. This indicates that the resistant fit overestimates the 2023 population for the various countries. Here, especially when comparing the residuals, I would say fitting the resistant line is not better. The least squares fit is a better estimate for our strongly related x and y and we also see there is less of a pattern in the residuals compared to the resistant fit residuals.

  1. Fit a resistant line to the following data (population of England and Wales in millions for different years).
    Data is stored in the file pop.england in the LearnEDA package. Plot the residuals and summarize the results.
ggplot(pop.england, aes(YEAR, POPULATION)) + geom_point() + geom_smooth(method = "lm", se = FALSE) 
## `geom_smooth()` using formula = 'y ~ x'

rline(POPULATION ~ YEAR, pop.england, iter=10)
## $a
## [1] 22.89979
## 
## $b
## [1] 0.256625
## 
## $xC
## [1] 1866
## 
## $half.slope.ratio
## [1] 1.563365
## 
## $residual
##  [1]  2.67083333  1.37458333  0.64833333 -0.01791667 -0.57416667 -1.12041667
##  [7] -1.54666667 -1.47291667 -0.77916667 -0.31541667  0.64833333  1.62208333
## [13]  0.87583333  0.36958333
## 
## $spoints.x
## [1] 1821 1866 1911
## 
## $spoints.y
## [1] 12.00 21.39 36.07
pop.england %>%
mutate(FIT = 0.256625 *
(YEAR - 1866) + 22.89979,
Residual = POPULATION - FIT) %>%
ggplot(aes(YEAR, Residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red") 

resistant fit: y = 22.89979 + 0.256625(x - 1866)

To summarize our results, firstly, we fitted a resistant line to our data which had a slope of 0.256625. This indicates that as time went on, the population was increasing at a slow rate. When plotting the residuals, we can clearly see a pattern in the form of a line. In the first and last thirds of the residual plot (about 1800-1825 and 1900-1925), we see that the residuals are positive. This indicates that our fit was underestimating the actual data. When looking at our initial scatterplot, this makes sense as we can actually see that the points lie above our line. In the middle of our residual plot, we see a dip in the residuals and the residuals are negative which means our fit overestimated the data. Again, this makes sense when we take a look at our initial scatter plot and see that our data points fell below the line.

  1. For the following problems (from Tukey, EDA),
  1. straighten plot using transformations applied on summary points
  2. fit a line to the transformed plots and plot the residuals
  3. summarize all results
ggplot(tukey.26a,
aes(temp, rate)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

summary.points <- data.frame(x = c(23, 28, 31.5),
y = c(388, 959, 185))
ggplot(tukey.26a,
aes(temp, rate)) +
geom_point() +
geom_point(data = summary.points,
aes(x, y), color="red", size = 3)

ggplot() +
geom_point(data = tukey.26a,
aes(temp, rate)) +
geom_point(data = summary.points,
aes(x, y), color="red", size = 3) +
geom_smooth(data = filter(summary.points, x < 31.5),
aes(x, y), method = "lm", color="blue") +
geom_smooth(data = filter(summary.points, x > 27),
aes(x, y), method = "lm", color="blue")
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

straightening.work <- function(sp, px, py){
sp$tx <- with(sp, (x ^ px - 1) / px)
sp$ty <- with(sp, (y ^ py - 1) / py)
sp$slope[1] <- with(sp, diff(ty[1:2]) / diff(tx[1:2]))
sp$slope[2] <- with(sp, diff(ty[2:3]) / diff(tx[2:3]))
sp$half.slope.ratio <- with(sp, slope[2] / slope[1])
sp$slope[3] <- NA
sp$half.slope.ratio[2:3] <- NA
row.names(sp) <- c("Left", "Center", "Right")
sp}

straightening.work(summary.points, 1, 1)
##           x   y   tx  ty     slope half.slope.ratio
## Left   23.0 388 22.0 387  114.2000        -1.936452
## Center 28.0 959 27.0 958 -221.1429               NA
## Right  31.5 185 30.5 184        NA               NA
straightening.work(summary.points,3.5,3.5)
##           x   y       tx         ty     slope half.slope.ratio
## Left   23.0 388 16671.39  328732767  452568.6         -1.01512
## Center 28.0 959 33188.02 7803637387 -459411.6               NA
## Right  31.5 185 50120.62   24605546        NA               NA
tukey.26a <- mutate(tukey.26a,
new.x = temp ^ (3.5),
new.y = rate ^ (3.5))
ggplot(tukey.26a, aes(new.x, new.y)) +
geom_point()

fit <- rline(new.y ~ new.x,
tukey.26a, iter=5)
c(fit$a, fit$b, fit$xC)
## [1]  4.285236e+09 -8.136322e+03  1.092175e+05
tukey.26a <- mutate(tukey.26a,
Residual = fit$residual)
ggplot(tukey.26a, aes(new.x, Residual)) +
geom_point() +
geom_hline(yintercept = 0)

We firstly take a look at the scatterplot for our dataset and see that it is not linear clearly. We then calculate the three summary points which are (23, 388), (28, 959), and (31.5, 185). Now, we will try to reexpress to reduce curvature in the graph. Using the straightening.work function and experimenting with different reexpression that will get us a half-slope ratio that is close to -1, we find that for both x and y, p = 3.5. This means we will transform our dataset by transforming x to the 3.5 power and y to the 3.5 power. After plotting the transformation, we see that there is still curvature due to the extreme points in the middle of the graph. but it is somewhat better compared to in the beginning. Our new fit is: rate = 4.285236e+09 -8.136322e+03 ((temp^35) - 1.092175e+05) We then plot the residuals of this fit and see there is still a pattern meaning our transformation was not completely successful but it improved slightly

ggplot(tukey.26b,
aes(year, deposits)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

summary.points <- data.frame(x = c(41.5, 51.5, 61),
y = c(747, 1230, 2643))
ggplot(tukey.26b,
aes(year, deposits)) +
geom_point() +
geom_point(data = summary.points,
aes(x, y), color="red", size = 3)

ggplot() +
geom_point(data = tukey.26b,
aes(year, deposits)) +
geom_point(data = summary.points,
aes(x, y), color="red", size = 3) +
geom_smooth(data = filter(summary.points, x < 61),
aes(x, y), method = "lm", color="blue") +
geom_smooth(data = filter(summary.points, x > 50),
aes(x, y), method = "lm", color="blue")
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

straightening.work(summary.points, 1, 1)
##           x    y   tx   ty    slope half.slope.ratio
## Left   41.5  747 40.5  746  48.3000         3.079438
## Center 51.5 1230 50.5 1229 148.7368               NA
## Right  61.0 2643 60.0 2642       NA               NA
straightening.work(summary.points,0.001,-1)
##           x    y       tx        ty       slope half.slope.ratio
## Left   41.5  747 3.732642 0.9986613 0.002425645         1.054211
## Center 51.5 1230 3.949360 0.9991870 0.002557141               NA
## Right  61.0 2643 4.119335 0.9996216          NA               NA
tukey.26b <- mutate(tukey.26b,
new.x = log(year),
new.y = -1/deposits)
ggplot(tukey.26b, aes(new.x, new.y)) +
geom_point()

fit <- rline(new.y ~ new.x,
tukey.26b, iter=5)
c(fit$a, fit$b, fit$xC)
## [1] -0.0008393117  0.0024433112  3.9318256327
tukey.26b <- mutate(tukey.26b,
Residual = fit$residual)
ggplot(tukey.26b, aes(new.x, Residual)) +
geom_point() +
geom_hline(yintercept = 0)

We firstly take a look at the scatterplot for our dataset and see that it is not linear clearly. We then calculate the three summary points which are (41.5, 747), (51.5, 1230), and (61, 2643). Now, we will try to reexpress to reduce curvature in the graph. Using the straightening.work function and experimenting with different reexpression that will get us a half-slope ratio that is close to 1, we find that for x, p = 0.001 and for y, p = -1. This means we will transform our dataset by taking the log of x and transforming y to the -1 power. After plotting the transformation, we see that it is much straighter to how it was in the beginning but there is still some slight curvature in the graph Our new fit is: deposits = 0.0008393117 + 0.0024433112 (log(year) - 3.9318256327) We then plot the residuals of this fit and see there is still a pattern and there are a few outliers especially towards the beginning.

ggplot(tukey.26c,
aes(year, miles)) +
geom_point() +
geom_smooth(method = "loess", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

summary.points <- data.frame(x = c(41.5, 48.5, 56.5),
y = c(1218.5, 6367, 23851))
ggplot(tukey.26c,
aes(year, miles)) +
geom_point() +
geom_point(data = summary.points,
aes(x, y), color="red", size = 3)

ggplot() +
geom_point(data = tukey.26c,
aes(year, miles)) +
geom_point(data = summary.points,
aes(x, y), color="red", size = 3) +
geom_smooth(data = filter(summary.points, x < 56),
aes(x, y), method = "lm", color="blue") +
geom_smooth(data = filter(summary.points, x > 47),
aes(x, y), method = "lm", color="blue")
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## `geom_smooth()` using formula = 'y ~ x'
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

straightening.work(summary.points, 1, 1)
##           x       y   tx      ty  slope half.slope.ratio
## Left   41.5  1218.5 40.5  1217.5  735.5         2.971448
## Center 48.5  6367.0 47.5  6366.0 2185.5               NA
## Right  56.5 23851.0 55.5 23850.0     NA               NA
straightening.work(summary.points, 3, 0.5)
##           x       y       tx        ty       slope half.slope.ratio
## Left   41.5  1218.5 23824.12  67.81404 0.006320442          1.06913
## Center 48.5  6367.0 38027.71 157.58697 0.006757374               NA
## Right  56.5 23851.0 60120.38 306.87538          NA               NA
tukey.26c <- mutate(tukey.26c,
new.x = (year^3),
new.y = sqrt(miles))
ggplot(tukey.26c, aes(new.x, new.y)) +
geom_point()

fit <- rline(new.y ~ new.x,
tukey.26c, iter=5)
c(fit$a, fit$b, fit$xC)
## [1] 8.363582e+01 1.053275e-03 1.141205e+05
tukey.26c <- mutate(tukey.26c,
Residual = fit$residual)
ggplot(tukey.26c, aes(new.x, Residual)) +
geom_point() +
geom_hline(yintercept = 0)

We firstly take a look at the scatterplot for our dataset and see that it is not linear clearly. We then calculate the three summary points which are (41.5, 1218.5), (48.5, 6367), and (56.5, 23851). Now, we will try to reexpress to reduce curvature in the graph. Using the straightening.work function and experimenting with different reexpression that will get us a half-slope ratio that is close to 1, we find that for x, p = 3 and for y, p = 0.5. This means we will transform our dataset by taking x to the third power and taking the square root of y. After plotting the transformation, we see that it is much straighter to how it was in the beginning. Our new fit is: deposits = 83.63582 + 0.001053275 ((year)^3 - 114120.5) We then plot the residuals of this fit and see there is no pattern which is immediately obvious although we do see some large negative residuals towards the right side of the graph.