Question 1

data5 <- read.csv("C:/Users/KayVog22/Downloads/data5.csv")
head(data5)
##   Height Weight
## 1   66.0    140
## 2   72.0    145
## 3   73.5    160
## 4   73.0    190
## 5   69.0    155
## 6   73.0    165
library(LearnEDAfunctions)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: ggplot2
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
library(ggplot2)

ggplot(data5, aes(x = Height, y = Weight)) +
  geom_point() +
  labs(title = "Height vs Weight", x = "Height", y = "Weight")

lm_fit <- lm(Weight ~ Height, data = data5)
summary(lm_fit)
## 
## Call:
## lm(formula = Weight ~ Height, data = data5)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.774 -11.758  -1.579   8.774  51.834 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -138.9196    55.4470  -2.505   0.0153 *  
## Height         4.1956     0.7838   5.353 1.81e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.93 on 54 degrees of freedom
## Multiple R-squared:  0.3467, Adjusted R-squared:  0.3346 
## F-statistic: 28.65 on 1 and 54 DF,  p-value: 1.814e-06
ggplot(data5, aes(x = Height, y = Weight)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Least Squares Regression Line")
## `geom_smooth()` using formula = 'y ~ x'

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:LearnEDAfunctions':
## 
##     farms
## The following object is masked from 'package:dplyr':
## 
##     select
rlm_fit <- rlm(Weight ~ Height, data = data5)
summary(rlm_fit)
## 
## Call: rlm(formula = Weight ~ Height, data = data5)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -24.0214 -10.4314  -0.9788   9.8613  52.8933 
## 
## Coefficients:
##             Value     Std. Error t value  
## (Intercept) -128.9651   52.0405    -2.4782
## Height         4.0427    0.7356     5.4955
## 
## Residual standard error: 14.98 on 54 degrees of freedom
ggplot(data5, aes(x = Height, y = Weight)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  geom_abline(intercept = coef(rlm_fit)[1], slope = coef(rlm_fit)[2], color = "green") +
  labs(title = "Least Square vs Resistant Fit")
## `geom_smooth()` using formula = 'y ~ x'

data5$ls_resid <- resid(lm_fit)
par(mfrow = c(1,2))
plot(data5$Height, data5$ls_resid, main="Least Squares Residuals", ylab="Residuals", xlab="Height")
abline(h=0)


data5$rlm_resid <- resid(rlm_fit)
plot(data5$Height, data5$rlm_resid, main="Resistant Residuals", ylab="Residuals", xlab="Height")
abline(h=0)

Both fits show a strong positive linear relationship between height and weight. The least-squares line is influenced more by outliers, while the resistant line provides a more stable fit that better represents the overall trend of the data. Because a few weights are higher than expected for their height, the resistant line gives a slightly better representation of the typical relationship in this dataset.

Question 2

library(LearnEDAfunctions)
data(pop.england)
head(pop.england)
##   YEAR POPULATION
## 1 1801       8.89
## 2 1811      10.16
## 3 1821      12.00
## 4 1831      13.90
## 5 1841      15.91
## 6 1851      17.93
ggplot(pop.england, aes(x = YEAR, y = POPULATION)) +
  geom_point() +
  labs(title = "Population", x = "Year", y = "Population")

ggplot(pop.england,
       aes(YEAR,POPULATION))+
  geom_point()+
  geom_smooth(method = "lm", se=FALSE)+
  xlab("YEAR")+ylab("TIME")
## `geom_smooth()` using formula = 'y ~ x'

resistant_model <- line(pop.england$YEAR, pop.england$POPULATION)
print(resistant_model)
## 
## Call:
## line(pop.england$YEAR, pop.england$POPULATION)
## 
## Coefficients:
## [1]  -476.1686     0.2674
plot(pop.england$YEAR, pop.england$POPULATION, main = "Scatterplot with Resistant Line")
abline(resistant_model, col = "blue", lwd = 2)

Question 3

LearnEDAfunctions::tukey.26a
##    temp rate
## 1    21  248
## 2    22  308
## 3    23  388
## 4    24  465
## 5    25  569
## 6    26  678
## 7    27  806
## 8    28  959
## 9    29  114
## 10   30  139
## 11   31  168
## 12   32  202
## 13   33  236
## 14   34  270
head(tukey.26a)
##   temp rate
## 1   21  248
## 2   22  308
## 3   23  388
## 4   24  465
## 5   25  569
## 6   26  678
data("tukey.26a")
ggplot(tukey.26a, aes(x = temp, y = rate)) +
  geom_point() +
  labs(title = "Temp vs Rate", x = "Temp", y = "Rate")

ggplot(tukey.26a, aes(x = temp, y = rate)) +
  geom_point() +
  labs(title = "Temp vs Rate", x = "Temp", y = "Rate")+
  geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

summary.points <- data.frame(x=c(23, 28, 32),
                            y=c(388, 959, 202))
ggplot(tukey.26a, aes(x = temp, y = rate)) +
  geom_point() +
  labs(title = "Temp vs Rate", x = "Temp", y = "Rate")+
  geom_point(data=summary.points,
             aes(x,y), color="red", size=3)

straightening.work <- function(sp, px, py){
   sp$tx <- with(sp, (x ^ px - 1) / px)
   sp$ty <- with(sp, (y ^ py - 1) / py)
   sp$slope[1] <- with(sp, diff(ty[1:2]) / diff(tx[1:2]))
   sp$slope[2] <- with(sp, diff(ty[2:3]) / diff(tx[2:3]))
   sp$half.slope.ratio <- with(sp, slope[2] / slope[1])
   sp$slope[3] <- NA
   sp$half.slope.ratio[2:3] <- NA
   row.names(sp) <- c("Left", "Center", "Right")
   sp}
straightening.work(summary.points, 1, 1)
##         x   y tx  ty   slope half.slope.ratio
## Left   23 388 22 387  114.20         -1.65718
## Center 28 959 27 958 -189.25               NA
## Right  32 202 31 201      NA               NA
straightening.work(summary.points, 0.5, 1)
##         x   y       tx  ty      slope half.slope.ratio
## Left   23 388 7.591663 387   575.9868        -1.798632
## Center 28 959 8.583005 958 -1035.9883               NA
## Right  32 202 9.313708 201         NA               NA
straightening.work(summary.points, 0.001, 1)
##         x   y       tx  ty     slope half.slope.ratio
## Left   23 388 3.140415 387  2893.374        -1.952683
## Center 28 959 3.337762 958 -5649.843               NA
## Right  32 202 3.471749 201        NA               NA
straightening.work(summary.points, 0.001, 0.001)
##         x   y       tx       ty      slope half.slope.ratio
## Left   23 388 3.140415 5.978807   4.614743        -2.534539
## Center 28 959 3.337762 6.889515 -11.696248               NA
## Right  32 202 3.471749 5.322382         NA               NA
straightening.work(summary.points, 0.001, -1)
##         x   y       tx        ty        slope half.slope.ratio
## Left   23 388 3.140415 0.9974227  0.007775964        -3.750698
## Center 28 959 3.337762 0.9989572 -0.029165295               NA
## Right  32 202 3.471749 0.9950495           NA               NA
straightening.work(summary.points, -0.33, -1)
##         x   y       tx        ty       slope half.slope.ratio
## Left   23 388 1.953551 0.9974227  0.02267507        -3.961773
## Center 28 959 2.021227 0.9989572 -0.08983348               NA
## Right  32 202 2.064727 0.9950495          NA               NA
tukey.26a <- mutate(tukey.26a,
                    new.x = rate ^ (.001),
                    new.y = -1/temp)
ggplot(tukey.26a, aes(new.x, new.y)) +
  geom_point()

library(LearnEDAfunctions)
library(dplyr)
library(ggplot2)

data(tukey.26a)
head(tukey.26a)
##   temp rate
## 1   21  248
## 2   22  308
## 3   23  388
## 4   24  465
## 5   25  569
## 6   26  678
ggplot(tukey.26a, aes(temp, rate)) +
  geom_point() 

tukey.26a <- tukey.26a %>%
  mutate(x1 = temp^(0.5),
         y1 = 1 / rate)

ggplot(tukey.26a, aes(x1, y1)) +
  geom_point()+
  labs(x = "sqrt(temp)", y = "1 / rate")

model <- lm(y1 ~ x1, data = tukey.26a)
summary(model)
## 
## Call:
## lm(formula = y1 ~ x1, data = tukey.26a)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0028403 -0.0014452 -0.0004257  0.0012592  0.0046688 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.008549   0.008049  -1.062    0.309
## x1           0.002349   0.001535   1.531    0.152
## 
## Residual standard error: 0.002219 on 12 degrees of freedom
## Multiple R-squared:  0.1633, Adjusted R-squared:  0.09362 
## F-statistic: 2.343 on 1 and 12 DF,  p-value: 0.1518
ggplot(tukey.26a, aes(x1, y1)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'

tukey.26a$resid <- residuals(model)

ggplot(tukey.26a, aes(x = fitted(model), y = resid)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  labs(title = "Residual Plot for Transformed Model",
       x = "Fitted Values", y = "Residuals")

The relationship between the square root of temp and 1/rate was the best fit.

LearnEDAfunctions::tukey.26b
##    year deposits
## 1    37      458
## 2    38      498
## 3    39      523
## 4    40      643
## 5    41      707
## 6    42      787
## 7    43      839
## 8    44      927
## 9    45     1001
## 10   46     1079
## 11   47     1007
## 12   48     1033
## 13   49     1090
## 14   50     1125
## 15   51     1212
## 16   52     1248
## 17   53     1334
## 18   54     1393
## 19   55     1443
## 20   56     1720
## 21   57     1720
## 22   58     1896
## 23   59     2050
## 24   60     2268
## 25   61     2643
## 26   62     3140
## 27   63     3353
## 28   64     3513
## 29   65     3810
head(tukey.26b)
##   year deposits
## 1   37      458
## 2   38      498
## 3   39      523
## 4   40      643
## 5   41      707
## 6   42      787
data("tukey.26b")
ggplot(tukey.26b, aes(x = year, y = deposits)) +
  geom_point() +
  labs(title = "Year vs Deposits", x = "Year", y = "Deposits")

ggplot(tukey.26b, aes(x = year, y = deposits)) +
  geom_point() +
  labs(title = "Year vs Deposits", x = "Year", y = "Deposits")+
  geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

summary.points <- data.frame(x=c(41, 51, 61),
                            y=c(746, 1260, 2710))
ggplot(tukey.26b, aes(x = year, y = deposits)) +
  geom_point() +
  labs(title = "Year vs Deposits", x = "Year", y = "Deposits")+
  geom_point(data=summary.points,
             aes(x,y), color="red", size=3)

straightening.work <- function(sp, px, py){
   sp$tx <- with(sp, (x ^ px - 1) / px)
   sp$ty <- with(sp, (y ^ py - 1) / py)
   sp$slope[1] <- with(sp, diff(ty[1:2]) / diff(tx[1:2]))
   sp$slope[2] <- with(sp, diff(ty[2:3]) / diff(tx[2:3]))
   sp$half.slope.ratio <- with(sp, slope[2] / slope[1])
   sp$slope[3] <- NA
   sp$half.slope.ratio[2:3] <- NA
   row.names(sp) <- c("Left", "Center", "Right")
   sp}
straightening.work(summary.points, 1, 1)
##         x    y tx   ty slope half.slope.ratio
## Left   41  746 40  745  51.4         2.821012
## Center 51 1260 50 1259 145.0               NA
## Right  61 2710 60 2709    NA               NA
straightening.work(summary.points, 0.5, 1)
##         x    y       tx   ty    slope half.slope.ratio
## Left   41  746 10.80625  745  348.095         3.114083
## Center 51 1260 12.28286 1259 1083.997               NA
## Right  61 2710 13.62050 2709       NA               NA
straightening.work(summary.points, 0.001, 1)
##         x    y       tx   ty    slope half.slope.ratio
## Left   41  746 3.720476  745 2346.073         3.438032
## Center 51 1260 3.939565 1259 8065.876               NA
## Right  61 2710 4.119335 2709       NA               NA
straightening.work(summary.points, 0.001, 0.001)
##         x    y       tx       ty    slope half.slope.ratio
## Left   41  746 3.720476 6.636651 2.408871         1.781858
## Center 51 1260 3.939565 7.164409 4.292265               NA
## Right  61 2710 4.119335 7.936029       NA               NA
straightening.work(summary.points, 0.001, -1)
##         x    y       tx        ty       slope half.slope.ratio
## Left   41  746 3.720476 0.9986595 0.002495929        0.9464103
## Center 51 1260 3.939565 0.9992063 0.002362173               NA
## Right  61 2710 4.119335 0.9996310          NA               NA
straightening.work(summary.points, -0.33, -1)
##         x    y       tx        ty       slope half.slope.ratio
## Left   41  746 2.140554 0.9986595 0.008844167         1.010803
## Center 51 1260 2.202384 0.9992063 0.008939711               NA
## Right  61 2710 2.249885 0.9996310          NA               NA
tukey.26b <- mutate(tukey.26b,
                    new.x = year ^ (.001),
                    new.y = -1/deposits)
ggplot(tukey.26b, aes(new.x, new.y)) +
  geom_point()

ggplot(tukey.26b, aes(year, deposits)) +
  geom_point() 

tukey.26b <- tukey.26b %>%
  mutate(x1 = year^(0.5),
         y1 = 1 / deposits)

ggplot(tukey.26b, aes(x1, y1)) +
  geom_point()+
  labs(x = "sqrt(year)", y = "1 / deposits")

model <- lm(y1 ~ x1, data = tukey.26b)
summary(model)
## 
## Call:
## lm(formula = y1 ~ x1, data = tukey.26b)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.529e-04 -8.385e-05  3.600e-07  4.477e-05  4.304e-04 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.738e-03  3.590e-04   18.77  < 2e-16 ***
## x1          -8.196e-04  5.027e-05  -16.30 1.69e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0001597 on 27 degrees of freedom
## Multiple R-squared:  0.9078, Adjusted R-squared:  0.9044 
## F-statistic: 265.8 on 1 and 27 DF,  p-value: 1.686e-15
ggplot(tukey.26b, aes(x1, y1)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'

tukey.26b$resid <- residuals(model)

ggplot(tukey.26b, aes(x = fitted(model), y = resid)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  labs(title = "Residual Plot for Transformed Model",
       x = "Fitted Values", y = "Residuals")

LearnEDAfunctions::tukey.26c
##    year miles
## 1    37   412
## 2    38   480
## 3    39   683
## 4    40  1052
## 5    41  1385
## 6    42  1418
## 7    43  1634
## 8    44  2178
## 9    45  3362
## 10   46  5948
## 11   47  6109
## 12   48  5981
## 13   49  6753
## 14   50  8003
## 15   51 10566
## 16   52 12528
## 17   53 14760
## 18   54 16769
## 19   55 19819
## 20   56 22362
## 21   57 25340
## 22   58 25343
## 23   59 28269
## 24   60 30514
head(tukey.26c)
##   year miles
## 1   37   412
## 2   38   480
## 3   39   683
## 4   40  1052
## 5   41  1385
## 6   42  1418
data("tukey.26c")
ggplot(tukey.26c, aes(x = year, y = miles)) +
  geom_point() +
  labs(title = "Year vs Miles", x = "Year", y = "Miles")

ggplot(tukey.26c, aes(x = year, y = miles)) +
  geom_point() +
  labs(title = "Year vs Miles", x = "Year", y = "Miles")+
  geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'

summary.points <- data.frame(x=c(41, 49, 57),
                            y=c(1155, 7406, 22897))
ggplot(tukey.26c, aes(x = year, y = miles)) +
  geom_point() +
  labs(title = "Year vs Miles", x = "Year", y = "Miles")+
  geom_point(data=summary.points,
             aes(x,y), color="red", size=3)

straightening.work <- function(sp, px, py){
   sp$tx <- with(sp, (x ^ px - 1) / px)
   sp$ty <- with(sp, (y ^ py - 1) / py)
   sp$slope[1] <- with(sp, diff(ty[1:2]) / diff(tx[1:2]))
   sp$slope[2] <- with(sp, diff(ty[2:3]) / diff(tx[2:3]))
   sp$half.slope.ratio <- with(sp, slope[2] / slope[1])
   sp$slope[3] <- NA
   sp$half.slope.ratio[2:3] <- NA
   row.names(sp) <- c("Left", "Center", "Right")
   sp}
straightening.work(summary.points, 1, 1)
##         x     y tx    ty    slope half.slope.ratio
## Left   41  1155 40  1154  781.375         2.478163
## Center 49  7406 48  7405 1936.375               NA
## Right  57 22897 56 22896       NA               NA
straightening.work(summary.points, 0.5, 1)
##         x     y       tx    ty     slope half.slope.ratio
## Left   41  1155 10.80625  1154  5236.433         2.690184
## Center 49  7406 12.00000  7405 14086.968               NA
## Right  57 22897 13.09967 22896        NA               NA
straightening.work(summary.points, 0.001, 1)
##         x     y       tx    ty     slope half.slope.ratio
## Left   41  1155 3.720476  1154  34935.97         2.920404
## Center 49  7406 3.899403  7405 102027.13               NA
## Right  57 22897 4.051235 22896        NA               NA
straightening.work(summary.points, 0.001, 0.001)
##         x     y       tx        ty     slope half.slope.ratio
## Left   41  1155 3.720476  7.076779 10.468382        0.7168954
## Center 49  7406 3.899403  8.949858  7.504735               NA
## Right  57 22897 4.051235 10.089319        NA               NA
straightening.work(summary.points, 0.001, -1)
##         x     y       tx        ty        slope half.slope.ratio
## Left   41  1155 3.720476 0.9991342 0.0040842008        0.1473148
## Center 49  7406 3.899403 0.9998650 0.0006016631               NA
## Right  57 22897 4.051235 0.9999563           NA               NA
straightening.work(summary.points, -0.33, -1)
##         x     y       tx        ty      slope half.slope.ratio
## Left   41  1155 2.140554 0.9991342 0.01437764         0.155577
## Center 49  7406 2.191381 0.9998650 0.00223683               NA
## Right  57 22897 2.232221 0.9999563         NA               NA
tukey.26c <- mutate(tukey.26c,
                    new.x = year ^ (.001),
                    new.y = -1/miles)
ggplot(tukey.26b, aes(new.x, new.y)) +
  geom_point()

ggplot(tukey.26c, aes(year, miles)) +
  geom_point() 

tukey.26c <- tukey.26c %>%
  mutate(x1 = year^(0.5),
         y1 = 1 / miles)

ggplot(tukey.26c, aes(x1, y1)) +
  geom_point()+
  labs(x = "sqrt(year)", y = "1 / miles")

model <- lm(y1 ~ x1, data = tukey.26c)
summary(model)
## 
## Call:
## lm(formula = y1 ~ x1, data = tukey.26c)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0004642 -0.0002700 -0.0001307  0.0002275  0.0010589 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0077671  0.0011222   6.922 5.98e-07 ***
## x1          -0.0010520  0.0001611  -6.529 1.44e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0003943 on 22 degrees of freedom
## Multiple R-squared:  0.6596, Adjusted R-squared:  0.6441 
## F-statistic: 42.62 on 1 and 22 DF,  p-value: 1.444e-06
ggplot(tukey.26c, aes(x1, y1)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'

tukey.26c$resid <- residuals(model)

ggplot(tukey.26c, aes(x = fitted(model), y = resid)) +
  geom_point() +
  geom_hline(yintercept = 0) +
  labs(title = "Residual Plot for Transformed Model",
       x = "Fitted Values", y = "Residuals")