data5 <- read.csv("C:/Users/KayVog22/Downloads/data5.csv")
head(data5)
## Height Weight
## 1 66.0 140
## 2 72.0 145
## 3 73.5 160
## 4 73.0 190
## 5 69.0 155
## 6 73.0 165
library(LearnEDAfunctions)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Loading required package: ggplot2
install.packages("ggplot2")
## Warning: package 'ggplot2' is in use and will not be installed
library(ggplot2)
library(ggplot2)
ggplot(data5, aes(x = Height, y = Weight)) +
geom_point() +
labs(title = "Height vs Weight", x = "Height", y = "Weight")
lm_fit <- lm(Weight ~ Height, data = data5)
summary(lm_fit)
##
## Call:
## lm(formula = Weight ~ Height, data = data5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.774 -11.758 -1.579 8.774 51.834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -138.9196 55.4470 -2.505 0.0153 *
## Height 4.1956 0.7838 5.353 1.81e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.93 on 54 degrees of freedom
## Multiple R-squared: 0.3467, Adjusted R-squared: 0.3346
## F-statistic: 28.65 on 1 and 54 DF, p-value: 1.814e-06
ggplot(data5, aes(x = Height, y = Weight)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Least Squares Regression Line")
## `geom_smooth()` using formula = 'y ~ x'
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:LearnEDAfunctions':
##
## farms
## The following object is masked from 'package:dplyr':
##
## select
rlm_fit <- rlm(Weight ~ Height, data = data5)
summary(rlm_fit)
##
## Call: rlm(formula = Weight ~ Height, data = data5)
## Residuals:
## Min 1Q Median 3Q Max
## -24.0214 -10.4314 -0.9788 9.8613 52.8933
##
## Coefficients:
## Value Std. Error t value
## (Intercept) -128.9651 52.0405 -2.4782
## Height 4.0427 0.7356 5.4955
##
## Residual standard error: 14.98 on 54 degrees of freedom
ggplot(data5, aes(x = Height, y = Weight)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") +
geom_abline(intercept = coef(rlm_fit)[1], slope = coef(rlm_fit)[2], color = "green") +
labs(title = "Least Square vs Resistant Fit")
## `geom_smooth()` using formula = 'y ~ x'
data5$ls_resid <- resid(lm_fit)
par(mfrow = c(1,2))
plot(data5$Height, data5$ls_resid, main="Least Squares Residuals", ylab="Residuals", xlab="Height")
abline(h=0)
data5$rlm_resid <- resid(rlm_fit)
plot(data5$Height, data5$rlm_resid, main="Resistant Residuals", ylab="Residuals", xlab="Height")
abline(h=0)
Both fits show a strong positive linear relationship between height and
weight. The least-squares line is influenced more by outliers, while the
resistant line provides a more stable fit that better represents the
overall trend of the data. Because a few weights are higher than
expected for their height, the resistant line gives a slightly better
representation of the typical relationship in this dataset.
library(LearnEDAfunctions)
data(pop.england)
head(pop.england)
## YEAR POPULATION
## 1 1801 8.89
## 2 1811 10.16
## 3 1821 12.00
## 4 1831 13.90
## 5 1841 15.91
## 6 1851 17.93
ggplot(pop.england, aes(x = YEAR, y = POPULATION)) +
geom_point() +
labs(title = "Population", x = "Year", y = "Population")
ggplot(pop.england,
aes(YEAR,POPULATION))+
geom_point()+
geom_smooth(method = "lm", se=FALSE)+
xlab("YEAR")+ylab("TIME")
## `geom_smooth()` using formula = 'y ~ x'
resistant_model <- line(pop.england$YEAR, pop.england$POPULATION)
print(resistant_model)
##
## Call:
## line(pop.england$YEAR, pop.england$POPULATION)
##
## Coefficients:
## [1] -476.1686 0.2674
plot(pop.england$YEAR, pop.england$POPULATION, main = "Scatterplot with Resistant Line")
abline(resistant_model, col = "blue", lwd = 2)
LearnEDAfunctions::tukey.26a
## temp rate
## 1 21 248
## 2 22 308
## 3 23 388
## 4 24 465
## 5 25 569
## 6 26 678
## 7 27 806
## 8 28 959
## 9 29 114
## 10 30 139
## 11 31 168
## 12 32 202
## 13 33 236
## 14 34 270
head(tukey.26a)
## temp rate
## 1 21 248
## 2 22 308
## 3 23 388
## 4 24 465
## 5 25 569
## 6 26 678
data("tukey.26a")
ggplot(tukey.26a, aes(x = temp, y = rate)) +
geom_point() +
labs(title = "Temp vs Rate", x = "Temp", y = "Rate")
ggplot(tukey.26a, aes(x = temp, y = rate)) +
geom_point() +
labs(title = "Temp vs Rate", x = "Temp", y = "Rate")+
geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
summary.points <- data.frame(x=c(23, 28, 32),
y=c(388, 959, 202))
ggplot(tukey.26a, aes(x = temp, y = rate)) +
geom_point() +
labs(title = "Temp vs Rate", x = "Temp", y = "Rate")+
geom_point(data=summary.points,
aes(x,y), color="red", size=3)
straightening.work <- function(sp, px, py){
sp$tx <- with(sp, (x ^ px - 1) / px)
sp$ty <- with(sp, (y ^ py - 1) / py)
sp$slope[1] <- with(sp, diff(ty[1:2]) / diff(tx[1:2]))
sp$slope[2] <- with(sp, diff(ty[2:3]) / diff(tx[2:3]))
sp$half.slope.ratio <- with(sp, slope[2] / slope[1])
sp$slope[3] <- NA
sp$half.slope.ratio[2:3] <- NA
row.names(sp) <- c("Left", "Center", "Right")
sp}
straightening.work(summary.points, 1, 1)
## x y tx ty slope half.slope.ratio
## Left 23 388 22 387 114.20 -1.65718
## Center 28 959 27 958 -189.25 NA
## Right 32 202 31 201 NA NA
straightening.work(summary.points, 0.5, 1)
## x y tx ty slope half.slope.ratio
## Left 23 388 7.591663 387 575.9868 -1.798632
## Center 28 959 8.583005 958 -1035.9883 NA
## Right 32 202 9.313708 201 NA NA
straightening.work(summary.points, 0.001, 1)
## x y tx ty slope half.slope.ratio
## Left 23 388 3.140415 387 2893.374 -1.952683
## Center 28 959 3.337762 958 -5649.843 NA
## Right 32 202 3.471749 201 NA NA
straightening.work(summary.points, 0.001, 0.001)
## x y tx ty slope half.slope.ratio
## Left 23 388 3.140415 5.978807 4.614743 -2.534539
## Center 28 959 3.337762 6.889515 -11.696248 NA
## Right 32 202 3.471749 5.322382 NA NA
straightening.work(summary.points, 0.001, -1)
## x y tx ty slope half.slope.ratio
## Left 23 388 3.140415 0.9974227 0.007775964 -3.750698
## Center 28 959 3.337762 0.9989572 -0.029165295 NA
## Right 32 202 3.471749 0.9950495 NA NA
straightening.work(summary.points, -0.33, -1)
## x y tx ty slope half.slope.ratio
## Left 23 388 1.953551 0.9974227 0.02267507 -3.961773
## Center 28 959 2.021227 0.9989572 -0.08983348 NA
## Right 32 202 2.064727 0.9950495 NA NA
tukey.26a <- mutate(tukey.26a,
new.x = rate ^ (.001),
new.y = -1/temp)
ggplot(tukey.26a, aes(new.x, new.y)) +
geom_point()
library(LearnEDAfunctions)
library(dplyr)
library(ggplot2)
data(tukey.26a)
head(tukey.26a)
## temp rate
## 1 21 248
## 2 22 308
## 3 23 388
## 4 24 465
## 5 25 569
## 6 26 678
ggplot(tukey.26a, aes(temp, rate)) +
geom_point()
tukey.26a <- tukey.26a %>%
mutate(x1 = temp^(0.5),
y1 = 1 / rate)
ggplot(tukey.26a, aes(x1, y1)) +
geom_point()+
labs(x = "sqrt(temp)", y = "1 / rate")
model <- lm(y1 ~ x1, data = tukey.26a)
summary(model)
##
## Call:
## lm(formula = y1 ~ x1, data = tukey.26a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0028403 -0.0014452 -0.0004257 0.0012592 0.0046688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.008549 0.008049 -1.062 0.309
## x1 0.002349 0.001535 1.531 0.152
##
## Residual standard error: 0.002219 on 12 degrees of freedom
## Multiple R-squared: 0.1633, Adjusted R-squared: 0.09362
## F-statistic: 2.343 on 1 and 12 DF, p-value: 0.1518
ggplot(tukey.26a, aes(x1, y1)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'
tukey.26a$resid <- residuals(model)
ggplot(tukey.26a, aes(x = fitted(model), y = resid)) +
geom_point() +
geom_hline(yintercept = 0) +
labs(title = "Residual Plot for Transformed Model",
x = "Fitted Values", y = "Residuals")
The relationship between the square root of temp and 1/rate was the best fit.
LearnEDAfunctions::tukey.26b
## year deposits
## 1 37 458
## 2 38 498
## 3 39 523
## 4 40 643
## 5 41 707
## 6 42 787
## 7 43 839
## 8 44 927
## 9 45 1001
## 10 46 1079
## 11 47 1007
## 12 48 1033
## 13 49 1090
## 14 50 1125
## 15 51 1212
## 16 52 1248
## 17 53 1334
## 18 54 1393
## 19 55 1443
## 20 56 1720
## 21 57 1720
## 22 58 1896
## 23 59 2050
## 24 60 2268
## 25 61 2643
## 26 62 3140
## 27 63 3353
## 28 64 3513
## 29 65 3810
head(tukey.26b)
## year deposits
## 1 37 458
## 2 38 498
## 3 39 523
## 4 40 643
## 5 41 707
## 6 42 787
data("tukey.26b")
ggplot(tukey.26b, aes(x = year, y = deposits)) +
geom_point() +
labs(title = "Year vs Deposits", x = "Year", y = "Deposits")
ggplot(tukey.26b, aes(x = year, y = deposits)) +
geom_point() +
labs(title = "Year vs Deposits", x = "Year", y = "Deposits")+
geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
summary.points <- data.frame(x=c(41, 51, 61),
y=c(746, 1260, 2710))
ggplot(tukey.26b, aes(x = year, y = deposits)) +
geom_point() +
labs(title = "Year vs Deposits", x = "Year", y = "Deposits")+
geom_point(data=summary.points,
aes(x,y), color="red", size=3)
straightening.work <- function(sp, px, py){
sp$tx <- with(sp, (x ^ px - 1) / px)
sp$ty <- with(sp, (y ^ py - 1) / py)
sp$slope[1] <- with(sp, diff(ty[1:2]) / diff(tx[1:2]))
sp$slope[2] <- with(sp, diff(ty[2:3]) / diff(tx[2:3]))
sp$half.slope.ratio <- with(sp, slope[2] / slope[1])
sp$slope[3] <- NA
sp$half.slope.ratio[2:3] <- NA
row.names(sp) <- c("Left", "Center", "Right")
sp}
straightening.work(summary.points, 1, 1)
## x y tx ty slope half.slope.ratio
## Left 41 746 40 745 51.4 2.821012
## Center 51 1260 50 1259 145.0 NA
## Right 61 2710 60 2709 NA NA
straightening.work(summary.points, 0.5, 1)
## x y tx ty slope half.slope.ratio
## Left 41 746 10.80625 745 348.095 3.114083
## Center 51 1260 12.28286 1259 1083.997 NA
## Right 61 2710 13.62050 2709 NA NA
straightening.work(summary.points, 0.001, 1)
## x y tx ty slope half.slope.ratio
## Left 41 746 3.720476 745 2346.073 3.438032
## Center 51 1260 3.939565 1259 8065.876 NA
## Right 61 2710 4.119335 2709 NA NA
straightening.work(summary.points, 0.001, 0.001)
## x y tx ty slope half.slope.ratio
## Left 41 746 3.720476 6.636651 2.408871 1.781858
## Center 51 1260 3.939565 7.164409 4.292265 NA
## Right 61 2710 4.119335 7.936029 NA NA
straightening.work(summary.points, 0.001, -1)
## x y tx ty slope half.slope.ratio
## Left 41 746 3.720476 0.9986595 0.002495929 0.9464103
## Center 51 1260 3.939565 0.9992063 0.002362173 NA
## Right 61 2710 4.119335 0.9996310 NA NA
straightening.work(summary.points, -0.33, -1)
## x y tx ty slope half.slope.ratio
## Left 41 746 2.140554 0.9986595 0.008844167 1.010803
## Center 51 1260 2.202384 0.9992063 0.008939711 NA
## Right 61 2710 2.249885 0.9996310 NA NA
tukey.26b <- mutate(tukey.26b,
new.x = year ^ (.001),
new.y = -1/deposits)
ggplot(tukey.26b, aes(new.x, new.y)) +
geom_point()
ggplot(tukey.26b, aes(year, deposits)) +
geom_point()
tukey.26b <- tukey.26b %>%
mutate(x1 = year^(0.5),
y1 = 1 / deposits)
ggplot(tukey.26b, aes(x1, y1)) +
geom_point()+
labs(x = "sqrt(year)", y = "1 / deposits")
model <- lm(y1 ~ x1, data = tukey.26b)
summary(model)
##
## Call:
## lm(formula = y1 ~ x1, data = tukey.26b)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.529e-04 -8.385e-05 3.600e-07 4.477e-05 4.304e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.738e-03 3.590e-04 18.77 < 2e-16 ***
## x1 -8.196e-04 5.027e-05 -16.30 1.69e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0001597 on 27 degrees of freedom
## Multiple R-squared: 0.9078, Adjusted R-squared: 0.9044
## F-statistic: 265.8 on 1 and 27 DF, p-value: 1.686e-15
ggplot(tukey.26b, aes(x1, y1)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'
tukey.26b$resid <- residuals(model)
ggplot(tukey.26b, aes(x = fitted(model), y = resid)) +
geom_point() +
geom_hline(yintercept = 0) +
labs(title = "Residual Plot for Transformed Model",
x = "Fitted Values", y = "Residuals")
LearnEDAfunctions::tukey.26c
## year miles
## 1 37 412
## 2 38 480
## 3 39 683
## 4 40 1052
## 5 41 1385
## 6 42 1418
## 7 43 1634
## 8 44 2178
## 9 45 3362
## 10 46 5948
## 11 47 6109
## 12 48 5981
## 13 49 6753
## 14 50 8003
## 15 51 10566
## 16 52 12528
## 17 53 14760
## 18 54 16769
## 19 55 19819
## 20 56 22362
## 21 57 25340
## 22 58 25343
## 23 59 28269
## 24 60 30514
head(tukey.26c)
## year miles
## 1 37 412
## 2 38 480
## 3 39 683
## 4 40 1052
## 5 41 1385
## 6 42 1418
data("tukey.26c")
ggplot(tukey.26c, aes(x = year, y = miles)) +
geom_point() +
labs(title = "Year vs Miles", x = "Year", y = "Miles")
ggplot(tukey.26c, aes(x = year, y = miles)) +
geom_point() +
labs(title = "Year vs Miles", x = "Year", y = "Miles")+
geom_smooth(method="loess", se=FALSE)
## `geom_smooth()` using formula = 'y ~ x'
summary.points <- data.frame(x=c(41, 49, 57),
y=c(1155, 7406, 22897))
ggplot(tukey.26c, aes(x = year, y = miles)) +
geom_point() +
labs(title = "Year vs Miles", x = "Year", y = "Miles")+
geom_point(data=summary.points,
aes(x,y), color="red", size=3)
straightening.work <- function(sp, px, py){
sp$tx <- with(sp, (x ^ px - 1) / px)
sp$ty <- with(sp, (y ^ py - 1) / py)
sp$slope[1] <- with(sp, diff(ty[1:2]) / diff(tx[1:2]))
sp$slope[2] <- with(sp, diff(ty[2:3]) / diff(tx[2:3]))
sp$half.slope.ratio <- with(sp, slope[2] / slope[1])
sp$slope[3] <- NA
sp$half.slope.ratio[2:3] <- NA
row.names(sp) <- c("Left", "Center", "Right")
sp}
straightening.work(summary.points, 1, 1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 40 1154 781.375 2.478163
## Center 49 7406 48 7405 1936.375 NA
## Right 57 22897 56 22896 NA NA
straightening.work(summary.points, 0.5, 1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 10.80625 1154 5236.433 2.690184
## Center 49 7406 12.00000 7405 14086.968 NA
## Right 57 22897 13.09967 22896 NA NA
straightening.work(summary.points, 0.001, 1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 3.720476 1154 34935.97 2.920404
## Center 49 7406 3.899403 7405 102027.13 NA
## Right 57 22897 4.051235 22896 NA NA
straightening.work(summary.points, 0.001, 0.001)
## x y tx ty slope half.slope.ratio
## Left 41 1155 3.720476 7.076779 10.468382 0.7168954
## Center 49 7406 3.899403 8.949858 7.504735 NA
## Right 57 22897 4.051235 10.089319 NA NA
straightening.work(summary.points, 0.001, -1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 3.720476 0.9991342 0.0040842008 0.1473148
## Center 49 7406 3.899403 0.9998650 0.0006016631 NA
## Right 57 22897 4.051235 0.9999563 NA NA
straightening.work(summary.points, -0.33, -1)
## x y tx ty slope half.slope.ratio
## Left 41 1155 2.140554 0.9991342 0.01437764 0.155577
## Center 49 7406 2.191381 0.9998650 0.00223683 NA
## Right 57 22897 2.232221 0.9999563 NA NA
tukey.26c <- mutate(tukey.26c,
new.x = year ^ (.001),
new.y = -1/miles)
ggplot(tukey.26b, aes(new.x, new.y)) +
geom_point()
ggplot(tukey.26c, aes(year, miles)) +
geom_point()
tukey.26c <- tukey.26c %>%
mutate(x1 = year^(0.5),
y1 = 1 / miles)
ggplot(tukey.26c, aes(x1, y1)) +
geom_point()+
labs(x = "sqrt(year)", y = "1 / miles")
model <- lm(y1 ~ x1, data = tukey.26c)
summary(model)
##
## Call:
## lm(formula = y1 ~ x1, data = tukey.26c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0004642 -0.0002700 -0.0001307 0.0002275 0.0010589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0077671 0.0011222 6.922 5.98e-07 ***
## x1 -0.0010520 0.0001611 -6.529 1.44e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0003943 on 22 degrees of freedom
## Multiple R-squared: 0.6596, Adjusted R-squared: 0.6441
## F-statistic: 42.62 on 1 and 22 DF, p-value: 1.444e-06
ggplot(tukey.26c, aes(x1, y1)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Fitted Line on Transformed Data")
## `geom_smooth()` using formula = 'y ~ x'
tukey.26c$resid <- residuals(model)
ggplot(tukey.26c, aes(x = fitted(model), y = resid)) +
geom_point() +
geom_hline(yintercept = 0) +
labs(title = "Residual Plot for Transformed Model",
x = "Fitted Values", y = "Residuals")