Problem 1
copier <- read.csv("/Users/caelynsobie/Downloads/MAT613/copier-maintenance.csv")
extra <- data.frame(y = c(132, 166), x = c(6, 5))
enlarged <- rbind(copier,extra)
lm <- lm(copier)
x <- copier$x
y <- copier$y
plot(x,y,pch=4)
abline(lm, col='green',lwd = 2)
summary(lm)
##
## Call:
## lm(formula = copier)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.7723 -3.7371 0.3334 6.3334 15.4039
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.5802 2.8039 -0.207 0.837
## x 15.0352 0.4831 31.123 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.914 on 43 degrees of freedom
## Multiple R-squared: 0.9575, Adjusted R-squared: 0.9565
## F-statistic: 968.7 on 1 and 43 DF, p-value: < 2.2e-16
en_model <- lm(enlarged)
X <- enlarged$x
Y <- enlarged$y
plot(X,Y,pch=4)
abline(en_model, col='blue',lwd = 2)
summary(en_model)
##
## Call:
## lm(formula = enlarged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25.980 -6.761 -2.323 3.903 88.567
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.8860 5.3672 0.351 0.727
## x 15.1094 0.9266 16.307 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.12 on 45 degrees of freedom
## Multiple R-squared: 0.8553, Adjusted R-squared: 0.852
## F-statistic: 265.9 on 1 and 45 DF, p-value: < 2.2e-16
Adding the 2 extra data points changed the intercept from -0.5802 to 1.8860. The slope changed from 15.0352 to 15.1094. So the intercept increased and the slope increased slightly. In both cases the F statistic and b1 estimate are significant based on p-value.
res <- en_model$residuals
med_res <- median(res)
m <- abs(res - med_res)
MAD_en <- (1/.6745)*median(m)
en_ui <- res/MAD_en
k <- 1.345
huber_weights_en <- numeric(length(en_ui))
for (i in seq_along(en_ui)) {
if (abs(en_ui[i]) <= k) {
huber_weights_en[i] <- 1
} else {
huber_weights_en[i] <- k / abs(en_ui[i])
}
}
print("Scaled Residuals")
## [1] "Scaled Residuals"
print(en_ui)
## 1 2 3 4 5 6
## -1.41225984 -0.27108033 -0.14165082 1.03780758 -0.58281107 -1.86434652
## 7 8 9 10 11 12
## -1.10052910 1.34953832 -1.51617008 -0.01222131 0.71515041 0.35238115
## 13 14 15 16 17 18
## 0.05341004 -1.30834959 1.12895820 -2.65551639 -0.30935922 0.96124980
## 19 20 21 22 23 24
## -3.03104529 -0.62108996 -1.33386885 -0.77603873 0.15732029 -0.40050984
## 25 26 27 28 29 30
## -0.45154836 0.50732992 -1.64559959 -0.51717971 0.39066004 0.40341967
## 31 32 33 34 35 36
## 0.02789078 1.46620820 -1.41225984 -0.47890082 -1.63283996 -0.62108996
## 37 38 39 40 41 42
## 0.99952869 -0.68488811 0.50732992 1.16723709 -0.80155799 0.22111844
## 43 44 45 46 47
## -0.59557070 -0.15441045 -0.05050020 4.60353525 10.33311885
print("Huber Weights")
## [1] "Huber Weights"
print( huber_weights_en)
## [1] 0.9523743 1.0000000 1.0000000 1.0000000 1.0000000 0.7214324 1.0000000
## [8] 0.9966371 0.8871036 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 1.0000000 0.5064928 1.0000000 1.0000000 0.4437413 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8173313 1.0000000
## [29] 1.0000000 1.0000000 1.0000000 0.9173322 0.9523743 1.0000000 0.8237182
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [43] 1.0000000 1.0000000 1.0000000 0.2921668 0.1301640
The outliers recieve the smallest Huber Weights. Specifically, the larger the absolute value of the scaled residual, the smaller the Huber Weight. This is because to get the huber weight, we divide by the normalized residual, so a larger residual means a smaller huber weight. C)
w_en_model <- lm(enlarged, weights = huber_weights_en)
summary(w_en_model)
##
## Call:
## lm(formula = enlarged, weights = huber_weights_en)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -17.072 -5.918 -0.497 5.115 32.524
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9235 3.1753 -0.291 0.773
## x 15.3552 0.5571 27.562 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.79 on 45 degrees of freedom
## Multiple R-squared: 0.9441, Adjusted R-squared: 0.9428
## F-statistic: 759.7 on 1 and 45 DF, p-value: < 2.2e-16
The estimates are very similar to the original least squares not with the addition of the two outliers (46 and 47). This shows that the weights make it so the regression is more resistant to outliers.
#obtain residuals
res_w_en <- w_en_model$residuals
#calculate ui with new residuals
med_res_w <- median(res_w_en)
m_w <- abs(res_w_en - med_res_w)
MAD_en_w <- (1/.6745)*median(m_w)
en_ui_w <- res_w_en/MAD_en_w
#get weights
huber_weights_en_w <- numeric(length(en_ui_w))
for (i in seq_along(en_ui_w)) {
if (abs(en_ui_w[i]) <= k) {
huber_weights_en_w[i] <- 1
} else {
huber_weights_en_w[i] <- k / abs(en_ui_w[i])
}
}
print("Scaled Residuals iteration 2")
## [1] "Scaled Residuals iteration 2"
print(en_ui_w)
## 1 2 3 4 5 6
## -1.14287578 -0.05808307 0.10017197 1.30940412 -0.28396933 -1.82503999
## 7 8 9 10 11 12
## -0.91698952 1.53529039 -1.21817120 0.25842702 0.78549497 0.39368945
## 13 14 15 16 17 18
## 0.20928300 -1.06758036 1.34321973 -2.55950700 -0.18252251 1.06052525
## 19 20 21 22 23 24
## -2.99279233 -0.40840877 -1.15053999 -0.64962345 0.28457842 -0.21633812
## 25 26 27 28 29 30
## -0.38225737 0.63490412 -1.37642625 -0.33311335 0.51812889 0.55960870
## 31 32 33 34 35 36
## 0.12632338 1.65206562 -1.14287578 -0.20867391 -1.33494644 -0.40840877
## 37 38 39 40 41 42
## 1.18496468 -0.61580784 0.63490412 1.46765917 -0.73258307 0.49197749
## 43 44 45 46 47
## -0.32544915 0.05869216 0.13398758 4.76351712 10.52698338
print("Huber Weights Iteration 2")
## [1] "Huber Weights Iteration 2"
print(huber_weights_en_w)
## [1] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.7369702 1.0000000
## [8] 0.8760558 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 1.0000000 0.5254918 1.0000000 1.0000000 0.4494131 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.9771682 1.0000000
## [29] 1.0000000 1.0000000 1.0000000 0.8141323 1.0000000 1.0000000 1.0000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 0.9164253 1.0000000 1.0000000
## [43] 1.0000000 1.0000000 1.0000000 0.2823544 0.1277669
#second iteration
w_en_model2 <- lm(enlarged, weights = huber_weights_en_w)
#obtain residuals
res_w_en2 <- w_en_model2$residuals
#calculate ui with new residuals
med_res_w2 <- median(res_w_en2)
m_w2 <- abs(res_w_en2 - med_res_w2)
MAD_en_w2 <- (1/.6745)*median(m_w2)
en_ui_w2 <- res_w_en2/MAD_en_w2
#get weights
huber_weights_en_w2 <- numeric(length(en_ui_w2))
for (i in seq_along(en_ui_w2)) {
if (abs(en_ui_w2[i]) <= k) {
huber_weights_en_w2[i] <- 1
} else {
huber_weights_en_w2[i] <- k / abs(en_ui_w2[i])
}
}
print("Scaled Residuals iteration 2")
## [1] "Scaled Residuals iteration 2"
print(en_ui_w2)
## 1 2 3 4 5 6
## -1.09721695 -0.01931019 0.14850654 1.37647843 -0.22263012 -1.85077568
## 7 8 9 10 11 12
## -0.89389702 1.57979835 -1.16499026 0.31632326 0.79073643 0.38732967
## 13 14 15 16 17 18
## 0.23403145 -1.02944364 1.39423003 -2.57529739 -0.16937532 1.07634817
## 19 20 21 22 23 24
## -3.02872586 -0.37269524 -1.12948706 -0.64055539 0.30180476 -0.18712692
## 25 26 27 28 29 30
## -0.38721375 0.65518981 -1.33280699 -0.30492193 0.53739479 0.58741650
## 31 32 33 34 35 36
## 0.13398803 1.69759337 -1.09721695 -0.15485681 -1.28278528 -0.37269524
## 37 38 39 40 41 42
## 1.22641330 -0.62280379 0.65518981 1.54429515 -0.74059881 0.55191330
## 43 44 45 46 47
## -0.27265183 0.09848483 0.16625814 4.82803715 10.65001474
print("Huber Weights Iteration 2")
## [1] "Huber Weights Iteration 2"
print(huber_weights_en_w2)
## [1] 1.0000000 1.0000000 1.0000000 0.9771312 1.0000000 0.7267223 1.0000000
## [8] 0.8513745 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 0.9646902 0.5222698 1.0000000 1.0000000 0.4440811 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [29] 1.0000000 1.0000000 1.0000000 0.7922981 1.0000000 1.0000000 1.0000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 0.8709475 1.0000000 1.0000000
## [43] 1.0000000 1.0000000 1.0000000 0.2785811 0.1262909
#do regression again
w_en_model3 <- lm(enlarged, weights = huber_weights_en_w2)
summary(w_en_model3)
##
## Call:
## lm(formula = enlarged, weights = huber_weights_en_w2)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -17.165 -5.871 -0.097 5.295 32.147
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.6778 3.1174 -0.538 0.593
## x 15.4436 0.5487 28.147 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9.722 on 45 degrees of freedom
## Multiple R-squared: 0.9463, Adjusted R-squared: 0.9451
## F-statistic: 792.3 on 1 and 45 DF, p-value: < 2.2e-16
Once again, the cases with the smallest huber weights (46,47,19,16,6,32) are the ones with the largest normalized residuals.
This has the lowest intercept value so far with the same estimated coefficient for x as the rest. The p-value is significant for b_1.
plot(X,Y,pch=4)
lines(X, en_model$fitted.values, col = rgb(70/255, 130/255, 180/255), lwd = 2, pch = 1) # OLS fit
lines(X, w_en_model3$fitted.values, col = rgb(95/255, 158/255, 72/255), pch = 1) # IRLS fit
legend("topleft", legend = c("OLS Fit", "IRLS Fit"),
col = c( rgb(70/255, 130/255, 180/255), rgb(95/255, 158/255, 72/255)), lwd = 2)
I do not think it is a substantial difference, especially as x gets larger, but at the intercept we can see that IRLS is lower and may be a better fit for our data. Overall I do not think it is too fdifferent.
PROBLEM 2
logistic_function <- function(X, beta0, beta1) {
1 / (1 + exp(-(beta0 + beta1 * X)))
}
log_x <- seq(0, 120, length.out = 100)
# Parameters
beta0 <- 20
beta1 <- -0.2
mean_response <- logistic_function(log_x, beta0, beta1)
plot(log_x, mean_response,type = "l", col = rgb(95/255, 158/255, 72/255), lwd = 2,
xlab = "X",
ylab = "Mean Response",
main = "Logistic Mean Response Function")
odds_125 <- exp(beta0 + beta1 * 125)
odds_126 <- exp(beta0 + beta1 * 126)
odds_ratio <- odds_126 / odds_125
exp_beta_1 <- exp(beta1)
cat("Odds at X = 125:", odds_125, "\n")
## Odds at X = 125: 0.006737947
cat("Odds at X = 126:", odds_126, "\n")
## Odds at X = 126: 0.005516564
cat("Odds Ratio:", odds_ratio, "\n")
## Odds Ratio: 0.8187308
cat("exp(beta1):", exp_beta_1, "\n")
## exp(beta1): 0.8187308
Yes, Odds Ratio = exp(beta1) as expected. PROBLEM 3
performance <- read.csv("/Users/caelynsobie/Downloads/MAT613/performance-ability.csv")
model_perform <- glm(performance, family = binomial)
beta_0 <- model_perform$coefficients[1]
beta_1 <- model_perform$coefficients[2]
print(paste("Estimated Intercept beto0:", beta_0))
## [1] "Estimated Intercept beto0: -10.3089251781477"
print(paste("Estimated Slope beta1:", beta_1))
## [1] "Estimated Slope beta1: 0.0189198293852957"
print(paste("Fitted Response Function: logit E(Yi) = (1 + exp(-(", beta_0, "+", beta_1, "*Xi))^-1"))
## [1] "Fitted Response Function: logit E(Yi) = (1 + exp(-( -10.3089251781477 + 0.0189198293852957 *Xi))^-1"
#lowess model
lowess <- loess(performance)
# Plot the original data
plot(performance$X, performance$Y)
seq <- seq(min(performance$X), max(performance$X), length.out = 100)
fitted_probs <- predict(model_perform, newdata = data.frame(X = seq), type = "response")
fitted_lowess <- predict(lowess, newdata = data.frame(X = seq), type = "response")
lines(seq, fitted_probs, lwd = 2, col = "green")
lines(seq, fitted_lowess, lwd = 2, col = "blue")
c)
exp(beta_1)
## X
## 1.0191
The estimated odds are multipled by exp(b1)= 1.0191 for any unit increase in X.
logistic_function(550,beta_0,beta_1)
## (Intercept)
## 0.5242263
(log(.7/.3) - beta_0)/beta_1
## (Intercept)
## 589.6577
PROBLEM 4
below <- data.frame(
value = c(29,42,38,40,43,40,30,42),
group = "Below_Average"
)
avg <- data.frame(
value = c(30,35,39,28,31,31,29,35,29,33),
group = "Average"
)
above <- data.frame(
value = c(26,32,21,20,23,22),
group = "Above_Average"
)
rehab <- rbind(below, avg, above)
# Create strip chart
stripchart(value ~ group,
data = rehab,
pch = 1,
xlab = "Prior Physical Fitness Status",
ylab = "Days to Recovery",
vertical = TRUE)
Below_Average = c(29,42,38,40,43,40,30,42, NA, NA)
Average = c(30,35,39,28,31,31,29,35,29,33)
Above_Average = c(26,32,21,20,23,22, NA, NA, NA, NA)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
knee <- bind_cols(
Below_Average = Below_Average,
Average = Average,
Above_Average = Above_Average
)
#fitted values
anov <- aov(value ~ group, data = rehab)
anov_fit <- anov$fitted.values
anov_fit
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
## 38 38 38 38 38 38 38 38 32 32 32 32 32 32 32 32 32 32 24 24 24 24 24 24
anov_res <- anov$residuals
sum(anov_res)
## [1] -2.642331e-14
This is approximately zero, so yes, they sum to zero.
#anova table
anov_table <- summary(anov)
anov_table
## Df Sum Sq Mean Sq F value Pr(>F)
## group 2 672 336.0 16.96 4.13e-05 ***
## Residuals 21 416 19.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
SSTo <- anov_table[[1]]$`Sum Sq`[1] + anov_table[[1]]$`Sum Sq`[2]
df_total <- anov_table[[1]]$`Df`[1] + anov_table[[1]]$`Df`[2]
print(paste("SSTo =", SSTo))
## [1] "SSTo = 1088"
print(paste("Total Degrees of Freedom = ", df_total))
## [1] "Total Degrees of Freedom = 23"
#H_0 : mu1 = mu2 = mu3
#H_a : not all mu_i are equal
MSTr <- anov_table[[1]]$`Mean Sq`[1]
MSE <- anov_table[[1]]$`Mean Sq`[2]
# Calculate the test statistic
test_stat <- MSTr / MSE
r <- 3
nt<- 8+10+6
df1 <- r-1
df2 <-nt - r
alpha <- 0.01
#critical value
critical_value <- pf(1-alpha, df1, df2)
print(paste("Test Statistic: ", test_stat))
## [1] "Test Statistic: 16.9615384615385"
print(paste("Critical Value: ", critical_value))
## [1] "Critical Value: 0.611735808180911"
print("Test Statistic > Critical Value. Hence conclude Ha. The mean number of days required for successful rehabilitation is not the same for the three fitness groups.")
## [1] "Test Statistic > Critical Value. Hence conclude Ha. The mean number of days required for successful rehabilitation is not the same for the three fitness groups."
#p value
p_value <- pf(test_stat, df1, df2, lower.tail = FALSE)
p_value
## [1] 4.129301e-05
This p-value < 0.01. Hence we reject the null hypothesis and accept the alternative, just as with the test statistic.
mu1 <- mean(Below_Average, na.rm = TRUE)
mu2 <- mean(Average)
mu3 <- mean(Above_Average, na.rm = TRUE)
print(c(mu1,mu2,mu3))
## [1] 38 32 24