Problem 1

copier <- read.csv("/Users/caelynsobie/Downloads/MAT613/copier-maintenance.csv")
extra <- data.frame(y = c(132, 166), x = c(6, 5))
enlarged <- rbind(copier,extra)
lm <- lm(copier)
x <- copier$x
y <- copier$y
plot(x,y,pch=4)
abline(lm, col='green',lwd = 2)

summary(lm)
## 
## Call:
## lm(formula = copier)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.7723  -3.7371   0.3334   6.3334  15.4039 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.5802     2.8039  -0.207    0.837    
## x            15.0352     0.4831  31.123   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.914 on 43 degrees of freedom
## Multiple R-squared:  0.9575, Adjusted R-squared:  0.9565 
## F-statistic: 968.7 on 1 and 43 DF,  p-value: < 2.2e-16
en_model <- lm(enlarged)
X <- enlarged$x
Y <- enlarged$y
plot(X,Y,pch=4)
abline(en_model, col='blue',lwd = 2)

summary(en_model)
## 
## Call:
## lm(formula = enlarged)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -25.980  -6.761  -2.323   3.903  88.567 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.8860     5.3672   0.351    0.727    
## x            15.1094     0.9266  16.307   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.12 on 45 degrees of freedom
## Multiple R-squared:  0.8553, Adjusted R-squared:  0.852 
## F-statistic: 265.9 on 1 and 45 DF,  p-value: < 2.2e-16

Adding the 2 extra data points changed the intercept from -0.5802 to 1.8860. The slope changed from 15.0352 to 15.1094. So the intercept increased and the slope increased slightly. In both cases the F statistic and b1 estimate are significant based on p-value.

res <- en_model$residuals
med_res <- median(res)
m <- abs(res - med_res)
MAD_en <- (1/.6745)*median(m)
en_ui <- res/MAD_en
k <- 1.345

huber_weights_en <- numeric(length(en_ui))
for (i in seq_along(en_ui)) {
  if (abs(en_ui[i]) <= k) {
  huber_weights_en[i] <- 1
  } else {
  huber_weights_en[i] <- k / abs(en_ui[i])
  }
}
print("Scaled Residuals")
## [1] "Scaled Residuals"
print(en_ui)
##           1           2           3           4           5           6 
## -1.41225984 -0.27108033 -0.14165082  1.03780758 -0.58281107 -1.86434652 
##           7           8           9          10          11          12 
## -1.10052910  1.34953832 -1.51617008 -0.01222131  0.71515041  0.35238115 
##          13          14          15          16          17          18 
##  0.05341004 -1.30834959  1.12895820 -2.65551639 -0.30935922  0.96124980 
##          19          20          21          22          23          24 
## -3.03104529 -0.62108996 -1.33386885 -0.77603873  0.15732029 -0.40050984 
##          25          26          27          28          29          30 
## -0.45154836  0.50732992 -1.64559959 -0.51717971  0.39066004  0.40341967 
##          31          32          33          34          35          36 
##  0.02789078  1.46620820 -1.41225984 -0.47890082 -1.63283996 -0.62108996 
##          37          38          39          40          41          42 
##  0.99952869 -0.68488811  0.50732992  1.16723709 -0.80155799  0.22111844 
##          43          44          45          46          47 
## -0.59557070 -0.15441045 -0.05050020  4.60353525 10.33311885
print("Huber Weights")
## [1] "Huber Weights"
print( huber_weights_en)
##  [1] 0.9523743 1.0000000 1.0000000 1.0000000 1.0000000 0.7214324 1.0000000
##  [8] 0.9966371 0.8871036 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 1.0000000 0.5064928 1.0000000 1.0000000 0.4437413 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.8173313 1.0000000
## [29] 1.0000000 1.0000000 1.0000000 0.9173322 0.9523743 1.0000000 0.8237182
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [43] 1.0000000 1.0000000 1.0000000 0.2921668 0.1301640

The outliers recieve the smallest Huber Weights. Specifically, the larger the absolute value of the scaled residual, the smaller the Huber Weight. This is because to get the huber weight, we divide by the normalized residual, so a larger residual means a smaller huber weight. C)

w_en_model <- lm(enlarged, weights = huber_weights_en)
summary(w_en_model)
## 
## Call:
## lm(formula = enlarged, weights = huber_weights_en)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.072  -5.918  -0.497   5.115  32.524 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.9235     3.1753  -0.291    0.773    
## x            15.3552     0.5571  27.562   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.79 on 45 degrees of freedom
## Multiple R-squared:  0.9441, Adjusted R-squared:  0.9428 
## F-statistic: 759.7 on 1 and 45 DF,  p-value: < 2.2e-16

The estimates are very similar to the original least squares not with the addition of the two outliers (46 and 47). This shows that the weights make it so the regression is more resistant to outliers.

#obtain residuals
res_w_en <- w_en_model$residuals

#calculate ui with new residuals
med_res_w <- median(res_w_en)
m_w <- abs(res_w_en - med_res_w)
MAD_en_w <- (1/.6745)*median(m_w)
en_ui_w <- res_w_en/MAD_en_w

#get weights
huber_weights_en_w <- numeric(length(en_ui_w))
for (i in seq_along(en_ui_w)) {
  if (abs(en_ui_w[i]) <= k) {
  huber_weights_en_w[i] <- 1
  } else {
  huber_weights_en_w[i] <- k / abs(en_ui_w[i])
  }
}

print("Scaled Residuals iteration 2")
## [1] "Scaled Residuals iteration 2"
print(en_ui_w)
##           1           2           3           4           5           6 
## -1.14287578 -0.05808307  0.10017197  1.30940412 -0.28396933 -1.82503999 
##           7           8           9          10          11          12 
## -0.91698952  1.53529039 -1.21817120  0.25842702  0.78549497  0.39368945 
##          13          14          15          16          17          18 
##  0.20928300 -1.06758036  1.34321973 -2.55950700 -0.18252251  1.06052525 
##          19          20          21          22          23          24 
## -2.99279233 -0.40840877 -1.15053999 -0.64962345  0.28457842 -0.21633812 
##          25          26          27          28          29          30 
## -0.38225737  0.63490412 -1.37642625 -0.33311335  0.51812889  0.55960870 
##          31          32          33          34          35          36 
##  0.12632338  1.65206562 -1.14287578 -0.20867391 -1.33494644 -0.40840877 
##          37          38          39          40          41          42 
##  1.18496468 -0.61580784  0.63490412  1.46765917 -0.73258307  0.49197749 
##          43          44          45          46          47 
## -0.32544915  0.05869216  0.13398758  4.76351712 10.52698338
print("Huber Weights Iteration 2")
## [1] "Huber Weights Iteration 2"
print(huber_weights_en_w)
##  [1] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.7369702 1.0000000
##  [8] 0.8760558 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 1.0000000 0.5254918 1.0000000 1.0000000 0.4494131 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 0.9771682 1.0000000
## [29] 1.0000000 1.0000000 1.0000000 0.8141323 1.0000000 1.0000000 1.0000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 0.9164253 1.0000000 1.0000000
## [43] 1.0000000 1.0000000 1.0000000 0.2823544 0.1277669
#second iteration
w_en_model2 <- lm(enlarged, weights = huber_weights_en_w)

#obtain residuals
res_w_en2 <- w_en_model2$residuals

#calculate ui with new residuals
med_res_w2 <- median(res_w_en2)
m_w2 <- abs(res_w_en2 - med_res_w2)
MAD_en_w2 <- (1/.6745)*median(m_w2)
en_ui_w2 <- res_w_en2/MAD_en_w2

#get weights
huber_weights_en_w2 <- numeric(length(en_ui_w2))
for (i in seq_along(en_ui_w2)) {
  if (abs(en_ui_w2[i]) <= k) {
  huber_weights_en_w2[i] <- 1
  } else {
  huber_weights_en_w2[i] <- k / abs(en_ui_w2[i])
  }
}
print("Scaled Residuals iteration 2")
## [1] "Scaled Residuals iteration 2"
print(en_ui_w2)
##           1           2           3           4           5           6 
## -1.09721695 -0.01931019  0.14850654  1.37647843 -0.22263012 -1.85077568 
##           7           8           9          10          11          12 
## -0.89389702  1.57979835 -1.16499026  0.31632326  0.79073643  0.38732967 
##          13          14          15          16          17          18 
##  0.23403145 -1.02944364  1.39423003 -2.57529739 -0.16937532  1.07634817 
##          19          20          21          22          23          24 
## -3.02872586 -0.37269524 -1.12948706 -0.64055539  0.30180476 -0.18712692 
##          25          26          27          28          29          30 
## -0.38721375  0.65518981 -1.33280699 -0.30492193  0.53739479  0.58741650 
##          31          32          33          34          35          36 
##  0.13398803  1.69759337 -1.09721695 -0.15485681 -1.28278528 -0.37269524 
##          37          38          39          40          41          42 
##  1.22641330 -0.62280379  0.65518981  1.54429515 -0.74059881  0.55191330 
##          43          44          45          46          47 
## -0.27265183  0.09848483  0.16625814  4.82803715 10.65001474
print("Huber Weights Iteration 2")
## [1] "Huber Weights Iteration 2"
print(huber_weights_en_w2)
##  [1] 1.0000000 1.0000000 1.0000000 0.9771312 1.0000000 0.7267223 1.0000000
##  [8] 0.8513745 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [15] 0.9646902 0.5222698 1.0000000 1.0000000 0.4440811 1.0000000 1.0000000
## [22] 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000 1.0000000
## [29] 1.0000000 1.0000000 1.0000000 0.7922981 1.0000000 1.0000000 1.0000000
## [36] 1.0000000 1.0000000 1.0000000 1.0000000 0.8709475 1.0000000 1.0000000
## [43] 1.0000000 1.0000000 1.0000000 0.2785811 0.1262909
#do regression again

w_en_model3 <- lm(enlarged, weights = huber_weights_en_w2)
summary(w_en_model3)
## 
## Call:
## lm(formula = enlarged, weights = huber_weights_en_w2)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.165  -5.871  -0.097   5.295  32.147 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.6778     3.1174  -0.538    0.593    
## x            15.4436     0.5487  28.147   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9.722 on 45 degrees of freedom
## Multiple R-squared:  0.9463, Adjusted R-squared:  0.9451 
## F-statistic: 792.3 on 1 and 45 DF,  p-value: < 2.2e-16

Once again, the cases with the smallest huber weights (46,47,19,16,6,32) are the ones with the largest normalized residuals.

This has the lowest intercept value so far with the same estimated coefficient for x as the rest. The p-value is significant for b_1.

plot(X,Y,pch=4)
lines(X, en_model$fitted.values, col = rgb(70/255, 130/255, 180/255), lwd = 2, pch = 1)  # OLS fit
lines(X, w_en_model3$fitted.values, col = rgb(95/255, 158/255, 72/255), pch = 1)  # IRLS fit

legend("topleft", legend = c("OLS Fit", "IRLS Fit"),
       col = c( rgb(70/255, 130/255, 180/255), rgb(95/255, 158/255, 72/255)), lwd = 2)

I do not think it is a substantial difference, especially as x gets larger, but at the intercept we can see that IRLS is lower and may be a better fit for our data. Overall I do not think it is too fdifferent.

PROBLEM 2

logistic_function <- function(X, beta0, beta1) {
  1 / (1 + exp(-(beta0 + beta1 * X)))
}

log_x <- seq(0, 120, length.out = 100)

# Parameters
beta0 <- 20
beta1 <- -0.2
mean_response <- logistic_function(log_x, beta0, beta1)

plot(log_x, mean_response,type = "l", col = rgb(95/255, 158/255, 72/255), lwd = 2,
     xlab = "X",
     ylab = "Mean Response",
     main = "Logistic Mean Response Function")

odds_125 <- exp(beta0 + beta1 * 125)
odds_126 <- exp(beta0 + beta1 * 126)
odds_ratio <- odds_126 / odds_125


exp_beta_1 <- exp(beta1)

cat("Odds at X = 125:", odds_125, "\n")
## Odds at X = 125: 0.006737947
cat("Odds at X = 126:", odds_126, "\n")
## Odds at X = 126: 0.005516564
cat("Odds Ratio:", odds_ratio, "\n")
## Odds Ratio: 0.8187308
cat("exp(beta1):", exp_beta_1, "\n")
## exp(beta1): 0.8187308

Yes, Odds Ratio = exp(beta1) as expected. PROBLEM 3

performance <- read.csv("/Users/caelynsobie/Downloads/MAT613/performance-ability.csv")
model_perform <- glm(performance,  family = binomial)
beta_0 <- model_perform$coefficients[1]  

beta_1 <- model_perform$coefficients[2]  



print(paste("Estimated Intercept beto0:", beta_0))
## [1] "Estimated Intercept beto0: -10.3089251781477"
print(paste("Estimated Slope beta1:", beta_1))
## [1] "Estimated Slope beta1: 0.0189198293852957"
print(paste("Fitted Response Function: logit E(Yi) = (1 + exp(-(", beta_0, "+", beta_1, "*Xi))^-1"))
## [1] "Fitted Response Function: logit E(Yi) = (1 + exp(-( -10.3089251781477 + 0.0189198293852957 *Xi))^-1"
#lowess model

lowess <- loess(performance)
# Plot the original data
plot(performance$X, performance$Y)

seq <- seq(min(performance$X), max(performance$X), length.out = 100)

fitted_probs <- predict(model_perform, newdata = data.frame(X = seq), type = "response")
fitted_lowess <- predict(lowess, newdata = data.frame(X = seq), type = "response")

lines(seq, fitted_probs, lwd = 2, col = "green")
lines(seq, fitted_lowess, lwd = 2, col = "blue")

c)

exp(beta_1)
##      X 
## 1.0191

The estimated odds are multipled by exp(b1)= 1.0191 for any unit increase in X.

logistic_function(550,beta_0,beta_1)
## (Intercept) 
##   0.5242263
(log(.7/.3) - beta_0)/beta_1
## (Intercept) 
##    589.6577

PROBLEM 4

below <- data.frame(
  value = c(29,42,38,40,43,40,30,42),
  group = "Below_Average"
)

avg <- data.frame(
  value = c(30,35,39,28,31,31,29,35,29,33),
  group = "Average"
)

above <- data.frame(
  value = c(26,32,21,20,23,22),
  group = "Above_Average"
)

rehab <- rbind(below, avg, above)

# Create strip chart
stripchart(value ~ group,
           data = rehab,
           pch = 1,
           xlab = "Prior Physical Fitness Status",
           ylab = "Days to Recovery",
           vertical = TRUE)

Below_Average = c(29,42,38,40,43,40,30,42, NA, NA)
Average = c(30,35,39,28,31,31,29,35,29,33)
Above_Average = c(26,32,21,20,23,22, NA, NA, NA, NA)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
knee <- bind_cols(
  Below_Average = Below_Average,
  Average = Average,
  Above_Average = Above_Average
)
#fitted values
anov <- aov(value ~ group, data = rehab)
anov_fit <- anov$fitted.values
anov_fit
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 
## 38 38 38 38 38 38 38 38 32 32 32 32 32 32 32 32 32 32 24 24 24 24 24 24
anov_res <- anov$residuals
sum(anov_res)
## [1] -2.642331e-14

This is approximately zero, so yes, they sum to zero.

#anova table

anov_table <- summary(anov)
anov_table
##             Df Sum Sq Mean Sq F value   Pr(>F)    
## group        2    672   336.0   16.96 4.13e-05 ***
## Residuals   21    416    19.8                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
SSTo <- anov_table[[1]]$`Sum Sq`[1] + anov_table[[1]]$`Sum Sq`[2]
df_total <- anov_table[[1]]$`Df`[1] + anov_table[[1]]$`Df`[2]
print(paste("SSTo =", SSTo))
## [1] "SSTo = 1088"
print(paste("Total Degrees of Freedom = ", df_total))
## [1] "Total Degrees of Freedom =  23"
#H_0 : mu1 = mu2 = mu3 
#H_a : not all mu_i are equal


MSTr <- anov_table[[1]]$`Mean Sq`[1]
MSE <- anov_table[[1]]$`Mean Sq`[2]

# Calculate the test statistic
test_stat <- MSTr / MSE
r <- 3
nt<- 8+10+6
  
df1 <- r-1
df2 <-nt - r

alpha <- 0.01

#critical value
critical_value <- pf(1-alpha, df1, df2)

print(paste("Test Statistic: ", test_stat))
## [1] "Test Statistic:  16.9615384615385"
print(paste("Critical Value: ", critical_value))
## [1] "Critical Value:  0.611735808180911"
print("Test Statistic > Critical Value. Hence conclude Ha. The mean number of days required for successful rehabilitation is not the same for the three fitness groups.")
## [1] "Test Statistic > Critical Value. Hence conclude Ha. The mean number of days required for successful rehabilitation is not the same for the three fitness groups."
#p value
p_value <- pf(test_stat, df1, df2, lower.tail = FALSE)
p_value
## [1] 4.129301e-05

This p-value < 0.01. Hence we reject the null hypothesis and accept the alternative, just as with the test statistic.

mu1 <- mean(Below_Average, na.rm = TRUE)
mu2 <- mean(Average)
mu3 <- mean(Above_Average, na.rm = TRUE)
print(c(mu1,mu2,mu3))
## [1] 38 32 24