Assignment_2_Regression

X <- c(1, 0, 2, 0, 3, 1, 0, 1, 2, 0)
Y <- c(16, 9, 17, 12, 22, 13, 8, 15, 19, 11)
data <- data.frame(Transfers = X, Broken = Y)
#Problem 1
# (a)
model <- lm(Broken ~ Transfers, data = data)
summary_model <- summary(model)
summary_model

## 
## Call:
## lm(formula = Broken ~ Transfers, data = data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##   -2.2   -1.2    0.3    0.8    1.8 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  10.2000     0.6633  15.377 3.18e-07 ***
## Transfers     4.0000     0.4690   8.528 2.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.483 on 8 degrees of freedom
## Multiple R-squared:  0.9009, Adjusted R-squared:  0.8885 
## F-statistic: 72.73 on 1 and 8 DF,  p-value: 2.749e-05

coefficients <- coef(model)
coefficients

## (Intercept)   Transfers 
##        10.2         4.0

#Estimated function: Y_hat= 10.2+4* X_hat
#Both R-squared and adjusted R-squared values are also pretty high

# (b)
Y_hat_1 <- predict(model, newdata = data.frame(Transfers = 1))
round(Y_hat_1, 4)

##    1 
## 14.2

#Expected broken ampules when X=1:14.2

# (c) Increase from 1 to 2 transfers
increase <- coefficients[2] * (2 - 1) 
increase

## Transfers 
##         4

# the increase in the expected number of ampules broken when there are
# 2 transfers as compared to 1 transfer:4

# (d) 
X_bar <- mean(X)
Y_bar <- mean(Y)
Y_hat_bar <- coefficients[1] + coefficients[2] * X_bar
X_bar

## [1] 1

Y_bar

## [1] 14.2

Y_hat_bar

## (Intercept) 
##        14.2

cat("Line passes through (X̄, Ȳ):", abs(Y_bar - Y_hat_bar) < 1e-10, "\n")

## Line passes through (X̄, Ȳ): TRUE

#Verified that the line passes through X bar, Y bar

# Plot
plot(X, Y, xlab = "Number of Transfers", ylab = "Broken Ampules",
     main = "Broken Ampules vs Transfers", pch = 19, col = "blue")
abline(model, col = "red", lwd = 2)

#problem 2

# (a)95% CI for beta1
ci_beta1 <- confint(model, "Transfers", level = 0.95)
ci_beta1

##              2.5 %   97.5 %
## Transfers 2.918388 5.081612

# Interpretation: We are 95% confident that each additional transfer increases
# broken ampules by between 2.918 and 5.082 units

# (b)Two-sided test
t_stat <- summary_model$coefficients["Transfers", "t value"]
p_val_two <- summary_model$coefficients["Transfers", "Pr(>|t|)"]
df <- summary_model$df[2]
critical_t <- qt(0.975, df)
cat("Conclusion:", ifelse(p_val_two < 0.05, "Reject Null - Significant linear association", 
                          "Fail to reject Null - No significant association"), "\n\n")

## Conclusion: Reject Null - Significant linear association

# (c) One-sided test for positive association
p_val_one <- p_val_two / 2
critical_t_one <- qt(0.95, df)
cat("Conclusion:", ifelse(p_val_one < 0.05, "Reject Null - Significant positive association", 
                          "Fail to reject Null - No significant positive association"), "\n\n")

## Conclusion: Reject Null - Significant positive association

# (d) 95% CI for beta0
ci_beta0 <- confint(model, "(Intercept)", level = 0.95)
ci_beta0

##               2.5 %   97.5 %
## (Intercept) 8.67037 11.72963

#We are 95% confident that the mean broken ampules with
# no transfers is between 8.670 and 11.727


# (e) Test beta0 ≤ 9.0
beta0_hat <- coefficients[1]
se_beta0 <- summary_model$coefficients["(Intercept)", "Std. Error"]
t_stat_beta0 <- (beta0_hat - 9.0) / se_beta0
p_val_beta0 <- pt(t_stat_beta0, df, lower.tail = FALSE)
critical_t_beta0 <- qt(0.975, df)
p_val_beta0

## (Intercept) 
##  0.05402227

cat("Conclusion:", ifelse(p_val_beta0 < 0.025, "Reject Null - Mean > 9.0", 
                          "Fail to reject Null - Mean ≤ 9.0"), "\n")

## Conclusion: Fail to reject Null - Mean ≤ 9.0

#Here, p value is not less than 0.025, we couldn't reject the null hypothesis

#Problem3

# (a) 99% Confidence intervals for mean response
X_new <- data.frame(Transfers = c(2, 4))
conf_intervals <- predict(model, newdata = X_new, 
                          interval = "confidence", level = 0.99)

cat("(a) 99% CONFIDENCE INTERVALS FOR MEAN BREAKAGE:\n")

## (a) 99% CONFIDENCE INTERVALS FOR MEAN BREAKAGE:

for(i in 1:nrow(X_new)) {
  cat("X =", X_new$Transfers[i], "transfers:\n")
  cat("  Predicted mean =", round(conf_intervals[i, "fit"], 4), "\n")
  cat("  99% CI: (", round(conf_intervals[i, "lwr"], 4), ",", 
      round(conf_intervals[i, "upr"], 4), ")\n")
  cat("  Interpretation: We are 99% confident that the true mean number of\n")
  cat("  broken ampules for all shipments with", X_new$Transfers[i], 
      "transfers is between", round(conf_intervals[i, "lwr"], 3), 
      "and", round(conf_intervals[i, "upr"], 3), "ampules.\n\n")
}

## X = 2 transfers:
##   Predicted mean = 18.2 
##   99% CI: ( 15.9743 , 20.4257 )
##   Interpretation: We are 99% confident that the true mean number of
##   broken ampules for all shipments with 2 transfers is between 15.974 and 20.426 ampules.
## 
## X = 4 transfers:
##   Predicted mean = 26.2 
##   99% CI: ( 21.2232 , 31.1768 )
##   Interpretation: We are 99% confident that the true mean number of
##   broken ampules for all shipments with 4 transfers is between 21.223 and 31.177 ampules.

# (b) 99% Prediction interval for single observation at X = 2
pred_interval_single <- predict(model, 
                                newdata = data.frame(Transfers = 2),
                                interval = "prediction", level = 0.99)

cat("(b) 99% PREDICTION INTERVAL FOR SINGLE SHIPMENT:\n")

## (b) 99% PREDICTION INTERVAL FOR SINGLE SHIPMENT:

cat("X = 2 transfers:\n")

## X = 2 transfers:

cat(" Predicted value =", round(pred_interval_single[1], 4), "\n")

##  Predicted value = 18.2

cat("  99% Prediction Interval: (", round(pred_interval_single[2], 4), ",", 
    round(pred_interval_single[3], 4), ")\n")

##   99% Prediction Interval: ( 12.7481 , 23.6519 )

cat(" Interpretation: We are 99% confident that a single shipment with 2 transfers\n")

##  Interpretation: We are 99% confident that a single shipment with 2 transfers

cat(" will have between", round(pred_interval_single[2], 3), "and", 
    round(pred_interval_single[3], 3), "broken ampules.\n\n")

##  will have between 12.748 and 23.652 broken ampules.

# (c) 
m <- 3
X_star <- 2
X_matrix <- model.matrix(model)
n <- nrow(X_matrix)
p <- ncol(X_matrix)
X_mean <- mean(data$Transfers)
SXX <- sum((data$Transfers - X_mean)^2)

#SE for mean of m new observations
se_mean_m <- summary_model$sigma * sqrt(1/m + (X_star - X_mean)^2 / SXX)

#Predicted value
Y_hat <- coefficients[1] + coefficients[2] * X_star

#t-value for 99% confidence
t_val <- qt(0.995, df = n - p)

#Prediction interval
lower <- Y_hat - t_val * se_mean_m
upper <- Y_hat + t_val * se_mean_m

#We predict that the AVERAGE number of broken ampules
#across 3 future shipments (each with 2 transfers) will be between
#14.92 and 21.48 ampules

#Problem 4

#(a) ANOVA table
anova_model=anova(model)
print(anova_model)

## Analysis of Variance Table
## 
## Response: Broken
##           Df Sum Sq Mean Sq F value    Pr(>F)    
## Transfers  1  160.0   160.0  72.727 2.749e-05 ***
## Residuals  8   17.6     2.2                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

SSR <- anova_model$"Sum Sq"[1]
SSE <- anova_model$"Sum Sq"[2]
SST <- SSR + SSE
dfR <- anova_model$Df[1]
dfE <- anova_model$Df[2]
dfT <- dfR + dfE
MSR <- anova_model$"Mean Sq"[1]
MSE <- anova_model$"Mean Sq"[2]
F_stat <- anova_model$"F value"[1]

cat("Additive elements: Sum of Squares and Degrees of Freedom are additive\n")

## Additive elements: Sum of Squares and Degrees of Freedom are additive

cat("Mean Squares are not additive \n\n")

## Mean Squares are not additive

# (b) F-test for linear association
alpha <- 0.05
F_critical <- qf(1 - alpha, dfR, dfE)
p_value_F <- anova_model$"Pr(>F)"[1]
p_value_F

## [1] 2.748669e-05

cat("Conclusion:", ifelse(F_stat > F_critical, 
                          "Reject H₀ - Significant linear association", 
                          "Fail to reject H₀ - No significant association"), "\n\n")

## Conclusion: Reject H₀ - Significant linear association

#Strong evidence of linear association

# (c) t-statistic and equivalence to F
t_stat <- summary_model$coefficients["Transfers", "t value"]
t_squared <- t_stat^2

cat("Difference =", round(abs(t_squared - F_stat), 6), "\n")

## Difference = 0

#Numerical equivalence demonstrated that t sqr= F

# (d) R-squared and correlation
R_squared <- summary_model$r.squared
r <- cor(X, Y)
r_squared_from_cor <- r^2

#R² and r: R² = 0.9009, r = 0.949
#Interpretation: 90.1% of the variation in broken ampules is explained by the linear relationship with number of transfers

Assignment_2_Regression

Sabuj Ganguly

2025-10-07