Regression_Assignment

#Problem 2
Ship_Route=c(1,0,2,0,3,1,0,1,2,0)
No_Ampules=c(16,9,17,12,22,13,8,15,19,11)
x <- Ship_Route
y <- No_Ampules
n <- length(x)

xbar <- mean(x)
ybar <- mean(y)

# slope and intercept
b1 <- sum((x - xbar) * (y - ybar)) / sum((x - xbar)^2)
b0 <- ybar - b1 * xbar

b0

## [1] 10.2

b1

## [1] 4

#Intercept = 10.2: When there are 0 transfer points (x=0), the estimated number of broken ampules is 10.2
#Slope = 4.0: Each additional transfer point increases the number of broken ampules by approximately 4 units

y_hat <- b0 + b1 * x
resid <- y - y_hat
s2 <- sum(resid^2) / (n - 2)
s <- sqrt(s2)

se_b1 <- s / sqrt(sum((x - xbar)^2))

# 95% CI for slope
t_crit <- qt(0.975, df = n - 2)
lower <- b1 - t_crit * se_b1
upper <- b1 + t_crit * se_b1

c(lower, upper)

## [1] 2.918388 5.081612

t_stat <- b1 / se_b1
p_value <- 2 * (1 - pt(abs(t_stat), df = n - 2))

t_stat

## [1] 8.528029

p_value

## [1] 2.748669e-05

#There is strong statistical evidence of a linear relationship between the number of transfer points and broken ampules

#problem 3
get_slope <- function(x, y, idx) {
  x_s <- x[idx]
  y_s <- y[idx]
  xbar <- mean(x_s)
  ybar <- mean(y_s)
  b1 <- sum((x_s - xbar) * (y_s - ybar)) / sum((x_s - xbar)^2)
  return(b1)
}


B <- 1000  # number of bootstrap samples
boot_b1 <- numeric(B)

set.seed(123) 
for (i in 1:B) {
  idx <- sample(1:n, n, replace = TRUE)
  boot_b1[i] <- get_slope(x, y, idx)
}

boot_CI <- quantile(boot_b1, probs = c(0.025, 0.975))
boot_CI

##     2.5%    97.5% 
## 3.209197 4.913134

xbar <- mean(x)
ybar <- mean(y)
b1 <- sum((x - xbar)*(y - ybar)) / sum((x - xbar)^2)
y_hat <- b1*(x - xbar) + mean(y)
resid <- y - y_hat
s2 <- sum(resid^2) / (n - 2)
s <- sqrt(s2)
se_b1 <- s / sqrt(sum((x - xbar)^2))
t_crit <- qt(0.975, df = n - 2)
normal_CI <- c(b1 - t_crit*se_b1, b1 + t_crit*se_b1)

round(normal_CI,3)

## [1] 2.918 5.082

round(boot_CI,3)

##  2.5% 97.5% 
## 3.209 4.913

#Bootstrap CI is narrower than Normal Theory CI
#Both intervals exclude zero, confirming the significant positive relationship
#Normal theory CI assumes errors are perfectly normally distributed
#Bootstrap makes no distributional assumptions about errors, that's why they are different

# Problem 1
x <- c(0,1,2,3,4,5,6,7,8,9)
y <- c(98,135,162,178,221,232,283,300,374,395)
n <- length(x)
p <- 2   

mean_x <- mean(x)
mean_y <- mean(y)
#sums needed for correlation
Sxx <- sum((x - mean_x)^2)
Syy <- sum((y - mean_y)^2)
Sxy <- sum((x - mean_x) * (y - mean_y))

#Correlation coefficient
r <- Sxy / sqrt(Sxx * Syy)
r

## [1] 0.9898316

# Design matrix
X <- cbind(1, x)   

# OLS 
XtX_inv <- solve( t(X) %*% X )
beta_hat <- XtX_inv %*% ( t(X) %*% y )
beta0 <- as.numeric(beta_hat[1])
beta1 <- as.numeric(beta_hat[2])
beta0

## [1] 91.56364

beta1

## [1] 32.49697

# Fitted values and residuals
y_hat <- as.numeric(X %*% beta_hat)
resid <- y - y_hat

#Residual sum of squares and estimate of sigma^2
RSS <- sum(resid^2)
s2 <- RSS / (n - p)
s <- sqrt(s2)


#Cook's distance
#Outlier checking
# Hat matrix diagonals
H <- X %*% XtX_inv %*% t(X)    
h <- diag(H)


# Cook's D:
D <- (resid^2) / (p * s2) * ( h / (1 - h)^2 )

cook_df <- data.frame(i = 1:n, x = x, y = y, yhat = round(y_hat,2),
                      resid = round(resid,3), h = round(h,4), CookD = round(D,4))
print(cook_df)

##     i x   y   yhat   resid      h  CookD
## 1   1 0  98  91.56   6.436 0.3455 0.0743
## 2   2 1 135 124.06  10.939 0.2485 0.1171
## 3   3 2 162 156.56   5.442 0.1758 0.0170
## 4   4 3 178 189.05 -11.055 0.1273 0.0454
## 5   5 4 221 221.55  -0.552 0.1030 0.0001
## 6   6 5 232 254.05 -22.048 0.1030 0.1384
## 7   7 6 283 286.55  -3.545 0.1273 0.0047
## 8   8 7 300 319.04 -19.042 0.1758 0.2086
## 9   9 8 374 351.54  22.461 0.2485 0.4934
## 10 10 9 395 384.04  10.964 0.3455 0.2155

#No influential points




#DW test
# Durbin-Watson test function
durbin_watson_test <- function(residuals) {
  n <- length(residuals)
  
  # Calculate the Durbin-Watson statistic
  numerator <- sum((residuals[2:n] - residuals[1:(n-1)])^2)
  denominator <- sum(residuals^2)
  dw_stat <- numerator / denominator
  
  return(dw_stat)
}

#Durbin-Watson statistic
dw_statistic <- durbin_watson_test(resid)

round(dw_statistic, 4)

## [1] 1.8521

d_l <- 0.879  # lower critical value
d_u <- 1.320  # upper critical value


if (dw_statistic < d_l) {
  cat("Conclusion: Reject H0 - Positive autocorrelation exists\n")
} else if (dw_statistic > d_u & dw_statistic < (4 - d_u)) {
  cat("Conclusion: Fail to reject H0 - No significant autocorrelation\n")
} else if (dw_statistic > (4 - d_l)) {
  cat("Conclusion: Reject H0 - Negative autocorrelation exists\n")
} else {
  cat("Conclusion: Test inconclusive\n")
}

## Conclusion: Fail to reject H0 - No significant autocorrelation

#No significant auto-correlation

#Constant variance test; BP test
z <- resid^2
Xaux <- cbind(1, x)
XtX_aux_inv <- solve(t(Xaux) %*% Xaux)
beta_aux <- XtX_aux_inv %*% ( t(Xaux) %*% z )
z_hat <- as.numeric(Xaux %*% beta_aux)
SSE_aux <- sum( (z - z_hat)^2 )
SST_aux <- sum( (z - mean(z))^2 )
R2_aux <- 1 - SSE_aux / SST_aux

BP_stat <- n * R2_aux
BP_stat

## [1] 2.397175

p_BP <- 1 - pchisq(BP_stat, df = 1)  #df = number of explanatory vars

p_BP

## [1] 0.1215546

#Constant variance assumption holds



#Normality assumption checking
# Anderson-Darling test
#install.packages("nortest")
library(nortest)
ad_test <- ad.test(resid)

cat("Anderson-Darling Test for Normality:\n")

## Anderson-Darling Test for Normality:

cat("A =", round(ad_test$statistic, 4), "p-value =", round(ad_test$p.value, 4), "\n")

## A = 0.2205 p-value = 0.7703

#Normality assumption holds

#BF test
# Brown-Forsythe test 
brown_forsythe_y <- function(residuals, y) {
  n <- length(residuals)
  
  sorted_indices <- order(y)
  y_sorted <- y[sorted_indices]
  resid_sorted <- residuals[sorted_indices]
  
  split_point <- floor(n/2)
  
  g1 <- resid_sorted[1:split_point]  # Low Y values group
  g2 <- resid_sorted[(split_point + 1):n]  # High Y values group
  
  
  #group medians
  med1 <- median(g1)
  med2 <- median(g2)
  
  #absolute deviations from group medians
  d1 <- abs(g1 - med1)
  d2 <- abs(g2 - med2)
  
  #means of absolute deviations
  mean_d1 <- mean(d1)
  mean_d2 <- mean(d2)
  
  
  n1 <- length(d1)
  n2 <- length(d2)
  var_d1 <- var(d1)
  var_d2 <- var(d2)
  
  #t-statistic
  mean_diff <- mean_d1 - mean_d2
  se <- sqrt(var_d1/n1 + var_d2/n2)
  t_stat <- mean_diff / se
  
  #degrees of freedom
  df_numerator <- (var_d1/n1 + var_d2/n2)^2
  df_denominator <- (var_d1/n1)^2/(n1-1) + (var_d2/n2)^2/(n2-1)
  df <- df_numerator / df_denominator
  
  #two-tailed p-value
  p_value <- 2 * (1 - pt(abs(t_stat), df = df))
  
  cat("t-statistic =", round(t_stat, 4), "\n")
  cat("Degrees of freedom =", round(df, 2), "\n")
  cat("p-value =", round(p_value, 4), "\n")
  
  # Test conclusion
  if (p_value > 0.05) {
    cat("Conclusion: Fail to reject H0:Constant variance\n")
  } else {
    cat("Conclusion: Reject H0: Non-constant variance\n")
  }
  
  return(list(
    t_statistic = t_stat,
    df = df,
    p_value = p_value,
    group1_deviations = d1,
    group2_deviations = d2,
    mean_d1 = mean_d1,
    mean_d2 = mean_d2
  ))
}


bf_y_result <- brown_forsythe_y(resid, y)

## t-statistic = -1.7688 
## Degrees of freedom = 7.11 
## p-value = 0.1196 
## Conclusion: Fail to reject H0:Constant variance

#Still holds


par(mfrow = c(2, 3))

#Scatter plot with regression line

plot(x, y, pch = 19, col = "blue", main = "Scatter Plot with Regression Line",
     xlab = "Time (x)", ylab = "Sales (y)", cex = 1.2)
abline(a = beta0, b = beta1, col = "red", lwd = 2)


# Residuals vs Fitted values

plot(y_hat, resid, pch = 19, col = "darkgreen", 
     main = "Residuals vs Fitted Values",
     xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2)
lines(lowess(y_hat, resid), col = "orange", lwd = 2)

# Normal Q-Q plot of residuals

qqnorm(resid, main = "Normal Q-Q Plot of Residuals", pch = 19, col = "purple")
qqline(resid, col = "red", lwd = 2)

# Histogram of residuals with normal curve
hist(resid, breaks = 5, freq = FALSE, col = "lightblue",
     main = "Histogram of Residuals", xlab = "Residuals")
curve(dnorm(x, mean = mean(resid), sd = sd(resid)), 
      add = TRUE, col = "red", lwd = 2)
# Cook's Distance plot
plot(1:n, D, type = "h", lwd = 3, col = "red",
     main = "Cook's Distance", xlab = "Observation Index", 
     ylab = "Cook's Distance")
points(1:n, D, pch = 19, col = "darkred")

#Boxplot of Cook's Distance

par(mfrow=c(1,3))

boxplot(D, horizontal = TRUE, col = "lightblue", 
        main = "Boxplot of Cook's Distance",
        xlab = "Cook's Distance", 
        ylab = "All Observations")
#install.packages("faraway")
library(faraway)

## Warning: package 'faraway' was built under R version 4.5.1

lmfit <- lm(y ~ x)
cook <- cooks.distance(lmfit)
boxplot(cook)  # Boxplot of Cook's distance from lm object
halfnorm(cook, 3, ylab = "Cook's distance", main = "Half-normal Plot of Cook's Distance")

#Problem 4

#Brown-Forsythe test function
brown_forsythe_test <- function(residuals, x, y) {
  n <- length(residuals)
  
  grouping_methods <- list()
  
  #Split by median of x(time)
  median_x <- median(x)
  grouping_methods$median_x <- ifelse(x <= median_x, "Group1", "Group2")
  
  # Split by median of y(sales)
  median_y <- median(y)
  grouping_methods$median_y <- ifelse(y <= median_y, "Group1", "Group2")
  
  #First half vs second half (time order)
  grouping_methods$time_half <- ifelse(x <= median(x), "Group1", "Group2")
  
  #Low vs high x values (bottom 40% vs top 40%)
  low_high_cutoff <- quantile(x, probs = 0.4)
  grouping_methods$low_high_x <- ifelse(x <= low_high_cutoff, "Group1", "Group2")
  
  #Low vs high y values (bottom 40% vs top 40%)
  low_high_y_cutoff <- quantile(y, probs = 0.4)
  grouping_methods$low_high_y <- ifelse(y <= low_high_y_cutoff, "Group1", "Group2")
  
  #Random split (for comparison)
  set.seed(123)
  grouping_methods$random <- sample(rep(c("Group1", "Group2"), length.out = n))
  
  #Function to perform single BF test
  bf_test_single <- function(grouping) {
    g1 <- residuals[grouping == "Group1"]
    g2 <- residuals[grouping == "Group2"]
    
    #absolute deviations from group medians
    d1 <- abs(g1 - median(g1))
    d2 <- abs(g2 - median(g2))
    
    #testing
    n1 <- length(d1)
    n2 <- length(d2)
    mean_diff <- mean(d1) - mean(d2)
    se <- sqrt(var(d1)/n1 + var(d2)/n2)
    t_stat <- mean_diff / se
    
    # Degrees of freedom
    df_numerator <- (var(d1)/n1 + var(d2)/n2)^2
    df_denominator <- (var(d1)/n1)^2/(n1-1) + (var(d2)/n2)^2/(n2-1)
    df <- df_numerator / df_denominator
    
    #Two-tailed p-value
    p_value <- 2 * (1 - pt(abs(t_stat), df = df))
    
    return(p_value)
  }
  
  #BF test for each grouping method
  bf_results <- list()
  constant_variance_count <- 0
  
  for(method_name in names(grouping_methods)) {
    p_value <- bf_test_single(grouping_methods[[method_name]])
    bf_results[[method_name]] <- p_value
    
    if(p_value > 0.05) {
      cat("Constant variance (Fail to reject H0)\n")
      constant_variance_count <- constant_variance_count + 1
    } else {
      cat("Non-constant variance (Reject H0)\n")
    }
  }
  

  cat("Constant variance conclusions:", constant_variance_count, "out of", 
      length(grouping_methods), "groupings\n\n")
  
  return(list(
    results = bf_results,
    constant_variance_count = constant_variance_count,
    total_tests = length(grouping_methods)
  ))
}

#Brown-Forsythe test
bf_test <- brown_forsythe_test(resid, x, y)

## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance conclusions: 6 out of 6 groupings

bf_test

## $results
## $results$median_x
## [1] 0.1195849
## 
## $results$median_y
## [1] 0.1195849
## 
## $results$time_half
## [1] 0.1195849
## 
## $results$low_high_x
## [1] 0.233051
## 
## $results$low_high_y
## [1] 0.233051
## 
## $results$random
## [1] 0.1988237
## 
## 
## $constant_variance_count
## [1] 6
## 
## $total_tests
## [1] 6

#out of the selected 6 methods all of them are saying constant variance
#BF test is more robust as it does not assume normality, it is a non parametric approach
#Both BP and BF are indicating same results

Regression_Assignment_hw3

Sabuj Ganguly

2025-10-23