#Problem 2
Ship_Route=c(1,0,2,0,3,1,0,1,2,0)
No_Ampules=c(16,9,17,12,22,13,8,15,19,11)
x <- Ship_Route
y <- No_Ampules
n <- length(x)
xbar <- mean(x)
ybar <- mean(y)
# slope and intercept
b1 <- sum((x - xbar) * (y - ybar)) / sum((x - xbar)^2)
b0 <- ybar - b1 * xbar
b0
## [1] 10.2
b1
## [1] 4
#Intercept = 10.2: When there are 0 transfer points (x=0), the estimated number of broken ampules is 10.2
#Slope = 4.0: Each additional transfer point increases the number of broken ampules by approximately 4 units
y_hat <- b0 + b1 * x
resid <- y - y_hat
s2 <- sum(resid^2) / (n - 2)
s <- sqrt(s2)
se_b1 <- s / sqrt(sum((x - xbar)^2))
# 95% CI for slope
t_crit <- qt(0.975, df = n - 2)
lower <- b1 - t_crit * se_b1
upper <- b1 + t_crit * se_b1
c(lower, upper)
## [1] 2.918388 5.081612
t_stat <- b1 / se_b1
p_value <- 2 * (1 - pt(abs(t_stat), df = n - 2))
t_stat
## [1] 8.528029
p_value
## [1] 2.748669e-05
#There is strong statistical evidence of a linear relationship between the number of transfer points and broken ampules
#problem 3
get_slope <- function(x, y, idx) {
x_s <- x[idx]
y_s <- y[idx]
xbar <- mean(x_s)
ybar <- mean(y_s)
b1 <- sum((x_s - xbar) * (y_s - ybar)) / sum((x_s - xbar)^2)
return(b1)
}
B <- 1000 # number of bootstrap samples
boot_b1 <- numeric(B)
set.seed(123)
for (i in 1:B) {
idx <- sample(1:n, n, replace = TRUE)
boot_b1[i] <- get_slope(x, y, idx)
}
boot_CI <- quantile(boot_b1, probs = c(0.025, 0.975))
boot_CI
## 2.5% 97.5%
## 3.209197 4.913134
xbar <- mean(x)
ybar <- mean(y)
b1 <- sum((x - xbar)*(y - ybar)) / sum((x - xbar)^2)
y_hat <- b1*(x - xbar) + mean(y)
resid <- y - y_hat
s2 <- sum(resid^2) / (n - 2)
s <- sqrt(s2)
se_b1 <- s / sqrt(sum((x - xbar)^2))
t_crit <- qt(0.975, df = n - 2)
normal_CI <- c(b1 - t_crit*se_b1, b1 + t_crit*se_b1)
round(normal_CI,3)
## [1] 2.918 5.082
round(boot_CI,3)
## 2.5% 97.5%
## 3.209 4.913
#Bootstrap CI is narrower than Normal Theory CI
#Both intervals exclude zero, confirming the significant positive relationship
#Normal theory CI assumes errors are perfectly normally distributed
#Bootstrap makes no distributional assumptions about errors, that's why they are different
# Problem 1
x <- c(0,1,2,3,4,5,6,7,8,9)
y <- c(98,135,162,178,221,232,283,300,374,395)
n <- length(x)
p <- 2
mean_x <- mean(x)
mean_y <- mean(y)
#sums needed for correlation
Sxx <- sum((x - mean_x)^2)
Syy <- sum((y - mean_y)^2)
Sxy <- sum((x - mean_x) * (y - mean_y))
#Correlation coefficient
r <- Sxy / sqrt(Sxx * Syy)
r
## [1] 0.9898316
# Design matrix
X <- cbind(1, x)
# OLS
XtX_inv <- solve( t(X) %*% X )
beta_hat <- XtX_inv %*% ( t(X) %*% y )
beta0 <- as.numeric(beta_hat[1])
beta1 <- as.numeric(beta_hat[2])
beta0
## [1] 91.56364
beta1
## [1] 32.49697
# Fitted values and residuals
y_hat <- as.numeric(X %*% beta_hat)
resid <- y - y_hat
#Residual sum of squares and estimate of sigma^2
RSS <- sum(resid^2)
s2 <- RSS / (n - p)
s <- sqrt(s2)
#Cook's distance
#Outlier checking
# Hat matrix diagonals
H <- X %*% XtX_inv %*% t(X)
h <- diag(H)
# Cook's D:
D <- (resid^2) / (p * s2) * ( h / (1 - h)^2 )
cook_df <- data.frame(i = 1:n, x = x, y = y, yhat = round(y_hat,2),
resid = round(resid,3), h = round(h,4), CookD = round(D,4))
print(cook_df)
## i x y yhat resid h CookD
## 1 1 0 98 91.56 6.436 0.3455 0.0743
## 2 2 1 135 124.06 10.939 0.2485 0.1171
## 3 3 2 162 156.56 5.442 0.1758 0.0170
## 4 4 3 178 189.05 -11.055 0.1273 0.0454
## 5 5 4 221 221.55 -0.552 0.1030 0.0001
## 6 6 5 232 254.05 -22.048 0.1030 0.1384
## 7 7 6 283 286.55 -3.545 0.1273 0.0047
## 8 8 7 300 319.04 -19.042 0.1758 0.2086
## 9 9 8 374 351.54 22.461 0.2485 0.4934
## 10 10 9 395 384.04 10.964 0.3455 0.2155
#No influential points
#DW test
# Durbin-Watson test function
durbin_watson_test <- function(residuals) {
n <- length(residuals)
# Calculate the Durbin-Watson statistic
numerator <- sum((residuals[2:n] - residuals[1:(n-1)])^2)
denominator <- sum(residuals^2)
dw_stat <- numerator / denominator
return(dw_stat)
}
#Durbin-Watson statistic
dw_statistic <- durbin_watson_test(resid)
round(dw_statistic, 4)
## [1] 1.8521
d_l <- 0.879 # lower critical value
d_u <- 1.320 # upper critical value
if (dw_statistic < d_l) {
cat("Conclusion: Reject H0 - Positive autocorrelation exists\n")
} else if (dw_statistic > d_u & dw_statistic < (4 - d_u)) {
cat("Conclusion: Fail to reject H0 - No significant autocorrelation\n")
} else if (dw_statistic > (4 - d_l)) {
cat("Conclusion: Reject H0 - Negative autocorrelation exists\n")
} else {
cat("Conclusion: Test inconclusive\n")
}
## Conclusion: Fail to reject H0 - No significant autocorrelation
#No significant auto-correlation
#Constant variance test; BP test
z <- resid^2
Xaux <- cbind(1, x)
XtX_aux_inv <- solve(t(Xaux) %*% Xaux)
beta_aux <- XtX_aux_inv %*% ( t(Xaux) %*% z )
z_hat <- as.numeric(Xaux %*% beta_aux)
SSE_aux <- sum( (z - z_hat)^2 )
SST_aux <- sum( (z - mean(z))^2 )
R2_aux <- 1 - SSE_aux / SST_aux
BP_stat <- n * R2_aux
BP_stat
## [1] 2.397175
p_BP <- 1 - pchisq(BP_stat, df = 1) #df = number of explanatory vars
p_BP
## [1] 0.1215546
#Constant variance assumption holds
#Normality assumption checking
# Anderson-Darling test
#install.packages("nortest")
library(nortest)
ad_test <- ad.test(resid)
cat("Anderson-Darling Test for Normality:\n")
## Anderson-Darling Test for Normality:
cat("A =", round(ad_test$statistic, 4), "p-value =", round(ad_test$p.value, 4), "\n")
## A = 0.2205 p-value = 0.7703
#Normality assumption holds
#BF test
# Brown-Forsythe test
brown_forsythe_y <- function(residuals, y) {
n <- length(residuals)
sorted_indices <- order(y)
y_sorted <- y[sorted_indices]
resid_sorted <- residuals[sorted_indices]
split_point <- floor(n/2)
g1 <- resid_sorted[1:split_point] # Low Y values group
g2 <- resid_sorted[(split_point + 1):n] # High Y values group
#group medians
med1 <- median(g1)
med2 <- median(g2)
#absolute deviations from group medians
d1 <- abs(g1 - med1)
d2 <- abs(g2 - med2)
#means of absolute deviations
mean_d1 <- mean(d1)
mean_d2 <- mean(d2)
n1 <- length(d1)
n2 <- length(d2)
var_d1 <- var(d1)
var_d2 <- var(d2)
#t-statistic
mean_diff <- mean_d1 - mean_d2
se <- sqrt(var_d1/n1 + var_d2/n2)
t_stat <- mean_diff / se
#degrees of freedom
df_numerator <- (var_d1/n1 + var_d2/n2)^2
df_denominator <- (var_d1/n1)^2/(n1-1) + (var_d2/n2)^2/(n2-1)
df <- df_numerator / df_denominator
#two-tailed p-value
p_value <- 2 * (1 - pt(abs(t_stat), df = df))
cat("t-statistic =", round(t_stat, 4), "\n")
cat("Degrees of freedom =", round(df, 2), "\n")
cat("p-value =", round(p_value, 4), "\n")
# Test conclusion
if (p_value > 0.05) {
cat("Conclusion: Fail to reject H0:Constant variance\n")
} else {
cat("Conclusion: Reject H0: Non-constant variance\n")
}
return(list(
t_statistic = t_stat,
df = df,
p_value = p_value,
group1_deviations = d1,
group2_deviations = d2,
mean_d1 = mean_d1,
mean_d2 = mean_d2
))
}
bf_y_result <- brown_forsythe_y(resid, y)
## t-statistic = -1.7688
## Degrees of freedom = 7.11
## p-value = 0.1196
## Conclusion: Fail to reject H0:Constant variance
#Still holds
par(mfrow = c(2, 3))
#Scatter plot with regression line
plot(x, y, pch = 19, col = "blue", main = "Scatter Plot with Regression Line",
xlab = "Time (x)", ylab = "Sales (y)", cex = 1.2)
abline(a = beta0, b = beta1, col = "red", lwd = 2)
# Residuals vs Fitted values
plot(y_hat, resid, pch = 19, col = "darkgreen",
main = "Residuals vs Fitted Values",
xlab = "Fitted Values", ylab = "Residuals")
abline(h = 0, col = "red", lwd = 2)
lines(lowess(y_hat, resid), col = "orange", lwd = 2)
# Normal Q-Q plot of residuals
qqnorm(resid, main = "Normal Q-Q Plot of Residuals", pch = 19, col = "purple")
qqline(resid, col = "red", lwd = 2)
# Histogram of residuals with normal curve
hist(resid, breaks = 5, freq = FALSE, col = "lightblue",
main = "Histogram of Residuals", xlab = "Residuals")
curve(dnorm(x, mean = mean(resid), sd = sd(resid)),
add = TRUE, col = "red", lwd = 2)
# Cook's Distance plot
plot(1:n, D, type = "h", lwd = 3, col = "red",
main = "Cook's Distance", xlab = "Observation Index",
ylab = "Cook's Distance")
points(1:n, D, pch = 19, col = "darkred")
#Boxplot of Cook's Distance
par(mfrow=c(1,3))

boxplot(D, horizontal = TRUE, col = "lightblue",
main = "Boxplot of Cook's Distance",
xlab = "Cook's Distance",
ylab = "All Observations")
#install.packages("faraway")
library(faraway)
## Warning: package 'faraway' was built under R version 4.5.1
lmfit <- lm(y ~ x)
cook <- cooks.distance(lmfit)
boxplot(cook) # Boxplot of Cook's distance from lm object
halfnorm(cook, 3, ylab = "Cook's distance", main = "Half-normal Plot of Cook's Distance")

#Problem 4
#Brown-Forsythe test function
brown_forsythe_test <- function(residuals, x, y) {
n <- length(residuals)
grouping_methods <- list()
#Split by median of x(time)
median_x <- median(x)
grouping_methods$median_x <- ifelse(x <= median_x, "Group1", "Group2")
# Split by median of y(sales)
median_y <- median(y)
grouping_methods$median_y <- ifelse(y <= median_y, "Group1", "Group2")
#First half vs second half (time order)
grouping_methods$time_half <- ifelse(x <= median(x), "Group1", "Group2")
#Low vs high x values (bottom 40% vs top 40%)
low_high_cutoff <- quantile(x, probs = 0.4)
grouping_methods$low_high_x <- ifelse(x <= low_high_cutoff, "Group1", "Group2")
#Low vs high y values (bottom 40% vs top 40%)
low_high_y_cutoff <- quantile(y, probs = 0.4)
grouping_methods$low_high_y <- ifelse(y <= low_high_y_cutoff, "Group1", "Group2")
#Random split (for comparison)
set.seed(123)
grouping_methods$random <- sample(rep(c("Group1", "Group2"), length.out = n))
#Function to perform single BF test
bf_test_single <- function(grouping) {
g1 <- residuals[grouping == "Group1"]
g2 <- residuals[grouping == "Group2"]
#absolute deviations from group medians
d1 <- abs(g1 - median(g1))
d2 <- abs(g2 - median(g2))
#testing
n1 <- length(d1)
n2 <- length(d2)
mean_diff <- mean(d1) - mean(d2)
se <- sqrt(var(d1)/n1 + var(d2)/n2)
t_stat <- mean_diff / se
# Degrees of freedom
df_numerator <- (var(d1)/n1 + var(d2)/n2)^2
df_denominator <- (var(d1)/n1)^2/(n1-1) + (var(d2)/n2)^2/(n2-1)
df <- df_numerator / df_denominator
#Two-tailed p-value
p_value <- 2 * (1 - pt(abs(t_stat), df = df))
return(p_value)
}
#BF test for each grouping method
bf_results <- list()
constant_variance_count <- 0
for(method_name in names(grouping_methods)) {
p_value <- bf_test_single(grouping_methods[[method_name]])
bf_results[[method_name]] <- p_value
if(p_value > 0.05) {
cat("Constant variance (Fail to reject H0)\n")
constant_variance_count <- constant_variance_count + 1
} else {
cat("Non-constant variance (Reject H0)\n")
}
}
cat("Constant variance conclusions:", constant_variance_count, "out of",
length(grouping_methods), "groupings\n\n")
return(list(
results = bf_results,
constant_variance_count = constant_variance_count,
total_tests = length(grouping_methods)
))
}
#Brown-Forsythe test
bf_test <- brown_forsythe_test(resid, x, y)
## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance (Fail to reject H0)
## Constant variance conclusions: 6 out of 6 groupings
bf_test
## $results
## $results$median_x
## [1] 0.1195849
##
## $results$median_y
## [1] 0.1195849
##
## $results$time_half
## [1] 0.1195849
##
## $results$low_high_x
## [1] 0.233051
##
## $results$low_high_y
## [1] 0.233051
##
## $results$random
## [1] 0.1988237
##
##
## $constant_variance_count
## [1] 6
##
## $total_tests
## [1] 6
#out of the selected 6 methods all of them are saying constant variance
#BF test is more robust as it does not assume normality, it is a non parametric approach
#Both BP and BF are indicating same results