Problem 1

cosmetics <- read.csv("/Users/caelynsobie/Downloads/MAT613/cosmetics-sales.csv")
#a
#b_r = (X^TX + CI)^-1 X^T Y

X <- as.matrix((cosmetics[, c("x1", "x2", "x3")]))

XT <- t(X)
I <- diag(3)
Y <- as.matrix(cosmetics$y)
XTX <- cor(X)
XTY <- cor(X,Y)
c = c(0, 0.01)
b_r <- matrix(0, 3, 2)
for (i in seq_along(c)) {
  c_val <- c[i]
  b_r[,i] <- solve(XTX + c_val * I) %*% XTY
}

print(b_r)
##           [,1]      [,2]
## [1,] 0.4902443 0.4605616
## [2,] 0.2956866 0.3219088
## [3,] 0.1685422 0.1672796
rxx <- cor(X)
VIF <-matrix(0, 3, 2)
for (i in seq_along(c)) {
  c_val <- c[i]
  ridge_inv <- solve(rxx + c_val * I)
  vif_matrix <- ridge_inv %*% rxx %*% ridge_inv
  VIF[, i] <- diag(vif_matrix)
}
print(VIF)
##           [,1]      [,2]
## [1,] 20.072031 10.358462
## [2,] 20.716101 10.672305
## [3,]  1.217973  1.171235
#R^2 = 1- RSS/TSS
r_sqr <- matrix(0,1,2)
Y_mean <- mean(Y)
TSS <- sum((Y - Y_mean)^2)

for (i in seq_along(c)) {
  c_val <- c[i]

  beta_ridge <- solve(XTX + c_val * I) %*% XTY
  
  Y_pred <- X %*% beta_ridge
  
  RSS <- sum((Y - Y_pred)^2)
  
  r_sqr[i] <- 1 - (RSS / TSS)
}
results <- data.frame(
  Metric = c("b1", "b2", "b3", "VIF1", "VIF2", "VIF3", "R^2"),
  `c = 0` = c(b_r[, 1], VIF[, 1], r_sqr[1]),
  `c = 0.01` = c(b_r[, 2], VIF[, 2], r_sqr[2])
)
results
##   Metric      c...0   c...0.01
## 1     b1  0.4902443  0.4605616
## 2     b2  0.2956866  0.3219088
## 3     b3  0.1685422  0.1672796
## 4   VIF1 20.0720311 10.3584618
## 5   VIF2 20.7161010 10.6723049
## 6   VIF3  1.2179732  1.1712352
## 7    R^2 -2.3516267 -2.3738551
#b
tot_c <- c(0, 0.01, 0.02, 0.04, 0.06, 0.08, 0.09, 0.1)

coefficients <- matrix(0, nrow = length(tot_c), ncol = ncol(X))
colnames(coefficients) <- c("b1", "b2", "b3")

for (i in seq_along(tot_c)) {
  c_valu <- tot_c[i]
  
  beta_ridge <- solve(XTX + c_valu * I) %*% XTY
  
  coefficients[i, ] <- beta_ridge
}

coefficients_df <- as.data.frame(coefficients)
coefficients_df$c <- tot_c

plot(coefficients_df$c, coefficients_df$b1, type = "l", col = "blue", lwd = 2,
     ylim = range(coefficients_df[, 1:3]), xlab = "c", ylab = "Coefficients",
     main = "Ridge Trace Plot")
lines(coefficients_df$c, coefficients_df$b2, col = "red", lwd = 2)
lines(coefficients_df$c, coefficients_df$b3, col = "green", lwd = 2)

legend("topright", legend = c("b1", "b2", "b3"), col = c("blue", "red", "green"), lwd = 2)

Yes, near c=0, there is the most substantial changes in the Ridge Trace Plot. The slope is the largest close to zero, then evens out as c gets larger.

#c c=0.06 is a reasonable value to pick because that is approximately when all 3 b values about level out and become stable. Based off of the VIF values c=0.06 because after, the VIF values have become sufficiently small and only change moderately as c is increased further

#d
#b_k = (S_y/S_x_k)b_k*
s_y <- sd(cosmetics$y)
s_x1 <- sd(cosmetics$x1)
s_x2 <- sd(cosmetics$x2)
s_x3 <- sd(cosmetics$x3)

b_r_c <- solve(XTX + 0.06 * I) %*% XTY

u_b_k1 <- (s_y/s_x1)*b_r_c[1]
u_b_k2 <- (s_y/s_x2)*b_r_c[2]
u_b_k3 <- (s_y/s_x3)*b_r_c[3]
u_b <- c(u_b_k1, u_b_k2, u_b_k3)
x_means <- apply(cosmetics[, c("x1", "x2", "x3")], 2, mean)

u_b_0 <- mean(cosmetics$y) -sum(u_b*x_means)
print(c(u_b_0, u_b))
## [1] 1.2126330 0.8084593 0.7527150 0.6612444
ols <- lm(cosmetics$y~., data=cosmetics)
ols$coefficients
## (Intercept)          x1          x2          x3 
##   1.0232513   0.9656902   0.6291644   0.6760246
u_b
## [1] 0.8084593 0.7527150 0.6612444
u_fit = u_b_0 + u_b_k1*cosmetics$x1 + u_b_k2*cosmetics$x2 + u_b_k3*cosmetics$x3
ols_fit = fitted(ols)
u_fit - ols_fit
##            1            2            3            4            5            6 
## -0.055393088  0.066832626 -0.013154493 -0.086206279  0.069071049 -0.103386350 
##            7            8            9           10           11           12 
## -0.187897112 -0.001504526 -0.009561199  0.015509854  0.009610297 -0.039240521 
##           13           14           15           16           17           18 
##  0.181247969 -0.081649064  0.069652989  0.174185229  0.184904846 -0.003275646 
##           19           20           21           22           23           24 
##  0.029151942 -0.019168687 -0.014441524 -0.059737573  0.143630868 -0.188758925 
##           25           26           27           28           29           30 
##  0.040750855 -0.023500643 -0.117287269  0.094481998 -0.171315767 -0.022260409 
##           31           32           33           34           35           36 
##  0.042335241  0.060360340  0.017603387  0.170817193  0.036677729 -0.074377851 
##           37           38           39           40           41           42 
## -0.175886208  0.085720234  0.056869154  0.023656106  0.020304856  0.015454543 
##           43           44 
## -0.021899471 -0.138926703

The values are similar, but not exactly the same.

plot(u_fit, col="blue")
points(ols_fit, col="black")

We see the points are almost exactly the same, but not quite.

PROBLEM 3

grocery <- read.csv("/Users/caelynsobie/Downloads/grocery-retailer.csv")
#a
X_g <- as.matrix((grocery[, c("x1", "x2", "x3")]))
hat <- X_g %*% solve(t(X_g) %*% X_g) %*% t(X_g)
#rule of thumb 2*(p/n)
thres <- 2*(3/52)
leverages <- diag(hat)
print(leverages)
##  [1] 0.02150212 0.04421445 0.09132604 0.05240339 0.20106108 0.02136666
##  [7] 0.02364536 0.01648624 0.02207083 0.01785180 0.02118171 0.01791272
## [13] 0.01916796 0.03730397 0.03330967 0.20436604 0.01860270 0.03841652
## [19] 0.02447201 0.02442505 0.18869690 0.17910994 0.04147034 0.07453020
## [25] 0.05582546 0.02128138 0.02506312 0.06066355 0.01832020 0.04158947
## [31] 0.03623619 0.05802987 0.04088364 0.02514119 0.04611123 0.01572176
## [37] 0.01904253 0.02990608 0.04574096 0.03099405 0.04371057 0.10711798
## [43] 0.17044394 0.12230876 0.06403822 0.02230334 0.05981042 0.25533074
## [49] 0.02417531 0.03100908 0.06763356 0.02667372
hasOutliers <- FALSE
for (i in 1:length(leverages)) {
  if (leverages[i] > thres) {
    cat(sprintf("Leverage at index %d is high: %f\n", i, leverages[i]))
    hasOutliers <- TRUE
  }
}
## Leverage at index 5 is high: 0.201061
## Leverage at index 16 is high: 0.204366
## Leverage at index 21 is high: 0.188697
## Leverage at index 22 is high: 0.179110
## Leverage at index 43 is high: 0.170444
## Leverage at index 44 is high: 0.122309
## Leverage at index 48 is high: 0.255331
if (!hasOutliers) {
  cat("There are no outliers with respect to x.\n")
}
#b
X_new <- c(300000, 7.2, 0)
plot(grocery$x1, grocery$x2)
points(X_new[1], X_new[2], col = "red", pch = 19)

Visually, this does not appear to be an extrapolation beyond the range of the data since it is in the middle of the data cloud.

# X_new^T (X^T X)^-1 X_new
X_new_scaled <- as.numeric((X_new - colMeans(grocery[, c("x1", "x2", "x3")])) / apply(grocery[, c("x1", "x2", "x3")], 2, sd))

XTX_inv <- solve(t(X_g) %*% X_g)

hidden_extrapolation <- t(X_new_scaled) %*% XTX_inv %*% X_new_scaled

hasHidExtra <- TRUE

if (min(leverages) < hidden_extrapolation & hidden_extrapolation < max(leverages)) {
  hasHidExtra <- FALSE
  cat(c(round(hidden_extrapolation,4),"is within range of all leverage values. So there is no hidden extrapolation. \n"))
  cat(c(round(min(leverages),4), "<", round(hidden_extrapolation,4), "<", round(max(leverages),4)))
} else {
  cat(c(hidden_extrapolation, " is outside the range of  leverage values. So we have hidden extrapolation."))
}
## 0.0242 is within range of all leverage values. So there is no hidden extrapolation. 
## 0.0157 < 0.0242 < 0.2553
#c
model <- lm(grocery$y ~ X_g[,1] + X_g[,2] + X_g[,3])
outX <- c(16, 22, 43, 48)
outY <- c(10, 32, 38, 40)
out <- c(outX, outY)


dffits_values <- dffits(model)

dfbetas_values <- dfbetas(model)

cooks_distance <- cooks.distance(model)

influence_measures <- data.frame(
  DFFITS = dffits_values[c(outX, outY)],
  DFBETAS_x1 = dfbetas_values[c(outX, outY),2],
  DFBETAS_x2 = dfbetas_values[c(outX, outY),3],
  DFBETAS_x3 = dfbetas_values[c(outX, outY),4],
  Cook_Distance = cooks_distance[c(outX, outY)]
)

print(influence_measures)
##         DFFITS  DFBETAS_x1   DFBETAS_x2  DFBETAS_x3 Cook_Distance
## 16 -0.55399026 -0.05978171  0.324814936 -0.45210042  0.0768950835
## 22  0.05508583 -0.02528712 -0.018701755  0.04464552  0.0007746088
## 43  0.56165186  0.13384205  0.326179735  0.35662759  0.0792193145
## 48 -0.14684146 -0.09384229  0.009013941 -0.10221538  0.0054988670
## 10  0.45863297 -0.10440317 -0.314158747 -0.06334626  0.0493501187
## 32 -0.65107706  0.09134173 -0.570832219  0.16520616  0.0997597379
## 38  0.38551766 -0.08273847  0.208364674 -0.12700868  0.0346380312
## 40  0.39672030 -0.21205906  0.093252646 -0.11104717  0.0364991539
dffits_influence <- FALSE
for (i in seq(dffits_values)){
  if (abs(dffits_values[i]) > 1) {
    dffits_influence <- TRUE
    cat(c(out[i]), "is an influential observation")
  }
}

if (!dffits_influence){
  print("According to DFFITS there are no influential observations. This measures the influence of the ith case on the fitted value. ")
}
## [1] "According to DFFITS there are no influential observations. This measures the influence of the ith case on the fitted value. "
dfbetas_influence <- FALSE
for (i in seq(dfbetas_values)){
  if (abs(dfbetas_values[i]) > 1) {
    dfbetas_influence <- TRUE
    cat(c(out[i]), "is an influential observation")
  }
}

if (!dfbetas_influence){
  print("According to DFBETAS there are no influential observations. This measures the change in the kth estimated coefficient bk when observation Xi is removed, studentized.")
}
## [1] "According to DFBETAS there are no influential observations. This measures the change in the kth estimated coefficient bk when observation Xi is removed, studentized."
p <- 3  
n <- 52 

percentiles <- pf(cooks_distance, p, n - p)

for (i in out) {
  cat(sprintf("Observation %d: Percentile = %.4f\n", 
              i, percentiles[i]))
}
## Observation 16: Percentile = 0.0278
## Observation 22: Percentile = 0.0000
## Observation 43: Percentile = 0.0290
## Observation 48: Percentile = 0.0006
## Observation 10: Percentile = 0.0147
## Observation 32: Percentile = 0.0402
## Observation 38: Percentile = 0.0088
## Observation 40: Percentile = 0.0094
cooks_influence <- FALSE
for (i in out){
  if (abs(percentiles[i]) > .2) {
    cooks_influence <- TRUE
    cat(c(out[i]), "is an influential observation")
  }
}
if (!cooks_influence){
  print("According to Cook's Distance there are no influential observations. This measures the effect of the ith case on all n fitted values")
}
## [1] "According to Cook's Distance there are no influential observations. This measures the effect of the ith case on all n fitted values"

Thus there are no influential observations.

#d
plot(model,4)

According to this measure, we would want to consider observations 16,32, and 43 to possibly be influential.

pairs(grocery)

There appears to be a weak positive linear association between x1 and x2. The points are somewhat dispersed but show a slight upward trend, indicating a weak correlation.

There is little to no visible linear association between x1 and x3. The points appear scattered without any apparent trend, suggesting that x1 and x3 are not linearly related.

Similarly, x2 and x3 show no obvious linear association. The points are dispersed without a discernible pattern, indicating that x2 and x3 are likely uncorrelated or have a very weak association.

cor(grocery)
##            y         x1         x2         x3
## y  1.0000000 0.20766494 0.06002960 0.81057940
## x1 0.2076649 1.00000000 0.08489639 0.04565698
## x2 0.0600296 0.08489639 1.00000000 0.11337076
## x3 0.8105794 0.04565698 0.11337076 1.00000000

The correlation between x1 and x2 is 0.0849, which is very low. This indicates a very weak linear association between these two variables, supporting the idea that they are nearly uncorrelated.

The correlation between x1 and x3 is 0.0456, which is also very low, suggesting little to no linear association between x1 and x3.

The correlation between x2 and x3 is 0.1134, which is also quite low, indicating a very weak linear relationship between these two variables or no linear relationship.

There is nothing that has shown that there exists a strong linear relationship between any of the predictors, suggesting independence between predictors.

#f
library(car)
## Loading required package: carData
vif_values <- vif(model)
print(vif_values)
## X_g[, 1] X_g[, 2] X_g[, 3] 
## 1.008596 1.019598 1.014364

None of the VIF values are greater than 10 suggesting there is no multicollinearity present.

PROBLEM 5

salary <- read.csv("/Users/caelynsobie/Downloads/MAT613/employee-salary.csv")
#a
model_salary <- lm(salary$y ~ factor(salary$degree) + salary$x3 + salary$x4 )
residuals <- resid(model_salary)
fitted_values <- fitted(model_salary)

plot(fitted_values, residuals)

There is a megaphone shape, suggesting non-constant varaince

#b
library(ALSM)
## Loading required package: leaps
## Loading required package: SuppDists
g <- rep(1, 65)

g[fitted_values >= median(fitted_values)] <- 0

bftest(model_salary,g, alpha=0.01)
##       t.value      P.Value alpha df
## [1,] 4.442422 3.668798e-05  0.01 63

Our p-value 3.668798e-05 < 0.01, thus we reject the null hypothesis that the variances are equal.

#c

plot(salary$x3, abs(residuals), 
     main = "Absolute Residuals vs. X3")

plot(salary$x4, abs(residuals), 
     main = "Absolute Residuals vs. X4")

As the values of X3 increase, the absolute residuals also appear to increase in variability. This suggests that the standard deviation of the error term may be positively associated with X3 The plot suggests that the variability of the residuals (and thus the error term) increases with X3, indicating that the error variance is not constant.

As the values of X4 increase, the absolute residuals also appear to increase in variability. This suggests that the standard deviation of the error term may be positively associated with X4. The plot suggests that the variability of the residuals (and thus the error term) increases with X4, indicating that the error variance is not constant.

#d
sd_model <- lm( abs(residuals) ~ salary$x3 + salary$x4)
pred_sd <- fitted(sd_model)

weights <- 1 / (pred_sd^2)

print(weights)
##           1           2           3           4           5           6 
## 0.056292687 0.077705686 0.001532505 0.024530644 0.163067895 0.010251537 
##           7           8           9          10          11          12 
## 0.090311060 0.118907587 0.034820755 0.013179549 0.007020100 0.073711480 
##          13          14          15          16          17          18 
## 0.083172030 0.010441314 0.077705686 0.090528372 0.023024234 0.011372887 
##          19          20          21          22          23          24 
## 0.005242312 0.008537492 0.119896882 0.089023467 0.005026257 0.034323007 
##          25          26          27          28          29          30 
## 0.055657611 0.009376487 0.032596332 0.094578048 0.111415948 0.125743297 
##          31          32          33          34          35          36 
## 0.015718961 0.009792584 0.049591497 0.056938695 0.075669068 0.125033549 
##          37          38          39          40          41          42 
## 0.016351763 0.004340427 0.039776372 0.010224118 0.059753781 0.011854165 
##          43          44          45          46          47          48 
## 0.003734147 0.013888717 0.136752318 0.108216859 0.062908841 0.057595888 
##          49          50          51          52          53          54 
## 0.133575200 0.139210138 0.137564309 0.122596179 0.107368283 0.012544414 
##          55          56          57          58          59          60 
## 0.008502906 0.036729680 0.060818508 0.028140893 0.013993969 0.021181282 
##          61          62          63          64          65 
## 0.040548297 0.031689610 0.148355382 0.094114807 0.003540910
#e
w_model_salary <- lm(salary$y ~ factor(salary$degree) + salary$x3 + salary$x4 , weights = weights)

w_coeff <- coef(w_model_salary)
coeff <- coef(model_salary)

cat("Weighted coefficients:", w_coeff, "\n")
## Weighted coefficients: 29.42549 10.89964 26.68488 1.42533 1.723949
cat("Unweighted coefficients:", coeff, "\n")
## Unweighted coefficients: 31.47143 10.81195 22.63073 1.258124 1.852313

The weighted least squares (WLS) estimates of the regression coefficients are similar to the ordinary least squares (OLS) coefficients, with some minor differences. The most notable difference is in the coefficient for the second predictor, where weighting appears to have a more substantial effect.

#f
print("Unweighted Model Summary")
## [1] "Unweighted Model Summary"
summary(model_salary)
## 
## Call:
## lm(formula = salary$y ~ factor(salary$degree) + salary$x3 + salary$x4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -29.058  -3.477  -0.915   3.417  36.909 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             31.4714     2.8691  10.969 5.73e-16 ***
## factor(salary$degree)2  10.8120     3.2183   3.360  0.00136 ** 
## factor(salary$degree)3  22.6307     3.4846   6.494 1.81e-08 ***
## salary$x3                1.2581     0.2273   5.535 7.23e-07 ***
## salary$x4                1.8523     0.2276   8.137 2.86e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.14 on 60 degrees of freedom
## Multiple R-squared:  0.8633, Adjusted R-squared:  0.8542 
## F-statistic: 94.76 on 4 and 60 DF,  p-value: < 2.2e-16
print("Weighted Model Summary")
## [1] "Weighted Model Summary"
summary(w_model_salary)
## 
## Call:
## lm(formula = salary$y ~ factor(salary$degree) + salary$x3 + salary$x4, 
##     weights = weights)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.2414 -0.7531 -0.2709  0.6915  3.3246 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             29.4255     1.3617  21.610  < 2e-16 ***
## factor(salary$degree)2  10.8996     1.4918   7.307 7.50e-10 ***
## factor(salary$degree)3  26.6849     1.6686  15.992  < 2e-16 ***
## salary$x3                1.4253     0.2002   7.118 1.57e-09 ***
## salary$x4                1.7239     0.3206   5.377 1.31e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.15 on 60 degrees of freedom
## Multiple R-squared:  0.8874, Adjusted R-squared:  0.8799 
## F-statistic: 118.2 on 4 and 60 DF,  p-value: < 2.2e-16

Where the standard errors were relatively large in the original model (intercept and x1 and x2) reduced to around 1 from 2 and 3 in the weighted model. The weighted model overall reduces the standard error.

#g
new_res <- salary$y - fitted(w_model_salary)
new_sd_model <- lm( abs(new_res) ~ salary$x3 + salary$x4)

new_pred_sd <- fitted(new_sd_model)

new_weights <- 1 / (new_pred_sd^2)


new_w_model_salary <- lm(salary$y ~ factor(salary$degree) + salary$x3 + salary$x4 , weights = new_weights)

new_w_coeff <- coef(new_w_model_salary)
new_coeff <- coef(model_salary)

cat("Weighted coefficients:", new_w_coeff, "\n")
## Weighted coefficients: 29.08323 11.00753 26.81422 1.490446 1.692183
cat("Unweighted coefficients:", new_coeff, "\n")
## Unweighted coefficients: 31.47143 10.81195 22.63073 1.258124 1.852313

There is not a substantial change between the iterations in the coefficients. If there was, we would want to continue iterating until there is not a significant change.