Problem 1
cosmetics <- read.csv("/Users/caelynsobie/Downloads/MAT613/cosmetics-sales.csv")
#a
#b_r = (X^TX + CI)^-1 X^T Y
X <- as.matrix((cosmetics[, c("x1", "x2", "x3")]))
XT <- t(X)
I <- diag(3)
Y <- as.matrix(cosmetics$y)
XTX <- cor(X)
XTY <- cor(X,Y)
c = c(0, 0.01)
b_r <- matrix(0, 3, 2)
for (i in seq_along(c)) {
c_val <- c[i]
b_r[,i] <- solve(XTX + c_val * I) %*% XTY
}
print(b_r)
## [,1] [,2]
## [1,] 0.4902443 0.4605616
## [2,] 0.2956866 0.3219088
## [3,] 0.1685422 0.1672796
rxx <- cor(X)
VIF <-matrix(0, 3, 2)
for (i in seq_along(c)) {
c_val <- c[i]
ridge_inv <- solve(rxx + c_val * I)
vif_matrix <- ridge_inv %*% rxx %*% ridge_inv
VIF[, i] <- diag(vif_matrix)
}
print(VIF)
## [,1] [,2]
## [1,] 20.072031 10.358462
## [2,] 20.716101 10.672305
## [3,] 1.217973 1.171235
#R^2 = 1- RSS/TSS
r_sqr <- matrix(0,1,2)
Y_mean <- mean(Y)
TSS <- sum((Y - Y_mean)^2)
for (i in seq_along(c)) {
c_val <- c[i]
beta_ridge <- solve(XTX + c_val * I) %*% XTY
Y_pred <- X %*% beta_ridge
RSS <- sum((Y - Y_pred)^2)
r_sqr[i] <- 1 - (RSS / TSS)
}
results <- data.frame(
Metric = c("b1", "b2", "b3", "VIF1", "VIF2", "VIF3", "R^2"),
`c = 0` = c(b_r[, 1], VIF[, 1], r_sqr[1]),
`c = 0.01` = c(b_r[, 2], VIF[, 2], r_sqr[2])
)
results
## Metric c...0 c...0.01
## 1 b1 0.4902443 0.4605616
## 2 b2 0.2956866 0.3219088
## 3 b3 0.1685422 0.1672796
## 4 VIF1 20.0720311 10.3584618
## 5 VIF2 20.7161010 10.6723049
## 6 VIF3 1.2179732 1.1712352
## 7 R^2 -2.3516267 -2.3738551
#b
tot_c <- c(0, 0.01, 0.02, 0.04, 0.06, 0.08, 0.09, 0.1)
coefficients <- matrix(0, nrow = length(tot_c), ncol = ncol(X))
colnames(coefficients) <- c("b1", "b2", "b3")
for (i in seq_along(tot_c)) {
c_valu <- tot_c[i]
beta_ridge <- solve(XTX + c_valu * I) %*% XTY
coefficients[i, ] <- beta_ridge
}
coefficients_df <- as.data.frame(coefficients)
coefficients_df$c <- tot_c
plot(coefficients_df$c, coefficients_df$b1, type = "l", col = "blue", lwd = 2,
ylim = range(coefficients_df[, 1:3]), xlab = "c", ylab = "Coefficients",
main = "Ridge Trace Plot")
lines(coefficients_df$c, coefficients_df$b2, col = "red", lwd = 2)
lines(coefficients_df$c, coefficients_df$b3, col = "green", lwd = 2)
legend("topright", legend = c("b1", "b2", "b3"), col = c("blue", "red", "green"), lwd = 2)
Yes, near c=0, there is the most substantial changes in the Ridge Trace
Plot. The slope is the largest close to zero, then evens out as c gets
larger.
#c c=0.06 is a reasonable value to pick because that is approximately when all 3 b values about level out and become stable. Based off of the VIF values c=0.06 because after, the VIF values have become sufficiently small and only change moderately as c is increased further
#d
#b_k = (S_y/S_x_k)b_k*
s_y <- sd(cosmetics$y)
s_x1 <- sd(cosmetics$x1)
s_x2 <- sd(cosmetics$x2)
s_x3 <- sd(cosmetics$x3)
b_r_c <- solve(XTX + 0.06 * I) %*% XTY
u_b_k1 <- (s_y/s_x1)*b_r_c[1]
u_b_k2 <- (s_y/s_x2)*b_r_c[2]
u_b_k3 <- (s_y/s_x3)*b_r_c[3]
u_b <- c(u_b_k1, u_b_k2, u_b_k3)
x_means <- apply(cosmetics[, c("x1", "x2", "x3")], 2, mean)
u_b_0 <- mean(cosmetics$y) -sum(u_b*x_means)
print(c(u_b_0, u_b))
## [1] 1.2126330 0.8084593 0.7527150 0.6612444
ols <- lm(cosmetics$y~., data=cosmetics)
ols$coefficients
## (Intercept) x1 x2 x3
## 1.0232513 0.9656902 0.6291644 0.6760246
u_b
## [1] 0.8084593 0.7527150 0.6612444
u_fit = u_b_0 + u_b_k1*cosmetics$x1 + u_b_k2*cosmetics$x2 + u_b_k3*cosmetics$x3
ols_fit = fitted(ols)
u_fit - ols_fit
## 1 2 3 4 5 6
## -0.055393088 0.066832626 -0.013154493 -0.086206279 0.069071049 -0.103386350
## 7 8 9 10 11 12
## -0.187897112 -0.001504526 -0.009561199 0.015509854 0.009610297 -0.039240521
## 13 14 15 16 17 18
## 0.181247969 -0.081649064 0.069652989 0.174185229 0.184904846 -0.003275646
## 19 20 21 22 23 24
## 0.029151942 -0.019168687 -0.014441524 -0.059737573 0.143630868 -0.188758925
## 25 26 27 28 29 30
## 0.040750855 -0.023500643 -0.117287269 0.094481998 -0.171315767 -0.022260409
## 31 32 33 34 35 36
## 0.042335241 0.060360340 0.017603387 0.170817193 0.036677729 -0.074377851
## 37 38 39 40 41 42
## -0.175886208 0.085720234 0.056869154 0.023656106 0.020304856 0.015454543
## 43 44
## -0.021899471 -0.138926703
The values are similar, but not exactly the same.
plot(u_fit, col="blue")
points(ols_fit, col="black")
We see the points are almost exactly the same, but not quite.
PROBLEM 3
grocery <- read.csv("/Users/caelynsobie/Downloads/grocery-retailer.csv")
#a
X_g <- as.matrix((grocery[, c("x1", "x2", "x3")]))
hat <- X_g %*% solve(t(X_g) %*% X_g) %*% t(X_g)
#rule of thumb 2*(p/n)
thres <- 2*(3/52)
leverages <- diag(hat)
print(leverages)
## [1] 0.02150212 0.04421445 0.09132604 0.05240339 0.20106108 0.02136666
## [7] 0.02364536 0.01648624 0.02207083 0.01785180 0.02118171 0.01791272
## [13] 0.01916796 0.03730397 0.03330967 0.20436604 0.01860270 0.03841652
## [19] 0.02447201 0.02442505 0.18869690 0.17910994 0.04147034 0.07453020
## [25] 0.05582546 0.02128138 0.02506312 0.06066355 0.01832020 0.04158947
## [31] 0.03623619 0.05802987 0.04088364 0.02514119 0.04611123 0.01572176
## [37] 0.01904253 0.02990608 0.04574096 0.03099405 0.04371057 0.10711798
## [43] 0.17044394 0.12230876 0.06403822 0.02230334 0.05981042 0.25533074
## [49] 0.02417531 0.03100908 0.06763356 0.02667372
hasOutliers <- FALSE
for (i in 1:length(leverages)) {
if (leverages[i] > thres) {
cat(sprintf("Leverage at index %d is high: %f\n", i, leverages[i]))
hasOutliers <- TRUE
}
}
## Leverage at index 5 is high: 0.201061
## Leverage at index 16 is high: 0.204366
## Leverage at index 21 is high: 0.188697
## Leverage at index 22 is high: 0.179110
## Leverage at index 43 is high: 0.170444
## Leverage at index 44 is high: 0.122309
## Leverage at index 48 is high: 0.255331
if (!hasOutliers) {
cat("There are no outliers with respect to x.\n")
}
#b
X_new <- c(300000, 7.2, 0)
plot(grocery$x1, grocery$x2)
points(X_new[1], X_new[2], col = "red", pch = 19)
Visually, this does not appear to be an extrapolation beyond the range
of the data since it is in the middle of the data cloud.
# X_new^T (X^T X)^-1 X_new
X_new_scaled <- as.numeric((X_new - colMeans(grocery[, c("x1", "x2", "x3")])) / apply(grocery[, c("x1", "x2", "x3")], 2, sd))
XTX_inv <- solve(t(X_g) %*% X_g)
hidden_extrapolation <- t(X_new_scaled) %*% XTX_inv %*% X_new_scaled
hasHidExtra <- TRUE
if (min(leverages) < hidden_extrapolation & hidden_extrapolation < max(leverages)) {
hasHidExtra <- FALSE
cat(c(round(hidden_extrapolation,4),"is within range of all leverage values. So there is no hidden extrapolation. \n"))
cat(c(round(min(leverages),4), "<", round(hidden_extrapolation,4), "<", round(max(leverages),4)))
} else {
cat(c(hidden_extrapolation, " is outside the range of leverage values. So we have hidden extrapolation."))
}
## 0.0242 is within range of all leverage values. So there is no hidden extrapolation.
## 0.0157 < 0.0242 < 0.2553
#c
model <- lm(grocery$y ~ X_g[,1] + X_g[,2] + X_g[,3])
outX <- c(16, 22, 43, 48)
outY <- c(10, 32, 38, 40)
out <- c(outX, outY)
dffits_values <- dffits(model)
dfbetas_values <- dfbetas(model)
cooks_distance <- cooks.distance(model)
influence_measures <- data.frame(
DFFITS = dffits_values[c(outX, outY)],
DFBETAS_x1 = dfbetas_values[c(outX, outY),2],
DFBETAS_x2 = dfbetas_values[c(outX, outY),3],
DFBETAS_x3 = dfbetas_values[c(outX, outY),4],
Cook_Distance = cooks_distance[c(outX, outY)]
)
print(influence_measures)
## DFFITS DFBETAS_x1 DFBETAS_x2 DFBETAS_x3 Cook_Distance
## 16 -0.55399026 -0.05978171 0.324814936 -0.45210042 0.0768950835
## 22 0.05508583 -0.02528712 -0.018701755 0.04464552 0.0007746088
## 43 0.56165186 0.13384205 0.326179735 0.35662759 0.0792193145
## 48 -0.14684146 -0.09384229 0.009013941 -0.10221538 0.0054988670
## 10 0.45863297 -0.10440317 -0.314158747 -0.06334626 0.0493501187
## 32 -0.65107706 0.09134173 -0.570832219 0.16520616 0.0997597379
## 38 0.38551766 -0.08273847 0.208364674 -0.12700868 0.0346380312
## 40 0.39672030 -0.21205906 0.093252646 -0.11104717 0.0364991539
dffits_influence <- FALSE
for (i in seq(dffits_values)){
if (abs(dffits_values[i]) > 1) {
dffits_influence <- TRUE
cat(c(out[i]), "is an influential observation")
}
}
if (!dffits_influence){
print("According to DFFITS there are no influential observations. This measures the influence of the ith case on the fitted value. ")
}
## [1] "According to DFFITS there are no influential observations. This measures the influence of the ith case on the fitted value. "
dfbetas_influence <- FALSE
for (i in seq(dfbetas_values)){
if (abs(dfbetas_values[i]) > 1) {
dfbetas_influence <- TRUE
cat(c(out[i]), "is an influential observation")
}
}
if (!dfbetas_influence){
print("According to DFBETAS there are no influential observations. This measures the change in the kth estimated coefficient bk when observation Xi is removed, studentized.")
}
## [1] "According to DFBETAS there are no influential observations. This measures the change in the kth estimated coefficient bk when observation Xi is removed, studentized."
p <- 3
n <- 52
percentiles <- pf(cooks_distance, p, n - p)
for (i in out) {
cat(sprintf("Observation %d: Percentile = %.4f\n",
i, percentiles[i]))
}
## Observation 16: Percentile = 0.0278
## Observation 22: Percentile = 0.0000
## Observation 43: Percentile = 0.0290
## Observation 48: Percentile = 0.0006
## Observation 10: Percentile = 0.0147
## Observation 32: Percentile = 0.0402
## Observation 38: Percentile = 0.0088
## Observation 40: Percentile = 0.0094
cooks_influence <- FALSE
for (i in out){
if (abs(percentiles[i]) > .2) {
cooks_influence <- TRUE
cat(c(out[i]), "is an influential observation")
}
}
if (!cooks_influence){
print("According to Cook's Distance there are no influential observations. This measures the effect of the ith case on all n fitted values")
}
## [1] "According to Cook's Distance there are no influential observations. This measures the effect of the ith case on all n fitted values"
Thus there are no influential observations.
#d
plot(model,4)
According to this measure, we would want to consider observations 16,32,
and 43 to possibly be influential.
pairs(grocery)
There appears to be a weak positive linear association between x1 and
x2. The points are somewhat dispersed but show a slight upward trend,
indicating a weak correlation.
There is little to no visible linear association between x1 and x3. The points appear scattered without any apparent trend, suggesting that x1 and x3 are not linearly related.
Similarly, x2 and x3 show no obvious linear association. The points are dispersed without a discernible pattern, indicating that x2 and x3 are likely uncorrelated or have a very weak association.
cor(grocery)
## y x1 x2 x3
## y 1.0000000 0.20766494 0.06002960 0.81057940
## x1 0.2076649 1.00000000 0.08489639 0.04565698
## x2 0.0600296 0.08489639 1.00000000 0.11337076
## x3 0.8105794 0.04565698 0.11337076 1.00000000
The correlation between x1 and x2 is 0.0849, which is very low. This indicates a very weak linear association between these two variables, supporting the idea that they are nearly uncorrelated.
The correlation between x1 and x3 is 0.0456, which is also very low, suggesting little to no linear association between x1 and x3.
The correlation between x2 and x3 is 0.1134, which is also quite low, indicating a very weak linear relationship between these two variables or no linear relationship.
There is nothing that has shown that there exists a strong linear relationship between any of the predictors, suggesting independence between predictors.
#f
library(car)
## Loading required package: carData
vif_values <- vif(model)
print(vif_values)
## X_g[, 1] X_g[, 2] X_g[, 3]
## 1.008596 1.019598 1.014364
None of the VIF values are greater than 10 suggesting there is no multicollinearity present.
PROBLEM 5
salary <- read.csv("/Users/caelynsobie/Downloads/MAT613/employee-salary.csv")
#a
model_salary <- lm(salary$y ~ factor(salary$degree) + salary$x3 + salary$x4 )
residuals <- resid(model_salary)
fitted_values <- fitted(model_salary)
plot(fitted_values, residuals)
There is a megaphone shape, suggesting non-constant varaince
#b
library(ALSM)
## Loading required package: leaps
## Loading required package: SuppDists
g <- rep(1, 65)
g[fitted_values >= median(fitted_values)] <- 0
bftest(model_salary,g, alpha=0.01)
## t.value P.Value alpha df
## [1,] 4.442422 3.668798e-05 0.01 63
Our p-value 3.668798e-05 < 0.01, thus we reject the null hypothesis that the variances are equal.
#c
plot(salary$x3, abs(residuals),
main = "Absolute Residuals vs. X3")
plot(salary$x4, abs(residuals),
main = "Absolute Residuals vs. X4")
As the values of X3 increase, the absolute residuals also appear to
increase in variability. This suggests that the standard deviation of
the error term may be positively associated with X3 The plot suggests
that the variability of the residuals (and thus the error term)
increases with X3, indicating that the error variance is not
constant.
As the values of X4 increase, the absolute residuals also appear to increase in variability. This suggests that the standard deviation of the error term may be positively associated with X4. The plot suggests that the variability of the residuals (and thus the error term) increases with X4, indicating that the error variance is not constant.
#d
sd_model <- lm( abs(residuals) ~ salary$x3 + salary$x4)
pred_sd <- fitted(sd_model)
weights <- 1 / (pred_sd^2)
print(weights)
## 1 2 3 4 5 6
## 0.056292687 0.077705686 0.001532505 0.024530644 0.163067895 0.010251537
## 7 8 9 10 11 12
## 0.090311060 0.118907587 0.034820755 0.013179549 0.007020100 0.073711480
## 13 14 15 16 17 18
## 0.083172030 0.010441314 0.077705686 0.090528372 0.023024234 0.011372887
## 19 20 21 22 23 24
## 0.005242312 0.008537492 0.119896882 0.089023467 0.005026257 0.034323007
## 25 26 27 28 29 30
## 0.055657611 0.009376487 0.032596332 0.094578048 0.111415948 0.125743297
## 31 32 33 34 35 36
## 0.015718961 0.009792584 0.049591497 0.056938695 0.075669068 0.125033549
## 37 38 39 40 41 42
## 0.016351763 0.004340427 0.039776372 0.010224118 0.059753781 0.011854165
## 43 44 45 46 47 48
## 0.003734147 0.013888717 0.136752318 0.108216859 0.062908841 0.057595888
## 49 50 51 52 53 54
## 0.133575200 0.139210138 0.137564309 0.122596179 0.107368283 0.012544414
## 55 56 57 58 59 60
## 0.008502906 0.036729680 0.060818508 0.028140893 0.013993969 0.021181282
## 61 62 63 64 65
## 0.040548297 0.031689610 0.148355382 0.094114807 0.003540910
#e
w_model_salary <- lm(salary$y ~ factor(salary$degree) + salary$x3 + salary$x4 , weights = weights)
w_coeff <- coef(w_model_salary)
coeff <- coef(model_salary)
cat("Weighted coefficients:", w_coeff, "\n")
## Weighted coefficients: 29.42549 10.89964 26.68488 1.42533 1.723949
cat("Unweighted coefficients:", coeff, "\n")
## Unweighted coefficients: 31.47143 10.81195 22.63073 1.258124 1.852313
The weighted least squares (WLS) estimates of the regression coefficients are similar to the ordinary least squares (OLS) coefficients, with some minor differences. The most notable difference is in the coefficient for the second predictor, where weighting appears to have a more substantial effect.
#f
print("Unweighted Model Summary")
## [1] "Unweighted Model Summary"
summary(model_salary)
##
## Call:
## lm(formula = salary$y ~ factor(salary$degree) + salary$x3 + salary$x4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.058 -3.477 -0.915 3.417 36.909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.4714 2.8691 10.969 5.73e-16 ***
## factor(salary$degree)2 10.8120 3.2183 3.360 0.00136 **
## factor(salary$degree)3 22.6307 3.4846 6.494 1.81e-08 ***
## salary$x3 1.2581 0.2273 5.535 7.23e-07 ***
## salary$x4 1.8523 0.2276 8.137 2.86e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.14 on 60 degrees of freedom
## Multiple R-squared: 0.8633, Adjusted R-squared: 0.8542
## F-statistic: 94.76 on 4 and 60 DF, p-value: < 2.2e-16
print("Weighted Model Summary")
## [1] "Weighted Model Summary"
summary(w_model_salary)
##
## Call:
## lm(formula = salary$y ~ factor(salary$degree) + salary$x3 + salary$x4,
## weights = weights)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.2414 -0.7531 -0.2709 0.6915 3.3246
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.4255 1.3617 21.610 < 2e-16 ***
## factor(salary$degree)2 10.8996 1.4918 7.307 7.50e-10 ***
## factor(salary$degree)3 26.6849 1.6686 15.992 < 2e-16 ***
## salary$x3 1.4253 0.2002 7.118 1.57e-09 ***
## salary$x4 1.7239 0.3206 5.377 1.31e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.15 on 60 degrees of freedom
## Multiple R-squared: 0.8874, Adjusted R-squared: 0.8799
## F-statistic: 118.2 on 4 and 60 DF, p-value: < 2.2e-16
Where the standard errors were relatively large in the original model (intercept and x1 and x2) reduced to around 1 from 2 and 3 in the weighted model. The weighted model overall reduces the standard error.
#g
new_res <- salary$y - fitted(w_model_salary)
new_sd_model <- lm( abs(new_res) ~ salary$x3 + salary$x4)
new_pred_sd <- fitted(new_sd_model)
new_weights <- 1 / (new_pred_sd^2)
new_w_model_salary <- lm(salary$y ~ factor(salary$degree) + salary$x3 + salary$x4 , weights = new_weights)
new_w_coeff <- coef(new_w_model_salary)
new_coeff <- coef(model_salary)
cat("Weighted coefficients:", new_w_coeff, "\n")
## Weighted coefficients: 29.08323 11.00753 26.81422 1.490446 1.692183
cat("Unweighted coefficients:", new_coeff, "\n")
## Unweighted coefficients: 31.47143 10.81195 22.63073 1.258124 1.852313
There is not a substantial change between the iterations in the coefficients. If there was, we would want to continue iterating until there is not a significant change.