This is a practice of Ridge regression.

(a) Solve.

set.seed(2363)
X <- matrix(rnorm(12), 3, 4)
y <- matrix(rnorm(4), 4, 1)
w <- solve(X %*% t(X), X %*% y)

(b) Then the matrices are non-conformable.

(c)

lambda <- 5
w2 <- solve(X %*% t(X) + lambda, X %*% y)
w2
##            [,1]
## [1,] -0.3089786
## [2,] -0.2020242
## [3,]  0.5337996

No solution for (b).

(d)

train.ridge <- function(ip_data, lambda){
    X <- ip_data[[1]]
    y <- ip_data[[2]]
    w <- solve(X %*% t(X) + lambda, X %*% y)
    return(w)
}

(e)

ip_data <- list(X=X, y=y)
train.ridge(ip_data, lambda = 5)
##            [,1]
## [1,] -0.3089786
## [2,] -0.2020242
## [3,]  0.5337996

(f) Generic function.

class(ip_data) <- 'ridge'
train <- function(x, lambda) UseMethod('train')
train(ip_data, lambda = 5)
##            [,1]
## [1,] -0.3089786
## [2,] -0.2020242
## [3,]  0.5337996

(g) Estimate error.

pred_err.ridge <- function(obj, w){
    X <- obj[[1]]
    y <- obj[[2]]
    sum((y - t(X) %*% w)^2)
}
# the prediction error is the residual sum of square, which is the square of L2 norm

pred_err <- function(obj, w) UseMethod('pred_err')
pred_err(ip_data, w)
## [1] 0.217151

(h)

crossval <- function(obj, lambdas, k){
    l <- length(lambdas)
    err <- matrix(0, k, l)
    X <- obj[[1]]
    y <- obj[[2]]
    fold <- floor(length(y)/k)
    for(i in seq(1, fold*k, by=fold)){
        testingX <- X[,i:(i+fold-1)]
        testingy <- y[i:(i+fold-1)]
        testing <- list(testingX, testingy)
        class(testing) <- 'ridge'
        
        trainingX <- X[,-(i:(i+fold-1))]
        trainingy <- y[-(i:(i+fold-1))]
        training <- list(trainingX, trainingy)
        class(training) <- 'ridge'
        
        for(j in 1:l){
            wtrain <- train(training, lambdas[j])
            err[(i-1)/fold +1, j] <- pred_err(testing, wtrain)
        }
    }
    return(err)
}

(i) Load dataset.

Use read.csv because this is better than read.table.

credit <- read.csv("Credit.csv")
y <- credit$Balance
X <- t(credit[,c(2,3,4,6,7)])
my_credit <- list(X, y)
class(my_credit) <- 'ridge'

(j)

lambdas <- c(0, 0.1, 0.5, 1, 5, 10, 50, 100, 1000)
cverr <- crossval(my_credit, lambdas, k=5)
print(cverr)
##         [,1]    [,2]    [,3]    [,4]    [,5]    [,6]    [,7]    [,8]
## [1,] 2570982 2570982 2570982 2570983 2570986 2570990 2571055 2571204
## [2,] 3246570 3246568 3246559 3246549 3246463 3246357 3245551 3244642
## [3,] 2573794 2573803 2573842 2573890 2574276 2574759 2578620 2583451
## [4,] 2122583 2122580 2122568 2122554 2122435 2122288 2121158 2119860
## [5,] 2120327 2120321 2120299 2120271 2120050 2119775 2117612 2115003
##         [,9]
## [1,] 2584060
## [2,] 3243290
## [3,] 2669845
## [4,] 2113876
## [5,] 2082754

(k) Calculate mean prediction error.

mean_err <- colMeans(cverr)
mean_err
## [1] 2526851 2526851 2526850 2526849 2526842 2526834 2526799 2526832 2538765
plot(log(lambdas), mean_err)

(l) Choose the best lambda.

nbest <- which.min(mean_err)
cat("The best lambda value is", lambdas[nbest])
## The best lambda value is 50
train(my_credit, lambdas[nbest])
##                  [,1]
## Income     -7.0731575
## Limit       0.2599977
## Rating     -0.1556694
## Age        -2.7038525
## Education -13.0320932