library(gurobi)
## Loading required package: slam
timelimit = 3600
train = read.csv('training_data.csv')
test = read.csv('test_data.csv')
X = model.matrix(y~., data=train)
y = train[,1]
MIQP = function(X,y,k,test_i){
m = ncol(X)-1
Q = matrix(0,(2*m+1),(2*m+1)) # Set Q to be a (2m+1) x (2m+1) matrix of all zeros
Q[(1):(m+1),(1):(m+1)] = t(X)%*%X # Assign the first m+1 rows and columns of Q to be t(X)%*%X
obj = rep(0, 2*m+1) # Set obj to be a list of (2m+1) zeros
obj[1:(m+1)] = -2%*%t(y)%*%X #set first 1 through (m+1) rows to be -2%*%t(y)%*%X
M = 2
MDouble = FALSE
while (MDouble==FALSE) { #if beta is equal to M then double M
MDouble = TRUE
M = 2*M
Amat = matrix(0,2*m+1,2*m+1) # constraint matrix
# M constraints' lower bound
for(i in 1:m){
Amat[i,i+1] = -1 #Betas
Amat[i,i+m+1] = -M #Zs
}
# M constraints upper bound
for(i in 1:m){
Amat[i+m,i+1] = 1 #Betas
Amat[i+m,i+m+1] = -M #Zs
}
Amat[2*m+1,(m+1+1):(m+1+m)] = 1
bvec = c(rep(0,2*m),k) # RHS
dir = c(rep('<',2*m+1)) # signs
lb = c(rep(-M,m+1),rep(0,m)) # lower bound
model = list()
model$modelsense = 'min'
model$Q = Q
model$lb = lb
model$obj = obj
model$A = Amat
model$rhs = bvec
model$sense = dir
model$vtype = c(rep('C',m+1),rep('B',m))
params = list()
params$TimeLimit = timelimit
params$outputflag = 0
qp = gurobi(model,params=params)
for (xi in qp$x[1:m+1]){
if (xi>=M){
print(xi)
MDouble = FALSE
}
if (xi<=-M){
print(xi)
MDouble = FALSE
}
}
cv.error = 0
for(i in 1:nrow(test_i)){
error = (qp$x[1] + as.numeric(t(qp$x[2:(m+1)]))%*%as.numeric(test_i[i,-1]) - test_i[i,1])^2
cv.error = cv.error + error
}
}
return(cv.error)
}
Formulate the optimization for a specific value of k and a specific fold (10 folds and 10 ks)
rerun = TRUE
#Define the number of folds
folds = 10 #In this case, 10-fold cross validation
n = nrow(train)
set = 1:n
#setting the size of the training
nobs = round(n/folds, #Number of observations in the fold
0) #Rounding with 0 decimals
if (rerun==FALSE){
klist = seq(from = 5, to = ncol(train)-1, by = 5)
cv.error.avg.list = c()
for (k in klist){
cat('############### k =',k,'#################\n')
cv.error.list = c()
for(i in 1:folds){
if(nobs<length(set)){
val = sample(set, size = nobs)
}
if(nobs>=length(set)){
val=set
}
#Create the train and test matrices
train_i = train[-val,]
test_i = train[val,]
X = model.matrix(y~., data=train_i)
y = train_i[,1]
#run function for MIQP problem
cv.error = MIQP(X,y,k,test_i)
cv.error.list = c(cv.error.list,cv.error)
} # for 10-fold
cv.error.avg = mean(cv.error.list)
cv.error.avg.list = c(cv.error.avg.list,cv.error.avg)
}
crossval_error_df = cbind(klist,cv.error.avg.list)
write.csv(crossval_error_df,"C:/Users/machu/OneDrive/Documents/Stochastic Control and Optimization/project3_cv_error.csv")
}else{
crossval_error_df = read.csv("project3_cv_error.csv", header=T)[,2:3]
}
crossval_error_df
## klist cv.error.avg.list
## 1 5 95.60751
## 2 10 66.07291
## 3 15 82.37168
## 4 20 74.24889
## 5 25 82.89555
## 6 30 95.48620
## 7 35 79.09778
## 8 40 86.50091
## 9 45 83.05626
## 10 50 91.89847
lowest.cv.error = min(crossval_error_df[,2])
for (i in 1:nrow(crossval_error_df)){
if (crossval_error_df[i,2]==lowest.cv.error){
lowest.k = crossval_error_df[i,1]
}
}
cat('Best k =',lowest.k,'\n')
## Best k = 10
plot(crossval_error_df[,1], crossval_error_df[,2], type="b", col="blue",
lwd=2, xlab="k", ylab="cross validation error",
ylim=c(min(crossval_error_df[,2])-5,max(crossval_error_df[,2])+5))
abline(v=lowest.k, col="red", lwd=2, lty=2)
title("cross validation error vs k")
X = model.matrix(y~., data=train)
y = train[,1]
k = lowest.k
m = ncol(X)-1
Q = matrix(0,(2*m+1),(2*m+1)) # Set Q to be a (2m+1) x (2m+1) matrix of all zeros
Q[0:m+1,0:m+1] = t(X)%*%X # Assign the first m+1 rows and columns of Q to be t(X)%*%X
obj = rep(0, 2*m+1) # Set obj to be a list of (2m+1) zeros; Beta*(m+1). z*(m)
obj[1:(m+1)] = -2%*%t(y)%*%X # Set the first m+1 values of obj to be -2%*%t(y)%*%X
M = 2
MDouble = FALSE
while (MDouble==FALSE) { # Ensure that M isn't equal to one of the betas. If it is, then double it.
MDouble = TRUE
M = 2*M
Amat = matrix(0,2*m+1,2*m+1) # constraint matrix
# lower bound for M constraint
for(i in 1:m){
Amat[i,i+1] = -1 #Betas
Amat[i,i+m+1] = -M #Zs
}
# upper bound for M constraint
for(i in 1:m){
Amat[i+m,i+1] = 1 #Betas
Amat[i+m,i+m+1] = -M #Zs
}
Amat[2*m+1,(m+1+1):(m+1+m)] = 1
bvec = c(rep(0,2*m),k) #rhs
dir = c(rep('<',2*m+1)) # signs
lb = c(rep(-M,m+1),rep(0,m)) # lower bound vector
model = list()
model$modelsense = 'min'
model$Q = Q
model$lb = lb
model$obj = obj
model$A = Amat
model$rhs = bvec
model$sense = dir
model$vtype = c(rep('C',m+1),rep('B',m)) # Beta: C, Z: B
params = list()
params$TimeLimit = timelimit
params$outputflag = 0
qp = gurobi(model,params=params)
for (xi in qp$x[1:m+1]){
if (xi>=M){
print(xi)
MDouble = FALSE
}
if (xi<=-M){
print(xi)
MDouble = FALSE
}
}
}
MIQP_pred = c()
for(i in 1:nrow(test)){
predict_y = qp$x[1] + as.numeric(t(qp$x[2:(m+1)]))%*%as.numeric(test[i,-1])
MIQP_pred = c(MIQP_pred,predict_y)
}
MIQP_pred
## [1] 6.1798588 5.0952430 3.2855953 3.7584854 -0.3329753 -5.1427368
## [7] -3.1445436 -1.2380629 1.3851109 -0.4417385 -1.6950022 2.7303503
## [13] 0.7474490 -0.9719223 -0.6868153 8.0452238 -7.9469847 3.8906397
## [19] -4.5814292 -3.2199208 -2.1621145 3.2168632 -3.1981053 0.1974073
## [25] -2.3598884 -0.4199989 -1.9125216 -3.3241859 -3.1417097 -3.5537932
## [31] -1.8084254 -0.3713430 1.8670808 5.0492789 -1.8000561 3.0942767
## [37] 4.3815431 2.6988627 1.6132886 5.9758464 -1.1973583 5.2232542
## [43] -5.8489989 -1.1446153 4.5180300 4.1877487 4.1204601 0.6148381
## [49] 1.9572325 -1.5490438
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-1
X = model.matrix(y~., data=train)
y = train[,1]
# 10 fold cross validation lasso regression using glmnet package
cvfit = cv.glmnet(x=X, y=y, nfolds = 10,alpha=1)
#find best lambda
best.lambda = cvfit$lambda.min
#plot the log lambdas
plot(log(cvfit$lambda),sqrt(cvfit$cvm),
main="LASSO CV (k=10)",xlab="log(lambda)",
ylab = "RMSE",col=4,type="b",cex.lab=1.2)
abline(v=log(best.lambda),lty=2,col=2,lwd=2)
print(paste("Best Lambda is",best.lambda))
## [1] "Best Lambda is 0.0808687957895688"
# After find lambda, fit a LASSO model to the entire training set using best lambda
lassofit = glmnet(X, y, alpha=1, lambda=best.lambda) # 1se/min?
lassofit
##
## Call: glmnet(x = X, y = y, alpha = 1, lambda = best.lambda)
##
## Df %Dev Lambda
## 1 18 87.91 0.08087
# prediction of the y values in the test set
lasso_pred = predict(lassofit, model.matrix(y~., data=test), s=best.lambda)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
MIQP_error = sum((MIQP_pred - test[,1])^2)
lasso_error = sum((lasso_pred - test[,1])^2)
error_data =as.data.frame(cbind(c('MIQP_SSE','LASSO_SSE'), c(MIQP_error,lasso_error)))
error_data$V2 <- sapply(error_data$V2, as.numeric)
error_data
## V1 V2
## 1 MIQP_SSE 116.8272
## 2 LASSO_SSE 117.7158
The problem we want to solve is, given “toy” data organized into training and test sets, what is a good variable selection method. In order to do such variable selection, we examine the two main approaches used by our company: LASSO and a mixed-integer quadratic programming (MIQP) approach. They are both optimization problems, but can be solved in different ways. By examining both models’ machine-learning and computational performance, we make a decision on which approach is more feasible for our company’s large-scale modeling operations.
Lasso is very fast in terms of inference and fitting. Lasso is also generally easier to apply as it just utilizes a package in R. Lasso is more resistant to overfitting.
MIQP found a slightly lower SSE. Additionally, MIQP could be viewed as more intuitive as it just applies sum of squared errors. MIQP does have computational challenges due to integrality constraints.
LASSO involves the use of minimizing the sum of squared errors (or residuals) of all the data points plus an added penalty term of sum of all the non-intercept coefficients weighted by a parameter ƛ. The presence of the penalty term may automatically drive some coefficients to 0 (i.e. produce automatic variable selection). As expressed earlier, this can easily be done using an R package. However, a better way to optimize the fit using LASSO is to apply cross-validation, which, for a given value of the hyperparameter ƛ, involves splitting up the dataset into n number of folds, training LASSO regression on each combination of n-1 folds and finding the sum of squared errors on each remaining fold. The average of these sums of squared errors is the cross-validation error. Comparing the cross-validation errors for different values of ƛ allows us to see which ƛ results in the lowest error. This is what we want to find, and this can be done with a function found in the same R package as regular LASSO. We see from the visualization below that the optimal log(ƛ) value from our LASSO analysis with 10-fold cross-validation, done within the training set, is about -2.5.
plot(log(cvfit$lambda),sqrt(cvfit$cvm),
main="LASSO CV (k=10)",xlab="log(lambda)",
ylab = "RMSE",col=4,type="b",cex.lab=1.2)
abline(v=log(best.lambda),lty=2,col=2,lwd=2)
On the other hand, a more direct variable selection solution can be obtained using the less automated but more mathematically intuitive process of the MIQP model, which involves the relatively more simple task of optimizing only on the sum of squared errors of the dataset, without any penalty term. This can be accomplished by visualizing the objective function as the following matrix quadratic expression:
\(\beta^{\top}\left(X^{\tau} X\right) \beta+\left(-2 y^{\top} X\right) \beta\)
where \(\beta\) is the vector of all coefficients along with the corresponding binary control variables for the non-intercept coefficients, and X and y are the matrices of the independent variables and dependent variable, respectively. The constraints are as follows: 1. Using an appropriate big M, control the correspondence of zero and nonzero coefficient values with 0 and 1 binary control values 2. To indicate the maximum number of nonzero coefficients, k, to be selected, the sum of the binary control values must be less than or equal to k.
As with the case for LASSO, cross-validation can be easily used to modify this model as well, allowing the hyperparameter k to be optimized through trial and error (or “tuning”). Doing so on the training set with values of k from 5, 10, 15, all the way to 50, we see that the optimal value of k among the 10 values is 10. To end our cross-validation analysis, we simply apply k=10 to the MIQP solution for the entire training dataset. Applying the resultant solution vector of β to the entire test dataset, we find the sum of squared errors for the test set, which we compare with the sum of squared errors from our earlier LASSO cross-validation analysis.
error_data
## V1 V2
## 1 MIQP_SSE 116.8272
## 2 LASSO_SSE 117.7158
Both models make use of the general idea of two separate sets for training and testing, which provides a safeguard against overfitting and gives reasonable assurance to the accuracy of the sums of squared errors. Based on whether the optimal k=10 CV error is lower or higher than the LASSO CV error, it would appear that either LASSO CV has lower error so we can say that that is good. Or even if the MIQP error turns out to be lower, it wasn’t lower by much, and we expect the difference in computational time to be even larger as our company’s actual datasets will be greatly scaled up from this “toy” dataset. Given the relatively negligible inferiority in accuracy versus the greatly higher computation-cost- and time-efficiency, LASSO CV remains, at least for the time being, the better choice for our company as we expect to be performing these model regression analyses at a high volume.