Import data into a data frame

library(gurobi)

## Loading required package: slam

timelimit = 3600
train = read.csv('training_data.csv')
test = read.csv('test_data.csv')

Create the data n x (m+1) data matrix using model.matrix(y~.,data=df)
Create a list with just the y values from the data frame

X = model.matrix(y~., data=train)
y = train[,1]

Direct Variable Selection – MIQP Problem

MIQP = function(X,y,k,test_i){
    m = ncol(X)-1
    Q = matrix(0,(2*m+1),(2*m+1)) # Set Q to be a (2m+1) x (2m+1) matrix of all zeros
    
    Q[(1):(m+1),(1):(m+1)] = t(X)%*%X # Assign the first m+1 rows and columns of Q to be t(X)%*%X
    obj = rep(0, 2*m+1) # Set obj to be a list of (2m+1) zeros
    obj[1:(m+1)] = -2%*%t(y)%*%X #set first 1 through (m+1) rows to be -2%*%t(y)%*%X
    
    M = 2
    MDouble = FALSE
    while (MDouble==FALSE) { #if beta is equal to M then double M
      MDouble = TRUE
      M = 2*M
      Amat = matrix(0,2*m+1,2*m+1) # constraint matrix
      
      # M constraints' lower bound
      for(i in 1:m){
        Amat[i,i+1] = -1 #Betas
        Amat[i,i+m+1] = -M #Zs
      }
        
      # M constraints upper bound
      for(i in 1:m){ 
        Amat[i+m,i+1] = 1 #Betas
        Amat[i+m,i+m+1] = -M #Zs
      }
      
      Amat[2*m+1,(m+1+1):(m+1+m)] = 1
    
      bvec = c(rep(0,2*m),k) # RHS
      dir = c(rep('<',2*m+1)) # signs
      lb = c(rep(-M,m+1),rep(0,m)) # lower bound
      
      model = list()
      model$modelsense = 'min'
      model$Q = Q
      model$lb = lb
      model$obj = obj 
      model$A = Amat
      model$rhs = bvec
      model$sense = dir
      model$vtype = c(rep('C',m+1),rep('B',m)) 
      
      params = list()
      params$TimeLimit = timelimit
      params$outputflag = 0
      qp = gurobi(model,params=params)
      
     
      for (xi in qp$x[1:m+1]){
        if (xi>=M){
          print(xi)
          MDouble = FALSE
        }
        if (xi<=-M){
          print(xi)
          MDouble = FALSE
        }
      }
      
      cv.error = 0
      for(i in 1:nrow(test_i)){
        error = (qp$x[1] + as.numeric(t(qp$x[2:(m+1)]))%*%as.numeric(test_i[i,-1]) - test_i[i,1])^2
        cv.error = cv.error + error
      }
    } 
    return(cv.error)
}

Formulate the optimization for a specific value of k and a specific fold (10 folds and 10 ks)

rerun = TRUE
#Define the number of folds
folds = 10 #In this case, 10-fold cross validation
n = nrow(train)
set = 1:n
#setting the size of the training
nobs = round(n/folds, #Number of observations in the fold
           0) #Rounding with 0 decimals

if (rerun==FALSE){
  klist = seq(from = 5, to = ncol(train)-1, by = 5)
  cv.error.avg.list = c()
  
  for (k in klist){
    cat('############### k =',k,'#################\n')
    cv.error.list = c()
    for(i in 1:folds){ 
      if(nobs<length(set)){ 
        val = sample(set, size = nobs) 
      }
    
      if(nobs>=length(set)){ 
        val=set 
      }
      #Create the train and test matrices
      train_i = train[-val,] 
      test_i = train[val,] 
      X = model.matrix(y~., data=train_i)
      y = train_i[,1]
      
      #run function for MIQP problem
      cv.error = MIQP(X,y,k,test_i)
      cv.error.list = c(cv.error.list,cv.error)
    } # for 10-fold
    cv.error.avg = mean(cv.error.list)
    cv.error.avg.list = c(cv.error.avg.list,cv.error.avg)
  } 
  crossval_error_df = cbind(klist,cv.error.avg.list)
  write.csv(crossval_error_df,"C:/Users/machu/OneDrive/Documents/Stochastic Control and Optimization/project3_cv_error.csv")
}else{ 
  crossval_error_df = read.csv("project3_cv_error.csv", header=T)[,2:3]
}
crossval_error_df

##    klist cv.error.avg.list
## 1      5          95.60751
## 2     10          66.07291
## 3     15          82.37168
## 4     20          74.24889
## 5     25          82.89555
## 6     30          95.48620
## 7     35          79.09778
## 8     40          86.50091
## 9     45          83.05626
## 10    50          91.89847

Find the k with the lowest cross validation error and display it graphically

lowest.cv.error = min(crossval_error_df[,2]) 
for (i in 1:nrow(crossval_error_df)){
  if (crossval_error_df[i,2]==lowest.cv.error){
    lowest.k = crossval_error_df[i,1]
  }
}
cat('Best k =',lowest.k,'\n')

## Best k = 10

plot(crossval_error_df[,1], crossval_error_df[,2], type="b", col="blue", 
     lwd=2, xlab="k", ylab="cross validation error",
     ylim=c(min(crossval_error_df[,2])-5,max(crossval_error_df[,2])+5))
abline(v=lowest.k, col="red", lwd=2, lty=2)
title("cross validation error vs k")

5. Fit the MIQP model on the entire training set

X = model.matrix(y~., data=train)
y = train[,1]
k = lowest.k

m = ncol(X)-1
Q = matrix(0,(2*m+1),(2*m+1)) # Set Q to be a (2m+1) x (2m+1) matrix of all zeros
  
Q[0:m+1,0:m+1] = t(X)%*%X # Assign the first m+1 rows and columns of Q to be t(X)%*%X
obj = rep(0, 2*m+1) # Set obj to be a list of (2m+1) zeros; Beta*(m+1). z*(m)
obj[1:(m+1)] = -2%*%t(y)%*%X # Set the first m+1 values of obj to be -2%*%t(y)%*%X
    
M = 2
MDouble = FALSE
while (MDouble==FALSE) { # Ensure that M isn't equal to one of the betas. If it is, then double it. 
  
  MDouble = TRUE
  M = 2*M
  Amat = matrix(0,2*m+1,2*m+1) # constraint matrix
  
  # lower bound for M constraint 
  for(i in 1:m){ 
    Amat[i,i+1] = -1 #Betas 
    Amat[i,i+m+1] = -M #Zs
  }
        
  # upper bound for M constraint
  for(i in 1:m){ 
    Amat[i+m,i+1] = 1 #Betas
    Amat[i+m,i+m+1] = -M #Zs
  }
      
  Amat[2*m+1,(m+1+1):(m+1+m)] = 1
    
  bvec = c(rep(0,2*m),k) #rhs
  dir = c(rep('<',2*m+1)) # signs 
  lb = c(rep(-M,m+1),rep(0,m)) # lower bound vector
      
  model = list()
  model$modelsense = 'min'
  model$Q = Q
  model$lb = lb
  model$obj = obj 
  model$A = Amat
  model$rhs = bvec
  model$sense = dir
  model$vtype = c(rep('C',m+1),rep('B',m)) # Beta: C, Z: B
      
  params = list()
  params$TimeLimit = timelimit
  params$outputflag = 0
  qp = gurobi(model,params=params)
      

  for (xi in qp$x[1:m+1]){
    if (xi>=M){
      print(xi)
      MDouble = FALSE
    }
    if (xi<=-M){
      print(xi)
      MDouble = FALSE
    }
  }
}
MIQP_pred = c()
for(i in 1:nrow(test)){
  predict_y = qp$x[1] + as.numeric(t(qp$x[2:(m+1)]))%*%as.numeric(test[i,-1])
  MIQP_pred = c(MIQP_pred,predict_y)
}
MIQP_pred

##  [1]  6.1798588  5.0952430  3.2855953  3.7584854 -0.3329753 -5.1427368
##  [7] -3.1445436 -1.2380629  1.3851109 -0.4417385 -1.6950022  2.7303503
## [13]  0.7474490 -0.9719223 -0.6868153  8.0452238 -7.9469847  3.8906397
## [19] -4.5814292 -3.2199208 -2.1621145  3.2168632 -3.1981053  0.1974073
## [25] -2.3598884 -0.4199989 -1.9125216 -3.3241859 -3.1417097 -3.5537932
## [31] -1.8084254 -0.3713430  1.8670808  5.0492789 -1.8000561  3.0942767
## [37]  4.3815431  2.6988627  1.6132886  5.9758464 -1.1973583  5.2232542
## [43] -5.8489989 -1.1446153  4.5180300  4.1877487  4.1204601  0.6148381
## [49]  1.9572325 -1.5490438

Indirect Variable Selection – LASSO

library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-1

X = model.matrix(y~., data=train)
y = train[,1]

# 10 fold cross validation lasso regression using glmnet package
cvfit = cv.glmnet(x=X, y=y, nfolds = 10,alpha=1) 

#find best lambda
best.lambda = cvfit$lambda.min

#plot the log lambdas 

plot(log(cvfit$lambda),sqrt(cvfit$cvm),
     main="LASSO CV (k=10)",xlab="log(lambda)",
     ylab = "RMSE",col=4,type="b",cex.lab=1.2)
abline(v=log(best.lambda),lty=2,col=2,lwd=2)

print(paste("Best Lambda is",best.lambda))

## [1] "Best Lambda is 0.0808687957895688"

# After find lambda, fit a LASSO model to the entire training set using best lambda
lassofit = glmnet(X, y, alpha=1, lambda=best.lambda) # 1se/min?
lassofit

## 
## Call:  glmnet(x = X, y = y, alpha = 1, lambda = best.lambda) 
## 
##   Df  %Dev  Lambda
## 1 18 87.91 0.08087

# prediction of the y values in the test set
lasso_pred = predict(lassofit, model.matrix(y~., data=test), s=best.lambda)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

MIQP_error = sum((MIQP_pred - test[,1])^2)
lasso_error = sum((lasso_pred - test[,1])^2)
error_data =as.data.frame(cbind(c('MIQP_SSE','LASSO_SSE'), c(MIQP_error,lasso_error)))
error_data$V2 <- sapply(error_data$V2, as.numeric)
error_data

##          V1       V2
## 1  MIQP_SSE 116.8272
## 2 LASSO_SSE 117.7158

Write Up

Background

The problem we want to solve is, given “toy” data organized into training and test sets, what is a good variable selection method. In order to do such variable selection, we examine the two main approaches used by our company: LASSO and a mixed-integer quadratic programming (MIQP) approach. They are both optimization problems, but can be solved in different ways. By examining both models’ machine-learning and computational performance, we make a decision on which approach is more feasible for our company’s large-scale modeling operations.

Advantages and Disadvantages

Lasso:

Lasso is very fast in terms of inference and fitting. Lasso is also generally easier to apply as it just utilizes a package in R. Lasso is more resistant to overfitting.

MIQP:

MIQP found a slightly lower SSE. Additionally, MIQP could be viewed as more intuitive as it just applies sum of squared errors. MIQP does have computational challenges due to integrality constraints.

Optimization and Cross-Validation

LASSO involves the use of minimizing the sum of squared errors (or residuals) of all the data points plus an added penalty term of sum of all the non-intercept coefficients weighted by a parameter ƛ. The presence of the penalty term may automatically drive some coefficients to 0 (i.e. produce automatic variable selection). As expressed earlier, this can easily be done using an R package. However, a better way to optimize the fit using LASSO is to apply cross-validation, which, for a given value of the hyperparameter ƛ, involves splitting up the dataset into n number of folds, training LASSO regression on each combination of n-1 folds and finding the sum of squared errors on each remaining fold. The average of these sums of squared errors is the cross-validation error. Comparing the cross-validation errors for different values of ƛ allows us to see which ƛ results in the lowest error. This is what we want to find, and this can be done with a function found in the same R package as regular LASSO. We see from the visualization below that the optimal log(ƛ) value from our LASSO analysis with 10-fold cross-validation, done within the training set, is about -2.5.

plot(log(cvfit$lambda),sqrt(cvfit$cvm),
     main="LASSO CV (k=10)",xlab="log(lambda)",
     ylab = "RMSE",col=4,type="b",cex.lab=1.2)
abline(v=log(best.lambda),lty=2,col=2,lwd=2)

On the other hand, a more direct variable selection solution can be obtained using the less automated but more mathematically intuitive process of the MIQP model, which involves the relatively more simple task of optimizing only on the sum of squared errors of the dataset, without any penalty term. This can be accomplished by visualizing the objective function as the following matrix quadratic expression:

\(\beta^{\top}\left(X^{\tau} X\right) \beta+\left(-2 y^{\top} X\right) \beta\)

where \(\beta\) is the vector of all coefficients along with the corresponding binary control variables for the non-intercept coefficients, and X and y are the matrices of the independent variables and dependent variable, respectively. The constraints are as follows: 1. Using an appropriate big M, control the correspondence of zero and nonzero coefficient values with 0 and 1 binary control values 2. To indicate the maximum number of nonzero coefficients, k, to be selected, the sum of the binary control values must be less than or equal to k.

As with the case for LASSO, cross-validation can be easily used to modify this model as well, allowing the hyperparameter k to be optimized through trial and error (or “tuning”). Doing so on the training set with values of k from 5, 10, 15, all the way to 50, we see that the optimal value of k among the 10 values is 10. To end our cross-validation analysis, we simply apply k=10 to the MIQP solution for the entire training dataset. Applying the resultant solution vector of β to the entire test dataset, we find the sum of squared errors for the test set, which we compare with the sum of squared errors from our earlier LASSO cross-validation analysis.

error_data

##          V1       V2
## 1  MIQP_SSE 116.8272
## 2 LASSO_SSE 117.7158

Results and Recommendations

Both models make use of the general idea of two separate sets for training and testing, which provides a safeguard against overfitting and gives reasonable assurance to the accuracy of the sums of squared errors. Based on whether the optimal k=10 CV error is lower or higher than the LASSO CV error, it would appear that either LASSO CV has lower error so we can say that that is good. Or even if the MIQP error turns out to be lower, it wasn’t lower by much, and we expect the difference in computational time to be even larger as our company’s actual datasets will be greatly scaled up from this “toy” dataset. Given the relatively negligible inferiority in accuracy versus the greatly higher computation-cost- and time-efficiency, LASSO CV remains, at least for the time being, the better choice for our company as we expect to be performing these model regression analyses at a high volume.

Project 3

Dian Zhao, Grayson Taylor, Kaiwan Zhang, MacAlistair Husted

4/6/2021