Executive Summary

We develop a demo, which is entirely contrived but we hope convincing, to show that, given a large number of variables, we can find not only the significant variables but also the coefficients for a regression problem using a suitable feature selection library in python. For this demonstration we want a relatively small percentage, say 5%, of the given variables to be significant. In R, we create a set of 200 variables and 200 observations. Note we must have at least 200 observations since we are given 200 variables. We then call python function defined in a python script. We are able to find the desired variables and the prescribed coefficients.

We expect each application will require customization, and even different methods. We call SelectKBest and pass a suitable score function. See .

# Define the independent variables x[[n]]

library(data.table)

## Warning: package 'data.table' was built under R version 4.0.5

len <- 20L
first <- 1
last <- len
inc <- 0L
factor = 1
sign = 1

set.seed(1008001)

xx <- NULL
for( n in 1L:200L ) {
  if(n %% 20L == 0) {
    # We define our arbitrarily selected variables so that they are independent and uncorrelated
     xx[[n]] = shift(c(first:last,rep(sign,180L)), n=inc, fill=sign) * factor

     inc = inc + len
     first = first + len
     last = last + len
     factor = factor + 1
     sign = -sign
  } else {
    # We define the other variables using uniformly random numbers from a fixed range. 
    # Since they are randomly chosen, we do not expect such variables will be significant.
     xx[[n]] = runif(200, min=1, max=200)
  }
}

# Add some noise to independent variables to demonstrate method's stability
x <- NULL
for( n in 1L:200L ) {
   x[[n]] =  jitter(xx[[n]])
}

for( n in seq(from=20, to=200, by=20) ) {
  plot(xx[[n]],pch = 1, cex = 1.3,col="green")
  points(x[[n]],pch = 9, cex = 1,col="orange")
}

# Define the dependent variable to be a linear combination of chosen variables
# We dispense with the jitter to have a "theoretical" or ideal solution.
# If we used the jitter, then we would find a perfect match.

factor = 1L

coef <- vector(mode="numeric",length=10L)
y <- rep(0,200)
for( n in seq(from=20, to=200, by=20) ) {
  coef[factor] = 1.0/factor
  y = xx[[n]]*coef[factor] + y
  factor = factor + 1
}

plot(y,pch = 1, cex = 1.3,col="blue")

setwd("~/R/feature_selection")
save(x,y,coef,file="xy_coef.rda")

Select significant variables

SelectKBest using the f_regression() function

We first try SelectKBest using the f_regression() function for the correlation statistic. We find it does not work satisfactorily.

setwd("~/R/feature_selection")
load("xy_coef.rda")

# Define variables with observations using x.

dfx <- as.data.frame(x)
col_seq = 1:length(x[[1]])
colnames(dfx) = col_seq

data <- cbind(dfx,y)

m <- as.matrix(data)

# Let's try k different variables
library(reticulate)
source_python("select_best.py")

k_RMSE <- 0L
vars_RMSE <- NULL
best_RMSE <- Inf

k_AVE_ERR <- 0L
vars_AVE_ERR <- NULL
best_AVE_ERR <- Inf

library(modelr)

## Warning: package 'modelr' was built under R version 4.0.5

for(k in 1L:20L) {  
  mask <- feature_selection_fr(m,k=k) # passed as numpy.ndarray
  
  mask <- mask[1:(length(mask)-1)]
  vars_list <- col_seq[mask]
  
  print(paste(c('Symbols:',vars_list),collapse = ' '))
  
  if( length(vars_list) ) {
  
    x <- dfx[,which(mask)]
    df <- as.data.frame(cbind(x,y))

    model <- lm(y~.,data=df)
    

    R2 = rsquare(model, data = df) #(Coefficient of determination) represents the coefficient of how well the values fit compared to the original values. The value from 0 to 1 interpreted as percentages. The higher the value is, the better the model is.
    RMSE = rmse(model, data = df) # (Root Mean Squared Error) is the error rate by the square root of MSE
    MAE = mae(model, data = df) # Mean absolute error
    
    if( RMSE < best_RMSE ) {
      k_RMSE <- k
      best_RMSE <- RMSE
      vars_RMSE <- vars_list
    }
    
    col1 <- summary(model)$coefficients[,1]
    intercept <- col1[1]
    W <- col1[2:length(col1)]
    
    df$y <- NULL
    
    z <- rowSums(as.data.table(Map("*",df,W)))+intercept
    
    ave_err = sum(abs(y-z))/length(y)
    
    if( ave_err < best_AVE_ERR ) {
      k_AVE_ERR <- k
      best_AVE_ERR <- ave_err
      vars_AVE_ERR <- vars_list
    }
    
    print(paste(c('k:',k,'\tR2:',R2,'\tRMSE:',RMSE,'\tMAE:',MAE,'\taverage error:',ave_err),collapse = ' '))
  }
  
}

## [1] "Symbols:"
## [1] "Symbols: 200"
## [1] "k: 2 \tR2: 0.275142023665736 \tRMSE: 49.3087706916245 \tMAE: 40.7387863776319 \taverage error: 40.7387863776319"
## [1] "Symbols: 20 200"
## [1] "k: 3 \tR2: 0.416597403163895 \tRMSE: 44.2366156078594 \tMAE: 36.4788640322102 \taverage error: 36.4788640322102"
## [1] "Symbols: 20 180 200"
## [1] "k: 4 \tR2: 0.595244261216596 \tRMSE: 36.8463286748386 \tMAE: 28.771213500138 \taverage error: 28.771213500138"
## [1] "Symbols: 20 40 180 200"
## [1] "k: 5 \tR2: 0.695244172053297 \tRMSE: 31.9722885619755 \tMAE: 24.3659675480722 \taverage error: 24.3659675480722"
## [1] "Symbols: 20 40 160 180 200"
## [1] "k: 6 \tR2: 0.806021739196549 \tRMSE: 25.507872927784 \tMAE: 18.3264756032655 \taverage error: 18.3264756032655"
## [1] "Symbols: 20 40 60 160 180 200"
## [1] "k: 7 \tR2: 0.850386992484333 \tRMSE: 22.4017729311639 \tMAE: 15.3896305818127 \taverage error: 15.3896305818127"
## [1] "Symbols: 4 20 40 60 160 180 200"
## [1] "k: 8 \tR2: 0.853270001872082 \tRMSE: 22.1848844085465 \tMAE: 15.349646985379 \taverage error: 15.349646985379"
## [1] "Symbols: 4 20 40 60 103 160 180 200"
## [1] "k: 9 \tR2: 0.853753361877024 \tRMSE: 22.1483133877953 \tMAE: 15.3534090207773 \taverage error: 15.3534090207773"
## [1] "Symbols: 4 20 31 40 60 103 160 180 200"
## [1] "k: 10 \tR2: 0.856646559407904 \tRMSE: 21.9281389567386 \tMAE: 15.443864938982 \taverage error: 15.443864938982"
## [1] "Symbols: 4 20 31 40 60 103 113 160 180 200"
## [1] "k: 11 \tR2: 0.856648128907521 \tRMSE: 21.9280189167181 \tMAE: 15.4500391761653 \taverage error: 15.4500391761653"
## [1] "Symbols: 4 20 31 40 60 103 113 160 165 180 200"
## [1] "k: 12 \tR2: 0.860315490803795 \tRMSE: 21.6457100824014 \tMAE: 15.3263972667448 \taverage error: 15.3263972667448"
## [1] "Symbols: 4 20 31 40 60 103 113 140 160 165 180 200"
## [1] "k: 13 \tR2: 0.90712156337855 \tRMSE: 17.650424811814 \tMAE: 11.8853368360012 \taverage error: 11.8853368360012"
## [1] "Symbols: 4 20 31 40 60 80 103 113 140 160 165 180 200"
## [1] "k: 14 \tR2: 0.911607958720316 \tRMSE: 17.2188560729911 \tMAE: 10.9471289784988 \taverage error: 10.9471289784988"
## [1] "Symbols: 4 20 31 40 60 80 103 113 139 140 160 165 180 200"
## [1] "k: 15 \tR2: 0.912199599379573 \tRMSE: 17.1611332312603 \tMAE: 11.0603671531672 \taverage error: 11.0603671531672"
## [1] "Symbols: 4 20 31 40 60 80 103 108 113 139 140 160 165 180 200"
## [1] "k: 16 \tR2: 0.914520353430834 \tRMSE: 16.9328114766753 \tMAE: 10.9164838219818 \taverage error: 10.9164838219818"
## [1] "Symbols: 4 20 31 40 49 60 80 103 108 113 139 140 160 165 180 200"
## [1] "k: 17 \tR2: 0.914568736973794 \tRMSE: 16.9280186083797 \tMAE: 10.9122528586832 \taverage error: 10.9122528586832"
## [1] "Symbols: 4 20 31 40 49 60 80 83 103 108 113 139 140 160 165 180 200"
## [1] "k: 18 \tR2: 0.915222665051205 \tRMSE: 16.8631069424349 \tMAE: 10.9435669405764 \taverage error: 10.9435669405764"
## [1] "Symbols: 4 20 31 40 49 60 80 83 103 108 113 139 140 156 160 165 180 200"
## [1] "k: 19 \tR2: 0.916667821393399 \tRMSE: 16.71876072235 \tMAE: 11.0434862955743 \taverage error: 11.0434862955743"
## [1] "Symbols: 4 20 31 40 49 60 80 83 88 103 108 113 139 140 156 160 165 180 200"
## [1] "k: 20 \tR2: 0.91721832208988 \tRMSE: 16.6634463161143 \tMAE: 11.0230927897532 \taverage error: 11.0230927897531"

print(paste(c('k:',k_RMSE,' Symbols:',vars_RMSE,' RMSE:',best_RMSE),collapse = ' '))

## [1] "k: 20  Symbols: 4 20 31 40 49 60 80 83 88 103 108 113 139 140 156 160 165 180 200  RMSE: 16.6634463161143"

print(paste(c('k:',k_AVE_ERR,' Symbols:',vars_AVE_ERR,' average error:',best_AVE_ERR),collapse = ' '))

## [1] "k: 17  Symbols: 4 20 31 40 49 60 80 103 108 113 139 140 160 165 180 200  average error: 10.9122528586832"

# By inspection, our choice for number of variables is 13 because R2 is sufficiently close to 1, and 
# RMSE is relatively low. It is true that R2 amd RMSE improve with more variables.

mask <- feature_selection_fr(m,k=13)
mask <- mask[1:(length(mask)-1)]
vars_list <- col_seq[mask]
print(paste(c('Symbols:',vars_list),collapse = ' '))

## [1] "Symbols: 4 20 31 40 60 103 113 140 160 165 180 200"

x <- dfx[,which(mask)]
df <- as.data.frame(cbind(x,y))

model <- lm(y~.,data=df)
  
col1 <- summary(model)$coefficients[,1]
intercept <- col1[1]
W <- col1[2:length(col1)]

save(vars_list,W,intercept,file="SelectKBest_f_regression.rda")

z <- rowSums(as.data.table(Map("*",x,W)))+intercept

plot(y,pch = 19, cex = .9,lwd = 1, col="green")
points(z,pch = 21, cex = 1.1,lwd = 1.5,col="orange")

SelectKBest using the mutual_info_regression() function

We next try SelectKBest using the mutual_info_regression() function for the correlation statistic. We find it works as desired.

setwd("~/R/feature_selection")
load("xy_coef.rda")

dfx <- as.data.frame(x)
col_seq = 1:length(x[[1]])
colnames(dfx) = col_seq

data <- cbind(dfx,y)

m <- as.matrix(data)

# Let's try k different variables
library(reticulate)
source_python("select_best.py")

k_RMSE <- 0L
vars_RMSE <- NULL
best_RMSE <- Inf

k_AVE_ERR <- 0L
vars_AVE_ERR <- NULL
best_AVE_ERR <- Inf

library(modelr)

for(k in 1L:20L) {  
  mask <- feature_selection_mir(m,k=k) # passed as numpy.ndarray
  
  mask <- mask[1:(length(mask)-1)]
  vars_list <- col_seq[mask]
  
  print(paste(c('Symbols:',vars_list),collapse = ' '))
  
  if( length(vars_list) ) {
  
    x <- dfx[,which(mask)]
    df <- as.data.frame(cbind(x,y))

    model <- lm(y~.,data=df)
    

    R2 = rsquare(model, data = df) #(Coefficient of determination) represents the coefficient of how well the values fit compared to the original values. The value from 0 to 1 interpreted as percentages. The higher the value is, the better the model is.
    RMSE = rmse(model, data = df) # (Root Mean Squared Error) is the error rate by the square root of MSE
    MAE = mae(model, data = df) # Mean absolute error
    
    if( RMSE < best_RMSE ) {
      k_RMSE <- k
      best_RMSE <- RMSE
      vars_RMSE <- vars_list
    }
    
    col1 <- summary(model)$coefficients[,1]
    intercept <- col1[1]
    W <- col1[2:length(col1)]
    
    df$y <- NULL
    
    # z <- rowSums(as.matrix(df)*W)+intercept
    z <- rowSums(as.data.table(Map("*",df,W)))+intercept
    
    ave_err = sum(abs(y-z))/length(y)
    
    if( ave_err < best_AVE_ERR ) {
      k_AVE_ERR <- k
      best_AVE_ERR <- ave_err
      vars_AVE_ERR <- vars_list
    }
    
    print(paste(c('k:',k,'\tR2:',R2,'\tRMSE:',RMSE,'\tMAE:',MAE,'\taverage error:',ave_err),collapse = ' '))
  }
  
}

## [1] "Symbols:"
## [1] "Symbols: 200"
## [1] "k: 2 \tR2: 0.275142023665736 \tRMSE: 49.3087706916245 \tMAE: 40.7387863776319 \taverage error: 40.7387863776319"
## [1] "Symbols: 180 200"
## [1] "k: 3 \tR2: 0.486055497184808 \tRMSE: 41.51985487171 \tMAE: 32.4458173131512 \taverage error: 32.4458173131512"
## [1] "Symbols: 160 180 200"
## [1] "k: 4 \tR2: 0.658443687491088 \tRMSE: 33.8476773388222 \tMAE: 25.1241047744123 \taverage error: 25.1241047744123"
## [1] "Symbols: 140 160 180 200"
## [1] "k: 5 \tR2: 0.780997876071051 \tRMSE: 27.1032816129095 \tMAE: 18.958081064238 \taverage error: 18.958081064238"
## [1] "Symbols: 120 140 160 180 200"
## [1] "k: 6 \tR2: 0.875101869740499 \tRMSE: 20.4680129329383 \tMAE: 13.3568194569542 \taverage error: 13.3568194569542"
## [1] "Symbols: 100 120 140 160 180 200"
## [1] "k: 7 \tR2: 0.933346380019058 \tRMSE: 14.9523544908846 \tMAE: 8.92734739225712 \taverage error: 8.92734739225712"
## [1] "Symbols: 80 100 120 140 160 180 200"
## [1] "k: 8 \tR2: 0.972574499607763 \tRMSE: 9.59124521064195 \tMAE: 5.42022206000553 \taverage error: 5.42022206000553"
## [1] "Symbols: 60 80 100 120 140 160 180 200"
## [1] "k: 9 \tR2: 0.99038745843025 \tRMSE: 5.67828033477872 \tMAE: 3.05021617037384 \taverage error: 3.05021617037384"
## [1] "Symbols: 40 60 80 100 120 140 160 180 200"
## [1] "k: 10 \tR2: 0.998832826455744 \tRMSE: 1.97863383697429 \tMAE: 1.04373981139342 \taverage error: 1.04373981139342"
## [1] "Symbols: 20 40 60 80 100 120 140 160 180 200"
## [1] "k: 11 \tR2: 0.999962212587139 \tRMSE: 0.356017456861795 \tMAE: 0.28287512424236 \taverage error: 0.28287512424236"
## [1] "Symbols: 20 40 60 80 100 103 120 140 160 180 200"
## [1] "k: 12 \tR2: 0.999962266534425 \tRMSE: 0.355763231526387 \tMAE: 0.283196273383772 \taverage error: 0.283196273383771"
## [1] "Symbols: 20 40 60 80 100 101 103 120 140 160 180 200"
## [1] "k: 13 \tR2: 0.999962374659974 \tRMSE: 0.355253144647945 \tMAE: 0.282816820017619 \taverage error: 0.282816820017619"
## [1] "Symbols: 20 40 60 80 100 101 103 120 140 151 160 180 200"
## [1] "k: 14 \tR2: 0.99996238866784 \tRMSE: 0.355187008348016 \tMAE: 0.282607265402878 \taverage error: 0.282607265402876"
## [1] "Symbols: 20 40 41 60 80 100 101 103 120 140 151 160 180 200"
## [1] "k: 15 \tR2: 0.999962469017612 \tRMSE: 0.354807409274044 \tMAE: 0.281602960070669 \taverage error: 0.281602960070668"
## [1] "Symbols: 20 40 41 60 80 100 101 103 120 140 151 160 170 180 200"
## [1] "k: 16 \tR2: 0.999962901381525 \tRMSE: 0.352757765357202 \tMAE: 0.278182329824079 \taverage error: 0.278182329824079"
## [1] "Symbols: 20 40 41 60 80 100 101 103 120 140 151 160 170 180 181 200"
## [1] "k: 17 \tR2: 0.999962915721342 \tRMSE: 0.352689582617975 \tMAE: 0.278619287139032 \taverage error: 0.278619287139032"
## [1] "Symbols: 20 24 40 41 60 80 100 101 103 120 140 151 160 170 180 181 200"
## [1] "k: 18 \tR2: 0.999963053166704 \tRMSE: 0.352035389672752 \tMAE: 0.281380246731073 \taverage error: 0.281380246731074"
## [1] "Symbols: 20 24 40 41 60 80 100 101 103 120 136 140 151 160 170 180 181 200"
## [1] "k: 19 \tR2: 0.999963067365822 \tRMSE: 0.351967737428922 \tMAE: 0.281643628939542 \taverage error: 0.281643628939543"
## [1] "Symbols: 4 20 24 40 41 60 80 100 101 103 120 136 140 151 160 170 180 181 200"
## [1] "k: 20 \tR2: 0.999963144541285 \tRMSE: 0.351599804318013 \tMAE: 0.280290688710759 \taverage error: 0.28029068871076"

print(paste(c('k:',k_RMSE,' Symbols:',vars_RMSE,' RMSE:',best_RMSE),collapse = ' '))

## [1] "k: 20  Symbols: 4 20 24 40 41 60 80 100 101 103 120 136 140 151 160 170 180 181 200  RMSE: 0.351599804318013"

print(paste(c('k:',k_AVE_ERR,' Symbols:',vars_AVE_ERR,' average error:',best_AVE_ERR),collapse = ' '))

## [1] "k: 16  Symbols: 20 40 41 60 80 100 101 103 120 140 151 160 170 180 200  average error: 0.278182329824079"

# By inspection, our choice for number of variables is 11.

mask <- feature_selection_mir(m, k=11)
mask <- mask[1:(length(mask)-1)]
vars_list <- col_seq[mask]
print(paste(c('Symbols:',vars_list),collapse = ' '))

## [1] "Symbols: 20 40 60 80 100 120 140 160 180 200"

x <- dfx[,which(mask)]
df <- as.data.frame(cbind(x,y))

model <- lm(y~.,data=df)
  
col1 <- summary(model)$coefficients[,1]
intercept <- col1[1]
W <- col1[2:length(col1)]

save(vars_list,W,intercept,file="SelectKBest_mutual_info_regression.rda")

z <- rowSums(as.data.table(Map("*",x,W)))+intercept

plot(y,pch = 19, cex = .9,lwd = 1, col="green")
points(z,pch = 21, cex = 1.1,lwd = 1.5,col="orange")

Conclusion

We succeeded to find precisely the desired variables and the prescribed coefficients. We first tried using SelectKBest with the popular score function f_regression for regression problems. This score function was not useful to find all of the desired variables and only those variables; consequently, we did not obtain the prescribed coefficients and the plot was unsatisfactory. We succeeded upon using SelectKBest with the score function mutual_info_regression.

Feature Selection for Regression Problem

Stephen Bique

6/23/2021

Executive Summary

Select significant variables

SelectKBest using the f_regression() function

SelectKBest using the mutual_info_regression() function

Conclusion