IBM-Summit

Introduction

We had discussed HPC race previously. See: https://rpubs.com/alex-lev/696179, https://rpubs.com/alex-lev/694840, https://rpubs.com/alex-lev/693131, https://rpubs.com/alex-lev/553777

TOP500 data

Data for November 2020 can be downloaded here: https://www.top500.org/lists/top500/2020/11/

Filtering Data

Note: the original names of variables were changed (truncated, concatenated and simplified) for the purpose of data exploration and visualization as much as possible.

library(readxl)
library(tidyverse)
library(tidyquant)
library(broom)
library(DT)
library(knitr)
library(e1071)
library(randomForest)



TOP500_202011 <- read_excel("top500/TOP500_202011.xlsx")
names(TOP500_202011)
##  [1] "Rank"                        "PreviousRank"               
##  [3] "FirstAppearance"             "FirstRank"                  
##  [5] "Name"                        "Computer"                   
##  [7] "Site"                        "Manufacturer"               
##  [9] "Country"                     "Year"                       
## [11] "Segment"                     "TotalCores"                 
## [13] "AcceleratorCoProcessorCores" "Rmax"                       
## [15] "Rpeak"                       "Nmax"                       
## [17] "Nhalf"                       "HPCG"                       
## [19] "Power"                       "PowerSource"                
## [21] "PowerEfficiency"             "Architecture"               
## [23] "Processor"                   "ProcessorTechnology"        
## [25] "ProcessorSpeed"              "OperatingSystem"            
## [27] "OSFamily"                    "AcceleratorCoProcessor"     
## [29] "CoresperSocket"              "ProcessorGeneration"        
## [31] "SystemModel"                 "SystemFamily"               
## [33] "InterconnectFamily"          "Interconnect"               
## [35] "Continent"                   "SiteID"                     
## [37] "SystemID"
TOP500_USCH <- as_tibble(TOP500_202011) %>% filter(Country %in% c("United States","China")) %>% select(Country,  Year, TotalCores, Rmax, Rpeak, Nmax, Power, PowerEfficiency, ProcessorSpeed, CoresperSocket) %>%mutate (Country=as.factor(Country)) %>% drop_na()

TOP500_USCH  %>% datatable()

Problem

We want to compare Machine Learning (ML) classifications methods applying TOP500 data for China and USA mainframes. For more see: logistic regression, classification trees, support vector machines, naive Bayes and discriminant analysis.

NOTE: Here we bypass code execution for time saving.

set.seed(12345)
MisClsErrGLM <- c()
MisClsErrSVM <- c()
MisClsErrNB <- c()
MisClsErrRF <- c()
MisClsErrLDA <- c()
for (i in 1:3000) {
  #Split data
  train <- sample_frac(TOP500_USCH,0.5,replace = F)
  test <-TOP500_USCH %>% setdiff(train)
  #GLM
  fit.glm <- glm(Country ~.,data = train,family = binomial(link = "logit"))
  res_test <- predict.glm(fit.glm,test,type="response")
  fitted.results <- ifelse(res_test > 0.5,1,0)
  MisClsErrGLM[i] <- round(mean(fitted.results != ifelse(test$Country=="China",0,1)),3)
  #Random Forest
  usch.rf <- randomForest(Country ~ ., data=train, importance=TRUE,
                          proximity=TRUE,replace=F,ntree=500)
  MisClsErrRF[i] <-usch.rf$err.rate
  #LDA
  fit.lda<-lda(Country~., data = train)
  prd.lda<-predict(fit.lda,test)
  MisClsErrLDA[i] <-sum(ifelse(prd.lda$class!=test$Country,1,0))/length(test$Country)
  #Support Vector Machine
  model.svm<-svm(Country~.,data = train,kernel="linear",type="C-classification",
                 scale=T,probability=T)
  pred_svm<-predict(model.svm, test, probability=T)
  MisClsErrSVM[i] <-sum(ifelse(pred_svm[1:length(test$Country)]!=test$Country,1,0))/length(test$Country)
  #Naive Bayes
  model.nb <- naiveBayes(Country ~ . , data = train)
  pred_country<-predict(model.nb,test)
  MisClsErrNB[i] <-sum(ifelse(pred_country!=test$Country,1,0))/length(test$Country)

}

save(MisClsErrGLM,MisClsErrSVM,MisClsErrNB,MisClsErrRF,MisClsErrLDA,file = "MisClsML.dat")

Comparing accuracy of ML methods

load("MisClsML.dat") # loading results of the code above
GLM_AC <- summary(1-MisClsErrGLM)
SVM_AC<- summary(1-MisClsErrSVM)
NB_AC<- summary(1-MisClsErrNB)
RF_AC <- summary(1-MisClsErrRF)
LDA_AC <- summary(1-MisClsErrLDA)
TAB_AC <- rbind(GLM_AC,SVM_AC,NB_AC,RF_AC,LDA_AC)
TAB_AC %>% kable(align = "r",caption = "ML Accuracy")
ML Accuracy
Min. 1st Qu. Median Mean 3rd Qu. Max.
GLM_AC 0.5260000 0.8840000 0.9160000 0.9070887 0.9370000 0.9790000
SVM_AC 0.3368421 0.6105263 0.7789474 0.7449368 0.8315789 0.9157895
NB_AC 0.3157895 0.7052632 0.7894737 0.7280526 0.8421053 0.9052632
RF_AC 0.2777778 0.7222222 0.7777778 0.7777778 0.8333333 1.0000000
LDA_AC 0.7263158 0.8315789 0.8526316 0.8495053 0.8631579 0.9368421
hist(1-MisClsErrGLM,breaks = 20,col="blue",main = "True Classification Proportion  GLM",
     xlab = "Proportion")

hist(1-MisClsErrSVM,breaks = 20,col="grey",main = "True Classification Proportion  SVM",
     xlab = "Proportion")

hist(1-MisClsErrNB,breaks = 20,col="green",main = "True Classification Proportion  NB",
     xlab = "Proportion")

hist(1-MisClsErrRF,breaks = 20,col="magenta",main = "True Classification Proportion  RF",
     xlab = "Proportion")

hist(1-MisClsErrLDA,breaks = 20,col="yellow",main = "True Classification Proportion  LDA",
     xlab = "Proportion")

Conclusion

Logistic Regression by GLM produces the most accurate results of classification.