IBM-Summit
We had discussed HPC race previously. See: https://rpubs.com/alex-lev/696179, https://rpubs.com/alex-lev/694840, https://rpubs.com/alex-lev/693131, https://rpubs.com/alex-lev/553777
Data for November 2020 can be downloaded here: https://www.top500.org/lists/top500/2020/11/
Note: the original names of variables were changed (truncated, concatenated and simplified) for the purpose of data exploration and visualization as much as possible.
library(readxl)
library(tidyverse)
library(tidyquant)
library(broom)
library(DT)
library(knitr)
library(e1071)
library(randomForest)
TOP500_202011 <- read_excel("top500/TOP500_202011.xlsx")
names(TOP500_202011)
## [1] "Rank" "PreviousRank"
## [3] "FirstAppearance" "FirstRank"
## [5] "Name" "Computer"
## [7] "Site" "Manufacturer"
## [9] "Country" "Year"
## [11] "Segment" "TotalCores"
## [13] "AcceleratorCoProcessorCores" "Rmax"
## [15] "Rpeak" "Nmax"
## [17] "Nhalf" "HPCG"
## [19] "Power" "PowerSource"
## [21] "PowerEfficiency" "Architecture"
## [23] "Processor" "ProcessorTechnology"
## [25] "ProcessorSpeed" "OperatingSystem"
## [27] "OSFamily" "AcceleratorCoProcessor"
## [29] "CoresperSocket" "ProcessorGeneration"
## [31] "SystemModel" "SystemFamily"
## [33] "InterconnectFamily" "Interconnect"
## [35] "Continent" "SiteID"
## [37] "SystemID"
TOP500_USCH <- as_tibble(TOP500_202011) %>% filter(Country %in% c("United States","China")) %>% select(Country, Year, TotalCores, Rmax, Rpeak, Nmax, Power, PowerEfficiency, ProcessorSpeed, CoresperSocket) %>%mutate (Country=as.factor(Country)) %>% drop_na()
TOP500_USCH %>% datatable()
We want to compare Machine Learning (ML) classifications methods applying TOP500 data for China and USA mainframes. For more see: logistic regression, classification trees, support vector machines, naive Bayes and discriminant analysis.
NOTE: Here we bypass code execution for time saving.
set.seed(12345)
MisClsErrGLM <- c()
MisClsErrSVM <- c()
MisClsErrNB <- c()
MisClsErrRF <- c()
MisClsErrLDA <- c()
for (i in 1:3000) {
#Split data
train <- sample_frac(TOP500_USCH,0.5,replace = F)
test <-TOP500_USCH %>% setdiff(train)
#GLM
fit.glm <- glm(Country ~.,data = train,family = binomial(link = "logit"))
res_test <- predict.glm(fit.glm,test,type="response")
fitted.results <- ifelse(res_test > 0.5,1,0)
MisClsErrGLM[i] <- round(mean(fitted.results != ifelse(test$Country=="China",0,1)),3)
#Random Forest
usch.rf <- randomForest(Country ~ ., data=train, importance=TRUE,
proximity=TRUE,replace=F,ntree=500)
MisClsErrRF[i] <-usch.rf$err.rate
#LDA
fit.lda<-lda(Country~., data = train)
prd.lda<-predict(fit.lda,test)
MisClsErrLDA[i] <-sum(ifelse(prd.lda$class!=test$Country,1,0))/length(test$Country)
#Support Vector Machine
model.svm<-svm(Country~.,data = train,kernel="linear",type="C-classification",
scale=T,probability=T)
pred_svm<-predict(model.svm, test, probability=T)
MisClsErrSVM[i] <-sum(ifelse(pred_svm[1:length(test$Country)]!=test$Country,1,0))/length(test$Country)
#Naive Bayes
model.nb <- naiveBayes(Country ~ . , data = train)
pred_country<-predict(model.nb,test)
MisClsErrNB[i] <-sum(ifelse(pred_country!=test$Country,1,0))/length(test$Country)
}
save(MisClsErrGLM,MisClsErrSVM,MisClsErrNB,MisClsErrRF,MisClsErrLDA,file = "MisClsML.dat")
load("MisClsML.dat") # loading results of the code above
GLM_AC <- summary(1-MisClsErrGLM)
SVM_AC<- summary(1-MisClsErrSVM)
NB_AC<- summary(1-MisClsErrNB)
RF_AC <- summary(1-MisClsErrRF)
LDA_AC <- summary(1-MisClsErrLDA)
TAB_AC <- rbind(GLM_AC,SVM_AC,NB_AC,RF_AC,LDA_AC)
TAB_AC %>% kable(align = "r",caption = "ML Accuracy")
| Min. | 1st Qu. | Median | Mean | 3rd Qu. | Max. | |
|---|---|---|---|---|---|---|
| GLM_AC | 0.5260000 | 0.8840000 | 0.9160000 | 0.9070887 | 0.9370000 | 0.9790000 |
| SVM_AC | 0.3368421 | 0.6105263 | 0.7789474 | 0.7449368 | 0.8315789 | 0.9157895 |
| NB_AC | 0.3157895 | 0.7052632 | 0.7894737 | 0.7280526 | 0.8421053 | 0.9052632 |
| RF_AC | 0.2777778 | 0.7222222 | 0.7777778 | 0.7777778 | 0.8333333 | 1.0000000 |
| LDA_AC | 0.7263158 | 0.8315789 | 0.8526316 | 0.8495053 | 0.8631579 | 0.9368421 |
hist(1-MisClsErrGLM,breaks = 20,col="blue",main = "True Classification Proportion GLM",
xlab = "Proportion")
hist(1-MisClsErrSVM,breaks = 20,col="grey",main = "True Classification Proportion SVM",
xlab = "Proportion")
hist(1-MisClsErrNB,breaks = 20,col="green",main = "True Classification Proportion NB",
xlab = "Proportion")
hist(1-MisClsErrRF,breaks = 20,col="magenta",main = "True Classification Proportion RF",
xlab = "Proportion")
hist(1-MisClsErrLDA,breaks = 20,col="yellow",main = "True Classification Proportion LDA",
xlab = "Proportion")
Logistic Regression by GLM produces the most accurate results of classification.