IBM-Summit
We had discussed HPC race previously. See: https://rpubs.com/alex-lev/696179, https://rpubs.com/alex-lev/694840, https://rpubs.com/alex-lev/693131, https://rpubs.com/alex-lev/553777
Data for November 2020 can be downloaded here: https://www.top500.org/lists/top500/2020/11/
Note: the original names of variables were changed (truncated, concatenated and simplified) for the purpose of data exploration and visualization as much as possible.
library(readxl)
library(tidyverse)
library(tidyquant)
library(broom)
library(DT)
library(knitr)
library(popbio)
library(pscl)
library(caTools)
TOP500_202011 <- read_excel("top500/TOP500_202011.xlsx")
names(TOP500_202011)
## [1] "Rank" "PreviousRank"
## [3] "FirstAppearance" "FirstRank"
## [5] "Name" "Computer"
## [7] "Site" "Manufacturer"
## [9] "Country" "Year"
## [11] "Segment" "TotalCores"
## [13] "AcceleratorCoProcessorCores" "Rmax"
## [15] "Rpeak" "Nmax"
## [17] "Nhalf" "HPCG"
## [19] "Power" "PowerSource"
## [21] "PowerEfficiency" "Architecture"
## [23] "Processor" "ProcessorTechnology"
## [25] "ProcessorSpeed" "OperatingSystem"
## [27] "OSFamily" "AcceleratorCoProcessor"
## [29] "CoresperSocket" "ProcessorGeneration"
## [31] "SystemModel" "SystemFamily"
## [33] "InterconnectFamily" "Interconnect"
## [35] "Continent" "SiteID"
## [37] "SystemID"
We want to classify HPC mainframes of China and USA, applying General Linear Model with logistic regression.
TOP500_USCH <- as_tibble(TOP500_202011) %>% filter(Country %in% c("United States","China")) %>% select(Country, Year, TotalCores, Rmax, Rpeak, Nmax, Power, PowerEfficiency, ProcessorSpeed, CoresperSocket) %>%mutate (Country=as.factor(Country)) %>% drop_na()
TOP500_USCH %>% datatable() #kable(align = "r",caption = "TOP500: China - USA")
This dataset above contains 98 individuals (mainframes) and 10 variables.
set.seed(12345)
MisClassError <- c()
for (i in 1:3000) {
train <- sample_frac(TOP500_USCH,0.5,replace = F)
test <-TOP500_USCH %>% setdiff(train)
fit.glm <- glm(Country ~.,data = train,family = binomial(link = "logit"))
res_test <- predict.glm(fit.glm,test,type="response")
fitted.results <- ifelse(res_test > 0.5,1,0)
MisClassError[i] <- round(mean(fitted.results != ifelse(test$Country=="China",0,1)),3)
}
summary(1-MisClassError)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6420 0.8840 0.9160 0.9079 0.9370 0.9790
quantile(1-MisClassError,c(0.1,0.5,0.9))
## 10% 50% 90%
## 0.853 0.916 0.947
print(paste('Mean Classifucation Accuracy =',round(mean(1-MisClassError),3)))
## [1] "Mean Classifucation Accuracy = 0.908"
par(mfrow=c(1,1))
hist(1-MisClassError,breaks = 20,col="blue",main = "True Classification Proportion",
xlab = "Proportion")
summary(fit.glm)
##
## Call:
## glm(formula = Country ~ ., family = binomial(link = "logit"),
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## 0.00 0.00 0.00 0.00 8.49
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.608e+16 1.355e+10 1925498 <2e-16 ***
## Year -1.401e+13 6.708e+06 -2088551 <2e-16 ***
## TotalCores -1.609e+09 1.425e+02 -11285719 <2e-16 ***
## Rmax 1.789e+11 2.701e+03 66240523 <2e-16 ***
## Rpeak -1.415e+11 2.835e+03 -49911351 <2e-16 ***
## Nmax 1.822e+08 6.895e+00 26421993 <2e-16 ***
## Power 3.534e+11 5.741e+03 61548681 <2e-16 ***
## PowerEfficiency 3.490e+13 4.708e+06 7413749 <2e-16 ***
## ProcessorSpeed 4.108e+10 3.844e+04 1068658 <2e-16 ***
## CoresperSocket 4.920e+13 1.019e+06 48267957 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 65.438 on 48 degrees of freedom
## Residual deviance: 360.437 on 39 degrees of freedom
## AIC: 380.44
##
## Number of Fisher Scoring iterations: 14
Here we generate classification charts by specifications (variables), applied in glm model above.
fun1 <- function(x, column, model){
logi.hist.plot(x[[column]],ifelse(x[[1]]=="China",1,0),boxp=FALSE,type="hist",col="gray",
xlabel = colnames(x)[[column]],
mainlabel = "Probability of HPC CHINA in TOP500",logi.mod = 2)
points(x[[column]],fitted(model),pch=20)
grid()
}
set.seed(12345)
fit.glm <- glm(Country ~.,data = TOP500_USCH,family = binomial(link = "logit"))
par(mfrow=c(3,3))
for (i in 2:length(TOP500_USCH)){
fun1(TOP500_USCH,i,fit.glm)
}
glm(Country~Rmax/Rpeak) discussed in https://rpubs.com/alex-lev/693131.