1 Introduction

We apply logistic regression for classification between USA and China top500 HPC mainframes as far as November 2020.

For more see previous topic https://rpubs.com/alex-lev/553777

2 TOP500 data

Data for November 2020 can be downloaded here: https://www.top500.org/lists/top500/2020/11/

3 Preparing and Exploring Data

Note: the original names of variables were changed (truncated, concatenated and simplified) for the purpose of data exploration and visualization as possible.

library(readxl)
library(popbio)
library(ggplot2)
library(dplyr)
library(pscl)
library(DT)
TOP500_202011 <- read_excel("top500/TOP500_202011.xlsx")

names(TOP500_202011)
##  [1] "Rank"                        "PreviousRank"               
##  [3] "FirstAppearance"             "FirstRank"                  
##  [5] "Name"                        "Computer"                   
##  [7] "Site"                        "Manufacturer"               
##  [9] "Country"                     "Year"                       
## [11] "Segment"                     "TotalCores"                 
## [13] "AcceleratorCoProcessorCores" "Rmax"                       
## [15] "Rpeak"                       "Nmax"                       
## [17] "Nhalf"                       "HPCG"                       
## [19] "Power"                       "PowerSource"                
## [21] "PowerEfficiency"             "Architecture"               
## [23] "Processor"                   "ProcessorTechnology"        
## [25] "ProcessorSpeed"              "OperatingSystem"            
## [27] "OSFamily"                    "AcceleratorCoProcessor"     
## [29] "CoresperSocket"              "ProcessorGeneration"        
## [31] "SystemModel"                 "SystemFamily"               
## [33] "InterconnectFamily"          "Interconnect"               
## [35] "Continent"                   "SiteID"                     
## [37] "SystemID"
TOP500_USCH <- TOP500_202011%>% filter(Country==c("China","United States")) %>%
  mutate(EFF=round(Rmax/Rpeak,3),CAT=if_else(Country=="China",1,0)) %>% select_at(vars(EFF,CAT,Country))
as_tibble(TOP500_USCH)%>% datatable()
cor.test(TOP500_USCH$EFF,TOP500_USCH$CAT)
## 
##  Pearson's product-moment correlation
## 
## data:  TOP500_USCH$EFF and TOP500_USCH$CAT
## t = -14.085, df = 154, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8117537 -0.6724720
## sample estimates:
##       cor 
## -0.750324
ggplot(TOP500_USCH,aes(x=Country,y=EFF,fill=Country)) +
  geom_violin() + geom_jitter(na.rm = T) + 
  theme_bw() + ggtitle("EFF=Rmax/Rpeak by Country")+
  theme(plot.title = element_text(hjust = .5))

TOP500_USCH %>% group_by(Country) %>% summarise(M_EFF=round(mean(EFF),3)) %>% datatable()
t.test(EFF~Country, data=TOP500_USCH)# Student t-test for the mean values
## 
##  Welch Two Sample t-test
## 
## data:  EFF by Country
## t = -14.003, df = 103.44, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2650525 -0.1992918
## sample estimates:
##         mean in group China mean in group United States 
##                   0.5045825                   0.7367547

As we can see there is significant difference between USA and CHINA in terms of the mean values of \(EFF=Rmax/Rpeak\): \(M[EFF]_{USA}=0.74\), \(M[EFF]_{CHINA}=0.51\) that is China’s HPC mainframes produce half of the theoretical peak performance while the US ones produce 74%. Now we are ready to discriminate HPC mainframes of two leading countries applying logistic regression.

4 Logistic model

About logistic regression math see: https://en.wikipedia.org/wiki/Logistic_regression. \[\log{\frac{p}{1-p}}={b_0 + b_1x}\] or \[p=\frac{1}{1+e^{-(b_0+b_1x)}}\]

fit.glm <- glm(CAT~EFF,data = TOP500_USCH,family = binomial(link = "logit"))
# put the fitted values in the data.frame
fit.glm
## 
## Call:  glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = TOP500_USCH)
## 
## Coefficients:
## (Intercept)          EFF  
##       15.72       -24.54  
## 
## Degrees of Freedom: 155 Total (i.e. Null);  154 Residual
## Null Deviance:       199.9 
## Residual Deviance: 73.59     AIC: 77.59
TOP500_USCH$fitted.values <- fit.glm$fitted.values
pred <- predict.glm(fit.glm, data.frame(EFF = TOP500_USCH$EFF), type = "link", se.fit = TRUE)
TOP500_USCH$fit <- pred$fit
TOP500_USCH$se.fit <- pred$se.fit
# CI for fitted values
TOP500_USCH <- within(TOP500_USCH, {
  fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
  fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})

p <- ggplot(TOP500_USCH, aes(x = EFF, y = CAT))
# predicted curve and point-wise 95% CI
p <- p + geom_ribbon(aes(x = EFF, ymin = fit.lower, ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = EFF, y = fitted.values), color = "red")
# fitted values
p <- p + geom_point(aes(y = fitted.values), color = "red", size=2)
# observed values
p <- p + geom_point(size=2)
p <- p + labs(title = paste("Observed and predicted probability of HPC CHINA top500.org in 2020")) + xlab("EFF=Rmax/Rpeak") + ylab("Probability") + theme_bw() 
print(p)

\[Prob(Country=CHINA)=\frac{1}{1+e^{-(15.72 - 24.54\frac{Rmax}{Rpeak})}}\]

#coefficients
fit.glm$coefficients
## (Intercept)         EFF 
##    15.72401   -24.53613
#mean  value of EFF=0.5 for CHINA
prob_ch <- predict.glm(fit.glm, data.frame(EFF = 0.5), type = "response", se.fit = TRUE)
print(paste('Probability for CHINA =',round(prob_ch$fit,3)))
## [1] "Probability for CHINA = 0.969"
#mean value of EFF=0.75 for USA
prob_us <- predict.glm(fit.glm, data.frame(EFF = 0.74), type = "response", se.fit = TRUE)
print(paste('Probability for CHINA =',round(prob_us$fit,3)))
## [1] "Probability for CHINA = 0.081"

5 Splitting Data Set and Training Model

MisClassError <- c()

N <- 3000
set.seed(12345)
for (i in 1:N) {
train <- sample_frac(TOP500_USCH,0.8,replace = F)
test <-TOP500_USCH %>% setdiff(train)
fit.glm <- glm(CAT~EFF,data = train,family = binomial(link = "logit"))

res_test <- predict.glm(fit.glm,test,type="response",se.fit = FALSE)

fitted.results <- ifelse(res_test > 0.5,1,0)
MisClassError[i] <- round(mean(fitted.results != test$CAT),3)

}

MCE <- 1-MisClassError
summary(MCE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5620  0.8120  0.8670  0.8583  0.9090  1.0000
print(paste('Mean Classification Accuracy =',round(mean(MCE),3)))
## [1] "Mean Classification Accuracy = 0.858"
print(paste('95% Confidence interval for MCE'))
## [1] "95% Confidence interval for MCE"
print(paste('MCE <=',round(mean(MCE)+2*sd(MCE)/sqrt(N),3)))
## [1] "MCE <= 0.861"
print(paste('MCE >=',round(mean(MCE)-2*sd(MCE)/sqrt(N),3)))
## [1] "MCE >= 0.856"
ggplot(as.data.frame(1-MisClassError), aes(x=1-MisClassError)) + 
 geom_histogram(aes(y=..density..), bins = 10,colour="black", fill="green")+
 geom_density(alpha=.2, fill="#FF6666") + theme_bw() + labs(title="True Classification Proportion") + xlab("Proportion") + ylab("Density")+
  theme(plot.title = element_text(hjust = .5))

summary(fit.glm)
## 
## Call:
## glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3216  -0.2260   0.2531   0.3932   1.7210  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    14.01       2.27   6.174 6.67e-10 ***
## EFF           -21.52       3.67  -5.863 4.55e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 151.859  on 124  degrees of freedom
## Residual deviance:  64.252  on 123  degrees of freedom
## AIC: 68.252
## 
## Number of Fisher Scoring iterations: 6
pR2(fit.glm)
##         llh     llhNull          G2    McFadden        r2ML        r2CU 
## -32.1260145 -75.9296147  87.6072005   0.5768974   0.5038404   0.7164430
anova(fit.glm, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: CAT
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                   124    151.859              
## EFF   1   87.607       123     64.252 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6 Classification Chart

6.1 China

x<-seq(from=0.1,to=1.0,by=0.1)
y=predict.glm(fit.glm,type = "response")
par(mfrow=c(1,1))
logi.hist.plot(train$EFF,train$CAT,boxp=FALSE,type="hist",col="gray",
               xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC CHINA in TOP500 2020")
points(train$EFF,fitted(fit.glm),pch=20)
grid()

6.2 USA

fit.glm.2 <- glm(1-CAT~EFF,data = train ,family = binomial(link = "logit"))
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict.glm(fit.glm.2,type = "response")
par(mfrow=c(1,1))
logi.hist.plot(train$EFF,1-train$CAT,boxp=FALSE,type="hist",col="gray",
               xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC USA in TOP500 2020")
points(train$EFF,fitted(fit.glm.2),pch=20)
grid()

7 Conclusions

  1. Top500 HPC super mainframes of USA and China can be classified by logistic regression model.
  2. Accuracy of classification ( \(0.856\le MCE \le0.861\)) is appropriate for this purpose.
  3. Though China has got more HPC mainframes compared to USA, the overall effectiveness of US mainframes in terms of \(EFF=Rmax/Rpeak\) is obvious: \(M[EFF]_{USA}=0.74\), \(M[EFF]_{CHINA}=0.51\)
  4. The race is not over: Japan is a new leader by Rpeak Performance with “Fugaku” supercomputer N1 in TOP500 list (https://rpubs.com/alex-lev/632258)!