1 Introduction

We apply logistic regression for classification between USA and China top500 HPC mainframes.

For more see previous topic https://rpubs.com/alex-lev/553777

2 Preparing and Exploring Data

library(readr)
library(popbio)
library(ggplot2)
library(dplyr)
library(pscl)
library(DT)
TOP500_201911 <- read_csv("top500/TOP500_201911.csv")
names(TOP500_201911)
##  [1] "Rank"                          "Previous.Rank"                
##  [3] "First.Appearance"              "First.Rank"                   
##  [5] "Name"                          "Computer"                     
##  [7] "Site"                          "Manufacturer"                 
##  [9] "Country"                       "Year"                         
## [11] "Segment"                       "Total.Cores"                  
## [13] "Accelerator.CoProcessor.Cores" "Rmax"                         
## [15] "Rpeak"                         "Nmax"                         
## [17] "Nhalf"                         "HPCG"                         
## [19] "Power"                         "Power. Source"                
## [21] "Power.Effeciency"              "Architecture"                 
## [23] "Processor"                     "Processor.Technology"         
## [25] "Processor.Speed"               "Operating.System"             
## [27] "OS.Family"                     "Accelerator.CoProcessor"      
## [29] "Cores.per.Socket"              "Processor.Generation"         
## [31] "System.Model"                  "System.Family"                
## [33] "Interconnect.Family"           "Interconnect"                 
## [35] "Region"                        "Continent"                    
## [37] "Site.ID"                       "System. ID"
TOP500_USCH <- TOP500_201911%>% filter(Country==c("China","United States")) %>%
  mutate(EFF=round(Rmax/Rpeak,3),CAT=if_else(Country=="China",1,0)) %>% select_at(vars(EFF,CAT,Country))
as_tibble(TOP500_USCH)%>% datatable()
cor.test(TOP500_USCH$EFF,TOP500_USCH$CAT)
## 
##  Pearson's product-moment correlation
## 
## data:  TOP500_USCH$EFF and TOP500_USCH$CAT
## t = -14.095, df = 166, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8001040 -0.6605198
## sample estimates:
##        cor 
## -0.7381116
ggplot(TOP500_USCH,aes(x=Country,y=EFF,fill=Country)) +
  geom_violin() + geom_jitter(na.rm = T) + 
  theme_bw() + ggtitle("EFF=Rmax/Rpeak by Country")+
  theme(plot.title = element_text(hjust = .5))

TOP500_USCH %>% group_by(Country) %>% summarise(M_EFF=round(mean(EFF),3)) %>% datatable()
t.test(EFF~Country, data=TOP500_USCH)# Student t-test for the mean values
## 
##  Welch Two Sample t-test
## 
## data:  EFF by Country
## t = -15.118, df = 144.29, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.2907198 -0.2234932
## sample estimates:
##         mean in group China mean in group United States 
##                   0.4938257                   0.7509322

As we can see there is significant difference between USA and CHINA in terms of the mean values of \(EFF=Rmax/Rpeak\): \(M[EFF]_{USA}=0.75\), \(M[EFF]_{CHINA}=0.49\) that is China’s HPC mainframes produce half of the theoretical peak performance while the US ones produce 75%. Now we are ready to discriminate HPC mainframes of two leading countries applying logistic regression.

3 Logistic model

About logistic regression math see: https://en.wikipedia.org/wiki/Logistic_regression. \[\log{\frac{p}{1-p}}={b_0 + b_1x}\] or \[p=\frac{1}{1+e^{-(b_0+b_1x)}}\]

fit.glm <- glm(CAT~EFF,data = TOP500_USCH,family = binomial(link = "logit"))
# put the fitted values in the data.frame
fit.glm
## 
## Call:  glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = TOP500_USCH)
## 
## Coefficients:
## (Intercept)          EFF  
##       14.63       -22.31  
## 
## Degrees of Freedom: 167 Total (i.e. Null);  166 Residual
## Null Deviance:       217.8 
## Residual Deviance: 81.3  AIC: 85.3
TOP500_USCH$fitted.values <- fit.glm$fitted.values
pred <- predict(fit.glm, data.frame(EFF = TOP500_USCH$EFF), type = "link", se.fit = TRUE)
TOP500_USCH$fit <- pred$fit
TOP500_USCH$se.fit <- pred$se.fit
# CI for fitted values
TOP500_USCH <- within(TOP500_USCH, {
  fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
  fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})

p <- ggplot(TOP500_USCH, aes(x = EFF, y = CAT))
# predicted curve and point-wise 95% CI
p <- p + geom_ribbon(aes(x = EFF, ymin = fit.lower, ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = EFF, y = fitted.values), color = "red")
# fitted values
p <- p + geom_point(aes(y = fitted.values), color = "red", size=2)
# observed values
p <- p + geom_point(size=2)
p <- p + labs(title = paste("Observed and predicted probability of HPC CHINA top500.org in 2019")) + xlab("EFF=Rmax/Rpeak") + ylab("Probability") + theme_bw() 
print(p)

\[Prob(Country=CHINA)=\frac{1}{1+e^{-(13.76 -21.01\frac{Rmax}{Rpeak})}}\]

#coefficients
fit.glm$coefficients
## (Intercept)         EFF 
##    14.62884   -22.31337
#mean  value of EFF=0.5 for CHINA
prob_ch <- predict(fit.glm, data.frame(EFF = 0.5), type = "response", se.fit = TRUE)
print(paste('Probability for CHINA =',round(prob_ch$fit,3)))
## [1] "Probability for CHINA = 0.97"
#mean value of EFF=0.75 for USA
prob_us <- predict(fit.glm, data.frame(EFF = 0.75), type = "response", se.fit = TRUE)
print(paste('Probability for CHINA =',round(prob_us$fit,3)))
## [1] "Probability for CHINA = 0.108"

4 Splitting Data Set and Training Model

MisClassError <- c()

N <- 3000
set.seed(12345)
for (i in 1:N) {
train <- sample_frac(TOP500_USCH,0.8,replace = F)
test <-TOP500_USCH %>% setdiff(train)
fit.glm <- glm(CAT~EFF,data = train,family = binomial(link = "logit"))

res_test <- predict.glm(fit.glm,test,type="response",se.fit = FALSE)

fitted.results <- ifelse(res_test > 0.5,1,0)
MisClassError[i] <- round(mean(fitted.results != test$CAT),3)

}

MCE <- 1-MisClassError
summary(MCE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.5450  0.8000  0.8570  0.8545  0.9090  1.0000
print(paste('Mean Classification Accuracy =',round(mean(MCE),3)))
## [1] "Mean Classification Accuracy = 0.854"
print(paste('95% Confidence interval for MCE'))
## [1] "95% Confidence interval for MCE"
print(paste('MCE <=',round(mean(MCE)+2*sd(MCE)/sqrt(N),3)))
## [1] "MCE <= 0.857"
print(paste('MCE >=',round(mean(MCE)-2*sd(MCE)/sqrt(N),3)))
## [1] "MCE >= 0.852"
ggplot(as.data.frame(1-MisClassError), aes(x=1-MisClassError)) + 
 geom_histogram(aes(y=..density..), bins = 10,colour="black", fill="green")+
 geom_density(alpha=.2, fill="#FF6666") + theme_bw() + labs(title="True Classification Proportion") + xlab("Proportion") + ylab("Density")+
  theme(plot.title = element_text(hjust = .5))

summary(fit.glm)
## 
## Call:
## glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3047  -0.3800   0.2001   0.3675   1.9918  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   13.757      2.182   6.306 2.86e-10 ***
## EFF          -21.014      3.390  -6.198 5.71e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 173.639  on 133  degrees of freedom
## Residual deviance:  72.228  on 132  degrees of freedom
## AIC: 76.228
## 
## Number of Fisher Scoring iterations: 6
pR2(fit.glm)
##         llh     llhNull          G2    McFadden        r2ML        r2CU 
## -36.1139950 -86.8195896 101.4111892   0.5840340   0.5308346   0.7308498
anova(fit.glm, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: CAT
## 
## Terms added sequentially (first to last)
## 
## 
##      Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                   133    173.639              
## EFF   1   101.41       132     72.228 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

5 Classification Chart

5.1 China

x<-seq(from=0.1,to=1.0,by=0.1)
y=predict.glm(fit.glm,type = "response")
par(mfrow=c(1,1))
logi.hist.plot(train$EFF,train$CAT,boxp=FALSE,type="hist",col="gray",
               xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC CHINA in TOP500")
points(train$EFF,fitted(fit.glm),pch=20)
grid()

5.2 USA

fit.glm.2 <- glm(1-CAT~EFF,data = train ,family = binomial(link = "logit"))
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict.glm(fit.glm.2,type = "response")
par(mfrow=c(1,1))
logi.hist.plot(train$EFF,1-train$CAT,boxp=FALSE,type="hist",col="gray",
               xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC USA in TOP500")
points(train$EFF,fitted(fit.glm.2),pch=20)
grid()

6 Conclusions

  1. Top500 HPC super mainframes of USA and China can be classified by logistic regression model.
  2. Accuracy of classification ( \(0.852\le MCE \le0.857\)) is appropriate for this purpose.
  3. Though China has got more HPC mainframes compared to USA, the overall effectiveness of US mainframes in terms of \(EFF=Rmax/Rpeak\) is obvious: \(M[EFF]_{USA}=0.75\), \(M[EFF]_{CHINA}=0.49\)