IBM-Summit

1 Introduction

We had discussed HPC race previously. See: https://rpubs.com/alex-lev/694840, https://rpubs.com/alex-lev/693131, https://rpubs.com/alex-lev/553777

2 TOP500 data

Data for November 2020 can be downloaded here: https://www.top500.org/lists/top500/2020/11/

3 Filtering Data

Note: the original names of variables were changed (truncated, concatenated and simplified) for the purpose of data exploration and visualization as much as possible.

library(readxl)
library(tidyverse)
library(tidyquant)
library(broom)
library(DT)
library(knitr)



TOP500_202011 <- read_excel("top500/TOP500_202011.xlsx")
names(TOP500_202011)
##  [1] "Rank"                        "PreviousRank"               
##  [3] "FirstAppearance"             "FirstRank"                  
##  [5] "Name"                        "Computer"                   
##  [7] "Site"                        "Manufacturer"               
##  [9] "Country"                     "Year"                       
## [11] "Segment"                     "TotalCores"                 
## [13] "AcceleratorCoProcessorCores" "Rmax"                       
## [15] "Rpeak"                       "Nmax"                       
## [17] "Nhalf"                       "HPCG"                       
## [19] "Power"                       "PowerSource"                
## [21] "PowerEfficiency"             "Architecture"               
## [23] "Processor"                   "ProcessorTechnology"        
## [25] "ProcessorSpeed"              "OperatingSystem"            
## [27] "OSFamily"                    "AcceleratorCoProcessor"     
## [29] "CoresperSocket"              "ProcessorGeneration"        
## [31] "SystemModel"                 "SystemFamily"               
## [33] "InterconnectFamily"          "Interconnect"               
## [35] "Continent"                   "SiteID"                     
## [37] "SystemID"

4 Linear Regression

Now we can compare five countries (USA,China,Japan,France,Germany), leading TOP500 race by total HPC mainframes (see https://rpubs.com/alex-lev/694840). Here we apply linear regression to observe pace of race in terms of coefficients \[ln(Y_i)=B_0+ B_1ln(X_i) +E_i \]that is \[ln(Rpeak_i) = B_0+B_1ln(Rmax_i) +E_i\]

4.1 USA

TOP500.US <- TOP500_202011 %>% filter(Country=="United States") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.US %>% datatable()
fit.us <- lm(log(Rpeak)~log(Rmax),TOP500.US)
summary(fit.us)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.US)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25568 -0.06590 -0.05357  0.07494  0.62885 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.04388    0.14033   0.313    0.755    
## log(Rmax)    1.03519    0.01740  59.500   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1517 on 90 degrees of freedom
## Multiple R-squared:  0.9752, Adjusted R-squared:  0.9749 
## F-statistic:  3540 on 1 and 90 DF,  p-value: < 2.2e-16
 ggplot(TOP500.US,aes(x=log(Rmax),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="darkblue") + 
  theme_bw() + ggtitle("Rpeak ~ Rmax linear regression USA")+
  theme(plot.title = element_text(hjust = .5))

4.2 CHINA

TOP500.CH <- TOP500_202011 %>% filter(Country=="China") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.CH  %>% datatable()
fit.china <- lm(log(Rpeak)~log(Rmax),TOP500.CH)
summary(fit.china)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.CH)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.38385 -0.15664  0.00897  0.05696  0.83472 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.21505    0.32824   3.702 0.000413 ***
## log(Rmax)    0.92633    0.04298  21.550  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2398 on 73 degrees of freedom
## Multiple R-squared:  0.8642, Adjusted R-squared:  0.8623 
## F-statistic: 464.4 on 1 and 73 DF,  p-value: < 2.2e-16
 ggplot(TOP500.CH,aes(x=log(Rmax),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="red") + 
  theme_bw() + ggtitle("Rpeak ~ Rmax linear regression CHINA")+
  theme(plot.title = element_text(hjust = .5))

4.3 JAPAN

TOP500.JP <- TOP500_202011 %>% filter(Country=="Japan") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.JP %>% datatable()
fit.jap <- lm(log(Rpeak)~log(Rmax),TOP500.JP)
summary(fit.jap)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.JP)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34356 -0.08090 -0.00476  0.05102  0.77224 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.92589    0.30692   3.017  0.00614 ** 
## log(Rmax)    0.93985    0.03634  25.863  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2207 on 23 degrees of freedom
## Multiple R-squared:  0.9668, Adjusted R-squared:  0.9653 
## F-statistic: 668.9 on 1 and 23 DF,  p-value: < 2.2e-16
 ggplot(TOP500.JP,aes(x=log(Rmax),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="yellow")  + 
  theme_bw() + ggtitle("Rpeak ~ Rmax linear regression JAPAN")+
  theme(plot.title = element_text(hjust = .5))

4.4 GERMANY

TOP500.GR <- TOP500_202011 %>% filter(Country=="Germany") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.GR %>% datatable()
fit.gr <- lm(log(Rpeak)~log(Rmax),TOP500.GR)
summary(fit.gr)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.GR)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.274392 -0.075001  0.000116  0.080323  0.180140 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.28069    0.29913   0.938    0.364    
## log(Rmax)    1.01110    0.03509  28.817 7.26e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1276 on 14 degrees of freedom
## Multiple R-squared:  0.9834, Adjusted R-squared:  0.9822 
## F-statistic: 830.4 on 1 and 14 DF,  p-value: 7.265e-14
 ggplot(TOP500.GR,aes(x=log(Rmax),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="green")  +
  theme_bw() + ggtitle("Rpeak ~ Rmax linear regression GERMANY")+
  theme(plot.title = element_text(hjust = .5))

4.5 FRANCE

TOP500.FR <- TOP500_202011 %>% filter(Country=="France") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.FR %>% datatable()
fit.fr <- lm(log(Rpeak)~log(Rmax),TOP500.FR)
summary(fit.fr)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.FR)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.19749 -0.13114  0.02325  0.11942  0.21029 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.32111    0.36805  -0.872    0.396    
## log(Rmax)    1.08322    0.04451  24.334 4.56e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1363 on 16 degrees of freedom
## Multiple R-squared:  0.9737, Adjusted R-squared:  0.972 
## F-statistic: 592.1 on 1 and 16 DF,  p-value: 4.562e-14
 ggplot(TOP500.FR,aes(x=log(Rmax),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="blue")  +
  theme_bw() + ggtitle("Rpeak ~ Rmax linear regression FRANCE")+
  theme(plot.title = element_text(hjust = .5))

4.6 Comparison

TOP500.5 <- rbind(TOP500.US,TOP500.CH,TOP500.JP,TOP500.FR,TOP500.GR)
TOP500.5%>%group_by(Country)%>%
  do(tidy(lm(log(Rpeak)~log(Rmax),data = .))) %>% 
  kable(caption = "Linear model parameters by country",digits = 4)
Linear model parameters by country
Country term estimate std.error statistic p.value
China (Intercept) 1.2150 0.3282 3.7017 0.0004
China log(Rmax) 0.9263 0.0430 21.5501 0.0000
France (Intercept) -0.3211 0.3681 -0.8725 0.3959
France log(Rmax) 1.0832 0.0445 24.3339 0.0000
Germany (Intercept) 0.2807 0.2991 0.9383 0.3640
Germany log(Rmax) 1.0111 0.0351 28.8174 0.0000
Japan (Intercept) 0.9259 0.3069 3.0167 0.0061
Japan log(Rmax) 0.9399 0.0363 25.8631 0.0000
United States (Intercept) 0.0439 0.1403 0.3127 0.7552
United States log(Rmax) 1.0352 0.0174 59.4998 0.0000
ggplot(TOP500.5,aes(x=log(Rmax),y=log(Rpeak),col=Country))+
  geom_smooth(method="lm",se=F) + geom_point()  +
  theme_bw() + ggtitle("Rpeak ~ Rmax linear regression ")+
  theme(plot.title = element_text(hjust = .5))

 TOP500.5%>%ggplot(.,aes(x=log(Rmax),y=log(Rpeak),col=Country))+
  geom_density2d(binwidth = 0.01, na.rm = T)+
  theme_bw() + ggtitle("Density contour plot for Rpeak~Rmax")+
  theme(plot.title = element_text(hjust = .5))

5 Conclusion

  1. Applying linear regression model lm(log(Rpeak)~log(Rmax)) that is \(ln(Rpeak_i) = B_0+B_1ln(Rmax_i)\) we can arrange five leaders in the HPC race by the following order according to the regression slope \(B_1\): 1. France - 1.0832, 2. USA - 1.0352, 3. Germany - 1.0111, 4. Japan - 0.9399, 5. China - 0.9263. The slope coefficient \(B_1\) (p-values for all countries are significant) displays how much peak performance \(Rpeak\) could be produced provided \(Rmax\), that is equivalent of mainframe construction technology for particular country in broad sense i.e. index of HPC advanced construction ability as well as usability.
  2. However China has got about 43% of the total HPC mainframes in top500 list while Japan has produced Fugaku - HPC number one in the world today. The race is not over!