IBM-Summit

1 Introduction

We had discussed HPC race previously. See: https://rpubs.com/alex-lev/696179, https://rpubs.com/alex-lev/694840, https://rpubs.com/alex-lev/693131, https://rpubs.com/alex-lev/553777

2 TOP500 data

Data for November 2020 can be downloaded here: https://www.top500.org/lists/top500/2020/11/

3 Filtering Data

Note: the original names of variables were changed (truncated, concatenated and simplified) for the purpose of data exploration and visualization as much as possible.

library(readxl)
library(tidyverse)
library(tidyquant)
library(broom)
library(DT)
library(knitr)



TOP500_202011 <- read_excel("top500/TOP500_202011.xlsx")
names(TOP500_202011)
##  [1] "Rank"                        "PreviousRank"               
##  [3] "FirstAppearance"             "FirstRank"                  
##  [5] "Name"                        "Computer"                   
##  [7] "Site"                        "Manufacturer"               
##  [9] "Country"                     "Year"                       
## [11] "Segment"                     "TotalCores"                 
## [13] "AcceleratorCoProcessorCores" "Rmax"                       
## [15] "Rpeak"                       "Nmax"                       
## [17] "Nhalf"                       "HPCG"                       
## [19] "Power"                       "PowerSource"                
## [21] "PowerEfficiency"             "Architecture"               
## [23] "Processor"                   "ProcessorTechnology"        
## [25] "ProcessorSpeed"              "OperatingSystem"            
## [27] "OSFamily"                    "AcceleratorCoProcessor"     
## [29] "CoresperSocket"              "ProcessorGeneration"        
## [31] "SystemModel"                 "SystemFamily"               
## [33] "InterconnectFamily"          "Interconnect"               
## [35] "Continent"                   "SiteID"                     
## [37] "SystemID"

4 Linear Regression

Now we can compare five countries (USA,China,Japan,France,Germany), leading TOP500 race by total HPC mainframes (see https://rpubs.com/alex-lev/694840). Here we apply linear regression to observe pace of race in terms of coefficients \[ln(Y_i)=B_0+B_1ln(X_i) +E_i \]that is \[ln(Rpeak_i) = B_0+B_1ln(Power_i) +E_i\]

4.1 USA

TOP500.US <- TOP500_202011 %>% filter(Country=="United States") %>% select(Country,Name,Rpeak,TotalCores,Power) %>% na.omit()
TOP500.US %>% datatable()
fit.us <- lm(log(Rpeak)~log(Power),TOP500.US)
summary(fit.us)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Power), data = TOP500.US)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7135 -0.6632 -0.1560  0.4305  2.5762 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   4.9225     1.0829   4.546 6.99e-05 ***
## log(Power)    0.5110     0.1469   3.479  0.00144 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.052 on 33 degrees of freedom
## Multiple R-squared:  0.2683, Adjusted R-squared:  0.2461 
## F-statistic:  12.1 on 1 and 33 DF,  p-value: 0.001437
 ggplot(TOP500.US,aes(x=log(Power),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="darkblue") + 
  theme_bw() + ggtitle("Rpeak ~ Power linear regression USA")+
  theme(plot.title = element_text(hjust = .5))

4.2 CHINA

TOP500.CH <- TOP500_202011 %>% filter(Country=="China") %>% select(Country,Name,Rpeak,TotalCores,Power) %>% na.omit()
TOP500.CH  %>% datatable()
fit.china <- lm(log(Rpeak)~log(Power),TOP500.CH)
summary(fit.china)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Power), data = TOP500.CH)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10168 -0.39250  0.00672  0.26678  1.20704 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.49628    0.57882   6.040 9.66e-07 ***
## log(Power)   0.72988    0.08571   8.515 9.90e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5 on 32 degrees of freedom
## Multiple R-squared:  0.6938, Adjusted R-squared:  0.6842 
## F-statistic: 72.51 on 1 and 32 DF,  p-value: 9.899e-10
 ggplot(TOP500.CH,aes(x=log(Power),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="red") + 
  theme_bw() + ggtitle("Rpeak ~ Power linear regression CHINA")+
  theme(plot.title = element_text(hjust = .5))

4.3 JAPAN

TOP500.JP <- TOP500_202011 %>% filter(Country=="Japan") %>% select(Country,Name,Rpeak,TotalCores,Power) %>% na.omit()
TOP500.JP %>% datatable()
fit.jap <- lm(log(Rpeak)~log(Power),TOP500.JP)
summary(fit.jap)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Power), data = TOP500.JP)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4362 -0.4584 -0.2440  0.4569  1.2598 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.1674     1.0379   3.052   0.0101 *  
## log(Power)    0.8669     0.1500   5.780 8.75e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7693 on 12 degrees of freedom
## Multiple R-squared:  0.7357, Adjusted R-squared:  0.7137 
## F-statistic:  33.4 on 1 and 12 DF,  p-value: 8.747e-05
 ggplot(TOP500.JP,aes(x=log(TotalCores),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="yellow")  +
  theme_bw() + ggtitle("Rpeak ~ Power linear regression JAPAN")+
  theme(plot.title = element_text(hjust = .5))

4.4 GERMANY

TOP500.GR <- TOP500_202011 %>% filter(Country=="Germany") %>% select(Country,Name,Rpeak,TotalCores,Power) %>% na.omit()
TOP500.GR %>% datatable()
fit.gr <- lm(log(Rpeak)~log(Power),TOP500.GR)
summary(fit.gr)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Power), data = TOP500.GR)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.50218 -0.40176 -0.01438  0.21393  2.10039 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)   3.4451     3.4255   1.006    0.338
## log(Power)    0.7524     0.4726   1.592    0.142
## 
## Residual standard error: 0.8922 on 10 degrees of freedom
## Multiple R-squared:  0.2022, Adjusted R-squared:  0.1224 
## F-statistic: 2.534 on 1 and 10 DF,  p-value: 0.1425
 ggplot(TOP500.GR,aes(x=log(TotalCores),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="green")  +
  theme_bw() + ggtitle("Rpeak ~ Power linear regression GERMANY")+
  theme(plot.title = element_text(hjust = .5))

4.5 FRANCE

TOP500.FR <- TOP500_202011 %>% filter(Country=="France") %>% select(Country,Name,Rpeak,TotalCores,Power) %>% na.omit()
TOP500.FR %>% datatable()
fit.fr <- lm(log(Rpeak)~log(Power),TOP500.FR)
summary(fit.fr)
## 
## Call:
## lm(formula = log(Rpeak) ~ log(Power), data = TOP500.FR)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.4505 -0.6125  0.0034  0.5399  1.4488 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   5.5838     1.7582   3.176  0.00674 **
## log(Power)    0.4287     0.2491   1.721  0.10732   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8122 on 14 degrees of freedom
## Multiple R-squared:  0.1746, Adjusted R-squared:  0.1156 
## F-statistic: 2.961 on 1 and 14 DF,  p-value: 0.1073
 ggplot(TOP500.FR,aes(x=log(TotalCores),y=log(Rpeak)))+
  geom_smooth(method="lm") + geom_point(col="blue")  +
  theme_bw() + ggtitle("Rpeak ~ Power linear regression FRANCE")+
  theme(plot.title = element_text(hjust = .5))

4.6 Comparison

TOP500.5 <- rbind(TOP500.US,TOP500.CH,TOP500.JP,TOP500.FR,TOP500.GR)
TOP500.5%>%group_by(Country)%>%
  do(tidy(lm(log(Rpeak)~log(Power),data = .))) %>% 
  kable(caption = "Linear model parameters by country",digits = 4)
Linear model parameters by country
Country term estimate std.error statistic p.value
China (Intercept) 3.4963 0.5788 6.0403 0.0000
China log(Power) 0.7299 0.0857 8.5154 0.0000
France (Intercept) 5.5838 1.7582 3.1758 0.0067
France log(Power) 0.4287 0.2491 1.7207 0.1073
Germany (Intercept) 3.4451 3.4255 1.0057 0.3383
Germany log(Power) 0.7524 0.4726 1.5919 0.1425
Japan (Intercept) 3.1674 1.0379 3.0516 0.0101
Japan log(Power) 0.8669 0.1500 5.7796 0.0001
United States (Intercept) 4.9225 1.0829 4.5458 0.0001
United States log(Power) 0.5110 0.1469 3.4786 0.0014
 ggplot(TOP500.5,aes(x=log(Power),y=log(Rpeak),col=Country))+
  geom_smooth(method="lm",se=F) + geom_point()  +
  theme_bw() + ggtitle("Rpeak ~ Power linear regression ")+
  theme(plot.title = element_text(hjust = .5))

 TOP500.5%>%ggplot(.,aes(x=log(Power),y=log(Rpeak),col=Country))+
  geom_density2d(binwidth = 0.01, na.rm = T)+
  theme_bw() + ggtitle("Density contour plot for Rpeak~Power")+
  theme(plot.title = element_text(hjust = .5))

5 Conclusion

Applying linear regression lm(log(Rpeak)~log(Power)) that is \(ln(Rpeak_i) = B_0+B_1ln(Power_i)\) we can arrange three leaders in the HPC race by the following order according to the regression slope \(B_1\): 1. Japan - 0.867, 2. China - 0.730, 3. USA - 0.511. We excluded Germany and France due to the P-value as not significant. The slope coefficient \(B_1\) displays how much performance \(Rpeak\) could be produced by \(Power\) i.e. more efficient or less power consuming. However China has got about 43% of the total HPC mainframes in top500 list while Japan has produced Fugaku - HPC number one in the world today. The race is not over!