IBM-Summit
We had discussed HPC race previously. See: https://rpubs.com/alex-lev/694840, https://rpubs.com/alex-lev/693131, https://rpubs.com/alex-lev/553777
Data for November 2020 can be downloaded here: https://www.top500.org/lists/top500/2020/11/
Note: the original names of variables were changed (truncated, concatenated and simplified) for the purpose of data exploration and visualization as much as possible.
library(readxl)
library(tidyverse)
library(tidyquant)
library(broom)
library(DT)
library(knitr)
TOP500_202011 <- read_excel("top500/TOP500_202011.xlsx")
names(TOP500_202011)
## [1] "Rank" "PreviousRank"
## [3] "FirstAppearance" "FirstRank"
## [5] "Name" "Computer"
## [7] "Site" "Manufacturer"
## [9] "Country" "Year"
## [11] "Segment" "TotalCores"
## [13] "AcceleratorCoProcessorCores" "Rmax"
## [15] "Rpeak" "Nmax"
## [17] "Nhalf" "HPCG"
## [19] "Power" "PowerSource"
## [21] "PowerEfficiency" "Architecture"
## [23] "Processor" "ProcessorTechnology"
## [25] "ProcessorSpeed" "OperatingSystem"
## [27] "OSFamily" "AcceleratorCoProcessor"
## [29] "CoresperSocket" "ProcessorGeneration"
## [31] "SystemModel" "SystemFamily"
## [33] "InterconnectFamily" "Interconnect"
## [35] "Continent" "SiteID"
## [37] "SystemID"
Now we can compare five countries (USA,China,Japan,France,Germany), leading TOP500 race by total HPC mainframes (see https://rpubs.com/alex-lev/694840). Here we apply linear regression to observe pace of race in terms of coefficients \[ln(Y_i)=B_0+B_1ln(X_i) +E_i \]that is \[ln(Rpeak_i) = B_0+B_1ln(TotalCores_i) +E_i\]
TOP500.US <- TOP500_202011 %>% filter(Country=="United States") %>% select(Country,Name,Rpeak,TotalCores)
TOP500.US %>% datatable()
fit.us <- lm(log(Rpeak)~log(TotalCores),TOP500.US)
summary(fit.us)
##
## Call:
## lm(formula = log(Rpeak) ~ log(TotalCores), data = TOP500.US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0870 -0.2548 -0.2548 0.4811 1.1014
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.03935 0.62384 -4.872 3.7e-06 ***
## log(TotalCores) 0.99926 0.05511 18.133 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4545 on 111 degrees of freedom
## Multiple R-squared: 0.7476, Adjusted R-squared: 0.7453
## F-statistic: 328.8 on 1 and 111 DF, p-value: < 2.2e-16
ggplot(TOP500.US,aes(x=log(TotalCores),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="darkblue") +
theme_bw() + ggtitle("Rpeak ~ TotalCores linear regression USA")+
theme(plot.title = element_text(hjust = .5))
TOP500.CH <- TOP500_202011 %>% filter(Country=="China") %>% select(Country,Name,Rpeak,TotalCores)
TOP500.CH %>% datatable()
fit.china <- lm(log(Rpeak)~log(TotalCores),TOP500.CH)
summary(fit.china)
##
## Call:
## lm(formula = log(Rpeak) ~ log(TotalCores), data = TOP500.CH)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.00401 -0.08666 0.06091 0.14633 2.49540
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.29949 0.32767 3.966 1e-04 ***
## log(TotalCores) 0.64083 0.02986 21.459 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3063 on 211 degrees of freedom
## Multiple R-squared: 0.6858, Adjusted R-squared: 0.6843
## F-statistic: 460.5 on 1 and 211 DF, p-value: < 2.2e-16
ggplot(TOP500.CH,aes(x=log(TotalCores),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="red") +
theme_bw() + ggtitle("Rpeak ~ TotalCores linear regression CHINA")+
theme(plot.title = element_text(hjust = .5))
TOP500.JP <- TOP500_202011 %>% filter(Country=="Japan") %>% select(Country,Name,Rpeak,TotalCores)
TOP500.JP %>% datatable()
fit.jap <- lm(log(Rpeak)~log(TotalCores),TOP500.JP)
summary(fit.jap)
##
## Call:
## lm(formula = log(Rpeak) ~ log(TotalCores), data = TOP500.JP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.88031 -0.25192 -0.01535 0.13663 2.00880
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.88784 0.89905 0.988 0.331
## log(TotalCores) 0.69498 0.07961 8.730 5.64e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5888 on 32 degrees of freedom
## Multiple R-squared: 0.7043, Adjusted R-squared: 0.695
## F-statistic: 76.21 on 1 and 32 DF, p-value: 5.639e-10
ggplot(TOP500.JP,aes(x=log(TotalCores),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="yellow") +
theme_bw() + ggtitle("Rpeak ~ TotalCores linear regression JAPAN")+
theme(plot.title = element_text(hjust = .5))
TOP500.GR <- TOP500_202011 %>% filter(Country=="Germany") %>% select(Country,Name,Rpeak,TotalCores)
TOP500.GR %>% datatable()
fit.gr <- lm(log(Rpeak)~log(TotalCores),TOP500.GR)
summary(fit.gr)
##
## Call:
## lm(formula = log(Rpeak) ~ log(TotalCores), data = TOP500.GR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.18382 -0.29307 -0.02513 0.23074 1.30078
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.2103 1.9075 0.11 0.913591
## log(TotalCores) 0.7421 0.1646 4.51 0.000356 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6296 on 16 degrees of freedom
## Multiple R-squared: 0.5597, Adjusted R-squared: 0.5322
## F-statistic: 20.34 on 1 and 16 DF, p-value: 0.0003564
ggplot(TOP500.GR,aes(x=log(TotalCores),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="green") +
theme_bw() + ggtitle("Rpeak ~ TotalCores linear regression GERMANY")+
theme(plot.title = element_text(hjust = .5))
TOP500.FR <- TOP500_202011 %>% filter(Country=="France") %>% select(Country,Name,Rpeak,TotalCores)
TOP500.FR %>% datatable()
fit.fr <- lm(log(Rpeak)~log(TotalCores),TOP500.FR)
summary(fit.fr)
##
## Call:
## lm(formula = log(Rpeak) ~ log(TotalCores), data = TOP500.FR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93569 -0.40902 0.00097 0.37450 0.84794
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4647 1.7255 -0.269 0.791
## log(TotalCores) 0.7837 0.1488 5.267 7.68e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5081 on 16 degrees of freedom
## Multiple R-squared: 0.6342, Adjusted R-squared: 0.6113
## F-statistic: 27.74 on 1 and 16 DF, p-value: 7.675e-05
ggplot(TOP500.FR,aes(x=log(TotalCores),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="blue") +
theme_bw() + ggtitle("Rpeak ~ TotalCores linear regression FRANCE")+
theme(plot.title = element_text(hjust = .5))
TOP500.5 <- rbind(TOP500.US,TOP500.CH,TOP500.JP,TOP500.FR,TOP500.GR)
TOP500.5%>%group_by(Country)%>%
do(tidy(lm(log(Rpeak)~log(TotalCores),data = .))) %>%
kable(caption = "Linear model parameters by country",digits = 4)
Country | term | estimate | std.error | statistic | p.value |
---|---|---|---|---|---|
China | (Intercept) | 1.2995 | 0.3277 | 3.9658 | 0.0001 |
China | log(TotalCores) | 0.6408 | 0.0299 | 21.4592 | 0.0000 |
France | (Intercept) | -0.4647 | 1.7255 | -0.2693 | 0.7911 |
France | log(TotalCores) | 0.7837 | 0.1488 | 5.2666 | 0.0001 |
Germany | (Intercept) | 0.2103 | 1.9075 | 0.1102 | 0.9136 |
Germany | log(TotalCores) | 0.7421 | 0.1646 | 4.5096 | 0.0004 |
Japan | (Intercept) | 0.8878 | 0.8991 | 0.9875 | 0.3308 |
Japan | log(TotalCores) | 0.6950 | 0.0796 | 8.7296 | 0.0000 |
United States | (Intercept) | -3.0393 | 0.6238 | -4.8720 | 0.0000 |
United States | log(TotalCores) | 0.9993 | 0.0551 | 18.1333 | 0.0000 |
ggplot(TOP500.5,aes(x=log(TotalCores),y=log(Rpeak),col=Country))+
geom_smooth(method="lm",se=F) + geom_point() +
theme_bw() + ggtitle("Rpeak ~ TotalCores linear regression ")+
theme(plot.title = element_text(hjust = .5))
TOP500.5%>%ggplot(.,aes(x=log(TotalCores),y=log(Rpeak),col=Country))+
geom_density2d(binwidth = 0.01, na.rm = T)+
theme_bw() + ggtitle("Density contour plot for Rpeak~TotalCores")+
theme(plot.title = element_text(hjust = .5))
Suppose we want to predict for each country \(Rpeak\) by \(TotalCores\) applying national linear model above. Say \(TotalCores=7630848\) that of Fugaku.
TOP500.JP$TotalCores[1]
## [1] 7630848
us.fugaku <-exp(predict.lm(fit.us,newdata = data.frame(TotalCores=TOP500.JP$TotalCores[1]),interval = "prediction"))
china.fugaku <-exp(predict.lm(fit.china,newdata = data.frame(TotalCores=TOP500.JP$TotalCores[1]),interval = "prediction" ))
japan.fugaku <-exp(predict.lm(fit.jap,newdata = data.frame(TotalCores=TOP500.JP$TotalCores[1]),interval = "prediction" ))
france.fugaku <-exp(predict.lm(fit.fr,newdata = data.frame(TotalCores=TOP500.JP$TotalCores[1]),interval = "prediction" ))
germany.fugaku <-exp(predict.lm(fit.gr,newdata = data.frame(TotalCores=TOP500.JP$TotalCores[1]),interval = "prediction" ))
Country_name <- TOP500.5 %>% group_by(Country) %>% count() %>% arrange(desc(n)) %>% select(Country)
all.fugaku <- as_tibble(rbind(china.fugaku,us.fugaku,japan.fugaku,france.fugaku,germany.fugaku))
all.fugaku %>% add_column(Country_name) %>% arrange(desc(upr)) %>% kable(align = "r",caption = "Predicted Rpeak by model")
fit | lwr | upr | Country |
---|---|---|---|
158150.87 | 20765.06 | 1204508.6 | Germany |
360982.61 | 128577.68 | 1013460.9 | United States |
155517.19 | 27144.68 | 890988.6 | France |
147517.37 | 35316.63 | 616179.2 | Japan |
94390.39 | 48285.56 | 184517.8 | China |
lm(log(Rpeak)~log(TotalCores))
that is \(ln(Rpeak_i) = B_0+B_1ln(TotalCores_i)\) we can arrange five leaders in the HPC race by the following order according to the regression slope \(B_1\): 1. USA - 0.999, 2. France - 0.784, 3. Germany - 0.742, 4. Japan - 0.695, 5. China - 0.641. The slope coefficient \(B_1\) displays how much performance \(Rpeak\) could be produced by \(TotalCores\), that is equivalent of mainframe construction technology for particular country in broad sense i.e. index of HPC advanced construction ability as well as usability.