IBM-Summit
We had discussed HPC race previously. See: https://rpubs.com/alex-lev/694840, https://rpubs.com/alex-lev/693131, https://rpubs.com/alex-lev/553777
Data for November 2020 can be downloaded here: https://www.top500.org/lists/top500/2020/11/
Note: the original names of variables were changed (truncated, concatenated and simplified) for the purpose of data exploration and visualization as much as possible.
library(readxl)
library(tidyverse)
library(tidyquant)
library(broom)
library(DT)
library(knitr)
TOP500_202011 <- read_excel("top500/TOP500_202011.xlsx")
names(TOP500_202011)
## [1] "Rank" "PreviousRank"
## [3] "FirstAppearance" "FirstRank"
## [5] "Name" "Computer"
## [7] "Site" "Manufacturer"
## [9] "Country" "Year"
## [11] "Segment" "TotalCores"
## [13] "AcceleratorCoProcessorCores" "Rmax"
## [15] "Rpeak" "Nmax"
## [17] "Nhalf" "HPCG"
## [19] "Power" "PowerSource"
## [21] "PowerEfficiency" "Architecture"
## [23] "Processor" "ProcessorTechnology"
## [25] "ProcessorSpeed" "OperatingSystem"
## [27] "OSFamily" "AcceleratorCoProcessor"
## [29] "CoresperSocket" "ProcessorGeneration"
## [31] "SystemModel" "SystemFamily"
## [33] "InterconnectFamily" "Interconnect"
## [35] "Continent" "SiteID"
## [37] "SystemID"
Now we can compare five countries (USA,China,Japan,France,Germany), leading TOP500 race by total HPC mainframes (see https://rpubs.com/alex-lev/694840). Here we apply linear regression to observe pace of race in terms of coefficients \[ln(Y_i)=B_0+ B_1ln(X_i) +E_i \]that is \[ln(Rpeak_i) = B_0+B_1ln(Rmax_i) +E_i\]
TOP500.US <- TOP500_202011 %>% filter(Country=="United States") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.US %>% datatable()
fit.us <- lm(log(Rpeak)~log(Rmax),TOP500.US)
summary(fit.us)
##
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.US)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.25568 -0.06590 -0.05357 0.07494 0.62885
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.04388 0.14033 0.313 0.755
## log(Rmax) 1.03519 0.01740 59.500 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1517 on 90 degrees of freedom
## Multiple R-squared: 0.9752, Adjusted R-squared: 0.9749
## F-statistic: 3540 on 1 and 90 DF, p-value: < 2.2e-16
ggplot(TOP500.US,aes(x=log(Rmax),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="darkblue") +
theme_bw() + ggtitle("Rpeak ~ Rmax linear regression USA")+
theme(plot.title = element_text(hjust = .5))
TOP500.CH <- TOP500_202011 %>% filter(Country=="China") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.CH %>% datatable()
fit.china <- lm(log(Rpeak)~log(Rmax),TOP500.CH)
summary(fit.china)
##
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.CH)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38385 -0.15664 0.00897 0.05696 0.83472
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.21505 0.32824 3.702 0.000413 ***
## log(Rmax) 0.92633 0.04298 21.550 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2398 on 73 degrees of freedom
## Multiple R-squared: 0.8642, Adjusted R-squared: 0.8623
## F-statistic: 464.4 on 1 and 73 DF, p-value: < 2.2e-16
ggplot(TOP500.CH,aes(x=log(Rmax),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="red") +
theme_bw() + ggtitle("Rpeak ~ Rmax linear regression CHINA")+
theme(plot.title = element_text(hjust = .5))
TOP500.JP <- TOP500_202011 %>% filter(Country=="Japan") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.JP %>% datatable()
fit.jap <- lm(log(Rpeak)~log(Rmax),TOP500.JP)
summary(fit.jap)
##
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.JP)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.34356 -0.08090 -0.00476 0.05102 0.77224
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.92589 0.30692 3.017 0.00614 **
## log(Rmax) 0.93985 0.03634 25.863 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2207 on 23 degrees of freedom
## Multiple R-squared: 0.9668, Adjusted R-squared: 0.9653
## F-statistic: 668.9 on 1 and 23 DF, p-value: < 2.2e-16
ggplot(TOP500.JP,aes(x=log(Rmax),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="yellow") +
theme_bw() + ggtitle("Rpeak ~ Rmax linear regression JAPAN")+
theme(plot.title = element_text(hjust = .5))
TOP500.GR <- TOP500_202011 %>% filter(Country=="Germany") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.GR %>% datatable()
fit.gr <- lm(log(Rpeak)~log(Rmax),TOP500.GR)
summary(fit.gr)
##
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.GR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.274392 -0.075001 0.000116 0.080323 0.180140
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.28069 0.29913 0.938 0.364
## log(Rmax) 1.01110 0.03509 28.817 7.26e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1276 on 14 degrees of freedom
## Multiple R-squared: 0.9834, Adjusted R-squared: 0.9822
## F-statistic: 830.4 on 1 and 14 DF, p-value: 7.265e-14
ggplot(TOP500.GR,aes(x=log(Rmax),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="green") +
theme_bw() + ggtitle("Rpeak ~ Rmax linear regression GERMANY")+
theme(plot.title = element_text(hjust = .5))
TOP500.FR <- TOP500_202011 %>% filter(Country=="France") %>% select(Country,Name,Rmax,Rpeak) %>% drop_na()
TOP500.FR %>% datatable()
fit.fr <- lm(log(Rpeak)~log(Rmax),TOP500.FR)
summary(fit.fr)
##
## Call:
## lm(formula = log(Rpeak) ~ log(Rmax), data = TOP500.FR)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19749 -0.13114 0.02325 0.11942 0.21029
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.32111 0.36805 -0.872 0.396
## log(Rmax) 1.08322 0.04451 24.334 4.56e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1363 on 16 degrees of freedom
## Multiple R-squared: 0.9737, Adjusted R-squared: 0.972
## F-statistic: 592.1 on 1 and 16 DF, p-value: 4.562e-14
ggplot(TOP500.FR,aes(x=log(Rmax),y=log(Rpeak)))+
geom_smooth(method="lm") + geom_point(col="blue") +
theme_bw() + ggtitle("Rpeak ~ Rmax linear regression FRANCE")+
theme(plot.title = element_text(hjust = .5))
TOP500.5 <- rbind(TOP500.US,TOP500.CH,TOP500.JP,TOP500.FR,TOP500.GR)
TOP500.5%>%group_by(Country)%>%
do(tidy(lm(log(Rpeak)~log(Rmax),data = .))) %>%
kable(caption = "Linear model parameters by country",digits = 4)
| Country | term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|---|
| China | (Intercept) | 1.2150 | 0.3282 | 3.7017 | 0.0004 |
| China | log(Rmax) | 0.9263 | 0.0430 | 21.5501 | 0.0000 |
| France | (Intercept) | -0.3211 | 0.3681 | -0.8725 | 0.3959 |
| France | log(Rmax) | 1.0832 | 0.0445 | 24.3339 | 0.0000 |
| Germany | (Intercept) | 0.2807 | 0.2991 | 0.9383 | 0.3640 |
| Germany | log(Rmax) | 1.0111 | 0.0351 | 28.8174 | 0.0000 |
| Japan | (Intercept) | 0.9259 | 0.3069 | 3.0167 | 0.0061 |
| Japan | log(Rmax) | 0.9399 | 0.0363 | 25.8631 | 0.0000 |
| United States | (Intercept) | 0.0439 | 0.1403 | 0.3127 | 0.7552 |
| United States | log(Rmax) | 1.0352 | 0.0174 | 59.4998 | 0.0000 |
ggplot(TOP500.5,aes(x=log(Rmax),y=log(Rpeak),col=Country))+
geom_smooth(method="lm",se=F) + geom_point() +
theme_bw() + ggtitle("Rpeak ~ Rmax linear regression ")+
theme(plot.title = element_text(hjust = .5))
TOP500.5%>%ggplot(.,aes(x=log(Rmax),y=log(Rpeak),col=Country))+
geom_density2d(binwidth = 0.01, na.rm = T)+
theme_bw() + ggtitle("Density contour plot for Rpeak~Rmax")+
theme(plot.title = element_text(hjust = .5))
lm(log(Rpeak)~log(Rmax)) that is \(ln(Rpeak_i) = B_0+B_1ln(Rmax_i)\) we can arrange five leaders in the HPC race by the following order according to the regression slope \(B_1\): 1. France - 1.0832, 2. USA - 1.0352, 3. Germany - 1.0111, 4. Japan - 0.9399, 5. China - 0.9263. The slope coefficient \(B_1\) (p-values for all countries are significant) displays how much peak performance \(Rpeak\) could be produced provided \(Rmax\), that is equivalent of mainframe construction technology for particular country in broad sense i.e. index of HPC advanced construction ability as well as usability.