We had discussed HPC mainframes from TOP500 list many times by means of data science and R. See: https://rpubs.com/alex-lev/350548, https://rpubs.com/alex-lev/557956, https://rpubs.com/alex-lev/216789, https://rpubs.com/alex-lev/71014.
For the first time Japan has introduced the most powerful supercomputer in the world - Fugaku. The new top system, Fugaku, turned in a High Performance Linpack (HPL) result of 415.5 petaflops, besting the now second-place Summit system by a factor of 2.8x. The new system is installed at RIKEN Center for Computational Science (R-CCS) in Kobe, Japan. See: https://www.fujitsu.com/global/about/innovation/fugaku/specifications/
This time we want to verify our previous linear model (https://rpubs.com/alex-lev/216789), applying Fugaku and TOP500 data.
We use the same open source for current date - https://www.top500.org/lists/2019/11/.
library(dplyr)
library(tibble)
library(DT)
library(rstanarm)
library(bayesplot)
library(ggplot2)
TOP500_201911 <- read.csv("top500/TOP500_201911.csv")
names(TOP500_201911)
## [1] "Rank" "Previous.Rank"
## [3] "First.Appearance" "First.Rank"
## [5] "Name" "Computer"
## [7] "Site" "Manufacturer"
## [9] "Country" "Year"
## [11] "Segment" "Total.Cores"
## [13] "Accelerator.CoProcessor.Cores" "Rmax"
## [15] "Rpeak" "Nmax"
## [17] "Nhalf" "HPCG"
## [19] "Power" "Power..Source"
## [21] "Power.Effeciency" "Architecture"
## [23] "Processor" "Processor.Technology"
## [25] "Processor.Speed" "Operating.System"
## [27] "OS.Family" "Accelerator.CoProcessor"
## [29] "Cores.per.Socket" "Processor.Generation"
## [31] "System.Model" "System.Family"
## [33] "Interconnect.Family" "Interconnect"
## [35] "Region" "Continent"
## [37] "Site.ID" "System..ID"
TOP500_201911.tbl <-as_tibble(TOP500_201911)
TOP500_201911.tbl %>% group_by(Country) %>%
summarise(Mainframes=n(),Rpeak.Sum=sum(Rpeak),Total.Cores.Sum=sum(Total.Cores))%>%
arrange(desc(Rpeak.Sum))%>% top_n(.,10) #%>% datatable()
## Selecting by Total.Cores.Sum
## # A tibble: 10 x 4
## Country Mainframes Rpeak.Sum Total.Cores.Sum
## <fct> <int> <dbl> <int>
## 1 China 228 1132046. 30463860
## 2 United States 117 861974. 17224104
## 3 Japan 29 174575. 3977228
## 4 France 18 105646. 2122784
## 5 Germany 16 98394. 1732022
## 6 Italy 5 47844. 794032
## 7 United Kingdom 11 39455. 1189608
## 8 Netherlands 15 31795. 864000
## 9 Korea, South 3 31497. 709220
## 10 Ireland 14 29676. 806400
For more see: https://rpubs.com/alex-lev/216789
fit.lm<-lm(data = TOP500_201911.tbl,log(Rpeak)~log(Power)+log(Total.Cores)+log(Year))
summary(fit.lm)
##
## Call:
## lm(formula = log(Rpeak) ~ log(Power) + log(Total.Cores) + log(Year),
## data = TOP500_201911.tbl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.3194 -0.2510 0.0421 0.2808 1.1223
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.213e+03 2.841e+02 -7.788 3.07e-13 ***
## log(Power) 1.545e-01 5.399e-02 2.861 0.00465 **
## log(Total.Cores) 6.760e-01 5.144e-02 13.142 < 2e-16 ***
## log(Year) 2.907e+02 3.733e+01 7.788 3.07e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4791 on 210 degrees of freedom
## (286 observations deleted due to missingness)
## Multiple R-squared: 0.6884, Adjusted R-squared: 0.684
## F-statistic: 154.7 on 3 and 210 DF, p-value: < 2.2e-16
exp(predict(fit.lm,data.frame(Power=283345.5,Total.Cores=48*158976,Year=2020),interval = "confidence", level=0.95))/1000
## fit lwr upr
## 1 317.6142 191.5502 526.6439
As we can see \(P(191.5\le R_{Peak}\le 526.6)=0.95\) and \(M[R_{Peak}]=317.6\)
options(mc.cores = parallel::detectCores())
fit.lm.bs<-stan_glm(log(Rpeak)~log(Power)+log(Total.Cores)+log(Year),
data=TOP500_201911.tbl,chains=4,iter=10000,seed=12345)
fit.lm.bs
## stan_glm
## family: gaussian [identity]
## formula: log(Rpeak) ~ log(Power) + log(Total.Cores) + log(Year)
## observations: 214
## predictors: 4
## ------
## Median MAD_SD
## (Intercept) -2213.2 279.2
## log(Power) 0.2 0.1
## log(Total.Cores) 0.7 0.1
## log(Year) 290.8 36.7
##
## Auxiliary parameter(s):
## Median MAD_SD
## sigma 0.5 0.0
##
## Sample avg. posterior predictive distribution of y:
## Median MAD_SD
## mean_PPD 8.3 0.0
##
## ------
## * For help interpreting the printed output see ?print.stanreg
## * For info on the priors used see ?prior_summary.stanreg
summary(fit.lm.bs)
##
## Model Info:
##
## function: stan_glm
## family: gaussian [identity]
## formula: log(Rpeak) ~ log(Power) + log(Total.Cores) + log(Year)
## algorithm: sampling
## priors: see help('prior_summary')
## sample: 20000 (posterior sample size)
## observations: 214
## predictors: 4
##
## Estimates:
## mean sd 2.5% 25% 50% 75% 97.5%
## (Intercept) -2214.4 284.4 -2777.6 -2401.7 -2213.2 -2025.3 -1648.1
## log(Power) 0.2 0.1 0.0 0.1 0.2 0.2 0.3
## log(Total.Cores) 0.7 0.1 0.6 0.6 0.7 0.7 0.8
## log(Year) 291.0 37.4 216.6 266.1 290.8 315.6 365.0
## sigma 0.5 0.0 0.4 0.5 0.5 0.5 0.5
## mean_PPD 8.3 0.0 8.2 8.2 8.3 8.3 8.4
## log-posterior -154.1 1.6 -158.1 -155.0 -153.8 -153.0 -152.1
##
## Diagnostics:
## mcse Rhat n_eff
## (Intercept) 2.4 1.0 14048
## log(Power) 0.0 1.0 11264
## log(Total.Cores) 0.0 1.0 12811
## log(Year) 0.3 1.0 14041
## sigma 0.0 1.0 17370
## mean_PPD 0.0 1.0 17851
## log-posterior 0.0 1.0 8867
##
## For each parameter, mcse is Monte Carlo standard error, n_eff is a crude measure of effective sample size, and Rhat is the potential scale reduction factor on split chains (at convergence Rhat=1).
plot(fit.lm.bs)
posterior_vs_prior(fit.lm.bs)
##
## Drawing from prior...
fit.lm.bs2<-as.array(fit.lm.bs)
mcmc_hist(fit.lm.bs2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
mcmc_trace(fit.lm.bs2)
fujitsu.pp <- posterior_predict(fit.lm.bs,
newdata = data.frame(Power=283345.5,Total.Cores=48*158976,Year=2020),seed=12345)
fujitsu.peak <- exp(fujitsu.pp)
#Range.km <- Range.km[1:10000,]
quantile(fujitsu.peak,probs = c(0.15,0.5,0.85))
## 15% 50% 85%
## 181268.4 317443.0 555888.1
ggplot(data=as.data.frame(fujitsu.peak), aes(fujitsu.peak)) +
geom_histogram(bins=50,col="black",fill="green") +
geom_errorbarh(aes(y=0.05, xmin=quantile(fujitsu.peak,0.15),
xmax=quantile(fujitsu.peak,0.85)),
data=as.data.frame(fujitsu.peak), col="#0094EA", size=3) +
ggtitle(label="Probability density of Fugaku Rpeak")+xlab("Rpeak, Gflop") + theme_bw()
## Don't know how to automatically pick scale for object of type ppd/matrix. Defaulting to continuous.