1 Introduction

We had discussed US-China HPC race previously in 2017. See: https://rpubs.com/alex-lev/350548

2 Data

We use the same open source for current date - https://www.top500.org/lists/2019/11/.

library(readr)
library(ggplot2)
library(dplyr)
library(broom)
library(tidyr)
library(corrplot)
library(ggpmisc)
library(scales)
library(tibble)
library(DT)
library(popbio)

TOP500_201911 <- read_csv("top500/TOP500_201911.csv")
names(TOP500_201911)
##  [1] "Rank"                          "Previous.Rank"                
##  [3] "First.Appearance"              "First.Rank"                   
##  [5] "Name"                          "Computer"                     
##  [7] "Site"                          "Manufacturer"                 
##  [9] "Country"                       "Year"                         
## [11] "Segment"                       "Total.Cores"                  
## [13] "Accelerator.CoProcessor.Cores" "Rmax"                         
## [15] "Rpeak"                         "Nmax"                         
## [17] "Nhalf"                         "HPCG"                         
## [19] "Power"                         "Power. Source"                
## [21] "Power.Effeciency"              "Architecture"                 
## [23] "Processor"                     "Processor.Technology"         
## [25] "Processor.Speed"               "Operating.System"             
## [27] "OS.Family"                     "Accelerator.CoProcessor"      
## [29] "Cores.per.Socket"              "Processor.Generation"         
## [31] "System.Model"                  "System.Family"                
## [33] "Interconnect.Family"           "Interconnect"                 
## [35] "Region"                        "Continent"                    
## [37] "Site.ID"                       "System. ID"
TOP500_201911.tbl <-as_tibble(TOP500_201911)

3 Listing of leaders

#TOP500_201911.tbl %>% select(Rank, Name, Country, Rpeak, Total.Cores)%>%group_by(Rank)

TOP500_201911.tbl %>%
  count(Country) %>%
  mutate(Mainframes=n, Percent=(n/500)*100) %>%
  select(Country,Mainframes,Percent)%>%
  arrange(desc(Mainframes),Percent) %>% top_n(.,10) %>% datatable()
## Selecting by Percent
TOP500_201911.tbl %>%
  group_by(Country) %>%
  summarise(Mean_EFF=round(mean(Rmax/Rpeak),2),Mean_Rmax=round(mean(Rmax),2),
            Mean_Rpeak=round(mean(Rpeak),2),
            Sum_Cores=round(sum(Total.Cores),2),
            Sum_Rpeak=round(sum(Rpeak),2)) %>% 
  arrange(desc(Sum_Rpeak)) %>% top_n(.,10) %>% datatable()
## Selecting by Sum_Rpeak
TOP5 <-as.data.frame(TOP500_201911.tbl %>%
  count(Country) %>%
  mutate(Mainframes=n, Percent=n/sum(n)*100)%>%
  mutate(ypos = cumsum(Percent)- 0.5*Percent ) %>% 
  select(Country,Mainframes,Percent,ypos)%>%
  arrange(desc(Percent)) %>% top_n(.,5,wt = Percent))


ggplot(TOP5,aes(x="",y=Percent,fill=Country))+
  geom_bar(stat = "identity",width = 1,color="white") +
  coord_polar(theta = "y",start = 0) +
  theme_void() +
  #theme(legend.position="none") +
  #geom_text(aes(y=Percent,label=Country), color = "white", size=)+
  scale_fill_brewer(palette="Set1")+
  ggtitle("TOP5 countries by HPC mainframes")

TOP500_201911.tbl %>% group_by(Country) %>%
  summarise(Rpeak.Sum=sum(Rpeak),Total.Cores.Sum=sum(Total.Cores))%>%
  arrange(desc(Rpeak.Sum))%>% top_n(.,10) %>% datatable()
## Selecting by Total.Cores.Sum

4 Race details

topUSCH <- TOP500_201911.tbl %>% filter(Country==c("China","United States")) %>%
  select(Country,Year,Manufacturer,Segment,Total.Cores,
         Rpeak,Rmax,Processor.Speed,Power,Architecture,Accelerator.CoProcessor.Cores)

topUSCH%>%group_by(Country,Year)%>%count(Year) %>% arrange(Year) %>% datatable()
topUSCH %>% ggplot(.,aes(Year))+ geom_bar(aes(fill=Country))+
  theme_bw()+ ggtitle("HPC by Year")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%group_by(Country,Segment)%>%count() %>% datatable()
topUSCH%>%ggplot(.,aes(Segment)) + geom_bar(aes(fill=Country))+
  theme_bw() + ggtitle("HPC by Segment")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%group_by(Country,Architecture)%>%count() %>% datatable()
topUSCH%>%ggplot(.,aes(Architecture)) + geom_bar(aes(fill=Country))+
  theme_bw() + ggtitle("HPC by Architecture")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%filter(Country=="China")%>%group_by(Country,Segment,Manufacturer)%>%
  count()%>%arrange(desc(n)) %>% datatable()
topUSCH%>%filter(Country=="China")%>%ggplot(.,aes(Segment)) +
  geom_bar(aes(fill=Manufacturer))+
  theme_bw() + ggtitle("China HPC by Manufacturer and Segment")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%filter(Country=="United States")%>%  group_by(Country,Segment,Manufacturer)%>%count()%>%arrange(desc(n)) %>% datatable()
topUSCH%>%filter(Country=="United States")%>%ggplot(.,aes(Segment)) +
  geom_bar(aes(fill=Manufacturer))+
  theme_bw() + ggtitle("US HPC by Manufacturer and Segment")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%group_by(Country)%>%  summarise("M[Rmax]"=mean(Rmax),"S[Rmax]"=sum(Rmax),            "M[Rpeak]"=mean(Rpeak),"S[Rpeak]"=sum(Rmax),        "M[Total.Cores]"=mean(Total.Cores),"S[Total.Cores]"=sum(Total.Cores))%>% datatable()
topUSCH%>%group_by(Country)%>%do(tidy(lm(log(Rpeak)~log(Total.Cores),data = .))) %>%  mutate_if(is.numeric, round, 2) %>% datatable()
## `mutate_if()` ignored the following grouping variables:
## Column `Country`
topUSCH%>%group_by(Country)%>%filter(Segment=="Industry")%>%
  do(tidy(lm(log(Rpeak)~log(Total.Cores),data = .))) %>% 
 mutate_if(is.numeric, round, 2)%>% datatable
## `mutate_if()` ignored the following grouping variables:
## Column `Country`
topUSCH%>%group_by(Country)%>%filter(Segment=="Academic")%>%
  do(tidy(lm(log(Rpeak)~log(Total.Cores),data = .))) %>% 
 mutate_if(is.numeric, round, 2)%>% datatable
## `mutate_if()` ignored the following grouping variables:
## Column `Country`
ggplot(topUSCH,aes(x=log(Total.Cores),y=log(Rpeak),col=Country))+
  geom_smooth(method="lm") + geom_point() + facet_wrap(~Country) +
  theme_bw() + ggtitle("Rpeak ~ Total.Cores linear regression")+ggtitle("Rpeak ~ Total.Cores linear regression by country")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%group_by(Country)%>%filter(Segment!="Vendor")%>%
  ggplot(.,aes(log(Total.Cores),log(Rpeak),col=Country)) +
  geom_point(na.rm = T) + geom_smooth(method = lm,se = F,na.rm = T) + facet_wrap(~Segment)+theme_bw() + ggtitle("Rpeak ~ Total.Cores linear regression by Segment ")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%group_by(Manufacturer)%>%
  #filter(Manufacturer=="Lenovo"|Manufacturer=="HPE")%>%
  ggplot(.,aes(log(Total.Cores),log(Rpeak),col=Manufacturer)) +
  geom_point(na.rm = T) + geom_smooth(method = lm,se = F,na.rm = T) +   facet_wrap(~Country)+
  theme_bw() + ggtitle("Rpeak ~ Total.Cores linear regression by Manufacturer")+
  theme(plot.title = element_text(hjust = .5))

ggplot(topUSCH,aes(log(Total.Cores),fill=Country)) +
  geom_density(alpha=0.7) + theme_bw() + ggtitle("Total.Cores density")+
  theme(plot.title = element_text(hjust = .5))

ggplot(topUSCH,aes(x=Country,y=log(Total.Cores),fill=Country)) +
  geom_violin() + geom_jitter() + theme_bw() + ggtitle("Total.Cores boxplot")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%group_by(Country)%>%summarise("Mean[Rmax/Rpeak]"=mean(Rmax/Rpeak, na.rm = T))%>% datatable()
ggplot(topUSCH,aes(x=Country,y=Rmax/Rpeak,fill=Country)) +
  geom_violin() + geom_jitter() + theme_bw() + ggtitle("Rmax/Rpeak boxplot")+
  theme(plot.title = element_text(hjust = .5))

ggplot(topUSCH,aes(Rmax/Rpeak,fill=Country)) +
  geom_density(alpha=0.5) + theme_bw() + ggtitle("Rmax/Rpeak density")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%group_by(Country)%>% summarise("S[Power]"=sum(Power,na.rm = T),"M[Power]"=mean(Power,na.rm = T),"M[Rpeak/Power]"=mean(Rpeak/Power,na.rm = T),"M[Rpeak]"=mean(Rpeak,na.rm = T),"Rpeak/Rmax~Power"=cor(Rpeak/Rmax,Power,use = "pairwise"),     "Rpeak~Power"=cor(Rpeak,Power,use = "pairwise"))%>% datatable()
topUSCH%>%group_by(Country)%>%
  ggplot(.,aes(log(Power),log(Rpeak),col=Country)) + geom_point(na.rm = T) + geom_smooth(method = lm)+ facet_wrap(~Country) +
  theme_bw() + ggtitle("Rpeak ~ Power linear regression")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_smooth).

topUSCH%>%group_by(Country)%>%filter(Segment!="Vendor")%>%
  ggplot(.,aes(log(Power),log(Rpeak),col=Country)) + geom_point(na.rm = T) + geom_smooth(method = lm,se = F,na.rm = T) +facet_wrap(~Segment) +
  theme_bw() + ggtitle("Rpeak ~ Power linear regression by Segment")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%group_by(Country)%>%
  ggplot(.,aes(log(Total.Cores),log(Accelerator.CoProcessor.Cores),col=Country))+
  geom_point(na.rm = T)+geom_smooth(method = lm,se = F) + theme_bw() + ggtitle("Accelerator.CoProcessor.Cores ~ Total.Cores linear regression ")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 111 rows containing non-finite values (stat_smooth).

topUSCH%>%group_by(Country)%>%
  ggplot(.,aes(log(Accelerator.CoProcessor.Cores),log(Rpeak),col=Country))+
  geom_point(na.rm = T)+geom_smooth(method = lm,se = F)+
  theme_bw() + ggtitle("Rpeak ~ Accelerator.CoProcessor.Cores linear regression")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 111 rows containing non-finite values (stat_smooth).

topUSCH%>%select(Country,Power)%>%ggplot(.,aes(x=Country,y=Power,fill=Country)) +
  geom_violin() + geom_jitter(na.rm = T) + 
  theme_bw() + ggtitle("Power boxplot")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_ydensity).

ggplot(topUSCH,aes(Power,fill=Country)) +
  geom_density(alpha=0.5) + 
  theme_bw() + ggtitle("Power density")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_density).

topUSCH%>%select(Country,Power,Rpeak)%>%
  ggplot(.,aes(x=Country,y=Rpeak/Power,fill=Country)) +
  geom_violin() + geom_jitter(na.rm = T) + 
  theme_bw() + ggtitle("Rpeak/Power boxplot")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_ydensity).

ggplot(topUSCH,aes(Rpeak/Power,fill=Country)) +
  geom_density(alpha=0.5) + 
  theme_bw() + ggtitle("Rpeak/Power density")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_density).

topUSCH%>%filter(Segment!="Vendor")%>% group_by(Country,Segment)%>%  summarise("M[Rpeak/Power]"=round(mean(Rpeak/Power, na.rm = T),2),"M[Processor.Speed]"=round(mean(Processor.Speed,na.rm = T),2))%>% datatable()
topUSCH%>%group_by(Country)%>%  summarise("Rpeak~Power"=round(cor(Rpeak,Power, use = "pairwise"),2),            "Rpeak~Total.Cores"=round(cor(Rpeak,Total.Cores, use = "pairwise"),2),            "Rpeak~Accelerator.CoProcessor.Cores"=round(cor(Rpeak,Accelerator.CoProcessor.Cores, use = "pairwise"),2))%>% datatable()
topUSCH%>%group_by(Country, Segment)%>%filter(Segment!="Vendor")%>%  summarise("Rpeak~Power"=round(cor(Rpeak,Power, use = "pairwise"),2),            "Rpeak~Total.Cores"=round(cor(Rpeak,Total.Cores, use = "pairwise"),2),            "Rpeak~Accelerator.CoProcessor.Cores"=round(cor(Rpeak,Accelerator.CoProcessor.Cores, use = "pairwise"),2))%>% datatable()
## Warning in cor(Rpeak, Accelerator.CoProcessor.Cores, use = "pairwise"): the
## standard deviation is zero
topUSCH%>%group_by(Country,Manufacturer)%>%  filter(Manufacturer=="Lenovo", Country=="China")%>%  summarise("Rpeak~Power"=round(cor(Rpeak,Power, use = "pairwise"),2),            "Rpeak~Total.Cores"=round(cor(Rpeak,Total.Cores, use = "pairwise"),2),            "Rpeak~Accelerator.CoProcessor.Cores"=round(cor(Rpeak,Accelerator.CoProcessor.Cores, use = "pairwise"),2))%>% datatable()
topUSCH%>%group_by(Country,Manufacturer)%>%  filter(Manufacturer=="IBM / NVIDIA / Mellanox", Country=="United States")%>%  summarise("Rpeak~Power"=round(cor(Rpeak,Power, use = "pairwise"),2),            "Rpeak~Total.Cores"=round(cor(Rpeak,Total.Cores, use = "pairwise"),2),            "Rpeak~Accelerator.CoProcessor.Cores"=round(cor(Rpeak,Accelerator.CoProcessor.Cores, use = "pairwise"),2))%>% datatable()
MCCH <- topUSCH%>%filter(Country=="China")%>%
  select(-Manufacturer,-Architecture,-Segment,-Country)%>%
  rename(TC=Total.Cores,PS=Processor.Speed,ACPC=Accelerator.CoProcessor.Cores)%>%
  cor(.,use = "pairwise")

MCUS <- topUSCH%>%filter(Country=="United States")%>%
  select(-Manufacturer,-Architecture,-Segment,-Country)%>%
  rename(TC=Total.Cores,PS=Processor.Speed,ACPC=Accelerator.CoProcessor.Cores)%>%
  cor(.,use = "pairwise")

round(MCCH,2)%>% datatable()
round(MCUS,2)%>% datatable()
par(mfrow=c(1,2))
corrplot(MCCH,method = "ellipse",title = "China")
corrplot(MCUS,method = "ellipse",title = "US")

topUSCH%>%ggplot(.,aes(log(Power),log(Rpeak),col=Country)) +
  geom_point() + geom_smooth(method = lm,se = F,na.rm = T) +
  theme_bw() + ggtitle("TOP500, 11/2019: Linear regression Rpeak~Power")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing missing values (geom_point).

topUSCH%>%ggplot(.,aes(log(Rmax),log(Rpeak),col=Country)) +
  geom_point() +geom_smooth(method = lm,se = F) +
  theme_bw() + ggtitle("TOP500, 11/2019: Linear regression Rpeak~Rmax")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%ggplot(.,aes(log(Total.Cores),log(Rpeak),col=Country)) +
  geom_point() +geom_smooth(method = lm,se = F) +
  theme_bw() + ggtitle("TOP500, 11/2019: Linear regression Rpeak~Total.Cores")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%
  ggplot(.,aes(log(Total.Cores),log(Accelerator.CoProcessor.Cores),col=Country))+  geom_point() +geom_smooth(method = lm,se = F,na.rm = T)+
  theme_bw() + ggtitle("TOP500, 11/2019: Linear regression Accelerator.CoProcessor.Cores~Total.Cores")+
  theme(plot.title = element_text(hjust = .5))
## Warning: Removed 111 rows containing missing values (geom_point).

topUSCH%>%ggplot(.,aes(x=log(Total.Cores),y=log(Rpeak),col=Country))+
  geom_density2d(binwidth = 0.01, na.rm = T)+
  theme_bw() + ggtitle("TOP500, 11/2019: Density contour plot for Rpeak~Total.Cores")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%ggplot(.,aes(x=log(Rmax),y=log(Rpeak),col=Country))+
  geom_density2d(binwidth = 0.01, na.rm = T)+
  theme_bw() + ggtitle("TOP500, 11/2019: Density contour plot for Rpeak~Rmax")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%ggplot(.,aes(x=log(Power),y=log(Rpeak),col=Country))+
  geom_density2d(binwidth = 0.01,na.rm = T)+
  theme_bw() + ggtitle("TOP500, 11/2019: Density contour plot for Rpeak~Power")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%
  ggplot(.,aes(x=log(Total.Cores),y=log(Accelerator.CoProcessor.Cores),col=Country))+
  geom_density2d(binwidth = 0.01,na.rm = T)+
  theme_bw() + ggtitle("TOP500, 11/2019: Density contour plot for Accelerator.CoProcessor.Cores~Total.Cores")+
  theme(plot.title = element_text(hjust = .5))

topUSCH%>%ggplot(., aes(log(Total.Cores), log(Rpeak))) +
  geom_polygon(aes(fill = Country)) + 
  theme_bw() + ggtitle("TOP500, 11/2019: Polygon plot for Rpeak~Total.Cores")+facet_wrap(~Country)+
  theme(plot.title = element_text(hjust = .5))

5 Classification by logistic regression

topUSCH2 <- topUSCH %>% mutate(EFF=round(Rmax/Rpeak,2),CAT=if_else(Country=="China",1,0)) %>% 
  select(Country,CAT,EFF) 
topUSCH2 %>% datatable()
fit.glm <- glm(CAT~EFF,data = topUSCH2,family = binomial(link = "logit"))
summary(fit.glm)
## 
## Call:
## glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = topUSCH2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3851  -0.3435   0.1764   0.3067   2.0210  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   14.702      2.102   6.994 2.68e-12 ***
## EFF          -22.440      3.267  -6.868 6.52e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 217.789  on 167  degrees of freedom
## Residual deviance:  80.361  on 166  degrees of freedom
## AIC: 84.361
## 
## Number of Fisher Scoring iterations: 6
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict(fit.glm,data.frame(EFF=x),type = "response")
par(mfrow=c(1,1))
logi.hist.plot(topUSCH2$EFF,topUSCH2$CAT,boxp=FALSE,type="hist",col="gray",
               xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC CHINA in TOP500")
points(topUSCH2$EFF,fitted(fit.glm),pch=20)
grid()

fit.glm.2 <- glm(1-CAT~EFF,data = topUSCH2,family = binomial(link = "logit"))
summary(fit.glm.2)
## 
## Call:
## glm(formula = 1 - CAT ~ EFF, family = binomial(link = "logit"), 
##     data = topUSCH2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0210  -0.3067  -0.1764   0.3435   3.3851  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -14.702      2.102  -6.994 2.68e-12 ***
## EFF           22.440      3.267   6.868 6.52e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 217.789  on 167  degrees of freedom
## Residual deviance:  80.361  on 166  degrees of freedom
## AIC: 84.361
## 
## Number of Fisher Scoring iterations: 6
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict(fit.glm.2,data.frame(EFF=x),type = "response")
par(mfrow=c(1,1))
logi.hist.plot(topUSCH2$EFF,1-topUSCH2$CAT,boxp=FALSE,type="hist",col="gray",
               xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC USA in TOP500")
points(topUSCH2$EFF,fitted(fit.glm.2),pch=20)
grid()