We had discussed US-China HPC race previously in 2017. See: https://rpubs.com/alex-lev/350548
We use the same open source for current date - https://www.top500.org/lists/2019/11/.
library(readr)
library(ggplot2)
library(dplyr)
library(broom)
library(tidyr)
library(corrplot)
library(ggpmisc)
library(scales)
library(tibble)
library(DT)
library(popbio)
TOP500_201911 <- read_csv("top500/TOP500_201911.csv")
names(TOP500_201911)
## [1] "Rank" "Previous.Rank"
## [3] "First.Appearance" "First.Rank"
## [5] "Name" "Computer"
## [7] "Site" "Manufacturer"
## [9] "Country" "Year"
## [11] "Segment" "Total.Cores"
## [13] "Accelerator.CoProcessor.Cores" "Rmax"
## [15] "Rpeak" "Nmax"
## [17] "Nhalf" "HPCG"
## [19] "Power" "Power. Source"
## [21] "Power.Effeciency" "Architecture"
## [23] "Processor" "Processor.Technology"
## [25] "Processor.Speed" "Operating.System"
## [27] "OS.Family" "Accelerator.CoProcessor"
## [29] "Cores.per.Socket" "Processor.Generation"
## [31] "System.Model" "System.Family"
## [33] "Interconnect.Family" "Interconnect"
## [35] "Region" "Continent"
## [37] "Site.ID" "System. ID"
TOP500_201911.tbl <-as_tibble(TOP500_201911)
#TOP500_201911.tbl %>% select(Rank, Name, Country, Rpeak, Total.Cores)%>%group_by(Rank)
TOP500_201911.tbl %>%
count(Country) %>%
mutate(Mainframes=n, Percent=(n/500)*100) %>%
select(Country,Mainframes,Percent)%>%
arrange(desc(Mainframes),Percent) %>% top_n(.,10) %>% datatable()
## Selecting by Percent
TOP500_201911.tbl %>%
group_by(Country) %>%
summarise(Mean_EFF=round(mean(Rmax/Rpeak),2),Mean_Rmax=round(mean(Rmax),2),
Mean_Rpeak=round(mean(Rpeak),2),
Sum_Cores=round(sum(Total.Cores),2),
Sum_Rpeak=round(sum(Rpeak),2)) %>%
arrange(desc(Sum_Rpeak)) %>% top_n(.,10) %>% datatable()
## Selecting by Sum_Rpeak
TOP5 <-as.data.frame(TOP500_201911.tbl %>%
count(Country) %>%
mutate(Mainframes=n, Percent=n/sum(n)*100)%>%
mutate(ypos = cumsum(Percent)- 0.5*Percent ) %>%
select(Country,Mainframes,Percent,ypos)%>%
arrange(desc(Percent)) %>% top_n(.,5,wt = Percent))
ggplot(TOP5,aes(x="",y=Percent,fill=Country))+
geom_bar(stat = "identity",width = 1,color="white") +
coord_polar(theta = "y",start = 0) +
theme_void() +
#theme(legend.position="none") +
#geom_text(aes(y=Percent,label=Country), color = "white", size=)+
scale_fill_brewer(palette="Set1")+
ggtitle("TOP5 countries by HPC mainframes")
TOP500_201911.tbl %>% group_by(Country) %>%
summarise(Rpeak.Sum=sum(Rpeak),Total.Cores.Sum=sum(Total.Cores))%>%
arrange(desc(Rpeak.Sum))%>% top_n(.,10) %>% datatable()
## Selecting by Total.Cores.Sum
topUSCH <- TOP500_201911.tbl %>% filter(Country==c("China","United States")) %>%
select(Country,Year,Manufacturer,Segment,Total.Cores,
Rpeak,Rmax,Processor.Speed,Power,Architecture,Accelerator.CoProcessor.Cores)
topUSCH%>%group_by(Country,Year)%>%count(Year) %>% arrange(Year) %>% datatable()
topUSCH %>% ggplot(.,aes(Year))+ geom_bar(aes(fill=Country))+
theme_bw()+ ggtitle("HPC by Year")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%group_by(Country,Segment)%>%count() %>% datatable()
topUSCH%>%ggplot(.,aes(Segment)) + geom_bar(aes(fill=Country))+
theme_bw() + ggtitle("HPC by Segment")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%group_by(Country,Architecture)%>%count() %>% datatable()
topUSCH%>%ggplot(.,aes(Architecture)) + geom_bar(aes(fill=Country))+
theme_bw() + ggtitle("HPC by Architecture")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%filter(Country=="China")%>%group_by(Country,Segment,Manufacturer)%>%
count()%>%arrange(desc(n)) %>% datatable()
topUSCH%>%filter(Country=="China")%>%ggplot(.,aes(Segment)) +
geom_bar(aes(fill=Manufacturer))+
theme_bw() + ggtitle("China HPC by Manufacturer and Segment")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%filter(Country=="United States")%>% group_by(Country,Segment,Manufacturer)%>%count()%>%arrange(desc(n)) %>% datatable()
topUSCH%>%filter(Country=="United States")%>%ggplot(.,aes(Segment)) +
geom_bar(aes(fill=Manufacturer))+
theme_bw() + ggtitle("US HPC by Manufacturer and Segment")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%group_by(Country)%>% summarise("M[Rmax]"=mean(Rmax),"S[Rmax]"=sum(Rmax), "M[Rpeak]"=mean(Rpeak),"S[Rpeak]"=sum(Rmax), "M[Total.Cores]"=mean(Total.Cores),"S[Total.Cores]"=sum(Total.Cores))%>% datatable()
topUSCH%>%group_by(Country)%>%do(tidy(lm(log(Rpeak)~log(Total.Cores),data = .))) %>% mutate_if(is.numeric, round, 2) %>% datatable()
## `mutate_if()` ignored the following grouping variables:
## Column `Country`
topUSCH%>%group_by(Country)%>%filter(Segment=="Industry")%>%
do(tidy(lm(log(Rpeak)~log(Total.Cores),data = .))) %>%
mutate_if(is.numeric, round, 2)%>% datatable
## `mutate_if()` ignored the following grouping variables:
## Column `Country`
topUSCH%>%group_by(Country)%>%filter(Segment=="Academic")%>%
do(tidy(lm(log(Rpeak)~log(Total.Cores),data = .))) %>%
mutate_if(is.numeric, round, 2)%>% datatable
## `mutate_if()` ignored the following grouping variables:
## Column `Country`
ggplot(topUSCH,aes(x=log(Total.Cores),y=log(Rpeak),col=Country))+
geom_smooth(method="lm") + geom_point() + facet_wrap(~Country) +
theme_bw() + ggtitle("Rpeak ~ Total.Cores linear regression")+ggtitle("Rpeak ~ Total.Cores linear regression by country")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%group_by(Country)%>%filter(Segment!="Vendor")%>%
ggplot(.,aes(log(Total.Cores),log(Rpeak),col=Country)) +
geom_point(na.rm = T) + geom_smooth(method = lm,se = F,na.rm = T) + facet_wrap(~Segment)+theme_bw() + ggtitle("Rpeak ~ Total.Cores linear regression by Segment ")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%group_by(Manufacturer)%>%
#filter(Manufacturer=="Lenovo"|Manufacturer=="HPE")%>%
ggplot(.,aes(log(Total.Cores),log(Rpeak),col=Manufacturer)) +
geom_point(na.rm = T) + geom_smooth(method = lm,se = F,na.rm = T) + facet_wrap(~Country)+
theme_bw() + ggtitle("Rpeak ~ Total.Cores linear regression by Manufacturer")+
theme(plot.title = element_text(hjust = .5))
ggplot(topUSCH,aes(log(Total.Cores),fill=Country)) +
geom_density(alpha=0.7) + theme_bw() + ggtitle("Total.Cores density")+
theme(plot.title = element_text(hjust = .5))
ggplot(topUSCH,aes(x=Country,y=log(Total.Cores),fill=Country)) +
geom_violin() + geom_jitter() + theme_bw() + ggtitle("Total.Cores boxplot")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%group_by(Country)%>%summarise("Mean[Rmax/Rpeak]"=mean(Rmax/Rpeak, na.rm = T))%>% datatable()
ggplot(topUSCH,aes(x=Country,y=Rmax/Rpeak,fill=Country)) +
geom_violin() + geom_jitter() + theme_bw() + ggtitle("Rmax/Rpeak boxplot")+
theme(plot.title = element_text(hjust = .5))
ggplot(topUSCH,aes(Rmax/Rpeak,fill=Country)) +
geom_density(alpha=0.5) + theme_bw() + ggtitle("Rmax/Rpeak density")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%group_by(Country)%>% summarise("S[Power]"=sum(Power,na.rm = T),"M[Power]"=mean(Power,na.rm = T),"M[Rpeak/Power]"=mean(Rpeak/Power,na.rm = T),"M[Rpeak]"=mean(Rpeak,na.rm = T),"Rpeak/Rmax~Power"=cor(Rpeak/Rmax,Power,use = "pairwise"), "Rpeak~Power"=cor(Rpeak,Power,use = "pairwise"))%>% datatable()
topUSCH%>%group_by(Country)%>%
ggplot(.,aes(log(Power),log(Rpeak),col=Country)) + geom_point(na.rm = T) + geom_smooth(method = lm)+ facet_wrap(~Country) +
theme_bw() + ggtitle("Rpeak ~ Power linear regression")+
theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_smooth).
topUSCH%>%group_by(Country)%>%filter(Segment!="Vendor")%>%
ggplot(.,aes(log(Power),log(Rpeak),col=Country)) + geom_point(na.rm = T) + geom_smooth(method = lm,se = F,na.rm = T) +facet_wrap(~Segment) +
theme_bw() + ggtitle("Rpeak ~ Power linear regression by Segment")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%group_by(Country)%>%
ggplot(.,aes(log(Total.Cores),log(Accelerator.CoProcessor.Cores),col=Country))+
geom_point(na.rm = T)+geom_smooth(method = lm,se = F) + theme_bw() + ggtitle("Accelerator.CoProcessor.Cores ~ Total.Cores linear regression ")+
theme(plot.title = element_text(hjust = .5))
## Warning: Removed 111 rows containing non-finite values (stat_smooth).
topUSCH%>%group_by(Country)%>%
ggplot(.,aes(log(Accelerator.CoProcessor.Cores),log(Rpeak),col=Country))+
geom_point(na.rm = T)+geom_smooth(method = lm,se = F)+
theme_bw() + ggtitle("Rpeak ~ Accelerator.CoProcessor.Cores linear regression")+
theme(plot.title = element_text(hjust = .5))
## Warning: Removed 111 rows containing non-finite values (stat_smooth).
topUSCH%>%select(Country,Power)%>%ggplot(.,aes(x=Country,y=Power,fill=Country)) +
geom_violin() + geom_jitter(na.rm = T) +
theme_bw() + ggtitle("Power boxplot")+
theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_ydensity).
ggplot(topUSCH,aes(Power,fill=Country)) +
geom_density(alpha=0.5) +
theme_bw() + ggtitle("Power density")+
theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_density).
topUSCH%>%select(Country,Power,Rpeak)%>%
ggplot(.,aes(x=Country,y=Rpeak/Power,fill=Country)) +
geom_violin() + geom_jitter(na.rm = T) +
theme_bw() + ggtitle("Rpeak/Power boxplot")+
theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_ydensity).
ggplot(topUSCH,aes(Rpeak/Power,fill=Country)) +
geom_density(alpha=0.5) +
theme_bw() + ggtitle("Rpeak/Power density")+
theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing non-finite values (stat_density).
topUSCH%>%filter(Segment!="Vendor")%>% group_by(Country,Segment)%>% summarise("M[Rpeak/Power]"=round(mean(Rpeak/Power, na.rm = T),2),"M[Processor.Speed]"=round(mean(Processor.Speed,na.rm = T),2))%>% datatable()
topUSCH%>%group_by(Country)%>% summarise("Rpeak~Power"=round(cor(Rpeak,Power, use = "pairwise"),2), "Rpeak~Total.Cores"=round(cor(Rpeak,Total.Cores, use = "pairwise"),2), "Rpeak~Accelerator.CoProcessor.Cores"=round(cor(Rpeak,Accelerator.CoProcessor.Cores, use = "pairwise"),2))%>% datatable()
topUSCH%>%group_by(Country, Segment)%>%filter(Segment!="Vendor")%>% summarise("Rpeak~Power"=round(cor(Rpeak,Power, use = "pairwise"),2), "Rpeak~Total.Cores"=round(cor(Rpeak,Total.Cores, use = "pairwise"),2), "Rpeak~Accelerator.CoProcessor.Cores"=round(cor(Rpeak,Accelerator.CoProcessor.Cores, use = "pairwise"),2))%>% datatable()
## Warning in cor(Rpeak, Accelerator.CoProcessor.Cores, use = "pairwise"): the
## standard deviation is zero
topUSCH%>%group_by(Country,Manufacturer)%>% filter(Manufacturer=="Lenovo", Country=="China")%>% summarise("Rpeak~Power"=round(cor(Rpeak,Power, use = "pairwise"),2), "Rpeak~Total.Cores"=round(cor(Rpeak,Total.Cores, use = "pairwise"),2), "Rpeak~Accelerator.CoProcessor.Cores"=round(cor(Rpeak,Accelerator.CoProcessor.Cores, use = "pairwise"),2))%>% datatable()
topUSCH%>%group_by(Country,Manufacturer)%>% filter(Manufacturer=="IBM / NVIDIA / Mellanox", Country=="United States")%>% summarise("Rpeak~Power"=round(cor(Rpeak,Power, use = "pairwise"),2), "Rpeak~Total.Cores"=round(cor(Rpeak,Total.Cores, use = "pairwise"),2), "Rpeak~Accelerator.CoProcessor.Cores"=round(cor(Rpeak,Accelerator.CoProcessor.Cores, use = "pairwise"),2))%>% datatable()
MCCH <- topUSCH%>%filter(Country=="China")%>%
select(-Manufacturer,-Architecture,-Segment,-Country)%>%
rename(TC=Total.Cores,PS=Processor.Speed,ACPC=Accelerator.CoProcessor.Cores)%>%
cor(.,use = "pairwise")
MCUS <- topUSCH%>%filter(Country=="United States")%>%
select(-Manufacturer,-Architecture,-Segment,-Country)%>%
rename(TC=Total.Cores,PS=Processor.Speed,ACPC=Accelerator.CoProcessor.Cores)%>%
cor(.,use = "pairwise")
round(MCCH,2)%>% datatable()
round(MCUS,2)%>% datatable()
par(mfrow=c(1,2))
corrplot(MCCH,method = "ellipse",title = "China")
corrplot(MCUS,method = "ellipse",title = "US")
topUSCH%>%ggplot(.,aes(log(Power),log(Rpeak),col=Country)) +
geom_point() + geom_smooth(method = lm,se = F,na.rm = T) +
theme_bw() + ggtitle("TOP500, 11/2019: Linear regression Rpeak~Power")+
theme(plot.title = element_text(hjust = .5))
## Warning: Removed 108 rows containing missing values (geom_point).
topUSCH%>%ggplot(.,aes(log(Rmax),log(Rpeak),col=Country)) +
geom_point() +geom_smooth(method = lm,se = F) +
theme_bw() + ggtitle("TOP500, 11/2019: Linear regression Rpeak~Rmax")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%ggplot(.,aes(log(Total.Cores),log(Rpeak),col=Country)) +
geom_point() +geom_smooth(method = lm,se = F) +
theme_bw() + ggtitle("TOP500, 11/2019: Linear regression Rpeak~Total.Cores")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%
ggplot(.,aes(log(Total.Cores),log(Accelerator.CoProcessor.Cores),col=Country))+ geom_point() +geom_smooth(method = lm,se = F,na.rm = T)+
theme_bw() + ggtitle("TOP500, 11/2019: Linear regression Accelerator.CoProcessor.Cores~Total.Cores")+
theme(plot.title = element_text(hjust = .5))
## Warning: Removed 111 rows containing missing values (geom_point).
topUSCH%>%ggplot(.,aes(x=log(Total.Cores),y=log(Rpeak),col=Country))+
geom_density2d(binwidth = 0.01, na.rm = T)+
theme_bw() + ggtitle("TOP500, 11/2019: Density contour plot for Rpeak~Total.Cores")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%ggplot(.,aes(x=log(Rmax),y=log(Rpeak),col=Country))+
geom_density2d(binwidth = 0.01, na.rm = T)+
theme_bw() + ggtitle("TOP500, 11/2019: Density contour plot for Rpeak~Rmax")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%ggplot(.,aes(x=log(Power),y=log(Rpeak),col=Country))+
geom_density2d(binwidth = 0.01,na.rm = T)+
theme_bw() + ggtitle("TOP500, 11/2019: Density contour plot for Rpeak~Power")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%
ggplot(.,aes(x=log(Total.Cores),y=log(Accelerator.CoProcessor.Cores),col=Country))+
geom_density2d(binwidth = 0.01,na.rm = T)+
theme_bw() + ggtitle("TOP500, 11/2019: Density contour plot for Accelerator.CoProcessor.Cores~Total.Cores")+
theme(plot.title = element_text(hjust = .5))
topUSCH%>%ggplot(., aes(log(Total.Cores), log(Rpeak))) +
geom_polygon(aes(fill = Country)) +
theme_bw() + ggtitle("TOP500, 11/2019: Polygon plot for Rpeak~Total.Cores")+facet_wrap(~Country)+
theme(plot.title = element_text(hjust = .5))
topUSCH2 <- topUSCH %>% mutate(EFF=round(Rmax/Rpeak,2),CAT=if_else(Country=="China",1,0)) %>%
select(Country,CAT,EFF)
topUSCH2 %>% datatable()
fit.glm <- glm(CAT~EFF,data = topUSCH2,family = binomial(link = "logit"))
summary(fit.glm)
##
## Call:
## glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = topUSCH2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3851 -0.3435 0.1764 0.3067 2.0210
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 14.702 2.102 6.994 2.68e-12 ***
## EFF -22.440 3.267 -6.868 6.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 217.789 on 167 degrees of freedom
## Residual deviance: 80.361 on 166 degrees of freedom
## AIC: 84.361
##
## Number of Fisher Scoring iterations: 6
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict(fit.glm,data.frame(EFF=x),type = "response")
par(mfrow=c(1,1))
logi.hist.plot(topUSCH2$EFF,topUSCH2$CAT,boxp=FALSE,type="hist",col="gray",
xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC CHINA in TOP500")
points(topUSCH2$EFF,fitted(fit.glm),pch=20)
grid()
fit.glm.2 <- glm(1-CAT~EFF,data = topUSCH2,family = binomial(link = "logit"))
summary(fit.glm.2)
##
## Call:
## glm(formula = 1 - CAT ~ EFF, family = binomial(link = "logit"),
## data = topUSCH2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0210 -0.3067 -0.1764 0.3435 3.3851
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -14.702 2.102 -6.994 2.68e-12 ***
## EFF 22.440 3.267 6.868 6.52e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 217.789 on 167 degrees of freedom
## Residual deviance: 80.361 on 166 degrees of freedom
## AIC: 84.361
##
## Number of Fisher Scoring iterations: 6
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict(fit.glm.2,data.frame(EFF=x),type = "response")
par(mfrow=c(1,1))
logi.hist.plot(topUSCH2$EFF,1-topUSCH2$CAT,boxp=FALSE,type="hist",col="gray",
xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC USA in TOP500")
points(topUSCH2$EFF,fitted(fit.glm.2),pch=20)
grid()