We apply logistic regression for classification between USA and China top500 HPC mainframes as far as November 2020.
For more see previous topic https://rpubs.com/alex-lev/553777
Data for November 2020 can be downloaded here: https://www.top500.org/lists/top500/2020/11/
Note: the original names of variables were changed (truncated, concatenated and simplified) for the purpose of data exploration and visualization as possible.
library(readxl)
library(popbio)
library(ggplot2)
library(dplyr)
library(pscl)
library(DT)
TOP500_202011 <- read_excel("top500/TOP500_202011.xlsx")
names(TOP500_202011)
## [1] "Rank" "PreviousRank"
## [3] "FirstAppearance" "FirstRank"
## [5] "Name" "Computer"
## [7] "Site" "Manufacturer"
## [9] "Country" "Year"
## [11] "Segment" "TotalCores"
## [13] "AcceleratorCoProcessorCores" "Rmax"
## [15] "Rpeak" "Nmax"
## [17] "Nhalf" "HPCG"
## [19] "Power" "PowerSource"
## [21] "PowerEfficiency" "Architecture"
## [23] "Processor" "ProcessorTechnology"
## [25] "ProcessorSpeed" "OperatingSystem"
## [27] "OSFamily" "AcceleratorCoProcessor"
## [29] "CoresperSocket" "ProcessorGeneration"
## [31] "SystemModel" "SystemFamily"
## [33] "InterconnectFamily" "Interconnect"
## [35] "Continent" "SiteID"
## [37] "SystemID"
TOP500_USCH <- TOP500_202011%>% filter(Country==c("China","United States")) %>%
mutate(EFF=round(Rmax/Rpeak,3),CAT=if_else(Country=="China",1,0)) %>% select_at(vars(EFF,CAT,Country))
as_tibble(TOP500_USCH)%>% datatable()
cor.test(TOP500_USCH$EFF,TOP500_USCH$CAT)
##
## Pearson's product-moment correlation
##
## data: TOP500_USCH$EFF and TOP500_USCH$CAT
## t = -14.085, df = 154, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8117537 -0.6724720
## sample estimates:
## cor
## -0.750324
ggplot(TOP500_USCH,aes(x=Country,y=EFF,fill=Country)) +
geom_violin() + geom_jitter(na.rm = T) +
theme_bw() + ggtitle("EFF=Rmax/Rpeak by Country")+
theme(plot.title = element_text(hjust = .5))
TOP500_USCH %>% group_by(Country) %>% summarise(M_EFF=round(mean(EFF),3)) %>% datatable()
t.test(EFF~Country, data=TOP500_USCH)# Student t-test for the mean values
##
## Welch Two Sample t-test
##
## data: EFF by Country
## t = -14.003, df = 103.44, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2650525 -0.1992918
## sample estimates:
## mean in group China mean in group United States
## 0.5045825 0.7367547
As we can see there is significant difference between USA and CHINA in terms of the mean values of \(EFF=Rmax/Rpeak\): \(M[EFF]_{USA}=0.74\), \(M[EFF]_{CHINA}=0.51\) that is China’s HPC mainframes produce half of the theoretical peak performance while the US ones produce 74%. Now we are ready to discriminate HPC mainframes of two leading countries applying logistic regression.
About logistic regression math see: https://en.wikipedia.org/wiki/Logistic_regression. \[\log{\frac{p}{1-p}}={b_0 + b_1x}\] or \[p=\frac{1}{1+e^{-(b_0+b_1x)}}\]
fit.glm <- glm(CAT~EFF,data = TOP500_USCH,family = binomial(link = "logit"))
# put the fitted values in the data.frame
fit.glm
##
## Call: glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = TOP500_USCH)
##
## Coefficients:
## (Intercept) EFF
## 15.72 -24.54
##
## Degrees of Freedom: 155 Total (i.e. Null); 154 Residual
## Null Deviance: 199.9
## Residual Deviance: 73.59 AIC: 77.59
TOP500_USCH$fitted.values <- fit.glm$fitted.values
pred <- predict.glm(fit.glm, data.frame(EFF = TOP500_USCH$EFF), type = "link", se.fit = TRUE)
TOP500_USCH$fit <- pred$fit
TOP500_USCH$se.fit <- pred$se.fit
# CI for fitted values
TOP500_USCH <- within(TOP500_USCH, {
fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})
p <- ggplot(TOP500_USCH, aes(x = EFF, y = CAT))
# predicted curve and point-wise 95% CI
p <- p + geom_ribbon(aes(x = EFF, ymin = fit.lower, ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = EFF, y = fitted.values), color = "red")
# fitted values
p <- p + geom_point(aes(y = fitted.values), color = "red", size=2)
# observed values
p <- p + geom_point(size=2)
p <- p + labs(title = paste("Observed and predicted probability of HPC CHINA top500.org in 2020")) + xlab("EFF=Rmax/Rpeak") + ylab("Probability") + theme_bw()
print(p)
\[Prob(Country=CHINA)=\frac{1}{1+e^{-(15.72 - 24.54\frac{Rmax}{Rpeak})}}\]
#coefficients
fit.glm$coefficients
## (Intercept) EFF
## 15.72401 -24.53613
#mean value of EFF=0.5 for CHINA
prob_ch <- predict.glm(fit.glm, data.frame(EFF = 0.5), type = "response", se.fit = TRUE)
print(paste('Probability for CHINA =',round(prob_ch$fit,3)))
## [1] "Probability for CHINA = 0.969"
#mean value of EFF=0.75 for USA
prob_us <- predict.glm(fit.glm, data.frame(EFF = 0.74), type = "response", se.fit = TRUE)
print(paste('Probability for CHINA =',round(prob_us$fit,3)))
## [1] "Probability for CHINA = 0.081"
MisClassError <- c()
N <- 3000
set.seed(12345)
for (i in 1:N) {
train <- sample_frac(TOP500_USCH,0.8,replace = F)
test <-TOP500_USCH %>% setdiff(train)
fit.glm <- glm(CAT~EFF,data = train,family = binomial(link = "logit"))
res_test <- predict.glm(fit.glm,test,type="response",se.fit = FALSE)
fitted.results <- ifelse(res_test > 0.5,1,0)
MisClassError[i] <- round(mean(fitted.results != test$CAT),3)
}
MCE <- 1-MisClassError
summary(MCE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5620 0.8120 0.8670 0.8583 0.9090 1.0000
print(paste('Mean Classification Accuracy =',round(mean(MCE),3)))
## [1] "Mean Classification Accuracy = 0.858"
print(paste('95% Confidence interval for MCE'))
## [1] "95% Confidence interval for MCE"
print(paste('MCE <=',round(mean(MCE)+2*sd(MCE)/sqrt(N),3)))
## [1] "MCE <= 0.861"
print(paste('MCE >=',round(mean(MCE)-2*sd(MCE)/sqrt(N),3)))
## [1] "MCE >= 0.856"
ggplot(as.data.frame(1-MisClassError), aes(x=1-MisClassError)) +
geom_histogram(aes(y=..density..), bins = 10,colour="black", fill="green")+
geom_density(alpha=.2, fill="#FF6666") + theme_bw() + labs(title="True Classification Proportion") + xlab("Proportion") + ylab("Density")+
theme(plot.title = element_text(hjust = .5))
summary(fit.glm)
##
## Call:
## glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3216 -0.2260 0.2531 0.3932 1.7210
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 14.01 2.27 6.174 6.67e-10 ***
## EFF -21.52 3.67 -5.863 4.55e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 151.859 on 124 degrees of freedom
## Residual deviance: 64.252 on 123 degrees of freedom
## AIC: 68.252
##
## Number of Fisher Scoring iterations: 6
pR2(fit.glm)
## llh llhNull G2 McFadden r2ML r2CU
## -32.1260145 -75.9296147 87.6072005 0.5768974 0.5038404 0.7164430
anova(fit.glm, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: CAT
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 124 151.859
## EFF 1 87.607 123 64.252 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict.glm(fit.glm,type = "response")
par(mfrow=c(1,1))
logi.hist.plot(train$EFF,train$CAT,boxp=FALSE,type="hist",col="gray",
xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC CHINA in TOP500 2020")
points(train$EFF,fitted(fit.glm),pch=20)
grid()
fit.glm.2 <- glm(1-CAT~EFF,data = train ,family = binomial(link = "logit"))
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict.glm(fit.glm.2,type = "response")
par(mfrow=c(1,1))
logi.hist.plot(train$EFF,1-train$CAT,boxp=FALSE,type="hist",col="gray",
xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC USA in TOP500 2020")
points(train$EFF,fitted(fit.glm.2),pch=20)
grid()