We apply logistic regression for classification between USA and China top500 HPC mainframes.
For more see previous topic https://rpubs.com/alex-lev/553777
library(readr)
library(popbio)
library(ggplot2)
library(dplyr)
library(pscl)
library(DT)
TOP500_201911 <- read_csv("top500/TOP500_201911.csv")
names(TOP500_201911)
## [1] "Rank" "Previous.Rank"
## [3] "First.Appearance" "First.Rank"
## [5] "Name" "Computer"
## [7] "Site" "Manufacturer"
## [9] "Country" "Year"
## [11] "Segment" "Total.Cores"
## [13] "Accelerator.CoProcessor.Cores" "Rmax"
## [15] "Rpeak" "Nmax"
## [17] "Nhalf" "HPCG"
## [19] "Power" "Power. Source"
## [21] "Power.Effeciency" "Architecture"
## [23] "Processor" "Processor.Technology"
## [25] "Processor.Speed" "Operating.System"
## [27] "OS.Family" "Accelerator.CoProcessor"
## [29] "Cores.per.Socket" "Processor.Generation"
## [31] "System.Model" "System.Family"
## [33] "Interconnect.Family" "Interconnect"
## [35] "Region" "Continent"
## [37] "Site.ID" "System. ID"
TOP500_USCH <- TOP500_201911%>% filter(Country==c("China","United States")) %>%
mutate(EFF=round(Rmax/Rpeak,3),CAT=if_else(Country=="China",1,0)) %>% select_at(vars(EFF,CAT,Country))
as_tibble(TOP500_USCH)%>% datatable()
cor.test(TOP500_USCH$EFF,TOP500_USCH$CAT)
##
## Pearson's product-moment correlation
##
## data: TOP500_USCH$EFF and TOP500_USCH$CAT
## t = -14.095, df = 166, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8001040 -0.6605198
## sample estimates:
## cor
## -0.7381116
ggplot(TOP500_USCH,aes(x=Country,y=EFF,fill=Country)) +
geom_violin() + geom_jitter(na.rm = T) +
theme_bw() + ggtitle("EFF=Rmax/Rpeak by Country")+
theme(plot.title = element_text(hjust = .5))
TOP500_USCH %>% group_by(Country) %>% summarise(M_EFF=round(mean(EFF),3)) %>% datatable()
t.test(EFF~Country, data=TOP500_USCH)# Student t-test for the mean values
##
## Welch Two Sample t-test
##
## data: EFF by Country
## t = -15.118, df = 144.29, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2907198 -0.2234932
## sample estimates:
## mean in group China mean in group United States
## 0.4938257 0.7509322
As we can see there is significant difference between USA and CHINA in terms of the mean values of \(EFF=Rmax/Rpeak\): \(M[EFF]_{USA}=0.75\), \(M[EFF]_{CHINA}=0.49\) that is China’s HPC mainframes produce half of the theoretical peak performance while the US ones produce 75%. Now we are ready to discriminate HPC mainframes of two leading countries applying logistic regression.
About logistic regression math see: https://en.wikipedia.org/wiki/Logistic_regression. \[\log{\frac{p}{1-p}}={b_0 + b_1x}\] or \[p=\frac{1}{1+e^{-(b_0+b_1x)}}\]
fit.glm <- glm(CAT~EFF,data = TOP500_USCH,family = binomial(link = "logit"))
# put the fitted values in the data.frame
fit.glm
##
## Call: glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = TOP500_USCH)
##
## Coefficients:
## (Intercept) EFF
## 14.63 -22.31
##
## Degrees of Freedom: 167 Total (i.e. Null); 166 Residual
## Null Deviance: 217.8
## Residual Deviance: 81.3 AIC: 85.3
TOP500_USCH$fitted.values <- fit.glm$fitted.values
pred <- predict(fit.glm, data.frame(EFF = TOP500_USCH$EFF), type = "link", se.fit = TRUE)
TOP500_USCH$fit <- pred$fit
TOP500_USCH$se.fit <- pred$se.fit
# CI for fitted values
TOP500_USCH <- within(TOP500_USCH, {
fit.lower = exp(fit - 1.96 * se.fit) / (1 + exp(fit - 1.96 * se.fit))
fit.upper = exp(fit + 1.96 * se.fit) / (1 + exp(fit + 1.96 * se.fit))
})
p <- ggplot(TOP500_USCH, aes(x = EFF, y = CAT))
# predicted curve and point-wise 95% CI
p <- p + geom_ribbon(aes(x = EFF, ymin = fit.lower, ymax = fit.upper), alpha = 0.2)
p <- p + geom_line(aes(x = EFF, y = fitted.values), color = "red")
# fitted values
p <- p + geom_point(aes(y = fitted.values), color = "red", size=2)
# observed values
p <- p + geom_point(size=2)
p <- p + labs(title = paste("Observed and predicted probability of HPC CHINA top500.org in 2019")) + xlab("EFF=Rmax/Rpeak") + ylab("Probability") + theme_bw()
print(p)
\[Prob(Country=CHINA)=\frac{1}{1+e^{-(13.76 -21.01\frac{Rmax}{Rpeak})}}\]
#coefficients
fit.glm$coefficients
## (Intercept) EFF
## 14.62884 -22.31337
#mean value of EFF=0.5 for CHINA
prob_ch <- predict(fit.glm, data.frame(EFF = 0.5), type = "response", se.fit = TRUE)
print(paste('Probability for CHINA =',round(prob_ch$fit,3)))
## [1] "Probability for CHINA = 0.97"
#mean value of EFF=0.75 for USA
prob_us <- predict(fit.glm, data.frame(EFF = 0.75), type = "response", se.fit = TRUE)
print(paste('Probability for CHINA =',round(prob_us$fit,3)))
## [1] "Probability for CHINA = 0.108"
MisClassError <- c()
N <- 3000
set.seed(12345)
for (i in 1:N) {
train <- sample_frac(TOP500_USCH,0.8,replace = F)
test <-TOP500_USCH %>% setdiff(train)
fit.glm <- glm(CAT~EFF,data = train,family = binomial(link = "logit"))
res_test <- predict.glm(fit.glm,test,type="response",se.fit = FALSE)
fitted.results <- ifelse(res_test > 0.5,1,0)
MisClassError[i] <- round(mean(fitted.results != test$CAT),3)
}
MCE <- 1-MisClassError
summary(MCE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.5450 0.8000 0.8570 0.8545 0.9090 1.0000
print(paste('Mean Classification Accuracy =',round(mean(MCE),3)))
## [1] "Mean Classification Accuracy = 0.854"
print(paste('95% Confidence interval for MCE'))
## [1] "95% Confidence interval for MCE"
print(paste('MCE <=',round(mean(MCE)+2*sd(MCE)/sqrt(N),3)))
## [1] "MCE <= 0.857"
print(paste('MCE >=',round(mean(MCE)-2*sd(MCE)/sqrt(N),3)))
## [1] "MCE >= 0.852"
ggplot(as.data.frame(1-MisClassError), aes(x=1-MisClassError)) +
geom_histogram(aes(y=..density..), bins = 10,colour="black", fill="green")+
geom_density(alpha=.2, fill="#FF6666") + theme_bw() + labs(title="True Classification Proportion") + xlab("Proportion") + ylab("Density")+
theme(plot.title = element_text(hjust = .5))
summary(fit.glm)
##
## Call:
## glm(formula = CAT ~ EFF, family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3047 -0.3800 0.2001 0.3675 1.9918
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 13.757 2.182 6.306 2.86e-10 ***
## EFF -21.014 3.390 -6.198 5.71e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 173.639 on 133 degrees of freedom
## Residual deviance: 72.228 on 132 degrees of freedom
## AIC: 76.228
##
## Number of Fisher Scoring iterations: 6
pR2(fit.glm)
## llh llhNull G2 McFadden r2ML r2CU
## -36.1139950 -86.8195896 101.4111892 0.5840340 0.5308346 0.7308498
anova(fit.glm, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: CAT
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 133 173.639
## EFF 1 101.41 132 72.228 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict.glm(fit.glm,type = "response")
par(mfrow=c(1,1))
logi.hist.plot(train$EFF,train$CAT,boxp=FALSE,type="hist",col="gray",
xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC CHINA in TOP500")
points(train$EFF,fitted(fit.glm),pch=20)
grid()
fit.glm.2 <- glm(1-CAT~EFF,data = train ,family = binomial(link = "logit"))
x<-seq(from=0.1,to=1.0,by=0.1)
y=predict.glm(fit.glm.2,type = "response")
par(mfrow=c(1,1))
logi.hist.plot(train$EFF,1-train$CAT,boxp=FALSE,type="hist",col="gray",
xlabel = "Rmax/Rpeak",mainlabel = "Probability of HPC USA in TOP500")
points(train$EFF,fitted(fit.glm.2),pch=20)
grid()