이 문서는 SPEC.org의 CPU2017 Floating Point Benchmark 결과를 분석한 문서임
spec.org(http://www.spec.org)
library(dplyr)
library(httr)
library(tidyverse)
library(xml2)
library(rvest)
library(XML)
library(xlsx)
R에서 웹에 있는 데이터를 가져오는 방법은 4단계를 거침
date <- Sys.Date()
fp <- GET(url = 'https://www.spec.org/cpu2017/results/rfp2017.html')
dfp <-fp %>%
read_html(encoding = 'utf-8') %>%
html_node(css = '#CFP2017_ratediv > table') %>%
html_table(trim = T, fill = T)
names(dfp)=c('sponsor','system','totalCores','coresPerChip','chips','threadPerCore','baseResult','peakResult','baseEnergy','peakEnergy')
제거할 문자 : “\n HTML \| CSV \| Text \| PDF \| PS \| Config”
head(dfp)
sponsor
1 Test Sponsor
2 ASUSTeK Computer Inc.
3 ASUSTeK Computer Inc.
4 ASUSTeK Computer Inc.
5 ASUSTeK Computer Inc.
6 ASUSTeK Computer Inc.
system
1 System Name
2 ASUS RS700-E9(Z11PP-D24) Server System (2.70 GHz, Intel Xeon Gold 6150)\n HTML | \n CSV | \n Text | \n PDF | \n PS | \n Config
3 ASUS RS700-E9(Z11PP-D24) Server System (2.10 GHz, Intel Xeon Platinum 8176)\n HTML | \n CSV | \n Text | \n PDF | \n PS | \n Config
4 ASUS RS700-E9(Z11PP-D24) Server System (2.70 GHz, Intel Xeon Gold 6150)\n HTML | \n CSV | \n Text | \n PDF | \n PS | \n Config
5 ASUS RS700-E9(Z11PP-D24) Server System (2.10 GHz, Intel Xeon Platinum 8176)\n HTML | \n CSV | \n Text | \n PDF | \n PS | \n Config
6 ASUS WS C621E SAGE Server System (2.50 GHz, Intel Xeon Platinum 8180)\n HTML | \n CSV | \n Text | \n PDF | \n PS | \n Config
totalCores coresPerChip chips threadPerCore baseResult peakResult
1 BaseCopies EnabledCores EnabledChips Threads/Core Base Peak
2 72 36 2 2 199 201
3 112 56 2 2 233 237
4 72 36 2 2 199 202
5 112 56 2 2 233 236
6 112 56 2 2 252 257
baseEnergy peakEnergy
1 Base Peak
2 -- --
3 -- --
4 -- --
5 -- --
6 -- --
dfp$system<-str_remove_all(dfp$system,pattern = "\\n HTML \\| \n CSV \\| \n Text \\| \n PDF \\| \n PS \\| \n Config")
dfp <-filter(dfp, sponsor!='Test Sponsor')
dfp1 <- dfp[grep("Intel Xeon [P|G|S|B]|AMD EPYC 7", ignore.case = T, dfp$system),]
head(dfp1)
sponsor
1 ASUSTeK Computer Inc.
2 ASUSTeK Computer Inc.
3 ASUSTeK Computer Inc.
4 ASUSTeK Computer Inc.
5 ASUSTeK Computer Inc.
6 ASUSTeK Computer Inc.
system
1 ASUS RS700-E9(Z11PP-D24) Server System (2.70 GHz, Intel Xeon Gold 6150)
2 ASUS RS700-E9(Z11PP-D24) Server System (2.10 GHz, Intel Xeon Platinum 8176)
3 ASUS RS700-E9(Z11PP-D24) Server System (2.70 GHz, Intel Xeon Gold 6150)
4 ASUS RS700-E9(Z11PP-D24) Server System (2.10 GHz, Intel Xeon Platinum 8176)
5 ASUS WS C621E SAGE Server System (2.50 GHz, Intel Xeon Platinum 8180)
6 ASUS RS720Q-E9(Z11PH-D12) Server System (2.70 GHz, Intel Xeon Gold 6150)
totalCores coresPerChip chips threadPerCore baseResult peakResult
1 72 36 2 2 199 201
2 112 56 2 2 233 237
3 72 36 2 2 199 202
4 112 56 2 2 233 236
5 112 56 2 2 252 257
6 72 36 2 2 205 209
baseEnergy peakEnergy
1 -- --
2 -- --
3 -- --
4 -- --
5 -- --
6 -- --
dfp1<-dfp1[,1:7]
summary(is.na(dfp1[,3:7]))
totalCores coresPerChip chips threadPerCore
Mode :logical Mode :logical Mode :logical Mode :logical
FALSE:4828 FALSE:4828 FALSE:4828 FALSE:4828
baseResult
Mode :logical
FALSE:4828
dfp1 <- filter(dfp1, baseResult !='NC')
에러: ‘list’ object cannot be coerced to type ‘double’ 를 방지하기 위해서 unlist() 함수 사용
str(dfp1)
'data.frame': 4819 obs. of 7 variables:
$ sponsor : chr "ASUSTeK Computer Inc." "ASUSTeK Computer Inc." "ASUSTeK Computer Inc." "ASUSTeK Computer Inc." ...
$ system : chr "ASUS RS700-E9(Z11PP-D24) Server System (2.70 GHz, Intel Xeon Gold 6150)" "ASUS RS700-E9(Z11PP-D24) Server System (2.10 GHz, Intel Xeon Platinum 8176)" "ASUS RS700-E9(Z11PP-D24) Server System (2.70 GHz, Intel Xeon Gold 6150)" "ASUS RS700-E9(Z11PP-D24) Server System (2.10 GHz, Intel Xeon Platinum 8176)" ...
$ totalCores : chr "72" "112" "72" "112" ...
$ coresPerChip : chr "36" "56" "36" "56" ...
$ chips : chr "2" "2" "2" "2" ...
$ threadPerCore: chr "2" "2" "2" "2" ...
$ baseResult : chr "199" "233" "199" "233" ...
dfp1[,3:7] <- as.numeric(unlist(dfp1[,3:7]))
dfp1$resultPerChip <- dfp1$baseResult/dfp1$chips
dfp1$coresPerChip<- dfp1$totalCores/dfp1$chips
dfp1$freq<-str_extract(dfp1$system, pattern = '[0-9]\\.[0-9][0-9]' )
pattern <- 'Intel Xeon [P|B|S|G][A-z]{3,} [0-9]{4}[A-z]|Intel Xeon [P|B|S|G][A-z]{3,} [0-9]{4}|AMD EPYC [[:alnum:]]{4}'
# 또는 'Intel Xeon [P|B|S|G][A-z]{3,} [0-9]{4}[A-z]|Intel Xeon [P|B|S|G][A-z]{3,} [0-9]{4}|AMD EPYC [A-z0-9]{4}'
dfp1$proc <- str_extract(dfp1$system, pattern = pattern)
이것은 R의 merge를 이용하는데, 순서는 다음과 같음
1) proc, freq 컬럼만 추출 -> pid
2) pid에서 ’proc’에서 중복행 제거
3) dfp1과 pid 테이블을 merge -> dfp2로 저장
4) 기존의 freq는 freq.x로 바뀌고, freq.y가 새로 생성됨. freq.x를 지우고 freq.y를 freq로 변경
pid <- dfp1[,9:10]
sum(is.na(pid$freq))
[1] 1103
pid<-filter(pid, pid$freq!='NA')
sum(duplicated(pid$proc))
[1] 3543
pid<-pid[-which(duplicated(pid$proc)),]
dfp2<-merge(dfp1, pid, by='proc', all = T)
names(dfp2)
[1] "proc" "sponsor" "system" "totalCores"
[5] "coresPerChip" "chips" "threadPerCore" "baseResult"
[9] "resultPerChip" "freq.x" "freq.y"
dfp2<-dfp2[,-10]
names(dfp2)[10]<- c("freq")
dfp2<-dfp2[,c(2,3,1,10,4:9)] #컬럼 순서를 변경경
names(dfp2)
[1] "sponsor" "system" "proc" "freq"
[5] "totalCores" "coresPerChip" "chips" "threadPerCore"
[9] "baseResult" "resultPerChip"
Using size for a discrete variable is not advised.
따라서 숫자를 범주형으로 만들고 싶을 때는 ordered() 함수를 이용
또한 여러개의 컬럼에 대해 한번에 적용하고 싶을 때는 lapply() 함수 이용
dfp2[,4:8]<-lapply(dfp2[,4:8], ordered)
dfp2 <- dfp2 %>%
mutate(com=case_when(
str_detect(dfp2$proc,"(?i)AMD")~"AMD",
str_detect(dfp2$proc,"(?i)Intel")~"Intel",
TRUE~'ETC' #NA로 채우로 싶으면: TRUE~NA_character_
))
windows() # 새로운 윈도우에 표시하고 싶을 때
ggplot(dfp2)+
aes(x = proc, y = resultPerChip, col=com, size = coresPerChip)+
geom_jitter()
n = 10
top <- dfp2 %>%
group_by(com) %>% # manufacturer별 분리
arrange(desc(dfp2$resultPerChip)) %>% # 내림차순 정렬하기
slice(1:n)
passmark<-ggplot(top)+
aes( y = resultPerChip, col = com) +
geom_jitter(aes(x=coresPerChip, size = coresPerChip))+
geom_text(aes(x = coresPerChip, label=paste('(',proc,')')), check_overlap = T, size = 2.5, vjust=-0.5, hjust=0.5)
passmark
c<-ggplot(top, aes(resultPerChip, col = com, fill=com))
c+geom_area(stat = "bin")
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
c + geom_density(kernel = "gaussian")
c+geom_density()
e<-ggplot(top, aes(chips, resultPerChip, col=com))
e + geom_label(aes(label = proc), nudge_x = 1,
nudge_y = 1)
static<- dfp2 %>%
group_by(com, proc) %>%
summarise(mean(resultPerChip), min(resultPerChip), max(resultPerChip), sd(resultPerChip), var(resultPerChip))
result <- aov(resultPerChip ~ freq+coresPerChip+freq:coresPerChip, data = dfp2)
summary(result)
Df Sum Sq Mean Sq F value Pr(>F)
freq 24 2469194 102883 860.09 <2e-16 ***
coresPerChip 32 3962784 123837 1035.26 <2e-16 ***
freq:coresPerChip 124 530788 4281 35.78 <2e-16 ***
Residuals 4638 554794 120
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1