library(dplyr)
library(httr) # for GET function
library(rvest) # for html_node
library(XML)
library(tidyverse)
library(stringi)
#원하는 내용 추출하기
date <- Sys.Date()
int <- GET(url = 'https://www.spec.org/cpu2017/results/rint2017.html')
fp <- GET(url = 'https://www.spec.org/cpu2017/results/rfp2017.html')
dint<- int %>%
read_html(encoding = 'utf-8') %>%
html_node(css = '#CINT2017_ratediv > table') %>% # css나 xpath로 못가져올 때는 html_nodes(res, "table")[[2]]로 시도해본다.
html_table(trim = TRUE, fill = TRUE)
dfp <-fp %>%
read_html(encoding = 'utf-8') %>%
html_node(css = '#CFP2017_ratediv > table') %>%
html_table(trim = T, fill = T)
# add metric column
dint$metric <- 'int'
dfp$metric <- 'fp'
# row bind
df <- rbind(dint, dfp)
df <- df[,c(11,1:10)]
# 컬럼 이름 변경
names(df)=c('metric', 'sponsor','system','totalCores','coresPerChip','chips','threadPerCore','baseResult','peakResult','baseEnergy','peakEnergy')
# garbage 행 제거
df$system<-str_remove_all(df$system,pattern = "\\n HTML \\| \\n CSV \\| \\n Text \\| \\n PDF \\| \\n PS \\| \\n Config")
df <-filter(df, sponsor!='Test Sponsor')
# write.csv(df, file = 'tbl4.csv') # 파일에 쓰고 싶을 때
df[,4:6] %>% sapply(table, useNA='ifany') # NA 포함해서 중복된 원소의 개수 파악
$totalCores
1 1016 104 112 1152 12 128 144 1536 16 160 168 176 1792 191 192
2 1 262 642 2 379 488 291 2 670 205 4 97 2 2 301
2 20 2016 2032 208 224 24 255 256 28 3040 3048 3072 32 320 336
40 26 1 1 84 245 107 2 74 14 1 1 2 1261 1 9
36 383 384 4 40 4064 415 416 44 447 448 48 508 512 52 56
15 4 38 302 605 2 4 13 2 7 45 965 2 2 7 318
576 6 63 64 72 768 8 80 88 896 96 992
2 114 2 1288 612 6 255 678 219 3 1055 1
$coresPerChip
0 1 10 1024 104 112 12 128 14 1536 16 168 18 192 2 20
2 2 26 2 91 266 293 124 10 2 1410 9 15 43 110 595
2048 208 22 224 24 256 26 28 288 32 36 384 4 40 44 448
2 17 2 53 949 7 2 318 2 1251 602 6 457 687 214 3
48 512 52 56 576 6 64 72 8 80 84 88 896 96
1042 2 260 623 2 309 480 300 542 207 4 102 2 338
$chips
1 16 2 24 3 32 4 6 8
1770 15 7586 2 4 8 2241 9 150
df[,2:6] %>% sapply(n_distinct) # 몇 개의 범주로 이루어졌는지 파악
sponsor system totalCores coresPerChip chips
35 5602 60 46 9
# System Column이 어떻게 이루어져 있는지 집중적으로 파악
df %>% nrow() # 11785
[1] 11785
df %>% filter(str_detect(df$system,'intel|Intel|AMD')) %>% nrow()
[1] 11744
df %>% filter(!str_detect(df$system,'intel|Intel|AMD')) %>% nrow()
[1] 41
df %>% filter(!str_detect(df$system,'intel|Intel|AMD')) %>% select(system)
system
1 PowerEdge R6515 (redacted)
2 ASUS RS500-E8-RS4 v2
3 ASUS RS500-E8-RS4 v2
4 ASUS RS500-E8-RS4 v2
5 Fujitsu SPARC M12-2S
6 Fujitsu SPARC M12-2S
7 Fujitsu SPARC M12-2S
8 Fujitsu SPARC M12-2S
9 Fujitsu SPARC M12-2
10 Fujitsu SPARC M12-1
11 Fujitsu SPARC M12-2S
12 Fujitsu SPARC M12-2S
13 Fujitsu SPARC M12-2S
14 IBM Power S924 (3.4 - 3.9 GHz, 24 core, SLES)
15 IBM Power E950 (3.4 - 3.8 GHz, 40 core, SLES)
16 Sun Fire V490
17 1-Chip VM with SPARC M7
18 Huawei TaiShan 200 Server (Model 2480) (2.6 GHz,Huawei Kunpeng 920 7260)
19 Huawei TaiShan 200 Server (Model 2280) (2.6 GHz, Huawei Kunpeng 920 7260)
20 SuperServer 7048R-C1R4+
21 SuperServer 2049U-TR4
22 Yadro Vesnin (3.32 GHz, 32 cores, RHEL 7.2)
23 Yadro Vesnin (2.92 GHz, 40 cores, RHEL 7.4)
24 PowerEdge R6515 (redacted)
25 Fujitsu SPARC M12-2S
26 Fujitsu SPARC M12-2S
27 Fujitsu SPARC M12-2S
28 Fujitsu SPARC M12-2S
29 Fujitsu SPARC M12-2
30 Fujitsu SPARC M12-1
31 Fujitsu SPARC M12-2S
32 Fujitsu SPARC M12-2S
33 Fujitsu SPARC M12-2S
34 Sun Fire V490
35 1-Chip VM with SPARC M7
36 Huawei TaiShan 200 Server (Model 2280) (2.6 GHz,Huawei Kunpeng 920 7260)
37 Huawei TaiShan 200 Server (Model 2480) (2.6 GHz,Huawei Kunpeng 920 7260)
38 Huawei TaiShan 200 Server (Model 2480) (2.6 GHz,Huawei Kunpeng 920 7260)
39 Huawei TaiShan 200 Server (Model 2280) (2.6 GHz,Huawei Kunpeng 920 7260)
40 Yadro Vesnin (3.32 GHz, 32 cores, RHEL 7.2)
41 Yadro Vesnin (2.92 GHz, 40 cores, RHEL 7.4)
df %>% filter(!str_detect(df$system,'(?i)intel|(?i)amd|SPARC|Power|Kunpeng')) %>% select(system)
system
1 ASUS RS500-E8-RS4 v2
2 ASUS RS500-E8-RS4 v2
3 ASUS RS500-E8-RS4 v2
4 Sun Fire V490
5 SuperServer 7048R-C1R4+
6 SuperServer 2049U-TR4
7 Yadro Vesnin (3.32 GHz, 32 cores, RHEL 7.2)
8 Yadro Vesnin (2.92 GHz, 40 cores, RHEL 7.4)
9 Sun Fire V490
10 Yadro Vesnin (3.32 GHz, 32 cores, RHEL 7.2)
11 Yadro Vesnin (2.92 GHz, 40 cores, RHEL 7.4)
# Intel, AMD, SPARC, Power, Kunpeng를 제외한 나머지는 11개 밖에 안되고, 전혀 모르는 업체니까 삭제
df <- df %>% filter(str_detect(df$system,'(?i)intel|(?i)amd|SPARC|Power|Kunpeng'))
df %>% nrow() # 11774(11개 뺀 나머지)
[1] 11774
df %>% filter(!str_detect(df$system, '(?i)ghz')) %>%
select(system) %>%
nrow() # 3102
[1] 3102
df %>% filter(str_detect(df$system, '(?i)ghz')) %>%
select(system) %>%
nrow() # 8672
[1] 8672
df %>% filter(str_detect(df$system, '[:digit:]\\.[:digit:]+[:space:]*(?i)ghz')) %>%
select(system) %>%
nrow() # 8672
[1] 8672
패턴 분석 [:digit:]\\.[:digit:]+[:space:]*(?i)ghz
1. [:digit:] : 1개의숫자
2. \. : 마침표. ‘.’
3. [:digit:]+ : 1개 이상의 숫자
4. [:space:]* : 0개 이상의 공백
5. (?i)ghz : 대소문자 구분없이 ghz
6. example : 2.5ghz, 2.678 GHz, 1.56GHz 등에서 숫자만 추출.
df <- df %>%
mutate( Clock=str_extract(df$system, '[:digit:]\\.[:digit:]+[:space:]*(?i)ghz'))
df$Clock <- str_remove(df$Clock, '[:space:]*(?i)ghz')
# Intel CPU 종류는 'Intel|Intel Celeron|Intel Pentium|Intel Core|(?i)intel xeon' 이게 다군.
df %>% filter(!str_detect(df$system,'Intel|Intel Celeron|Intel Pentium|Intel Core|(?i)intel xeon|(?i)amd|SPARC|Power|Kunpeng')) %>% select(system)
[1] system
<0 rows> (or 0-length row.names)
df <- df %>%
mutate(cpu1=str_extract(df$system, 'Intel.*|AMD.*|IBM Power.*|SPARC.*|Kunpeng.*'))
trimws()
1. \)| : )(오른쪽 괄호) 2. \\d\\.\\d+\\s*(?i)ghz| : 위의 [:digit:]\\.[:digit:]+[:space:]*(?i)ghz와 동일한 다른 표현
3. , : ,(쉼표)
4. CPU : CPU 문자
5. trimws() : 좌우 공백 제거 (=str_trim()과 동일)
df$cpu1 <- str_remove_all(df$cpu1, '\\)|\\d\\.\\d+\\s*(?i)ghz|,|CPU') %>%
trimws()
df <- df %>%
mutate(cpu_no=str_sub(df$cpu1, start = stri_locate_last(df$cpu1, regex = '\\s') %>% .[,1]))
df$cpu_no<-str_squish(df$cpu_no) # ' aaa bb ' to 'aaa bb'
[
’, 3): str_split으로 반환되는 값이 list이고, 3번째 원소들만 추출.# cpu_no에 32-Core, 64-Core 등 잘못 들어간 행 찾기
e_value <- which(str_detect(df$cpu_no, 'Core'))
# [1] 28 145 5631 5632 6030 6148 11437 11438
# 위의 행 번호의 cpu1에서 cpu_no를 추출하여 cpu_no에 집어 넣기
kkk <- df[e_value,]%>%
select(cpu1) %>%
unlist() %>% # select를 실행하면 list로 나오는데, vector로 변경
str_split('\\s+') %>%
sapply(`[`, 3)
# "7601" "7H12" "7742" "7742" "7601" "7H12" "7742" "7742"
df[e_value, 'cpu_no'] <- c(kkk)
패턴에 나와있는 cpu 종류를 제외하면 0 인 것으로 봐서 cpu 종류는 아래와 같음.
#### ‘Intel|Intel Celeron|Intel Pentium|Intel Core|(?i)intel xeon|(?i)amd|SPARC|Power|Kunpeng’
df %>% filter(!str_detect(df$system,'Intel|Intel Celeron|Intel Pentium|Intel Core|(?i)intel xeon|(?i)amd|SPARC|Power|Kunpeng')) %>% select(system)
[1] system
<0 rows> (or 0-length row.names)
# 또는 df %>% filter(str_detect(cpu1, "Intel")) %>% filter(!str_detect(cpu1, 'Intel Celeron|Intel Pentium|Intel Core|(?i)intel xeon'))
case_when을 쓸 때, 가장 작은 집합부터 큰 집합으로 기술할 것.
이렇게 하면 모든 집합을 표현이 가능함.
즉, Intel만 있는 것, Intel Xeon만 있는 것, Intel Xeon Gold 인 것, Intel Core 인 것 등등.
df <- df %>%
mutate(cpu=case_when(
str_detect(cpu1, 'AMD') ~ 'EPYC',
str_detect(cpu1, 'SPARC') ~ 'SPARC',
str_detect(cpu1, 'IBM Power') ~ 'Power',
str_detect(cpu1, 'Kunpeng') ~ 'Kunpeng',
str_detect(cpu1, 'Intel Celeron') ~ 'Celeron',
str_detect(cpu1, 'Intel Pentium') ~ 'Pentium',
str_detect(cpu1, 'Intel Core') ~ 'Core',
str_detect(cpu1, '(?i)Intel Xeon gold') ~ 'gold',
str_detect(cpu1, '(?i)Intel Xeon silver') ~ 'silver',
str_detect(cpu1, '(?i)Intel Xeon platinum') ~ 'platinum',
str_detect(cpu1, '(?i)Intel Xeon bronze') ~ 'bronze',
str_detect(cpu1, '(?i)Intel Xeon') ~ 'xeon',
str_detect(cpu1, '(?i)Intel') ~ 'intel',
TRUE~NA_character_ #NA로 채우로 싶으면: TRUE~NA_character_
))
df$cpu %>% table(useNA = 'ifany') # NA가 있으면 함께 테이블로 표현.
.
bronze Celeron Core EPYC gold intel Kunpeng Pentium
321 36 76 1155 5697 9 6 66
platinum Power silver SPARC xeon <NA>
2130 2 1462 20 792 2
df <- df %>%
mutate(Com=case_when(
str_detect(cpu1, 'AMD') ~ 'AMD',
str_detect(cpu1, 'SPARC') ~ 'SPARC',
str_detect(cpu1, 'IBM Power') ~ 'IBM',
str_detect(cpu1, 'Kunpeng') ~ 'Huawai',
TRUE~'Intel' #NA로 채우로 싶으면: TRUE~NA_character_
))
먼저 cpu_no에 잘못 들어간 값을 찾아내서 정확한 값을 넣고, NA 값을 평균으로 채움
df$Clock %>% table(useNA = 'ifany') # NA 포함 각 범주의 원소의 개수 파악
.
1.30 1.70 1.80 1.90 2.0 2.00 2.10 2.20 2.25 2.30 2.35 2.4 2.40 2.5 2.50 2.6
2 109 142 270 2 466 1425 941 53 572 34 6 608 4 534 8
2.60 2.70 2.8 2.80 2.9 2.90 3.0 3.00 3.10 3.2 3.20 3.3 3.30 3.4 3.40 3.50
573 573 3 168 3 195 9 321 135 5 265 1 172 1 207 153
3.60 3.70 3.8 3.80 3.9 3.90 4.00 <NA>
320 73 1 248 1 31 38 3102
df %>%
distinct(cpu_no, Clock, .keep_all = T) %>% # df에서 (cpu_no, Clock) 컬럼의 값이 유일한 행. .keep_all=T; 모든 열을 보여줌.
select(cpu1, cpu_no, Clock) %>%
filter(!is.na(Clock)) %>%
arrange(cpu_no) %>% # cpu_no 컬럼을 기준으로 정렬
filter(duplicated(cpu_no)|duplicated(cpu_no, fromLast = T)) # 중복된 행을 모두 찾고 싶을 때.
cpu1 cpu_no Clock
1 Intel Xeon Silver 4116 4116 2.10
2 Intel Xeon Silver 4116 4116 2.20
3 Intel Xeon Silver 4210R 4210R 2.40
4 Intel Xeon Silver 4210R 4210R 2.4
5 Intel Xeon Silver 4214R 4214R 2.40
6 Intel Xeon Silver 4214R 4214R 2.4
7 Intel Xeon Silver 4215 4215 2.50
8 Intel Xeon Silver 4215 4215 2.5
9 Intel Xeon Silver 4215R 4215R 3.20
10 Intel Xeon Silver 4215R 4215R 3.2
11 Intel Xeon Gold 5217 5217 3.00
12 Intel Xeon Gold 5217 5217 3.0
13 Intel Xeon Gold 6148 6148 2.4
14 Intel Xeon Gold 6148 6148 2.40
15 Intel Xeon Gold 6248R 6248R 3.00
16 Intel Xeon Gold 6248R 6248R 3.0
17 AMD EPYC 7261 7261 2.50
18 AMD EPYC 7261 7261 2.20
19 AMD EPYC 7262 7262 3.20
20 AMD EPYC 7262 7262 3.2
21 AMD EPYC 7272 7272 2.90
22 AMD Epyc 7272 AMD EPYC 7272 7272 2.9
23 AMD EPYC 7281 7281 2.10
24 AMD EPYC 7281 7281 2.70
25 AMD EPYC 7281 7281 2.20
26 AMD EPYC 7302 7302 3.00
27 AMD Epyc 7302 AMD EPYC 7302 7302 3.0
28 AMD EPYC 7402 7402 2.80
29 AMD Epyc 7402 AMD EPYC 7402 7402 2.8
30 AMD EPYC 7542 7542 2.90
31 AMD EPYC 7542 7542 2.9
32 AMD EPYC 7702 7702 2.00
33 AMD EPYC 7702 7702 2.0
34 AMD EPYC 7H12 7H12 2.60
35 AMD EPYC 7H12 64-Core 7H12 2.6
36 Intel Xeon Platinum 8158 8158 3.00
37 Intel Xeon Platinum 8158 8158 3.0
38 Intel Xeon E-2136 E-2136 3.30
39 Intel Xeon E-2136 E-2136 3.50
40 Intel Xeon E-2236 E-2236 3.40
41 Intel Xeon E-2236 E-2236 3.4
42 IBM Power S924 (3.4 - 24 core SLES SLES 3.9
43 IBM Power E950 (3.4 - 40 core SLES SLES 3.8
44 Intel Xeon E7-8890 v4 v4 2.20
45 Intel Xeon E5-2620 v4 v4 2.10
46 Intel Xeon E5-2699A v4 v4 2.40
47 Intel Xeon E3-1225 v6 v6 3.3
48 Intel Xeon E3-1220 v6 v6 3.0
49 Intel Xeon E3-1220 v6 v6 3.00
50 Intel Xeon E3-1225 v6 v6 3.30
51 Intel Xeon E3-1230 v6 v6 3.50
52 Intel Xeon E3-1240 v6 v6 3.70
53 Intel Xeon E3-1270 v6 v6 3.80
54 Intel Xeon E3-1280 v6 v6 3.90
# 위에서 찾은 cpu_no의 정확한 Clock을 찾아서 넣는다.
df$Clock <- case_when(
df$cpu_no=='4116' ~ 2.1,
df$cpu_no=='7281' ~ 2.1,
df$cpu_no=='7261' ~ 2.5,
TRUE ~ df$Clock)
Error: must be a double vector, not a character vector.
# 이제 Clock의 NA 값을 cpu_no 그룹 별로(group_by) 평균값을 넣는다.
df<-df %>%
group_by(cpu_no) %>%
mutate(Clock=case_when(
Clock=is.na(Clock) ~ mean(Clock, na.rm = T),
TRUE ~ Clock))
Warning: Problem with `mutate()` input `Clock`.
i argument is not numeric or logical: returning NA
i Input `Clock` is `case_when(...)`.
i The error occurred in group 1: cpu_no = "3104".
Warning in mean.default(Clock, na.rm = T): argument is not numeric or logical:
returning NA
Error: Problem with `mutate()` input `Clock`.
x must be a double vector, not a character vector.
i Input `Clock` is `case_when(...)`.
i The error occurred in group 1: cpu_no = "3104".
df[,4:12] <- df[,4:12] %>% sapply(as.numeric)
Warning in lapply(X = X, FUN = FUN, ...): 강제형변환에 의해 생성된 NA 입니다
Warning in lapply(X = X, FUN = FUN, ...): 강제형변환에 의해 생성된 NA 입니다
Warning in lapply(X = X, FUN = FUN, ...): 강제형변환에 의해 생성된 NA 입니다
Warning in lapply(X = X, FUN = FUN, ...): 강제형변환에 의해 생성된 NA 입니다
df<-df %>%
group_by(cpu_no) %>%
mutate(baseResult=case_when(
baseResult==is.na(baseResult) ~ mean(baseResult, na.rm = T),
TRUE ~ baseResult))
df<-df %>%
group_by(cpu_no) %>%
mutate(baseEnergy=case_when(
baseEnergy==is.na(baseEnergy) ~ mean(baseEnergy, na.rm = T),
TRUE ~ baseEnergy))
df<-df %>%
group_by(cpu_no) %>%
mutate(peakEnergy=case_when(
peakEnergy==is.na(peakEnergy) ~ mean(peakEnergy, na.rm = T),
TRUE ~ peakEnergy))
# basePerCore, basePerChip 컬럼 생성(base_core, base_chip)
df <- df %>%
group_by(cpu_no) %>%
mutate(basePerCore=baseResult/totalCores,
basePerChip=baseResult/chips)
###데이터 탐색
library(ggplot2)
# cpu가 EPYC,platinum인 것의 평균값. 이 외에도 다양하게 변수를 바꾸어가며 시각화.
df %>%
filter(metric=='int' & cpu=='EPYC'|cpu=='platinum') %>%
group_by(Com, cpu_no) %>%
summarise(mean=mean(basePerChip, na.rm = T)) %>%
ggplot()+
aes(x = cpu_no, y = mean, col=Com, label=cpu_no )+
geom_jitter()+
theme(legend.position = 'none')+
#facet_grid(. ~ Com)+
geom_text(check_overlap = T, size=3.5, vjust=-0.5, hjust=0.5, angle=30)
df %>%
filter(metric=='fp' & cpu=='platinum'|cpu=='silver'|cpu=='gold'|cpu=='bronze') %>%
group_by(cpu_no) %>%
summarise(mean=mean(basePerChip, na.rm = T)) %>%
ggplot()+
aes(x = cpu_no, y = mean, col=cpu_no, label=cpu_no)+
geom_jitter()+
theme(legend.position = 'none')+
#facet_grid(. ~ Com)+
geom_text( size=3.5, vjust=-0.5, hjust=0.5, angle=30, check_overlap = T)+
labs(title = "Intel CPU Floating Point per chip(cpu Average)")+
theme(plot.title = element_text(hjust = 0.5))
df %>%
filter(metric=='fp' & cpu=='EPYC') %>%
group_by(cpu_no) %>%
summarise(mean=mean(basePerChip, na.rm = T)) %>%
ggplot()+
aes(x = cpu_no, y = mean, col=cpu_no, label=cpu_no)+
geom_jitter()+
theme(legend.position = 'none')+
#facet_grid(. ~ Com)+
geom_text( size=3.5, vjust=-0.5, hjust=0.5, angle=30, check_overlap = T)+
labs(title = "AMD EPYC CPU Floating Point per chip(cpu Average)")+
theme(plot.title = element_text(hjust = 0.5))