Syslog row 데이터 분석에 앞서 fault 데이터의 주요 내용을 살펴본다. 사용 대상 데이터의 내용은 아래와 같음
## Source: local data frame [1,592 x 9]
##
## ttl host regr
## (fctr) (fctr) (fctr)
## 1 one-wlan-02(10.83.44.108) Ping Fail one-wlan-02 NSight
## 2 one-guest-01(10.83.44.101) Ping Fail one-guest-01 NSight
## 3 one-wlan-01(10.83.44.104) Ping Fail one-wlan-01 NSight
## 4 one-wired-01(10.83.44.103) Ping Fail one-wired-01 NSight
## 5 win-ng2qcf6eu4i(10.67.12.219) Ping Fail mirae-nswire-02 NSight
## 6 one-guest-02(10.83.44.105) Ping Fail one-guest-02 NSight
## 7 one-ipt-01(10.83.44.102) Ping Fail one-ipt-01 NSight
## 8 one-wired-02(10.83.44.107) Ping Fail one-wired-02 NSight
## 9 one-ipt-02(10.83.44.106) Ping Fail one-ipt-02 NSight
## 10 sdata113.lineobs(10.47.38.21) Ping Fail sdata113.lineobs NSight
## .. ... ... ...
## Variables not shown: faultymdt (time), type (fctr), cont (fctr), ip
## (fctr), faulthour (time), faultday (time)
### count by ip
fault %>%
group_by(ip) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
print
## Source: local data frame [525 x 2]
##
## ip count
## (fctr) (int)
## 1 NA 1029
## 2 10.112.128.141 7
## 3 10.101.4.57 3
## 4 10.114.200.146 3
## 5 10.114.43.219 3
## 6 10.115.111.83 3
## 7 10.97.66.227 3
## 8 10.101.48.204 2
## 9 10.101.49.202 2
## 10 10.101.53.178 2
## .. ... ...
### count by type
fault %>%
group_by(type) %>%
summarize(count = n()
# , regr = n_distinct(regr)
) %>%
arrange(desc(count)) %>%
print
## Source: local data frame [14 x 2]
##
## type count
## (fctr) (int)
## 1 [서버] Disk Error 687
## 2 [서버] 기타 351
## 3 [서버] Ping Fail 244
## 4 [서버] NIC Error 92
## 5 [서버] Hang 81
## 6 [서버] Kernel Panic 28
## 7 [서버] Down 26
## 8 [서버] 자동 Rebooting 26
## 9 [서버] R/C Error 21
## 10 [서버] Memory Error 16
## 11 [기반시설] 설비문제 10
## 12 [서버] Power Supply Error 5
## 13 [서버] HW LED 점등 3
## 14 [서버] CPU Error 2
### count by regr
fault %>%
group_by(regr) %>%
summarize(count = n()
# ,type = n_distinct(type)
) %>%
arrange(desc(count)) %>%
print
## Source: local data frame [31 x 2]
##
## regr count
## (fctr) (int)
## 1 SEPORTAL 676
## 2 NSight 563
## 3 김태훈 66
## 4 홍현기 58
## 5 이정민 33
## 6 김해랑 25
## 7 박하령 22
## 8 양시현 19
## 9 佐野裕 19
## 10 이준형 18
## .. ... ...
### cont by day
fault %>%
group_by(faultday) %>%
summarize(count = n()) %>%
arrange(desc(count)) %>%
print
## Source: local data frame [30 x 2]
##
## faultday count
## (time) (int)
## 1 2015-11-02 226
## 2 2015-11-16 187
## 3 2015-11-30 107
## 4 2015-11-23 96
## 5 2015-11-05 95
## 6 2015-11-24 94
## 7 2015-11-09 86
## 8 2015-11-19 78
## 9 2015-11-27 61
## 10 2015-11-12 58
## .. ... ...
## Source: local data frame [30 x 3]
##
## day syslog_count fault_count
## (time) (int) (int)
## 1 2015-11-01 14 12
## 2 2015-11-02 49 226
## 3 2015-11-03 77 50
## 4 2015-11-04 18 39
## 5 2015-11-05 30 95
## 6 2015-11-06 12 21
## 7 2015-11-07 3 5
## 8 2015-11-08 7 23
## 9 2015-11-09 61 86
## 10 2015-11-10 45 47
## .. ... ... ...
## ggplot lm
ggplot(byday, aes(x = syslog_count, y = fault_count)) +
geom_point() +
ggtitle("Syslog event vs Fault by day - Linear Regression") +
stat_smooth(method = "lm", col = "red") + theme_bw()
회귀 값이 매우 낮은 0.22 정도이지만.. 그래도..
## Source: local data frame [290 x 3]
##
## hour syslog_count fault_count
## (time) (int) (dbl)
## 1 2015-11-01 01:00:00 1 1
## 2 2015-11-01 02:00:00 1 0
## 3 2015-11-01 07:00:00 1 0
## 4 2015-11-01 09:00:00 1 0
## 5 2015-11-01 10:00:00 1 1
## 6 2015-11-01 11:00:00 2 3
## 7 2015-11-01 13:00:00 3 0
## 8 2015-11-01 16:00:00 1 2
## 9 2015-11-01 19:00:00 2 0
## 10 2015-11-01 23:00:00 1 0
## .. ... ... ...
시간은 날짜 대비 더 낮은 값으로
## ggplot lm
ggplot(byhour, aes(x = syslog_count, y = fault_count)) +
geom_point() +
ggtitle("Syslog event vs Fault by hour- Linear Regression (2015.11)") +
stat_smooth(method = "lm", col = "red") + theme_bw()
## 시간 기준 syslog event count 와 fault count 비교
## tidy byhour
byhour_tidy <- byhour %>%
gather(type, count, syslog_count:fault_count) %>%
print
## Source: local data frame [580 x 3]
##
## hour type count
## (time) (chr) (dbl)
## 1 2015-11-01 01:00:00 syslog_count 1
## 2 2015-11-01 02:00:00 syslog_count 1
## 3 2015-11-01 07:00:00 syslog_count 1
## 4 2015-11-01 09:00:00 syslog_count 1
## 5 2015-11-01 10:00:00 syslog_count 1
## 6 2015-11-01 11:00:00 syslog_count 2
## 7 2015-11-01 13:00:00 syslog_count 3
## 8 2015-11-01 16:00:00 syslog_count 1
## 9 2015-11-01 19:00:00 syslog_count 2
## 10 2015-11-01 23:00:00 syslog_count 1
## .. ... ... ...
ggplot(byhour_tidy, aes(x = hour, y= count)) +
geom_bar(aes(fill = type), stat = "identity") +
ggtitle("Syslog event vs Fault by hour. (2015.11)") + theme_bw()
## 시간 + ip 기준 syslog event count 와 fault count 사이의 선형 회귀
### count by hour
byhourip_syslog <- syslog_event %>%
group_by(sysloghour, ip) %>%
filter(syslogday >= "2015-11-01", syslogday < "2015-12-01" ) %>%
filter(!is.na(ip)) %>%
summarize(count = n()) %>%
select(ip, hour = sysloghour, syslog_count = count) %>%
print
## Source: local data frame [724 x 3]
## Groups: hour [290]
##
## ip hour syslog_count
## (fctr) (time) (int)
## 1 10.114.106.80 2015-11-01 01:00:00 1
## 2 10.114.62.226 2015-11-01 02:00:00 1
## 3 10.97.68.30 2015-11-01 07:00:00 1
## 4 10.114.26.149 2015-11-01 09:00:00 1
## 5 10.101.57.118 2015-11-01 10:00:00 1
## 6 10.25.149.153 2015-11-01 11:00:00 1
## 7 10.97.67.145 2015-11-01 11:00:00 1
## 8 10.113.216.115 2015-11-01 13:00:00 1
## 9 10.114.117.36 2015-11-01 13:00:00 1
## 10 10.99.125.77 2015-11-01 13:00:00 1
## .. ... ... ...
### count by hour
byhourip_fault <- fault %>%
group_by(faulthour, ip) %>%
filter(!is.na(ip)) %>%
summarize(count = n()) %>%
select(ip, hour = faulthour, fault_count = count) %>%
print
## Source: local data frame [552 x 3]
## Groups: hour [200]
##
## ip hour fault_count
## (fctr) (time) (int)
## 1 10.114.43.219 2015-11-01 01:00:00 1
## 2 10.114.37.214 2015-11-01 06:00:00 1
## 3 10.97.70.218 2015-11-01 08:00:00 1
## 4 10.114.7.21 2015-11-01 10:00:00 1
## 5 10.25.149.153 2015-11-01 11:00:00 1
## 6 10.97.70.41 2015-11-01 11:00:00 1
## 7 10.115.40.18 2015-11-01 16:00:00 1
## 8 10.99.125.77 2015-11-01 16:00:00 1
## 9 10.114.64.207 2015-11-01 17:00:00 1
## 10 10.97.23.86 2015-11-01 22:00:00 1
## .. ... ... ...
## join byhour
byhourip <- byhourip_syslog %>%
left_join(byhourip_fault, by = c("ip", "hour") ) %>%
#replace(is.na(.), 0) %>%
filter(!is.na(fault_count)) %>%
print
## Warning in left_join_impl(x, y, by$x, by$y): joining factors with different
## levels, coercing to character vector
## Source: local data frame [12 x 4]
## Groups: hour [12]
##
## ip hour syslog_count fault_count
## (chr) (time) (int) (int)
## 1 10.25.149.153 2015-11-01 11:00:00 1 1
## 2 10.99.125.77 2015-11-01 16:00:00 1 1
## 3 10.114.172.170 2015-11-03 14:00:00 1 1
## 4 10.114.43.219 2015-11-03 17:00:00 1 1
## 5 10.114.60.202 2015-11-09 11:00:00 1 1
## 6 10.97.66.227 2015-11-11 09:00:00 1 1
## 7 10.97.0.70 2015-11-12 13:00:00 1 1
## 8 10.114.45.213 2015-11-19 10:00:00 1 1
## 9 10.114.131.209 2015-11-25 11:00:00 1 1
## 10 10.97.68.111 2015-11-26 00:00:00 1 1
## 11 10.20.188.198 2015-11-27 06:00:00 1 1
## 12 10.99.86.164 2015-11-29 11:00:00 1 1
summary(byhourip)
## ip hour syslog_count
## Length:12 Min. :2015-11-01 11:00:00 Min. :1
## Class :character 1st Qu.:2015-11-03 16:15:00 1st Qu.:1
## Mode :character Median :2015-11-11 23:00:00 Median :1
## Mean :2015-11-14 06:45:00 Mean :1
## 3rd Qu.:2015-11-25 14:15:00 3rd Qu.:1
## Max. :2015-11-29 11:00:00 Max. :1
## fault_count
## Min. :1
## 1st Qu.:1
## Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
# 1시간 범위에 ip 일치하는건 총 12건
## ggplot lm
ggplot(byhourip, aes(x = syslog_count, y = fault_count)) + geom_point()