library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)
library(stringr)
library(DescTools)
library(lubridate)
Get top 6 view of data
setwd("/home/danny/Downloads/fyp 2016/apache-samples/access_log")
rawdata<- read.table("access_log.txt", header = FALSE)
head(rawdata)
## V1 V2 V3 V4 V5
## 1 64.242.88.10 - - [07/Mar/2004:16:05:49 -0800]
## 2 64.242.88.10 - - [07/Mar/2004:16:06:51 -0800]
## 3 64.242.88.10 - - [07/Mar/2004:16:10:02 -0800]
## 4 64.242.88.10 - - [07/Mar/2004:16:11:58 -0800]
## 5 64.242.88.10 - - [07/Mar/2004:16:20:55 -0800]
## 6 64.242.88.10 - - [07/Mar/2004:16:23:12 -0800]
## V6
## 1 GET /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables HTTP/1.1
## 2 GET /twiki/bin/rdiff/TWiki/NewUserTemplate?rev1=1.3&rev2=1.2 HTTP/1.1
## 3 GET /mailman/listinfo/hsdivision HTTP/1.1
## 4 GET /twiki/bin/view/TWiki/WikiSyntax HTTP/1.1
## 5 GET /twiki/bin/view/Main/DCCAndPostFix HTTP/1.1
## 6 GET /twiki/bin/oops/TWiki/AppendixFileSystem?template=oopsmore¶m1=1.12¶m2=1.12 HTTP/1.1
## V7 V8
## 1 401 12846
## 2 200 4523
## 3 200 6291
## 4 200 7352
## 5 200 5253
## 6 200 11382
rows x cols
dim(rawdata)
## [1] 1546 8
rawdata<- rawdata[ c(-2, -3)]
cleaning column names
rawdata<- separate(rawdata, V6, c("method", "resource", "protocol"), sep = " ")
## Warning: Too few values at 1 locations: 907
names(rawdata)<- c("ip", "timestamp", "timezone", "method", "resource", "protocol", "status_code", "bytes")
head(rawdata)
## ip timestamp timezone method
## 1 64.242.88.10 [07/Mar/2004:16:05:49 -0800] GET
## 2 64.242.88.10 [07/Mar/2004:16:06:51 -0800] GET
## 3 64.242.88.10 [07/Mar/2004:16:10:02 -0800] GET
## 4 64.242.88.10 [07/Mar/2004:16:11:58 -0800] GET
## 5 64.242.88.10 [07/Mar/2004:16:20:55 -0800] GET
## 6 64.242.88.10 [07/Mar/2004:16:23:12 -0800] GET
## resource
## 1 /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables
## 2 /twiki/bin/rdiff/TWiki/NewUserTemplate?rev1=1.3&rev2=1.2
## 3 /mailman/listinfo/hsdivision
## 4 /twiki/bin/view/TWiki/WikiSyntax
## 5 /twiki/bin/view/Main/DCCAndPostFix
## 6 /twiki/bin/oops/TWiki/AppendixFileSystem?template=oopsmore¶m1=1.12¶m2=1.12
## protocol status_code bytes
## 1 HTTP/1.1 401 12846
## 2 HTTP/1.1 200 4523
## 3 HTTP/1.1 200 6291
## 4 HTTP/1.1 200 7352
## 5 HTTP/1.1 200 5253
## 6 HTTP/1.1 200 11382
summary(rawdata)
## ip timestamp
## 64.242.88.10 :452 [08/Mar/2004:10:48:37: 12
## 10.0.0.153 :270 [08/Mar/2004:09:18:57: 11
## h24-71-236-129.ca.shawcable.net: 51 [08/Mar/2004:12:59:37: 11
## cr020r01-3.sac.overture.com : 44 [08/Mar/2004:22:03:29: 11
## h24-70-69-74.ca.shawcable.net : 32 [10/Mar/2004:08:36:58: 11
## market-mail.panduit.com : 29 [12/Mar/2004:12:23:41: 11
## (Other) :668 (Other) :1479
## timezone method resource protocol
## -0800]:1546 Length:1546 Length:1546 Length:1546
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
## status_code bytes
## Min. :200.0 - : 139
## 1st Qu.:200.0 12846 : 94
## Median :200.0 209 : 35
## Mean :226.4 10419 : 32
## 3rd Qu.:200.0 2877 : 32
## Max. :408.0 3169 : 31
## (Other):1183
dim(rawdata)
## [1] 1546 8
rawdata$timeStamp <- paste0(rawdata$timestamp, rawdata$timezone)
log1<-rawdata[ c(-2, -3)]
head(log1)
## ip method
## 1 64.242.88.10 GET
## 2 64.242.88.10 GET
## 3 64.242.88.10 GET
## 4 64.242.88.10 GET
## 5 64.242.88.10 GET
## 6 64.242.88.10 GET
## resource
## 1 /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables
## 2 /twiki/bin/rdiff/TWiki/NewUserTemplate?rev1=1.3&rev2=1.2
## 3 /mailman/listinfo/hsdivision
## 4 /twiki/bin/view/TWiki/WikiSyntax
## 5 /twiki/bin/view/Main/DCCAndPostFix
## 6 /twiki/bin/oops/TWiki/AppendixFileSystem?template=oopsmore¶m1=1.12¶m2=1.12
## protocol status_code bytes timeStamp
## 1 HTTP/1.1 401 12846 [07/Mar/2004:16:05:49-0800]
## 2 HTTP/1.1 200 4523 [07/Mar/2004:16:06:51-0800]
## 3 HTTP/1.1 200 6291 [07/Mar/2004:16:10:02-0800]
## 4 HTTP/1.1 200 7352 [07/Mar/2004:16:11:58-0800]
## 5 HTTP/1.1 200 5253 [07/Mar/2004:16:20:55-0800]
## 6 HTTP/1.1 200 11382 [07/Mar/2004:16:23:12-0800]
methodCount<- log1 %>% select(method) %>% group_by(method) %>% summarise(count = n())
methodCount
## Source: local data frame [5 x 2]
##
## method count
## (chr) (int)
## 1 - 1
## 2 GET 1525
## 3 HEAD 5
## 4 OPTIONS 1
## 5 POST 14
top10IP<- log1 %>% select(ip) %>% group_by(ip) %>% summarise( count = n()) %>% arrange(desc(count)) %>% filter(count >=20)
top10IP
## Source: local data frame [10 x 2]
##
## ip count
## (fctr) (int)
## 1 64.242.88.10 452
## 2 10.0.0.153 270
## 3 h24-71-236-129.ca.shawcable.net 51
## 4 cr020r01-3.sac.overture.com 44
## 5 h24-70-69-74.ca.shawcable.net 32
## 6 market-mail.panduit.com 29
## 7 ts04-ip92.hevanet.com 28
## 8 mail.geovariances.fr 23
## 9 ip68-228-43-49.tc.ph.cox.net 22
## 10 207.195.59.160 20
top10RESOURCE<- log1 %>% select(resource) %>% group_by(resource) %>% summarise(count = n()) %>% arrange( desc(count))%>% filter(count >=18)
top10RESOURCE
## Source: local data frame [10 x 2]
##
## resource count
## (chr) (int)
## 1 /twiki/pub/TWiki/TWikiLogos/twikiRobot46x50.gif 64
## 2 / 47
## 3 /twiki/bin/view/Main/WebHome 41
## 4 /icons/gnu-head-tiny.jpg 37
## 5 /icons/mailman.jpg 37
## 6 /icons/PythonPowered.png 37
## 7 /favicon.ico 28
## 8 /robots.txt 27
## 9 /razor.html 26
## 10 /twiki/bin/view/Main/SpamAssassinTaggingOnly 18
statusCode<- log1 %>% select(status_code) %>% group_by(status_code) %>% summarise(codes = n())
statusCode
## Source: local data frame [6 x 2]
##
## status_code codes
## (int) (int)
## 1 200 1274
## 2 302 6
## 3 304 137
## 4 401 123
## 5 404 5
## 6 408 1
log1$time1<- dmy_hms(log1$timeStamp, tz= "UTC")
log1<- log1[-7]
log1$bytes<- as.numeric(log1$bytes)
meanBytes<- log1 %>% select(bytes) %>% summarise(mean = mean(bytes))
meanBytes
## mean
## 1 279.4276
see HERE for code 401 meaning
code401<- log1 %>% select(ip, resource, status_code, time1) %>% filter(status_code == 401)
head(code401)
## ip
## 1 64.242.88.10
## 2 64.242.88.10
## 3 64.242.88.10
## 4 64.242.88.10
## 5 64.242.88.10
## 6 64.242.88.10
## resource
## 1 /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables
## 2 /twiki/bin/edit/Main/Header_checks?topicparent=Main.ConfigurationVariables
## 3 /twiki/bin/attach/Main/OfficeLocations
## 4 /twiki/bin/edit/Main/Smtpd_etrn_restrictions?topicparent=Main.ConfigurationVariables
## 5 /twiki/bin/attach/Main/PostfixCommands
## 6 /twiki/bin/edit/Main/Flush_service_name?topicparent=Main.ConfigurationVariables
## status_code time1
## 1 401 2004-03-08 00:05:49
## 2 401 2004-03-08 00:29:16
## 3 401 2004-03-08 00:30:29
## 4 401 2004-03-08 00:33:53
## 5 401 2004-03-08 00:45:56
## 6 401 2004-03-08 00:52:35
finding which ip returned 401 codes
ip401<- code401 %>% select(ip) %>% group_by(ip) %>% summarise(count = n()) %>% arrange(desc(count))
ip401
## Source: local data frame [7 x 2]
##
## ip count
## (fctr) (int)
## 1 64.242.88.10 112
## 2 cr020r01-3.sac.overture.com 6
## 3 195.246.13.119 1
## 4 207.195.59.160 1
## 5 market-mail.panduit.com 1
## 6 p213.54.168.132.tisdip.tiscali.de 1
## 7 prxint-sxb3.e-i.net 1