Import Libraries

library(ggplot2)
library(dplyr)
library(tidyr)
library(scales)
library(stringr)
library(DescTools)
library(lubridate)

Get top 6 view of data

setwd("/home/danny/Downloads/fyp 2016/apache-samples/access_log")
rawdata<- read.table("access_log.txt", header = FALSE)
head(rawdata)
##             V1 V2 V3                    V4     V5
## 1 64.242.88.10  -  - [07/Mar/2004:16:05:49 -0800]
## 2 64.242.88.10  -  - [07/Mar/2004:16:06:51 -0800]
## 3 64.242.88.10  -  - [07/Mar/2004:16:10:02 -0800]
## 4 64.242.88.10  -  - [07/Mar/2004:16:11:58 -0800]
## 5 64.242.88.10  -  - [07/Mar/2004:16:20:55 -0800]
## 6 64.242.88.10  -  - [07/Mar/2004:16:23:12 -0800]
##                                                                                                V6
## 1  GET /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables HTTP/1.1
## 2                           GET /twiki/bin/rdiff/TWiki/NewUserTemplate?rev1=1.3&rev2=1.2 HTTP/1.1
## 3                                                       GET /mailman/listinfo/hsdivision HTTP/1.1
## 4                                                   GET /twiki/bin/view/TWiki/WikiSyntax HTTP/1.1
## 5                                                 GET /twiki/bin/view/Main/DCCAndPostFix HTTP/1.1
## 6 GET /twiki/bin/oops/TWiki/AppendixFileSystem?template=oopsmore&param1=1.12&param2=1.12 HTTP/1.1
##    V7    V8
## 1 401 12846
## 2 200  4523
## 3 200  6291
## 4 200  7352
## 5 200  5253
## 6 200 11382

dimensions of data

rows x cols

dim(rawdata)
## [1] 1546    8

drop column 2 and 3

rawdata<- rawdata[ c(-2, -3)]

cleaning column names

rawdata<- separate(rawdata, V6, c("method", "resource", "protocol"), sep = " ")
## Warning: Too few values at 1 locations: 907
names(rawdata)<- c("ip", "timestamp", "timezone", "method", "resource", "protocol", "status_code", "bytes")
head(rawdata)
##             ip             timestamp timezone method
## 1 64.242.88.10 [07/Mar/2004:16:05:49   -0800]    GET
## 2 64.242.88.10 [07/Mar/2004:16:06:51   -0800]    GET
## 3 64.242.88.10 [07/Mar/2004:16:10:02   -0800]    GET
## 4 64.242.88.10 [07/Mar/2004:16:11:58   -0800]    GET
## 5 64.242.88.10 [07/Mar/2004:16:20:55   -0800]    GET
## 6 64.242.88.10 [07/Mar/2004:16:23:12   -0800]    GET
##                                                                             resource
## 1  /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables
## 2                           /twiki/bin/rdiff/TWiki/NewUserTemplate?rev1=1.3&rev2=1.2
## 3                                                       /mailman/listinfo/hsdivision
## 4                                                   /twiki/bin/view/TWiki/WikiSyntax
## 5                                                 /twiki/bin/view/Main/DCCAndPostFix
## 6 /twiki/bin/oops/TWiki/AppendixFileSystem?template=oopsmore&param1=1.12&param2=1.12
##   protocol status_code bytes
## 1 HTTP/1.1         401 12846
## 2 HTTP/1.1         200  4523
## 3 HTTP/1.1         200  6291
## 4 HTTP/1.1         200  7352
## 5 HTTP/1.1         200  5253
## 6 HTTP/1.1         200 11382

summary of data

summary(rawdata)
##                                ip                      timestamp   
##  64.242.88.10                   :452   [08/Mar/2004:10:48:37:  12  
##  10.0.0.153                     :270   [08/Mar/2004:09:18:57:  11  
##  h24-71-236-129.ca.shawcable.net: 51   [08/Mar/2004:12:59:37:  11  
##  cr020r01-3.sac.overture.com    : 44   [08/Mar/2004:22:03:29:  11  
##  h24-70-69-74.ca.shawcable.net  : 32   [10/Mar/2004:08:36:58:  11  
##  market-mail.panduit.com        : 29   [12/Mar/2004:12:23:41:  11  
##  (Other)                        :668   (Other)              :1479  
##    timezone       method            resource           protocol        
##  -0800]:1546   Length:1546        Length:1546        Length:1546       
##                Class :character   Class :character   Class :character  
##                Mode  :character   Mode  :character   Mode  :character  
##                                                                        
##                                                                        
##                                                                        
##                                                                        
##   status_code        bytes     
##  Min.   :200.0   -      : 139  
##  1st Qu.:200.0   12846  :  94  
##  Median :200.0   209    :  35  
##  Mean   :226.4   10419  :  32  
##  3rd Qu.:200.0   2877   :  32  
##  Max.   :408.0   3169   :  31  
##                  (Other):1183
dim(rawdata)
## [1] 1546    8

cleaning timestamp

rawdata$timeStamp <- paste0(rawdata$timestamp, rawdata$timezone)
log1<-rawdata[ c(-2, -3)]
head(log1)
##             ip method
## 1 64.242.88.10    GET
## 2 64.242.88.10    GET
## 3 64.242.88.10    GET
## 4 64.242.88.10    GET
## 5 64.242.88.10    GET
## 6 64.242.88.10    GET
##                                                                             resource
## 1  /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables
## 2                           /twiki/bin/rdiff/TWiki/NewUserTemplate?rev1=1.3&rev2=1.2
## 3                                                       /mailman/listinfo/hsdivision
## 4                                                   /twiki/bin/view/TWiki/WikiSyntax
## 5                                                 /twiki/bin/view/Main/DCCAndPostFix
## 6 /twiki/bin/oops/TWiki/AppendixFileSystem?template=oopsmore&param1=1.12&param2=1.12
##   protocol status_code bytes                   timeStamp
## 1 HTTP/1.1         401 12846 [07/Mar/2004:16:05:49-0800]
## 2 HTTP/1.1         200  4523 [07/Mar/2004:16:06:51-0800]
## 3 HTTP/1.1         200  6291 [07/Mar/2004:16:10:02-0800]
## 4 HTTP/1.1         200  7352 [07/Mar/2004:16:11:58-0800]
## 5 HTTP/1.1         200  5253 [07/Mar/2004:16:20:55-0800]
## 6 HTTP/1.1         200 11382 [07/Mar/2004:16:23:12-0800]

method count

methodCount<- log1 %>% select(method) %>% group_by(method) %>% summarise(count = n())

methodCount
## Source: local data frame [5 x 2]
## 
##    method count
##     (chr) (int)
## 1       -     1
## 2     GET  1525
## 3    HEAD     5
## 4 OPTIONS     1
## 5    POST    14

top 10 IP

top10IP<- log1 %>% select(ip) %>% group_by(ip) %>% summarise( count = n()) %>% arrange(desc(count)) %>% filter(count >=20)

top10IP
## Source: local data frame [10 x 2]
## 
##                                 ip count
##                             (fctr) (int)
## 1                     64.242.88.10   452
## 2                       10.0.0.153   270
## 3  h24-71-236-129.ca.shawcable.net    51
## 4      cr020r01-3.sac.overture.com    44
## 5    h24-70-69-74.ca.shawcable.net    32
## 6          market-mail.panduit.com    29
## 7            ts04-ip92.hevanet.com    28
## 8             mail.geovariances.fr    23
## 9     ip68-228-43-49.tc.ph.cox.net    22
## 10                  207.195.59.160    20

top 10 resource

top10RESOURCE<- log1 %>% select(resource) %>% group_by(resource) %>% summarise(count = n()) %>% arrange( desc(count))%>% filter(count >=18)

top10RESOURCE
## Source: local data frame [10 x 2]
## 
##                                           resource count
##                                              (chr) (int)
## 1  /twiki/pub/TWiki/TWikiLogos/twikiRobot46x50.gif    64
## 2                                                /    47
## 3                     /twiki/bin/view/Main/WebHome    41
## 4                         /icons/gnu-head-tiny.jpg    37
## 5                               /icons/mailman.jpg    37
## 6                         /icons/PythonPowered.png    37
## 7                                     /favicon.ico    28
## 8                                      /robots.txt    27
## 9                                      /razor.html    26
## 10    /twiki/bin/view/Main/SpamAssassinTaggingOnly    18

status codes

statusCode<- log1 %>% select(status_code) %>% group_by(status_code) %>% summarise(codes = n())

statusCode
## Source: local data frame [6 x 2]
## 
##   status_code codes
##         (int) (int)
## 1         200  1274
## 2         302     6
## 3         304   137
## 4         401   123
## 5         404     5
## 6         408     1

using lubricate package to standardize time and put it UTC timezone

log1$time1<- dmy_hms(log1$timeStamp, tz= "UTC")
log1<- log1[-7]

mean bytes

log1$bytes<- as.numeric(log1$bytes)
meanBytes<- log1 %>% select(bytes) %>% summarise(mean = mean(bytes))

meanBytes
##       mean
## 1 279.4276

401 status code

see HERE for code 401 meaning

code401<- log1 %>% select(ip, resource, status_code, time1) %>% filter(status_code == 401)
head(code401)
##             ip
## 1 64.242.88.10
## 2 64.242.88.10
## 3 64.242.88.10
## 4 64.242.88.10
## 5 64.242.88.10
## 6 64.242.88.10
##                                                                               resource
## 1    /twiki/bin/edit/Main/Double_bounce_sender?topicparent=Main.ConfigurationVariables
## 2           /twiki/bin/edit/Main/Header_checks?topicparent=Main.ConfigurationVariables
## 3                                               /twiki/bin/attach/Main/OfficeLocations
## 4 /twiki/bin/edit/Main/Smtpd_etrn_restrictions?topicparent=Main.ConfigurationVariables
## 5                                               /twiki/bin/attach/Main/PostfixCommands
## 6      /twiki/bin/edit/Main/Flush_service_name?topicparent=Main.ConfigurationVariables
##   status_code               time1
## 1         401 2004-03-08 00:05:49
## 2         401 2004-03-08 00:29:16
## 3         401 2004-03-08 00:30:29
## 4         401 2004-03-08 00:33:53
## 5         401 2004-03-08 00:45:56
## 6         401 2004-03-08 00:52:35

ip 401

finding which ip returned 401 codes

ip401<- code401 %>% select(ip) %>% group_by(ip) %>% summarise(count = n()) %>% arrange(desc(count))

ip401
## Source: local data frame [7 x 2]
## 
##                                  ip count
##                              (fctr) (int)
## 1                      64.242.88.10   112
## 2       cr020r01-3.sac.overture.com     6
## 3                    195.246.13.119     1
## 4                    207.195.59.160     1
## 5           market-mail.panduit.com     1
## 6 p213.54.168.132.tisdip.tiscali.de     1
## 7               prxint-sxb3.e-i.net     1