Had an issue changing the working directory errors like Error in file(file, “rt”) : cannot open the connection,, when I attempted to set the working directory using setwd(“E:/School/Summer 2021/Security and Data Governance/Handout/Handout 6/book/ch04/data”) would issue another error that the working directory could not be changed in the chunk.
I fixed by going to Session on the menu and Set working directory there.
# set working directory to chapter location
# (change for where you set up files in ch 2)
#getwd()
#setwd(file.path("E:", "School", "Summer 2021", "Security and Data Governance", "Handout","Handout 6","book","ch04"))
#setwd("~/book/ch04")
# make sure the packages for this chapter
# are installed, install if necessary
pkg <- c("bitops", "ggplot2", "mapproj", "stringr", "maps",
"grid", "gridExtra", "RColorBrewer", "igraph",
"colorspace", "scales", "stringr", "reshape2")
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
install.packages(new.pkg)
}
# requires packages: bitops
library(bitops) # load the bitops functions
# Define functions for converting IP addresses to/from integers
# take an IP address string in dotted octets (e.g. "192.168.0.1")
# and convert it to a 32-bit long integer (e.g. 3232235521)
ip2long <- function(ip) {
# convert string into vector of characters
ips <- unlist(strsplit(ip, '.', fixed=TRUE))
# set up a function to bit-shift, then "OR" the octets
octet <- function(x,y) bitOr(bitShiftL(x, 8), y)
# Reduce applys a function cumulatively left to right
Reduce(octet, as.integer(ips))
}
# take an 32-bit integer IP address (e.g. 3232235521)
# and convert it to a (e.g. "192.168.0.1").
long2ip <- function(longip) {
# set up reversing bit manipulation
octet <- function(nbits) bitAnd(bitShiftR(longip, nbits), 0xFF)
# Map applys a function to each element of the argument
# paste converts arguments to character and concatenates them
paste(Map(octet, c(24,16,8,0)), sep="", collapse=".")
}
#Test the functionality by reviewing the output of the following code
long2ip(ip2long("192.168.0.0"))
[1] "192.168.0.0"
#Test the functionality by reviewing the output of the following code
long2ip(ip2long("192.168.100.6"))
[1] "192.168.100.6"
# Define function to test for IP CIDR membership
# take an IP address (string) and a CIDR (string) and
# return whether the given IP address is in the CIDR range
ip.is.in.cidr <- function(ip, cidr) {
long.ip <- ip2long(ip)
cidr.parts <- unlist(strsplit(cidr, "/"))
cidr.range <- ip2long(cidr.parts[1])
cidr.mask <- bitShiftL(bitFlip(0), (32-as.integer(cidr.parts[2])))
return(bitAnd(long.ip, cidr.mask) == bitAnd(cidr.range, cidr.mask))
}
# Let us know that the IP falls within a certain range, in this case it does, comes back as TRUE.
ip.is.in.cidr("10.0.1.15","10.0.1.3/24")
[1] TRUE
# Let us know that the IP falls within a certain range, in this case it does not, comes back as FALSE.
ip.is.in.cidr("10.0.1.15","10.0.2.255/24")
[1] FALSE
Here I received error in strsplit, I put the whole path in the variable avRep. That cleared the problem
# R code to extract longitude/latitude pairs from AlienVault data
# read in the AlienVault reputation data
avRep <- "E:/School/Summer 2021/Security and Data Governance/Handout/Handout 6/book/ch04/data/reputation.data"
av.df <- read.csv(avRep, sep="#", header=FALSE)
colnames(av.df) <- c("IP", "Reliability", "Risk", "Type",
"Country", "Locale", "Coords", "x")
# create a vector of lat/long data by splitting on ","
av.coords.vec <- unlist(strsplit(as.character(av.df$Coords), ","))
# convert the vector in a 2-column matrix
av.coords.mat <- matrix(av.coords.vec, ncol=2, byrow=TRUE)
# project into a data frame
av.coords.df <- as.data.frame(av.coords.mat)
# name the columns
colnames(av.coords.df) <- c("lat","long")
# convert the characters to numeric values
av.coords.df$long <- as.double(as.character(av.coords.df$long))
av.coords.df$lat <- as.double(as.character(av.coords.df$lat))
# requires packages: ggplot2, maps, RColorBrewer, scales
# requires object: av.coords.df (4-3)
# R code to extract longitude/latitude pairs from AlienVault data
# need plotting and mapping functions plus colors
library(ggplot2)
library(maps)
library(RColorBrewer)
library(scales)
Which of the libraries listed above was previously used in this course? # ggplot2
# extract a color palette from the RColorBrewer package
set2 <- brewer.pal(8,"Set2")
# extract the polygon information for the world map, minus Antarctica
world <- map_data('world')
world <- subset(world, region != "Antarctica")
Do you see the utility of the code ran above? Please explain. # Yes, # extract the polygon information for the world map, minus Antarctica # > world <- map_data(‘world’) # > world <- subset(world, region != “Antarctica”) # Create a variable world and creating map_data of the world by using “world” package. # Second line create world map excluding Antarctica.
# plot the map with the points marking lat/lon of the geocoded entries
# plotting ~200K takes a bit of time
c
function (...) .Primitive("c")
Make comments about the output as well as the syntax used in the previous task.
This allows to see an IP address with a physical location. The map is showing high density of plot points in on the eastern side of the United States and some in California, also we seeing the same for Europe and on the coast of Asia,
Created a variable gg to hold the function of ggplot # gg <- ggplot()
Here it is gather the long and groups the plots by use longitude and latitude and group the points filling the inside with white using the geom_polygon function and ggplot and storing in gg. # gg <- gg + geom_polygon(data=world, aes(long, lat, group=group), # fill=“white”) gg variable is going plot the point of the data frame setting the color, the size. # gg <- gg + geom_point(data=av.coords.df, aes(x=long, y=lat),
# color=set2[2], size=1, alpha=0.1) # gg <- gg + labs(x="“, y=”") gg variable is setting the theme and backgroud of the map. # gg <- gg + theme(panel.background=element_rect(fill=alpha(set2[3],0.2), # colour=‘white’)) Displays the gg map. # gg
Had issue with the directory again just added the path from above and ran.
# requires packages: stringr
# requires object: av.df (4-3)
# R code to incporporate IANA IPv4 allocations
# retrieve IANA prefix list
library(stringr)
ianaURL <- "http://www.iana.org/assignments/ipv4-address-space/ipv4-address-space.csv"
ianaData <- "E:/School/Summer 2021/Security and Data Governance/Handout/Handout 6/book/ch04/data/ipv4-address-space.csv"
if (file.access(ianaData)) {
download.file(ianaURL, ianaData)
}
# read in the IANA table
iana <- read.csv(ianaData)
# clean up the iana prefix since it uses the old/BSD-
# number formatting (i.e. allows leading zeroes and
# we do not need to know the CIDR component.
iana$Prefix <- sub("^(00|0)", "", iana$Prefix, perl=TRUE)
iana$Prefix <- sub("/8$", "", iana$Prefix, perl=TRUE)
# define function to strip 'n' characters from a string
# (character vector) and return the shortened string.
# note that this function is 'vectorized' (you can pass it a single
# string or a vector of them)
rstrip <- function(x, n){
substr(x, 1, nchar(x)-n)
}
# extract just the prefix from the AlienVault list
av.IP.prefix <- rstrip(str_extract(as.character(av.df$IP),
"^([0-9]+)\\."), 1)
# there are faster ways than 'sapply()' but we wanted you to
# see the general "apply" pattern in action as you will use it
# quite a bit throughout your work in R
av.df$Designation <- sapply(av.IP.prefix, function(ip) {
iana[iana$Prefix == ip, ]$Designation
})
# summarize, order & review the findings
summary(factor(av.df$Designation))
Administered by AFRINIC Administered by APNIC Administered by ARIN Administered by RIPE NCC AFRINIC
322 2615 17974 5893 1896
APNIC ARIN AT&T Bell Laboratories Digital Equipment Corporation Hewlett-Packard Company
93776 42358 24 1 3
LACNIC Level 3 Communications, Inc. PSINet, Inc. RIPE NCC
18914 31 30 74789
# requires packages: ggplot2, maps, RColorBrewer
# requires object: av.coords.df (4-3), iana (4-5)
# Code to extract IANA block assignments & compare w/AlienVault groups
# create a new data frame from the iana designation factors
iana.df <- data.frame(table(iana$Designation))
colnames(iana.df) <- c("Registry", "IANA.Block.Count")
# make a data frame of the counts of the av iana
# designation factor
tmp.df <- data.frame(table(factor(av.df$Designation)))
colnames(tmp.df) <- c("Registry", "AlienVault.IANA.Count")
# merge (join) the data frames on the "reg" column
combined.df <- merge(iana.df, tmp.df)
print(combined.df[with(combined.df, order(-IANA.Block.Count)),],
row.names=FALSE)
# requires packages: reshape, grid, gridExtra, ggplot2, RColorBrewer
# requires object: combined.df (4-6), set2 (4-4)
# generates Figure 4-3
# plot charts from IANA data
# flatten the data frame by making one entry per "count" type
# versus having the counts in individual columns
# need the 'melt()' function from the reshape package
# to transform the data frame shape
library(reshape2)
library(grid)
library(gridExtra)
# normalize the IANA and AV values to % so bar chart scales
# match and make it easier to compare
combined.df$IANA.pct <- 100 * (combined.df$IANA.Block.Count /
sum(combined.df$IANA.Block.Count))
combined.df$AV.pct <- 100 * (combined.df$AlienVault.IANA.Count /
sum(combined.df$AlienVault.IANA.Count))
combined.df$IANA.vs.AV.pct <- combined.df$IANA.pct - combined.df$AV.pct
melted.df <- melt(combined.df)
Using Registry as id variables
# plot the new melted data frame values
gg1 <- ggplot(data=melted.df[melted.df$variable=="IANA.pct",],
aes(x=reorder(Registry, -value), y=value))
# set min/max for axis so scale is same for both charts
gg1 <- gg1 + ylim(0,40)
gg1 <- gg1 + geom_bar(stat="identity", fill=set2[3]) # using bars
# make a better label for the y axis
gg1 <- gg1 + labs(x="Registry", y="%", title="IANA %")
# make bar chart horizontal
gg1 <- gg1 + coord_flip()
# rotate the x-axis labels and remove the legend
gg1 <- gg1 + theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.background = element_blank(),
legend.position = "none")
gg1

gg2 <- ggplot(data=melted.df[melted.df$variable=="AV.pct",],
aes(x=reorder(Registry,-value), y=value))
gg2 <- gg2 + geom_bar(stat="identity", fill=set2[4]) # using bars
gg2 <- gg2 + ylim(0,40)
gg2 <- gg2 + labs(x="Registry", y="%", title="AlienVault IANA %")
gg2 <- gg2 + coord_flip()
gg2 <- gg2 + theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.background = element_blank(),
legend.position = "none")
gg2

# grid.arrange makes it possible to do very precise placement of
# multiple ggplot objects
grid.arrange(gg1, gg2, ncol=1, nrow=2)

I would like you to explain the output as well as the syntax.In addition, make comments about the possibility to redefine the functions gg1 and gg2 in such a way that the y-axis shows a more clear description of the problem.
The output are showing there are variation, but there are larger blocks that contribute the majority of malicious host. Such as RIPE NCC, Administered by ARIN and LACNIC.
Plot the new melted data frame values
gg2 <- ggplot(data=melted.df[melted.df$variable==“AV.pct”,], aes(x=reorder(Registry,-value), y=value)) gg2 <- gg2 + geom_bar(stat=“identity”, fill=set2[4]) # using bars
Set the min/max for axis so scale is same for both charts
gg2 <- gg2 + ylim(0,40) gg2 <- gg2 + labs(x=“Registry”, y=“%”, title=“AlienVault IANA %”) gg2 <- gg2 + coord_flip()
Rotate the x-axis labels and remove the legend
gg2 <- gg2 + theme(axis.text.x = element_text(angle = 90, hjust = 1), panel.background = element_blank(), legend.position = “none”) gg2 # Can deduce the same for AlienVaultIANA %, start with Administered by ARIN, LACNIC, ARIN, RIPE NCC APNIC contribute large blocks of malicious hosts.
# requires packages: ggplot2
# requires object: combined.df (4-7), set2 (4-4)
gg <- ggplot(data=combined.df,
aes(x=reorder(Registry, -IANA.Block.Count), y=AV.pct ))
gg <- gg + geom_bar(stat="identity", fill=set2[2])
gg <- gg + labs(x="Registry", y="Count",
title="AlienVault/IANA sorted by IANA (low-to-high")
gg <- gg + coord_flip()
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1),
panel.background = element_blank(),
legend.position = "none")
gg

Explain the output as well as the syntax used in the task we just completed above.
The output is telling us that AlienVault population does gravitate towards the IANA blocks with the most allocations.
The code is plotting AlienVault population per IANA block sorted by IANA Blocksize
requires packages: ggplot2
requires object: combined.df (4-7), set2 (4-4)
gg <- ggplot(data=combined.df, aes(x=reorder(Registry, -IANA.Block.Count), y=AV.pct )) gg <- gg + geom_bar(stat=“identity”, fill=set2[2]) gg <- gg + labs(x=“Registry”, y=“Count”, title=“AlienVault/IANA sorted by IANA (low-to-high”) gg <- gg + coord_flip() gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1), panel.background = element_blank(), legend.position = “none”) gg
In order to fully understand the task we are performing in the query below as well as the output, we would need to run the code listed in Handout7. Handouts 6 and 7 are to be completed within the same time frame so that our findings and conclusions can be more solid.
# requires packages: ggplot2, scales
# requires object: combined.df (4-7), set2 (4-4)
# generates figure 4-6
library(scales)
gg <- ggplot(data=combined.df)
gg <- gg + geom_point(aes(x=IANA.Block.Count,
y=AlienVault.IANA.Count),
color=set2[2], size=4)
gg <- gg + labs(x="IANA Block Count", y="AlienVault IANA Count",
title="IANA ~ AlienVault")
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none")
gg <- gg + theme(panel.background=element_rect(fill=alpha(set2[3],0.2),
colour='white'))
gg

Explain the output as well as the syntax we included in the chunk above. Use Handout 7 and class discussion to support your answers. # The scatterplot is showing a positive correlation.
requires packages: ggplot2, scales
requires object: combined.df (4-7), set2 (4-4)
Creating a scatterplot by displaying the relationship between tow continous variables.
gg <- gg + geom_point(aes(x=IANA.Block.Count, y=AlienVault.IANA.Count), color=set2[2], size=4)
Creating labels for the scatterplot diagram.
gg <- gg + labs(x=“IANA Block Count”, y=“AlienVault IANA Count”, title=“IANA ~ AlienVault”)
Create a theme to customize the non-data components of the plots.
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = “none”) gg <- gg + theme(panel.background=element_rect(fill=alpha(set2[3],0.2), colour=‘white’))
This returning the correlation coeffiecent the results falls close to +1, this indicates a strong positive linear relationship between the two variables.
cor(combined.df$IANA.Block.Count,
combined.df$AlienVault.IANA.Count, method="spearman")
[1] 0.9488598
Make comments about both the output and the syntax. # This returning the correlation coeffiecent the results falls close to +1, this indicates a strong positive linear relationship between the two variables. # Spearman correlation was applied it produces a rank correlation coefficient and is more suited to variables that do not have a normal distribution. Visual patterns are showing that larger blocks of networks will contain more malicious hosts.
---
title: "Handout 6"
output: html_notebook
---

# Had an issue changing the working directory errors like Error in file(file, "rt") : cannot open the connection,, when I attempted to set the working directory using setwd("E:/School/Summer 2021/Security and Data Governance/Handout/Handout 6/book/ch04/data") would issue another error that the working directory could not be changed in the chunk.
# I fixed by going to Session on the menu and Set working directory there.
```{r}
# set working directory to chapter location
# (change for where you set up files in ch 2)
#getwd()
#setwd(file.path("E:", "School", "Summer 2021", "Security and Data Governance", "Handout","Handout 6","book","ch04"))
#setwd("~/book/ch04")
# make sure the packages for this chapter
# are installed, install if necessary
pkg <- c("bitops", "ggplot2", "mapproj", "stringr", "maps",
         "grid", "gridExtra", "RColorBrewer", "igraph",
         "colorspace", "scales", "stringr", "reshape2")
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
  install.packages(new.pkg)  
}
```


```{r}
# requires packages: bitops
library(bitops) # load the bitops functions

# Define functions for converting IP addresses to/from integers
# take an IP address string in dotted octets (e.g. "192.168.0.1")
# and convert it to a 32-bit long integer (e.g. 3232235521)
ip2long <- function(ip) {
  # convert string into vector of characters
  ips <- unlist(strsplit(ip, '.', fixed=TRUE))
  # set up a function to bit-shift, then "OR" the octets
  octet <- function(x,y) bitOr(bitShiftL(x, 8), y)
  # Reduce applys a function cumulatively left to right
  Reduce(octet, as.integer(ips))
}

```




```{r}
# take an 32-bit integer IP address (e.g. 3232235521)
# and convert it to a (e.g. "192.168.0.1").
long2ip <- function(longip) {
  # set up reversing bit manipulation
  octet <- function(nbits) bitAnd(bitShiftR(longip, nbits), 0xFF)
  # Map applys a function to each element of the argument
  # paste converts arguments to character and concatenates them
  paste(Map(octet, c(24,16,8,0)), sep="", collapse=".")
}
```



```{r}
#Test the functionality by reviewing the output of the following code
long2ip(ip2long("192.168.0.0"))
```



```{r}
#Test the functionality by reviewing the output of the following code
long2ip(ip2long("192.168.100.6"))
```



```{r}
# Define function to test for IP CIDR membership
# take an IP address (string) and a CIDR (string) and
# return whether the given IP address is in the CIDR range
ip.is.in.cidr <- function(ip, cidr) {
  long.ip <- ip2long(ip)
  cidr.parts <- unlist(strsplit(cidr, "/"))
  cidr.range <- ip2long(cidr.parts[1])
  cidr.mask <- bitShiftL(bitFlip(0), (32-as.integer(cidr.parts[2])))
  return(bitAnd(long.ip, cidr.mask) == bitAnd(cidr.range, cidr.mask))
}
```


```{r}
# Let us know that the IP falls within a certain range, in this case it does, comes back as TRUE.
ip.is.in.cidr("10.0.1.15","10.0.1.3/24")
```

```{r}
# Let us know that the IP falls within a certain range, in this case it does not, comes back as FALSE.
ip.is.in.cidr("10.0.1.15","10.0.2.255/24")
```

# Here I received error in strsplit, I put the whole path in the variable avRep. That cleared the problem
```{r}
# R code to extract longitude/latitude pairs from AlienVault data
# read in the AlienVault reputation data
avRep <- "E:/School/Summer 2021/Security and Data Governance/Handout/Handout 6/book/ch04/data/reputation.data"
av.df <- read.csv(avRep, sep="#", header=FALSE)
colnames(av.df) <- c("IP", "Reliability", "Risk", "Type",
                     "Country", "Locale", "Coords", "x")
```



```{r}
# create a vector of lat/long data by splitting on ","
av.coords.vec <- unlist(strsplit(as.character(av.df$Coords), ","))
```


```{r}
# convert the vector in a 2-column matrix
av.coords.mat <- matrix(av.coords.vec, ncol=2, byrow=TRUE)
```



```{r}
# project into a data frame
av.coords.df <- as.data.frame(av.coords.mat)
```


```{r}
# name the columns 
colnames(av.coords.df) <- c("lat","long")
```


```{r}
# convert the characters to numeric values
av.coords.df$long <- as.double(as.character(av.coords.df$long))
av.coords.df$lat <- as.double(as.character(av.coords.df$lat))
```

```{r}
# requires packages: ggplot2, maps, RColorBrewer, scales
# requires object: av.coords.df (4-3)
# R code to extract longitude/latitude pairs from AlienVault data
# need plotting and mapping functions plus colors
library(ggplot2)
library(maps)
library(RColorBrewer)
library(scales)
```

Which of the libraries listed above was previously used in this course?
# ggplot2

```{r}
# extract a color palette from the RColorBrewer package
set2 <- brewer.pal(8,"Set2")
```

```{r}
# extract the polygon information for the world map, minus Antarctica
world <- map_data('world')
world <- subset(world, region != "Antarctica")
```

Do you see the utility of the code ran above? Please explain.
# Yes,
# extract the polygon information for the world map, minus Antarctica
# > world <- map_data('world')
# > world <- subset(world, region != "Antarctica")
# Create a variable world and creating map_data of the world by using "world" package.
# Second line create world map excluding Antarctica.
```{r}
# plot the map with the points marking lat/lon of the geocoded entries
# plotting ~200K takes a bit of time
c
```


Make comments about the output as well as the syntax used in the previous task.

# This allows to see an IP address with a physical location. The map is showing high density of plot points in on the eastern side of the United States and some in California, also we seeing the same for Europe and on the coast of Asia,

Created a variable gg to hold the function of ggplot
# gg <- ggplot()

Here it is gather the long and groups the plots by use longitude and latitude and group the points filling the inside with white using the geom_polygon function and ggplot and storing in gg.
# gg <- gg + geom_polygon(data=world, aes(long, lat, group=group), 
#                         fill="white")
gg variable is going plot the point of the data frame setting the color, the size.
# gg <- gg + geom_point(data=av.coords.df, aes(x=long, y=lat),  
#                       color=set2[2], size=1, alpha=0.1)
# gg <- gg + labs(x="", y="")
gg variable is setting the theme and backgroud of the map.
# gg <- gg + theme(panel.background=element_rect(fill=alpha(set2[3],0.2), 
#                                               colour='white'))
Displays the gg map.
# gg

# Had issue with the directory again just added the path from above and ran.
```{r}
# requires packages: stringr
# requires object: av.df (4-3)
# R code to incporporate IANA IPv4 allocations
# retrieve IANA prefix list
library(stringr)

ianaURL <- "http://www.iana.org/assignments/ipv4-address-space/ipv4-address-space.csv"
ianaData <- "E:/School/Summer 2021/Security and Data Governance/Handout/Handout 6/book/ch04/data/ipv4-address-space.csv"
if (file.access(ianaData)) {
  download.file(ianaURL, ianaData) 
}
```



```{r}
# read in the IANA table
iana <- read.csv(ianaData)
```



```{r}
# clean up the iana prefix since it uses the old/BSD-
# number formatting (i.e. allows leading zeroes and
# we do not need to know the CIDR component.
iana$Prefix <- sub("^(00|0)", "", iana$Prefix, perl=TRUE)
iana$Prefix <- sub("/8$", "", iana$Prefix, perl=TRUE)
```



```{r}
# define function to strip 'n' characters from a string
# (character vector) and return the shortened string.
# note that this function is 'vectorized' (you can pass it a single
# string or a vector of them)
rstrip <- function(x, n){
  substr(x, 1, nchar(x)-n)
}
```



```{r}
# extract just the prefix from the AlienVault list
av.IP.prefix <- rstrip(str_extract(as.character(av.df$IP),
                                  "^([0-9]+)\\."), 1)
```



```{r}
# there are faster ways than 'sapply()' but we wanted you to 
# see the general "apply" pattern in action as you will use it
# quite a bit throughout your work in R
av.df$Designation <- sapply(av.IP.prefix, function(ip) {
  iana[iana$Prefix == ip, ]$Designation
})
```

```{r}
# summarize, order & review the findings
summary(factor(av.df$Designation))
```


```{r}
# requires packages: ggplot2, maps, RColorBrewer
# requires object: av.coords.df (4-3), iana (4-5)
# Code to extract IANA block assignments & compare w/AlienVault groups
# create a new data frame from the iana designation factors
iana.df <- data.frame(table(iana$Designation))
colnames(iana.df) <- c("Registry", "IANA.Block.Count")

# make a data frame of the counts of the av iana
# designation factor
tmp.df <- data.frame(table(factor(av.df$Designation)))
colnames(tmp.df) <- c("Registry", "AlienVault.IANA.Count")
```



```{r}
# merge (join) the data frames on the "reg" column
combined.df <- merge(iana.df, tmp.df)
print(combined.df[with(combined.df, order(-IANA.Block.Count)),],
      row.names=FALSE)
```



```{r}
# requires packages: reshape, grid, gridExtra, ggplot2, RColorBrewer
# requires object: combined.df (4-6), set2 (4-4)
# generates Figure 4-3
# plot charts from IANA data
# flatten the data frame by making one entry per "count" type
# versus having the counts in individual columns
# need the 'melt()' function from the reshape package
# to transform the data frame shape
library(reshape2) 
library(grid)
library(gridExtra)
```



```{r}
# normalize the IANA and AV values to % so bar chart scales
# match and make it easier to compare
combined.df$IANA.pct <- 100 * (combined.df$IANA.Block.Count / 
                                 sum(combined.df$IANA.Block.Count))
combined.df$AV.pct <- 100 * (combined.df$AlienVault.IANA.Count / 
                               sum(combined.df$AlienVault.IANA.Count))

combined.df$IANA.vs.AV.pct <- combined.df$IANA.pct - combined.df$AV.pct
```


```{r}
melted.df <- melt(combined.df)
# plot the new melted data frame values
```

```{r}
gg1 <- ggplot(data=melted.df[melted.df$variable=="IANA.pct",], 
              aes(x=reorder(Registry, -value), y=value))
# set min/max for axis so scale is same for both charts
gg1 <- gg1 + ylim(0,40)
gg1 <- gg1 +  geom_bar(stat="identity", fill=set2[3]) # using bars

# make a better label for the y axis
gg1 <- gg1 + labs(x="Registry", y="%", title="IANA %") 
# make bar chart horizontal
gg1 <- gg1 + coord_flip()
# rotate the x-axis labels and remove the legend
gg1 <- gg1 + theme(axis.text.x = element_text(angle = 90, hjust = 1), 
                   panel.background = element_blank(),
                   legend.position = "none")
gg1
```

```{r}
gg2 <- ggplot(data=melted.df[melted.df$variable=="AV.pct",], 
              aes(x=reorder(Registry,-value), y=value))
gg2 <- gg2 + geom_bar(stat="identity", fill=set2[4]) # using bars
gg2 <- gg2 + ylim(0,40)
gg2 <- gg2 + labs(x="Registry", y="%", title="AlienVault IANA %") 
gg2 <- gg2 + coord_flip()
gg2 <- gg2 + theme(axis.text.x = element_text(angle = 90, hjust = 1), 
                   panel.background = element_blank(),
                   legend.position = "none")
gg2
```

```{r}
# grid.arrange makes it possible to do very precise placement of 
# multiple ggplot objects
grid.arrange(gg1, gg2, ncol=1, nrow=2)
```


I would like you to explain the output as well as the syntax.In addition, make comments about the possibility to redefine the functions gg1 and gg2 in such a way that the y-axis shows a more clear description of the problem.

# The output are showing there are variation, but there are larger blocks that contribute the majority of malicious host. Such as RIPE NCC, Administered by ARIN and LACNIC.

# Plot the new melted data frame values
gg2 <- ggplot(data=melted.df[melted.df$variable=="AV.pct",], 
              aes(x=reorder(Registry,-value), y=value))
gg2 <- gg2 + geom_bar(stat="identity", fill=set2[4]) # using bars

# Set the min/max for axis so scale is same for both charts
gg2 <- gg2 + ylim(0,40)
gg2 <- gg2 + labs(x="Registry", y="%", title="AlienVault IANA %") 
gg2 <- gg2 + coord_flip()

# Rotate the x-axis labels and remove the legend
gg2 <- gg2 + theme(axis.text.x = element_text(angle = 90, hjust = 1), 
                   panel.background = element_blank(),
                   legend.position = "none")
gg2
# Can deduce the same for AlienVaultIANA %, start with Administered by ARIN, LACNIC, ARIN, RIPE NCC APNIC contribute large blocks of malicious hosts. 


```{r}
# requires packages: ggplot2
# requires object: combined.df (4-7), set2 (4-4)
gg <- ggplot(data=combined.df, 
             aes(x=reorder(Registry, -IANA.Block.Count), y=AV.pct ))
gg <- gg + geom_bar(stat="identity", fill=set2[2])
gg <- gg + labs(x="Registry", y="Count",
                title="AlienVault/IANA sorted by IANA (low-to-high") 
gg <- gg + coord_flip()
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1), 
                 panel.background = element_blank(),
                 legend.position = "none")
gg
```

Explain the output as well as the syntax used in the task we just completed above.

# The output is telling us that AlienVault population does gravitate towards the IANA blocks with the most allocations.


# The code is plotting AlienVault population per IANA block sorted by IANA Blocksize

# requires packages: ggplot2
# requires object: combined.df (4-7), set2 (4-4)
gg <- ggplot(data=combined.df, 
             aes(x=reorder(Registry, -IANA.Block.Count), y=AV.pct ))
gg <- gg + geom_bar(stat="identity", fill=set2[2])
gg <- gg + labs(x="Registry", y="Count",
                title="AlienVault/IANA sorted by IANA (low-to-high") 
gg <- gg + coord_flip()
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1), 
                 panel.background = element_blank(),
                 legend.position = "none")
gg

In order to fully understand the task we are performing in the query below as well as the output, we would need to run the code listed in Handout7. Handouts 6 and 7 are to be completed within the same time frame so that our findings and conclusions can be more solid.


```{r}
# requires packages: ggplot2, scales
# requires object: combined.df (4-7), set2 (4-4)
# generates figure 4-6
library(scales)
gg <- ggplot(data=combined.df)
gg <- gg + geom_point(aes(x=IANA.Block.Count, 
                          y=AlienVault.IANA.Count),
                      color=set2[2], size=4)
gg <- gg + labs(x="IANA Block Count", y="AlienVault IANA Count",
                title="IANA ~ AlienVault")
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1), 
                 legend.position = "none")
gg <- gg + theme(panel.background=element_rect(fill=alpha(set2[3],0.2),
                                               colour='white'))
gg
```

Explain the output as well as the syntax we included in the chunk above. Use Handout 7 and class discussion to support your answers.
# The scatterplot is showing a positive correlation.

# requires packages: ggplot2, scales
# requires object: combined.df (4-7), set2 (4-4)
# generates figure 4-6
library(scales)
#Creating a variable gg to hold the ggplot from the combined.df 
gg <- ggplot(data=combined.df)

# Creating a scatterplot by displaying the relationship between tow continous variables.
gg <- gg + geom_point(aes(x=IANA.Block.Count, 
                          y=AlienVault.IANA.Count),
                      color=set2[2], size=4)
                      
# Creating labels for the scatterplot diagram.
gg <- gg + labs(x="IANA Block Count", y="AlienVault IANA Count",
                title="IANA ~ AlienVault")

# Create a theme to customize the non-data components of the plots.
gg <- gg + theme(axis.text.x = element_text(angle = 90, hjust = 1), 
                 legend.position = "none")
gg <- gg + theme(panel.background=element_rect(fill=alpha(set2[3],0.2),
                                               colour='white'))
                                               
# Display scatterplot.
gg


# This returning the correlation coeffiecent the results falls close to +1, this indicates a strong positive linear relationship between the two variables.
```{r}
cor(combined.df$IANA.Block.Count,
    combined.df$AlienVault.IANA.Count, method="spearman")
```

Make comments about both the output and the syntax.
# This returning the correlation coeffiecent the results falls close to +1, this indicates a strong positive linear relationship between the two variables.
# Spearman correlation was applied  it produces a rank correlation coefficient and is more suited to variables that do not have a normal distribution. Visual patterns are showing that larger blocks of networks will contain more malicious hosts.
