Welcome to handout#5, let start by getting our working directory.
getwd()
[1] "E:/School/Summer 2021/Security and Data Governance/Handout/Handout 5"
pkg <- c("ggplot2","lattice")
new.pkg <-pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
install.packages(new.pkg)
}
library(ggplot2)
library(lattice)
av <-read.csv("reputation.data",sep = "#",header = FALSE)
View(av)
#we just uploaded reputation data into our directory by giving it the name av each of our data points was separated by "#"
colnames(av) <-c("IP","reliability","Risk","Type",
"Country","Locale","Coords","x")
# Here we are naming the columns in the table av we uploaded earlier
av
str(av)#getting an overview of the data frame
'data.frame': 258626 obs. of 8 variables:
$ IP : chr "222.76.212.189" "222.76.212.185" "222.76.212.186" "5.34.246.67" ...
$ reliability: int 4 4 4 6 4 4 4 4 4 6 ...
$ Risk : int 2 2 2 3 5 2 2 2 2 3 ...
$ Type : chr "Scanning Host" "Scanning Host" "Scanning Host" "Spamming" ...
$ Country : chr "CN" "CN" "CN" "US" ...
$ Locale : chr "Xiamen" "Xiamen" "Xiamen" "" ...
$ Coords : chr "24.4797992706,118.08190155" "24.4797992706,118.08190155" "24.4797992706,118.08190155" "38.0,-97.0" ...
$ x : chr "11" "11" "11" "12" ...
# we have our reputation.data we named it av in our file and then we named the columns so instead of having unnamed columns we have nice organized columns.
# Changes Country from a char to factor to be able to use in math functions. A factor behave a little bit like integer vector because R encodes the levels as integers.
#Factors are the data objects which are used to categorize the data and store it as levels. They can store both strings and integers. They are useful in the columns which have a limited number of unique values. Like "Male, "Female" and True, False etc. They are useful in data analysis for statistical modeling.
# this was need to correct errors we were having when running the ggplots and computations were getting errors.
av$Country<-as.factor(av$Country)
head(av)
# head av gives you an actual view of the header of the table with a more organized view , while str gives you a more messy but complete information of the table such as how many levels per factor
summary(av$reliability)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.000 2.000 2.798 4.000 10.000
summary(av$Risk)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.000 2.000 2.000 2.221 2.000 7.000
table(av$reliability)
1 2 3 4 5 6 7 8 9 10
5612 149117 10892 87040 7 4758 297 21 686 196
table(av$Risk)
1 2 3 4 5 6 7
39 213852 33719 9588 1328 90 10
# here we got a summary of both reliability and of risk with numbers such as min,mean,median,max
#we also go the values in the table
summary(av$Type,maxsum=10)
Length Class Mode
258626 character character
summary(av$Country, maxsum=40)
CN US TR DE NL RU GB IN FR TW BR UA RO KR CA AR
68583 50387 13958 10055 9953 7931 6346 6293 5480 5449 4399 3811 3443 3274 3101 3051 3046
MX TH IT HK ES CL AE JP HU PL VE EG ID RS PK VN LV
3039 2572 2448 2361 1929 1896 1827 1811 1636 1610 1589 1452 1378 1323 1309 1203 1056
NO CZ BG SG IR (Other)
958 928 871 868 866 15136
# we are getting the summary of both these categories with max level of 10 and 40
library(ggplot2)
country.top20 <- names(summary(av$Country))[1:20]
# we just created a new column with the top 20 countries and we called it country.top20
country.top20
[1] "CN" "US" "TR" "" "DE" "NL" "RU" "GB" "IN" "FR" "TW" "BR" "UA" "RO" "KR" "CA" "AR" "MX" "TH" "IT"
gg <-ggplot(data=subset(av,Country %in% country.top20),
#gg <-ggplot(data=subset(av),
aes(x=reorder(Country,Country, length)))
gg <- gg + geom_bar(fill="#000099")
gg <- gg + labs(title="Country Counts", x="Country", y="Count")
# then we are going to rotate the chart to make it easier to read
gg <- gg + coord_flip()
print(gg)
# We are going to remove garbage from the chart we created in the last step
gg <- gg + theme(panel.grid=element_blank(),
panel.background=element_blank())
print(gg)
# here we have a chart with the top 20 countries by count from highest to lowest count we named out y value and our x value we gave the chart a title
limits = factor(...) or scale_*_continuous()?# now we are going to create a bar graph of counts by risk
gg <- ggplot(data=av, aes(x=Risk))
gg <- gg + geom_bar(fill="#000099")
# force a X scale to be just the limits of the data and to be discrete vs continuous
#gg <- gg + scale_x_discrete(limits=seq(max(av$Risk)))
gg <- gg + scale_x_discrete(limits=factor(max(av$Risk)))
gg <- gg + labs(title= "'Risk' Counts", x="Risk Score", y="Count")
# remove garbage from chart to get more precise info
gg <- gg + theme(panel.grid=element_blank(),
panel.background=element_blank())
print(gg)
# We jsut crated a bar chart called risk counts we named our x and y axis we gave the graph a title and we got all of our data from our av table and the risk column
limits = factor(...) or scale_*_continuous()?# In this part of the handout we are going to create yet another bar graph called bar graph of counts by reliability
gg <- ggplot(data=av, aes(x=reliability))
gg <- gg + geom_bar(fill="#000099")
# force a X scale to be just the limits of the data and to be discrete vs continuous
gg <- gg + scale_x_discrete(limits=factor(max(av$reliability)))
#gg <- gg + scale_x_discrete(limits=seq(max(av$reliability)))
gg <- gg + labs(title= "'Reliability' Counts", x="Reliability Score", y="Count")
# remove garbage from chart to get more precise info
gg <- gg + theme(panel.grid=element_blank(),
panel.background=element_blank())
print(gg)
# we created another bar graph called reliability counts with our y and x axis named and we got the data from our av table and our reliability column
country_top <- summary(av$Country, maxsum=10)
# now we are converting into a percentage by dividing it by the number of rows in av
country.perc10 <- country_top/nrow(av)
print(country.perc10)
CN US TR DE NL RU GB IN (Other)
0.26518215 0.19482573 0.05396983 0.03887854 0.03848414 0.03066590 0.02453736 0.02433243 0.02118890 0.30793501
# here we got the top 10 countries and we converted the value they sustained into a percentage by dividing by the number of rows in av and they we are displaying the top in percentage
#we are computing a contingency table for risk/reliability factors which is going to produce a matrix
#of counts of rows that have attributes at each (x,y) location
rr.tab <- xtabs(~Risk+reliability, data=av)
print(ftable(rr.tab)) #print the table
reliability 1 2 3 4 5 6 7 8 9 10
Risk
1 0 0 16 7 0 8 8 0 0 0
2 804 149114 3670 57653 4 2084 85 11 345 82
3 2225 3 6668 22168 2 2151 156 7 260 79
4 2129 0 481 6447 0 404 43 2 58 24
5 432 0 55 700 1 103 5 1 20 11
6 19 0 2 60 0 8 0 0 1 0
7 3 0 0 5 0 0 0 0 2 0
# we just created another table that shows us what risk and reliability have in common
#graphical view
#we need to use a function called levelplot from the lattice package we downloaded at the begining of the handout
# and we are going to cast the table into a data frame
rr.df = data.frame(table(av$Risk, av$reliability))
# now we are going to name our columns
colnames(rr.df) <- c("Risk", "Reliability", "Freq")
levelplot(Freq~Reliability*Risk, data=rr.df, main="Risk ~ Reliability",
ylab = "Reliability", xlab = "Risk", shrink = c (0.5,1),
col.regions = colorRampPalette(c("#F5F5F5", "#01665E"))(20))
# we just created a levelplot wich we got from the lattice package we donwloaded and we used our Risk and reliability columns to see which is the perfect spot between both of those categories
# We are going to generate random samples for risk and Reliability and re-run xtab
#starting PRNG from reproducible point
set.seed(1492) # as it leads to discovery
#generate 260,00 random samples
rel=sample(1:7, 260000, replace=T)
rsk=sample(1:10,260000, replace=T)
#Cast the table into a data frame
tmp.df = data.frame(table(factor(rsk), factor(rel)))
# we are going to give the columns names
colnames(rr.df) <- c("Risk", "Reliability", "Freq")
levelplot(Freq~Reliability*Risk, data=rr.df, main="Risk ~ Reliability",
ylab = "Reliability", xlab = "Risk", shrink = c (0.5,1),
col.regions = colorRampPalette(c("#F5F5F5", "#01665E"))(20))
# We are going to create a new variable called "simpletype & replacing multiple categories with label of"multiples"
av$simpletype <- as.character(av$Type)
#Group all nodes with multiple categories into a new category
av$simpletype[grep(';', av$simpletype)] <- "Multiples"
#Turn it into a factor once again
av$simpletype <- factor(av$simpletype)
rrt.df = data.frame(table(av$Risk, av$reliability, av$simpletype))
colnames(rrt.df) <- c("Risk", "Reliability", "simpletype","Freq")
levelplot(Freq ~ Reliability*Risk|simpletype, data=rrt.df,
main="Risk ~ Reliability | Type",ylab = "Risk",
xlab = "Reliability", shrink = c(0.5,1),
col.regions = colorRampPalette(c("#F5F5F5","#01665E"))(20))
# we jsut created a levelplotjsut like the one before but we added an new column called simpletype which shows us the types and where we see more risk a reliability per type
# we are going to go fro the same levelplot but we are filtering out scanning host which was the more popular one in the earlier levelplot
rrt.df <- subset(rrt.df, simpletype != "Scanning Host")
levelplot(Freq ~ Reliability*Risk|simpletype, data =rrt.df,
main="Risk ~ Reliability | Type",ylab = "Risk",
xlab = "Reliability", shrink = c(0.5, 1),
col.regions = colorRampPalette(c("#F5F5F5","#01665E"))(20))
# here we have the same level plot we had in the earlier example but without scanning host as a category and we see the graph a lot more distributed than before
rrt.df = subset(rrt.df,
!(simpletype %in% c("Malware distribution",
"Malware Domain")))
sprintf("Count: %d; Percent: %2.1f%%",
sum(rrt.df$Freq),
100*sum(rrt.df$Freq)/nrow(av))
[1] "Count: 15171; Percent: 5.9%"
levelplot(Freq ~ Reliability*Risk|simpletype, data =rrt.df,
main="Risk ~ Reliability | Type",ylab = "Risk",
xlab = "Reliability", shrink = c(0.5, 1),
col.regions = colorRampPalette(c("#F5F5F5","#01665E"))(20))
# here we created another levelplot but with less columns in this case we took off malware distribution and malware domain and yet again we see the numbers better distributed across the graph