These packages are needed for this assignment:
pkg <- c("ggplot2", "RColorBrewer")
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
install.packages(new.pkg)
}
Let us first make sure we are working in a directory we feel comfortable with. So I will start by changing my working directory to the one listed below
Let us now read the file and start exploring the dataset memproc.
mydata1<- read.csv("data/memproc.csv", header=T)
summary(mydata1)
host proc mem state
Length:247 Min. :-3.1517 Min. :-3.5939 Length:247
Class :character 1st Qu.:-1.2056 1st Qu.:-1.4202 Class :character
Mode :character Median :-0.4484 Median :-0.6212 Mode :character
Mean :-0.4287 Mean :-0.5181
3rd Qu.: 0.3689 3rd Qu.: 0.2413
Max. : 3.1428 Max. : 3.2184
mydata1
In order to explore this dataset more in detail, let us create a plot to compare the processor and memory usage, and differentiate it based on the malware state.
library(ggplot2)
gg <- ggplot(mydata1, aes(proc, mem, color=state))
gg <- gg + scale_color_brewer(palette="Set2")
gg <- gg + geom_point(size=3) + theme_bw()
print(gg)

I suggest you to create now a new graph but this time including the title in the chart. The title would be “Memory vs Processor Usage as function of the Malaware state”. # Added a title by using + ggtitle(“Memory vs Processor Usage as function of the Malaware state”)) to the print: # print(gg + ggtitle(“Memory vs Processor Usage as function of the Malaware state”))
library(ggplot2)
gg <- ggplot(mydata1, aes(proc, mem, color=state))
gg <- gg + scale_color_brewer(palette="Set2")
gg <- gg + geom_point(size=3) + theme_bw()
print(gg + ggtitle("Memory vs Processor Usage as function of the Malaware state"))

Going to take 1/3 of the data for test data and the train the algorithm
on the remaining 2/3.
set.seed(1492)
# count how many in the overall sample
n <- nrow(mydata1)
# set the test.size to be 1/3rd
test.size <- as.integer(n/3)
# randomly sample the rows for test set
testset <- sample(n, test.size)
# now split the data into test and train
test <- mydata1[testset, ]
train <- mydata1[-testset, ]
Splitting the data into test and train
# pull out proc and mem columns for infected then normal
# then use colMeans() to means of the columns
inf <- colMeans(train[train$state=="Infected", c("proc", "mem")])
nrm <- colMeans(train[train$state=="Normal", c("proc", "mem")])
Display the results of the on the two columns and return vector with
two elements.
print(inf)
proc mem
0.9354513 1.0868010
print(nrm)
proc mem
-0.7907962 -0.9352974
Predicting malware predict.malware take in a single named vector called
data, extract out the proc and mem values, then calculate how far those are from the means
tat was generated during the training. Calculate the distance by using the
Pythagorean theorem a^2 + b^2 = c^2. Once the difference between the trained proc mean
and test proc mean and the trained mem mean and the test mem mean is calculated we just’
then just compare them.
predict.malware <- function(data) {
# get 'proc' and 'mem' as numeric values
proc <- as.numeric(data[['proc']])
mem <- as.numeric(data[['mem']])
# set up infected comparison
inf.a <- inf['proc'] - proc
inf.b <- inf['mem'] - mem
# pythagorean distance c = sqrt(a^2 + b^2)
inf.dist <- sqrt(inf.a^2 + inf.b^2)
# repeat for normal systems
nrm.a <- nrm['proc'] - proc
nrm.b <- nrm['mem'] - mem
nrm.dist <- sqrt(nrm.a^2 + nrm.b^2)
# assign a label of the closest (smallest)
ifelse(inf.dist<nrm.dist,"Infected", "Normal")
}
# could test with these if you uncomment them
# predict.malware(inf['proc'], inf['mem'])
# expect "Infected"
# predict.malware(nrm['proc'], nrm['mem'])
# expect "Normal"
requires the values from test test <- memproc[testset, ] and predict.malware value
calculated earlier.
prediction <- apply(test, 1, predict.malware)
Now have a set of predictions and the ability to compare
them to the real values to determine how well it did, look at
the proportion of correctly predicted results on the test data.
Then calculate that by taking the number of correct predictions
(where the real test$state and the predicted prediction match) and then
dividing that by the total number of predictions.
This simple algorithgm predicted 90% of the values correctly.
sum(test$state==prediction)/nrow(test)
[1] 0.902439
# Figure 9-2 #########################################################
slope <- -1*(1/((inf['mem']-nrm['mem'])/(inf['proc']-nrm['proc'])))
intercept <- mean(c(inf['mem'], nrm['mem'])) - (slope*mean(c(inf['proc'], nrm['proc'])))
result <- cbind(test, predict=prediction)
result$Accurate <- ifelse(result$state==result$predict, "Yes", "No")
result$Accurate <- factor(result$Accurate, levels=c("Yes", "No"), ordered=T)
This classifier creates a line halfway between the two means and perpendicular to an intersecting line. Anything above the line is predicted as infected; anything below is predicted to be normal.
The misclassified values are clearly marked. There are normal system above the line that are mislabeled as well as the infected systems below the line.
gg <- ggplot(result, aes(proc, mem, color=state, size=Accurate, shape=Accurate))
gg <- gg + scale_shape_manual(values=c(16, 8))
gg <- gg + scale_size_manual(values=c(3, 6))
gg <- gg + scale_color_brewer(palette="Set2")
gg <- gg + geom_point() + theme_bw()
gg <- gg + geom_abline(intercept = intercept, slope = slope, color="gray80")
print(gg)

Here we going to plot linear regression on non-linear data, this happens because
part of the linear regression is a reference to the linear coefficients estimated
not the data.
set.seed(1)
x <- runif(200, min=-10, max=10)
y <- 1.377*(x^3) + 0.92*(x^2) + .3*x + rnorm(200, sd=250) + 1572
x <- x + 10
smooth <- ggplot(data.frame(x,y), aes(x, y)) + geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x, 3), size = 1, se=F) +
theme_bw()
print(smooth)

memproc <- read.csv("data/memproc.csv", header=T)
memproc$infected <- ifelse(memproc$state=="Infected", 1, 0)
set.seed(1492)
n <- nrow(memproc)
test.size <- as.integer(n/3)
testset <- sample(n, test.size)
test <- memproc[testset, ]
train <- memproc[-testset, ]
The plots are showing linear regression on infection test data
Output on the x-axis is an estimated probability of a host being infected
based on the input variables. This plotted against the known values in the test
data on the y-axis(real life).
glm() function is one of serveral approaches to logistic regression. This function
can handle most situations.
glm.out = glm(infected ~ proc + mem, data=test, family=binomial(logit))
summary(glm.out)
Call:
glm(formula = infected ~ proc + mem, family = binomial(logit),
data = test)
Deviance Residuals:
Min 1Q Median 3Q Max
-1.51110 -0.17718 -0.08015 -0.01132 2.28073
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.1905 0.6943 -3.155 0.00160 **
proc 2.1378 0.7192 2.972 0.00295 **
mem 1.6530 0.5682 2.909 0.00362 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 68.275 on 81 degrees of freedom
Residual deviance: 25.348 on 79 degrees of freedom
AIC: 31.348
Number of Fisher Scoring iterations: 8
modelog <- predict.glm(glm.out, test, type="response")
gg <- ggplot(data.frame(x=modelog, y=ifelse(test$infected>0.5, "Yes", "No")), aes(x, y)) +
geom_point(size=3, fill="steelblue", color="black", shape=4) +
ylab("Known Infected Host") +
xlab("Estimated Probability of Infected Host") + theme_bw()
print(gg)

Creating a diagram with same data with multiple differenct k-values
kmeans() function will perform k-means clustering.
The first diagram is k-means with 3 clusters.
The second diagram is k-means with 4 clusters.
The third diagram is k-means with 5 clusters.
The fourth diagram is k-means with 6 clusters.
set.seed(1) # repeatable
x <- c(rnorm(200), rnorm(400)+2, rnorm(400)-2)
y <- c(rnorm(200), rnorm(200)+2, rnorm(200)-2, rnorm(200)+2, rnorm(200)-2)
randata <- data.frame(x=x, y=y)
out <- list()
for(i in c(3,4,5,6)) {
km <- kmeans(randata, i)
centers <- data.frame(x=km$centers[ ,1], y=km$centers[ ,2], cluster=1)
randata$cluster <- factor(km$cluster)
gg <- ggplot(randata, aes(x, y, color=cluster)) + geom_point(size=2)
gg <- gg + geom_point(data=centers, aes(x, y), shape=8, color="black", size=4)
gg <- gg + scale_x_continuous(expand=c(0,0.1))
gg <- gg + scale_y_continuous(expand=c(0,0.1))
gg <- gg + ggtitle(paste("k-means with", i, "clusters"))
gg <- gg + theme(panel.grid = element_blank(),
panel.background = element_rect(colour = "black", fill=NA),
axis.text = element_blank(),
axis.title = element_blank(),
legend.position = "none",
axis.ticks = element_blank())
out[[i-2]] <- gg
}
print(out[[1]])

print(out[[2]])

print(out[[3]])

print(out[[4]])

