These packages are needed for this assignment:

pkg <- c("ggplot2", "RColorBrewer")
new.pkg <- pkg[!(pkg %in% installed.packages())]
if (length(new.pkg)) {
  install.packages(new.pkg)  
}

Let us first make sure we are working in a directory we feel comfortable with. So I will start by changing my working directory to the one listed below

Let us now read the file and start exploring the dataset memproc.

mydata1<- read.csv("data/memproc.csv", header=T)
summary(mydata1)
     host                proc              mem             state          
 Length:247         Min.   :-3.1517   Min.   :-3.5939   Length:247        
 Class :character   1st Qu.:-1.2056   1st Qu.:-1.4202   Class :character  
 Mode  :character   Median :-0.4484   Median :-0.6212   Mode  :character  
                    Mean   :-0.4287   Mean   :-0.5181                     
                    3rd Qu.: 0.3689   3rd Qu.: 0.2413                     
                    Max.   : 3.1428   Max.   : 3.2184                     
mydata1

In order to explore this dataset more in detail, let us create a plot to compare the processor and memory usage, and differentiate it based on the malware state.

library(ggplot2)
gg <- ggplot(mydata1, aes(proc, mem, color=state))
gg <- gg + scale_color_brewer(palette="Set2")
gg <- gg + geom_point(size=3) + theme_bw()
print(gg)

I suggest you to create now a new graph but this time including the title in the chart. The title would be “Memory vs Processor Usage as function of the Malaware state”. # Added a title by using + ggtitle(“Memory vs Processor Usage as function of the Malaware state”)) to the print: # print(gg + ggtitle(“Memory vs Processor Usage as function of the Malaware state”))

library(ggplot2)
gg <- ggplot(mydata1, aes(proc, mem, color=state))
gg <- gg + scale_color_brewer(palette="Set2")
gg <- gg + geom_point(size=3) + theme_bw()
print(gg + ggtitle("Memory vs Processor Usage as function of the Malaware state"))

Going to take 1/3 of the data for test data and the train the algorithm

on the remaining 2/3.

set.seed(1492)
# count how many in the overall sample
n <- nrow(mydata1)
# set the test.size to be 1/3rd
test.size <- as.integer(n/3)
# randomly sample the rows for test set
testset <- sample(n, test.size)
# now split the data into test and train
test <- mydata1[testset, ]
train <- mydata1[-testset, ]

Splitting the data into test and train

# pull out proc and mem columns for infected then normal
# then use colMeans() to means of the columns
inf <- colMeans(train[train$state=="Infected", c("proc", "mem")])
nrm <- colMeans(train[train$state=="Normal", c("proc", "mem")])

Display the results of the on the two columns and return vector with

two elements.

print(inf)
     proc       mem 
0.9354513 1.0868010 
print(nrm)
      proc        mem 
-0.7907962 -0.9352974 

Predicting malware predict.malware take in a single named vector called

data, extract out the proc and mem values, then calculate how far those are from the means

tat was generated during the training. Calculate the distance by using the

Pythagorean theorem a^2 + b^2 = c^2. Once the difference between the trained proc mean

and test proc mean and the trained mem mean and the test mem mean is calculated we just’

then just compare them.

predict.malware <- function(data) {
  # get 'proc' and 'mem' as numeric values
  proc <- as.numeric(data[['proc']])
  mem <- as.numeric(data[['mem']])
  # set up infected comparison
  inf.a <- inf['proc'] - proc
  inf.b <- inf['mem'] - mem
  # pythagorean distance c = sqrt(a^2 + b^2)
  inf.dist <- sqrt(inf.a^2 + inf.b^2)
  # repeat for normal systems
  nrm.a <- nrm['proc'] - proc
  nrm.b <- nrm['mem'] - mem
  nrm.dist <- sqrt(nrm.a^2 + nrm.b^2)
  # assign a label of the closest (smallest)
  ifelse(inf.dist<nrm.dist,"Infected", "Normal")
}
# could test with these if you uncomment them
# predict.malware(inf['proc'], inf['mem'])
# expect "Infected" 
# predict.malware(nrm['proc'], nrm['mem'])
# expect "Normal"

requires the values from test test <- memproc[testset, ] and predict.malware value

calculated earlier.

prediction <- apply(test, 1, predict.malware)

Now have a set of predictions and the ability to compare

them to the real values to determine how well it did, look at

the proportion of correctly predicted results on the test data.

Then calculate that by taking the number of correct predictions

(where the real test$state and the predicted prediction match) and then

dividing that by the total number of predictions.

This simple algorithgm predicted 90% of the values correctly.

sum(test$state==prediction)/nrow(test)
[1] 0.902439
# Figure 9-2 #########################################################
slope <- -1*(1/((inf['mem']-nrm['mem'])/(inf['proc']-nrm['proc'])))
intercept <- mean(c(inf['mem'], nrm['mem'])) - (slope*mean(c(inf['proc'], nrm['proc'])))
result <- cbind(test, predict=prediction)
result$Accurate <- ifelse(result$state==result$predict, "Yes", "No")
result$Accurate <- factor(result$Accurate, levels=c("Yes", "No"), ordered=T)

This classifier creates a line halfway between the two means and perpendicular to an intersecting line. Anything above the line is predicted as infected; anything below is predicted to be normal.

The misclassified values are clearly marked. There are normal system above the line that are mislabeled as well as the infected systems below the line.

gg <- ggplot(result, aes(proc, mem, color=state, size=Accurate, shape=Accurate))
gg <- gg + scale_shape_manual(values=c(16, 8))
gg <- gg + scale_size_manual(values=c(3, 6))
gg <- gg + scale_color_brewer(palette="Set2")
gg <- gg + geom_point() + theme_bw()
gg <- gg + geom_abline(intercept = intercept, slope = slope, color="gray80")
print(gg)

Here we going to plot linear regression on non-linear data, this happens because

part of the linear regression is a reference to the linear coefficients estimated

not the data.

set.seed(1)
x <- runif(200, min=-10, max=10)
y <- 1.377*(x^3) + 0.92*(x^2) + .3*x + rnorm(200, sd=250) + 1572
x <- x + 10
smooth <- ggplot(data.frame(x,y), aes(x, y)) + geom_point() + 
  geom_smooth(method = "lm", formula = y ~ poly(x, 3), size = 1, se=F) + 
  theme_bw()
print(smooth)

memproc <- read.csv("data/memproc.csv", header=T)
memproc$infected <- ifelse(memproc$state=="Infected", 1, 0)
set.seed(1492)
n <- nrow(memproc)
test.size <- as.integer(n/3)
testset <- sample(n, test.size)
test <- memproc[testset, ]
train <- memproc[-testset, ]

The plots are showing linear regression on infection test data

Output on the x-axis is an estimated probability of a host being infected

based on the input variables. This plotted against the known values in the test

data on the y-axis(real life).

glm() function is one of serveral approaches to logistic regression. This function

can handle most situations.

glm.out = glm(infected ~ proc + mem, data=test, family=binomial(logit))
summary(glm.out)

Call:
glm(formula = infected ~ proc + mem, family = binomial(logit), 
    data = test)

Deviance Residuals: 
     Min        1Q    Median        3Q       Max  
-1.51110  -0.17718  -0.08015  -0.01132   2.28073  

Coefficients:
            Estimate Std. Error z value Pr(>|z|)   
(Intercept)  -2.1905     0.6943  -3.155  0.00160 **
proc          2.1378     0.7192   2.972  0.00295 **
mem           1.6530     0.5682   2.909  0.00362 **
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 68.275  on 81  degrees of freedom
Residual deviance: 25.348  on 79  degrees of freedom
AIC: 31.348

Number of Fisher Scoring iterations: 8
modelog <- predict.glm(glm.out, test, type="response")
gg <- ggplot(data.frame(x=modelog, y=ifelse(test$infected>0.5, "Yes", "No")), aes(x, y)) +
  geom_point(size=3, fill="steelblue", color="black", shape=4) + 
  ylab("Known Infected Host") +
  xlab("Estimated Probability of Infected Host") + theme_bw()
print(gg)

Creating a diagram with same data with multiple differenct k-values

kmeans() function will perform k-means clustering.

The first diagram is k-means with 3 clusters.

The second diagram is k-means with 4 clusters.

The third diagram is k-means with 5 clusters.

The fourth diagram is k-means with 6 clusters.

set.seed(1) # repeatable
x <- c(rnorm(200), rnorm(400)+2, rnorm(400)-2)
y <- c(rnorm(200), rnorm(200)+2, rnorm(200)-2, rnorm(200)+2, rnorm(200)-2)
randata <- data.frame(x=x, y=y)
out <- list()
for(i in c(3,4,5,6)) {
  km <- kmeans(randata, i)
  centers <- data.frame(x=km$centers[ ,1], y=km$centers[ ,2], cluster=1)
  randata$cluster <- factor(km$cluster)
  gg <- ggplot(randata, aes(x, y, color=cluster)) + geom_point(size=2)
  gg <- gg + geom_point(data=centers, aes(x, y), shape=8, color="black", size=4)
  gg <- gg + scale_x_continuous(expand=c(0,0.1))
  gg <- gg + scale_y_continuous(expand=c(0,0.1))
  gg <- gg + ggtitle(paste("k-means with", i, "clusters"))
  gg <- gg + theme(panel.grid = element_blank(),
                   panel.background = element_rect(colour = "black", fill=NA),
                   axis.text = element_blank(),
                   axis.title = element_blank(),
                   legend.position = "none",
                   axis.ticks = element_blank())
  out[[i-2]] <- gg
}
print(out[[1]])

print(out[[2]])

print(out[[3]])

print(out[[4]])

