Introduction

The malware industry continues to be a well-organized, well-funded market dedicated to evading traditional security measures. Once a computer is infected by malware, criminals can hurt consumers and enterprises in many ways.

With more than one billion enterprise and consumer customers, Microsoft takes this problem very seriously and is deeply invested in improving security.

As one part of their overall strategy for doing so, Microsoft is challenging the data science community to develop techniques to predict if a machine will soon be hit with malware. As with their previous, Malware Challenge (2015), Microsoft is providing Kagglers with an unprecedented malware dataset to encourage open-source progress on effective techniques for predicting malware occurrences.

Cleansing

First, lets load relevant packages. Then We’ll upload the train dataset. Our third step is to reduce of colmuns as Kaggle let us know that some columns are NAs.

write(paste0(date(), "::", "Start!!!"), file = "log.txt", append = TRUE)
library(gbm)
library(caret)
library(dplyr)
setwd("G:/DataScienceProject/Kaggle_Microsoft_Malware _Prediction")
MalwareDF <- read.csv(file = "train.csv", header = TRUE, nrow = 603000)
MalwareDF$RtpStateBitfield <- NULL
MalwareDF$IsSxsPassiveMode <- NULL
MalwareDF$AVProductsInstalled <- NULL
MalwareDF$AVProductsEnabled <- NULL
MalwareDF$IeVerIdentifier <- NULL
MalwareDF$Census_OEMNameIdentifier <- NULL
MalwareDF$Census_OEMModelIdentifier <- NULL
MalwareDF$Census_ProcessorManufacturerIdentifier <- NULL
MalwareDF$Census_ProcessorModelIdentifier <- NULL
MalwareDF$Census_InternalBatteryType <- NULL
MalwareDF$Census_InternalBatteryNumberOfCharges <- NULL
MalwareDF$Census_OSInstallLanguageIdentifier <- NULL
MalwareDF$Census_OSUILocaleIdentifier <- NULL
MalwareDF$Census_IsFlightingInternal <- NULL
MalwareDF$Census_ThresholdOptIn <- NULL
MalwareDF$Census_FirmwareManufacturerIdentifier <- NULL
MalwareDF$Census_FirmwareVersionIdentifier <- NULL
MalwareDF$Census_IsWIMBootEnabled <- NULL
MalwareDF$Wdft_RegionIdentifier <- NULL
MalwareDF$MachineIdentifier <- as.character(MalwareDF$MachineIdentifier)
write(paste0(date(), "::", "LoadTrainData - DONE"), file = "log.txt", append = TRUE)

Load the test dataset & remove NAs columns.

MalwareTest <- read.csv(file = "test.csv", header = TRUE)
MalwareTest$RtpStateBitfield <- NULL
MalwareTest$IsSxsPassiveMode <- NULL
MalwareTest$AVProductsInstalled <- NULL
MalwareTest$AVProductsEnabled <- NULL
MalwareTest$IeVerIdentifier <- NULL
MalwareTest$Census_OEMNameIdentifier <- NULL
MalwareTest$Census_OEMModelIdentifier <- NULL
MalwareTest$Census_ProcessorManufacturerIdentifier <- NULL
MalwareTest$Census_ProcessorModelIdentifier <- NULL
MalwareTest$Census_InternalBatteryType <- NULL
MalwareTest$Census_InternalBatteryNumberOfCharges <- NULL
MalwareTest$Census_OSInstallLanguageIdentifier <- NULL
MalwareTest$Census_OSUILocaleIdentifier <- NULL
MalwareTest$Census_IsFlightingInternal <- NULL
MalwareTest$Census_ThresholdOptIn <- NULL
MalwareTest$Census_FirmwareManufacturerIdentifier <- NULL
MalwareTest$Census_FirmwareVersionIdentifier <- NULL
MalwareTest$Census_IsWIMBootEnabled <- NULL
MalwareTest$Wdft_RegionIdentifier <- NULL
MalwareTest$MachineIdentifier <- as.character(MalwareTest$MachineIdentifier)
MalwareTest$HasDetections <- 0
write(paste0(date(), "::", "LoadTestData - DONE"), file = "log.txt", append = TRUE)

or easing the on machine learning, we will convert factor colums into numric one. This could acheive by creating a dictionary for both train & test dataset

colNum <- length(names(MalwareDF))
i <- 2
for(i in 2:colNum){
  if(class(MalwareDF[,i]) == "factor"){
    MalwareDF[,i] <- as.character(MalwareDF[,i])
    n <- as.data.frame(unique(MalwareDF[,i]))
    colnames(n) <- c(names(MalwareDF)[i])
    
    for (j in 1:length(n[,1])) {
      MalwareDF[,i] <- gsub(n[j,1], j, MalwareDF[,i])
      MalwareTest[,i] <- gsub(n[j,1], j, MalwareTest[,i])
      j <- j + 1
    }
    MalwareDF[,i] <- as.integer(MalwareDF[,i])
    MalwareTest[,i] <- as.integer(MalwareTest[,i])
  } else if(class(MalwareDF[,i]) != "factor"){
    
  }
  
  assign(paste("index", i, sep = ""), n)
  i <- i + 1
}
write.csv(MalwareDF, file = "MalwareDF-1.csv", row.names = FALSE)
write.csv(MalwareTest, file = "MalwareTest-1.csv", row.names = FALSE)
write(paste0(date(), "::", "IndexFactor - DONE"), file = "log.txt", append = TRUE)

This step is optional, but can be ease on ML. We’d like to convert double cloums into ingeter:

i <- 1
  for (i in 1:length(MalwareDF$Census_PrimaryDiskTotalCapacity)) {
    MalwareDF$Census_PrimaryDiskTotalCapacity[i] <- round(MalwareDF$Census_PrimaryDiskTotalCapacity[i] / 1000)
    MalwareDF$Census_InternalPrimaryDiagonalDisplaySizeInInches[i] <- round(MalwareDF$Census_InternalPrimaryDiagonalDisplaySizeInInches[i])
    i <- i + 1 
  }

i <- 1
  for (i in 1:length(MalwareTest$Census_PrimaryDiskTotalCapacity)) {
    MalwareTest$Census_PrimaryDiskTotalCapacity[i] <- round(MalwareTest$Census_PrimaryDiskTotalCapacity[i] / 1000)
    MalwareTest$Census_InternalPrimaryDiagonalDisplaySizeInInches[i] <- round(MalwareTest$Census_InternalPrimaryDiagonalDisplaySizeInInches[i])
    i <- i + 1 
  }

MalwareDF$Census_PrimaryDiskTotalCapacity <- as.integer(MalwareDF$Census_PrimaryDiskTotalCapacity)
MalwareDF$Census_InternalPrimaryDiagonalDisplaySizeInInches <- as.integer(MalwareDF$Census_InternalPrimaryDiagonalDisplaySizeInInches)

MalwareTest$Census_PrimaryDiskTotalCapacity <- as.integer(MalwareTest$Census_PrimaryDiskTotalCapacity)
MalwareTest$Census_InternalPrimaryDiagonalDisplaySizeInInches <- as.integer(MalwareTest$Census_InternalPrimaryDiagonalDisplaySizeInInches)

write.csv(MalwareDF, file = "MalwareDF-2.csv", row.names = FALSE)
write.csv(MalwareTest, file = "MalwareTest-2.csv", row.names = FALSE)
write(paste0(date(), "::", "HandlieDbl - DONE"), file = "log.txt", append = TRUE)

At this step, we runs Boosting Trees machine learning.

Train_cv <- MalwareDF
modFit <- gbm(HasDetections ~.-MachineIdentifier, data=Train_cv, 
              distribution = 'bernoulli', 
              n.trees = 300, #Iteration
              shrinkage = 0.1,
              interaction.depth = 2, 
              bag.fraction = 0.5, #Fraction for sample next tree
              train.fraction = 0.95,
              n.minobsinnode = 10, #num of sub-trees
              cv.folds = 10, #How many check on CV
              keep.data = TRUE,
              verbose = FALSE, n.cores = 1)
gbm.perf(modFit, method="cv")
gbm.perf(modFit, method="OOB")
saveRDS(modFit, "modFit-gbm-1.rds")
write(paste0(date(), "::", "BoostingTrees - DONE"), file = "log.txt", append = TRUE)

Now, we can run predition on our test data set.

i <- 1
for (i in 1:785) {
  startRow <- (i - 1) * 10000 + 1
  endRow <- i * 10000
  PredicSample <- MalwareTest[startRow:endRow,]
  Prediction <- predict(modFit, newdata = PredicSample, n.trees = 300, type = "link")
  
  j <- 1
  for (j in 1:10000) {
    MalwareTest$HasDetections[startRow + j] <- Prediction[j]
    j <- j + 1
  }

  write(paste0(date(), "::", "Prediction - ", startRow, " : ", endRow), file = "log.txt", append = TRUE)
  write.csv(MalwareTest, file = "mySubmision.csv", row.names = FALSE)
  i <- i + 1 
} 

startRow <- (i - 1) * 10000 + 1
endRow <- length(MalwareTest$HasDetections)
PredicSample <- MalwareTest[startRow:endRow,]
Prediction <- predict(modFit, newdata = PredicSample, n.trees = 300, type = "link")

j <- 1
startRow <- startRow - 1
for (j in 1:length(Prediction)) {
  MalwareTest$HasDetections[startRow + j] <- Prediction[j]
  j <- j + 1
}

write.csv(MalwareTest, file = "mySubmision.csv", row.names = FALSE)
write(paste0(date(), "::", "Prediction - DONE"), file = "log.txt", append = TRUE)

Since, the prediction has scale which the mean is zero. We could split the dataset into 2 types: 0 & 1.

MalwareTest <- MalwareTest[,c("MachineIdentifier","HasDetections")]
MalwareTest1 <- MalwareTest[which(MalwareTest$HasDetections > 0),]
MalwareTest2 <- MalwareTest[which(MalwareTest$HasDetections <= 0),]
MalwareTest1$HasDetections <- 1
MalwareTest2$HasDetections <- 0
MalwareSubmit <- rbind(MalwareTest1, MalwareTest2)
MalwareSubmit <- MalwareSubmit[order(MalwareSubmit$MachineIdentifier),]
write(paste0(date(), "::", "Submission - DONE"), file = "log.txt", append = TRUE)
write.csv(MalwareSubmit, file = "mySubmision1.csv", row.names = FALSE)