Cleansing

First, lets load relevant packages. Then We’ll upload the train dataset. Our third step is to reduce of colmuns as Kaggle let us know that some columns are NAs.

write(paste0(date(), "::", "Start!!!"), file = "log.txt", append = TRUE)
library(gbm)
library(caret)
library(dplyr)
setwd("G:/DataScienceProject/Kaggle_Microsoft_Malware _Prediction")
MalwareDF <- read.csv(file = "train.csv", header = TRUE, nrow = 603000)
MalwareDF$RtpStateBitfield <- NULL
MalwareDF$IsSxsPassiveMode <- NULL
MalwareDF$AVProductsInstalled <- NULL
MalwareDF$AVProductsEnabled <- NULL
MalwareDF$IeVerIdentifier <- NULL
MalwareDF$Census_OEMNameIdentifier <- NULL
MalwareDF$Census_OEMModelIdentifier <- NULL
MalwareDF$Census_ProcessorManufacturerIdentifier <- NULL
MalwareDF$Census_ProcessorModelIdentifier <- NULL
MalwareDF$Census_InternalBatteryType <- NULL
MalwareDF$Census_InternalBatteryNumberOfCharges <- NULL
MalwareDF$Census_OSInstallLanguageIdentifier <- NULL
MalwareDF$Census_OSUILocaleIdentifier <- NULL
MalwareDF$Census_IsFlightingInternal <- NULL
MalwareDF$Census_ThresholdOptIn <- NULL
MalwareDF$Census_FirmwareManufacturerIdentifier <- NULL
MalwareDF$Census_FirmwareVersionIdentifier <- NULL
MalwareDF$Census_IsWIMBootEnabled <- NULL
MalwareDF$Wdft_RegionIdentifier <- NULL
MalwareDF$MachineIdentifier <- as.character(MalwareDF$MachineIdentifier)
write(paste0(date(), "::", "LoadTrainData - DONE"), file = "log.txt", append = TRUE)

Load the test dataset & remove NAs columns.

MalwareTest <- read.csv(file = "test.csv", header = TRUE)
MalwareTest$RtpStateBitfield <- NULL
MalwareTest$IsSxsPassiveMode <- NULL
MalwareTest$AVProductsInstalled <- NULL
MalwareTest$AVProductsEnabled <- NULL
MalwareTest$IeVerIdentifier <- NULL
MalwareTest$Census_OEMNameIdentifier <- NULL
MalwareTest$Census_OEMModelIdentifier <- NULL
MalwareTest$Census_ProcessorManufacturerIdentifier <- NULL
MalwareTest$Census_ProcessorModelIdentifier <- NULL
MalwareTest$Census_InternalBatteryType <- NULL
MalwareTest$Census_InternalBatteryNumberOfCharges <- NULL
MalwareTest$Census_OSInstallLanguageIdentifier <- NULL
MalwareTest$Census_OSUILocaleIdentifier <- NULL
MalwareTest$Census_IsFlightingInternal <- NULL
MalwareTest$Census_ThresholdOptIn <- NULL
MalwareTest$Census_FirmwareManufacturerIdentifier <- NULL
MalwareTest$Census_FirmwareVersionIdentifier <- NULL
MalwareTest$Census_IsWIMBootEnabled <- NULL
MalwareTest$Wdft_RegionIdentifier <- NULL
MalwareTest$MachineIdentifier <- as.character(MalwareTest$MachineIdentifier)
MalwareTest$HasDetections <- 0
write(paste0(date(), "::", "LoadTestData - DONE"), file = "log.txt", append = TRUE)

or easing the on machine learning, we will convert factor colums into numric one. This could acheive by creating a dictionary for both train & test dataset

colNum <- length(names(MalwareDF))
i <- 2
for(i in 2:colNum){
  if(class(MalwareDF[,i]) == "factor"){
    MalwareDF[,i] <- as.character(MalwareDF[,i])
    n <- as.data.frame(unique(MalwareDF[,i]))
    colnames(n) <- c(names(MalwareDF)[i])
    
    for (j in 1:length(n[,1])) {
      MalwareDF[,i] <- gsub(n[j,1], j, MalwareDF[,i])
      MalwareTest[,i] <- gsub(n[j,1], j, MalwareTest[,i])
      j <- j + 1
    }
    MalwareDF[,i] <- as.integer(MalwareDF[,i])
    MalwareTest[,i] <- as.integer(MalwareTest[,i])
  } else if(class(MalwareDF[,i]) != "factor"){
    
  }
  
  assign(paste("index", i, sep = ""), n)
  i <- i + 1
}
write.csv(MalwareDF, file = "MalwareDF-1.csv", row.names = FALSE)
write.csv(MalwareTest, file = "MalwareTest-1.csv", row.names = FALSE)
write(paste0(date(), "::", "IndexFactor - DONE"), file = "log.txt", append = TRUE)

This step is optional, but can be ease on ML. We’d like to convert double cloums into ingeter:

i <- 1
  for (i in 1:length(MalwareDF$Census_PrimaryDiskTotalCapacity)) {
    MalwareDF$Census_PrimaryDiskTotalCapacity[i] <- round(MalwareDF$Census_PrimaryDiskTotalCapacity[i] / 1000)
    MalwareDF$Census_InternalPrimaryDiagonalDisplaySizeInInches[i] <- round(MalwareDF$Census_InternalPrimaryDiagonalDisplaySizeInInches[i])
    i <- i + 1 
  }

i <- 1
  for (i in 1:length(MalwareTest$Census_PrimaryDiskTotalCapacity)) {
    MalwareTest$Census_PrimaryDiskTotalCapacity[i] <- round(MalwareTest$Census_PrimaryDiskTotalCapacity[i] / 1000)
    MalwareTest$Census_InternalPrimaryDiagonalDisplaySizeInInches[i] <- round(MalwareTest$Census_InternalPrimaryDiagonalDisplaySizeInInches[i])
    i <- i + 1 
  }

MalwareDF$Census_PrimaryDiskTotalCapacity <- as.integer(MalwareDF$Census_PrimaryDiskTotalCapacity)
MalwareDF$Census_InternalPrimaryDiagonalDisplaySizeInInches <- as.integer(MalwareDF$Census_InternalPrimaryDiagonalDisplaySizeInInches)

MalwareTest$Census_PrimaryDiskTotalCapacity <- as.integer(MalwareTest$Census_PrimaryDiskTotalCapacity)
MalwareTest$Census_InternalPrimaryDiagonalDisplaySizeInInches <- as.integer(MalwareTest$Census_InternalPrimaryDiagonalDisplaySizeInInches)

write.csv(MalwareDF, file = "MalwareDF-2.csv", row.names = FALSE)
write.csv(MalwareTest, file = "MalwareTest-2.csv", row.names = FALSE)
write(paste0(date(), "::", "HandlieDbl - DONE"), file = "log.txt", append = TRUE)

At this step, we runs Boosting Trees machine learning.

Train_cv <- MalwareDF
modFit <- gbm(HasDetections ~.-MachineIdentifier, data=Train_cv, 
              distribution = 'bernoulli', 
              n.trees = 300, #Iteration
              shrinkage = 0.1,
              interaction.depth = 2, 
              bag.fraction = 0.5, #Fraction for sample next tree
              train.fraction = 0.95,
              n.minobsinnode = 10, #num of sub-trees
              cv.folds = 10, #How many check on CV
              keep.data = TRUE,
              verbose = FALSE, n.cores = 1)
gbm.perf(modFit, method="cv")
gbm.perf(modFit, method="OOB")
saveRDS(modFit, "modFit-gbm-1.rds")
write(paste0(date(), "::", "BoostingTrees - DONE"), file = "log.txt", append = TRUE)

Now, we can run predition on our test data set.

i <- 1
for (i in 1:785) {
  startRow <- (i - 1) * 10000 + 1
  endRow <- i * 10000
  PredicSample <- MalwareTest[startRow:endRow,]
  Prediction <- predict(modFit, newdata = PredicSample, n.trees = 300, type = "link")
  
  j <- 1
  for (j in 1:10000) {
    MalwareTest$HasDetections[startRow + j] <- Prediction[j]
    j <- j + 1
  }

  write(paste0(date(), "::", "Prediction - ", startRow, " : ", endRow), file = "log.txt", append = TRUE)
  write.csv(MalwareTest, file = "mySubmision.csv", row.names = FALSE)
  i <- i + 1 
} 

startRow <- (i - 1) * 10000 + 1
endRow <- length(MalwareTest$HasDetections)
PredicSample <- MalwareTest[startRow:endRow,]
Prediction <- predict(modFit, newdata = PredicSample, n.trees = 300, type = "link")

j <- 1
startRow <- startRow - 1
for (j in 1:length(Prediction)) {
  MalwareTest$HasDetections[startRow + j] <- Prediction[j]
  j <- j + 1
}

write.csv(MalwareTest, file = "mySubmision.csv", row.names = FALSE)
write(paste0(date(), "::", "Prediction - DONE"), file = "log.txt", append = TRUE)

Since, the prediction has scale which the mean is zero. We could split the dataset into 2 types: 0 & 1.

MalwareTest <- MalwareTest[,c("MachineIdentifier","HasDetections")]
MalwareTest1 <- MalwareTest[which(MalwareTest$HasDetections > 0),]
MalwareTest2 <- MalwareTest[which(MalwareTest$HasDetections <= 0),]
MalwareTest1$HasDetections <- 1
MalwareTest2$HasDetections <- 0
MalwareSubmit <- rbind(MalwareTest1, MalwareTest2)
MalwareSubmit <- MalwareSubmit[order(MalwareSubmit$MachineIdentifier),]
write(paste0(date(), "::", "Submission - DONE"), file = "log.txt", append = TRUE)
write.csv(MalwareSubmit, file = "mySubmision1.csv", row.names = FALSE)

Kaggle_Microsoft_Malware _Prediction

Gilad Shtern

Jan 11, 2019

Introduction

Cleansing