Introduction

The human eye has been the arbiter for the classification of astronomical sources in the night sky for hundreds of years. But a new facility – the Large Synoptic Survey Telescope (LSST) – is about to revolutionize the field, discovering 10 to 100 times more astronomical sources that vary in the night sky than we’ve ever known. Some of these sources will be completely unprecedented!

The Photometric LSST Astronomical Time-Series Classification Challenge (PLAsTiCC) asks Kagglers to help prepare to classify the data from this new survey. Competitors will classify astronomical sources that vary with time into different classes, scaling from a small training set to a very large test set of the type the LSST will discover.

Data Handling

Step#1 - Load relevant libs & files.

library(astroFns)
library(randomForest)
library(caret)
setwd("G:/DataScienceProject/Kaggle_PLAsTiCC_Astronomical_Classification")
Trainset <- read.csv(file = "training_set.csv", header = TRUE)
TrainMeta <- read.csv(file = "training_set_metadata.csv", header = TRUE)
TestMeta <- read.csv(file = "test_set_metadata.csv", header = TRUE)

Step#2 Clean data & add relevant columns. 1.Adding hostgal_specz as reference 2.Select only ddf with ‘1’. 3.Handling negative flux & flux error. 4. remove non post processed columns.

Trainset <- Trainset[which(Trainset$detected == 1),] #Only dtected object
Trainset$hostgal_specz <- 0
Trainset$ddf <- 0
Trainset$target <- 0

i <- 1
for(i in 1:length(Trainset$object_id)){
  Trainset$mjd[i] <- round(Trainset$mjd[i])
  if(Trainset$flux[i] < 0){
    Trainset$flux[i] <- 0
    Trainset$flux_err[i] <- 0
  }
  
  rownum <- grep(paste0("\\b", Trainset$object_id[i], "\\b"), TrainMeta$object_id)
  Trainset$hostgal_specz[i] <- TrainMeta$hostgal_specz[rownum]
  Trainset$ddf[i] <- TrainMeta$ddf[rownum]
  Trainset$target[i] <- TrainMeta$target[rownum]

  print(i)
  i <- i + 1
}
Trainset <- Trainset[which(Trainset$ddf == 1),] #Only ddf is 1
Trainset$ddf <- NULL
Trainset$detected <- NULL
write.csv(Trainset, file = "Trainset-new-baseline.csv", row.names = FALSE)

Step#3 Since, I already saw the feature importance, I can remove the low impact features.

TestMeta$ra <- NULL
TestMeta$gal_l <- NULL
TestMeta$decl <- NULL
TestMeta$mwebv <- NULL
TestMeta$distmod <- NULL
TestMeta$hostgal_photoz_err <- NULL
TestMeta$hostgal_photoz <- NULL
TestMeta$gal_b <- NULL
TestMeta$ddf <- NULL
TestMeta <- TestMeta[which(!is.na(TestMeta$hostgal_specz)),]

Step#4 create cross validation & run Random Forest ML.

Trainset_cv <- createDataPartition(Trainset$target, p=0.8, list=FALSE)
Train_cv <- Trainset[Trainset_cv,]
Test_cv <- Trainset[-Trainset_cv,]

model2 <- randomForest(target ~.-object_id, data = Train_cv, importance = TRUE)
model2
saveRDS(model2, "model2.baseline.rds")
#model2 <- readRDS("model2.baseline.rds")
#predicTrain <- predict(model1, Test_cv, type = "response")
importance(model2)

Step#5 Handling test dataset to align with train dataset. Afterthen, run prediction & get submission results.

submitFile <- read.csv(file = "mySubmission.csv", header = TRUE)
write(paste0(date(), "::", "Start!!!"), file = "log.txt", append = TRUE)
loopStop <- 330788055
flag <- 0
i <- 131
while(flag < loopStop){
  n <- (i - 1) * 10000 #load chuncks of 10k
  Testset <- read.csv(file = "test_set.csv", header = TRUE, nrows = 10000, skip = n)
  colnames(Testset) <- c("object_id", "mjd", "passband", "flux", "flux_err", "detected") #Naming
  Testset$detected <- NULL
  Testset$hostgal_specz <- 0
  Testset$target <- 0

  j <- 1
  for(j in 1:10000){
      Testset$mjd[j] <- round(Testset$mjd[j]) #cleansing date & flux
      if(Testset$flux[j] < 0){
        Testset$flux[j] <- 0
        Testset$flux_err[j] <- 0
        }
      print(j)
      j <- j + 1
    }
  Testset <- Testset[which(Testset$flux != 0),]

  j <- 1
  for(j in 1:length(Testset$flux)){     
      rownum <- grep(paste0("\\b", Testset$object_id[j], "\\b"), TestMeta$object_id) 
      if(length(rownum == 0)){
        
      } else if (length(rownum > 0)){
          Testset$hostgal_specz[j] <- TestMeta$hostgal_specz[rownum] #Handling hostgal specz
      }
      print(j)
      j <- j + 1
  }
  predic <- predict(model2, Testset, type = "response")
  predic <- as.data.frame(predic)
  #putting prediction according to submission file
  Testset$mjd <- NULL
  Testset$passband <- NULL
  Testset$flux <- NULL
  Testset$flux_err <- NULL
  Testset$hostgal_specz <- NULL
  Testset$target <- NULL
  Testset <- cbind(Testset, predic)
  j <- 1
  for(j in 1:length(Testset$predic)){   
      Testset$predic <- round(Testset$predic)
  #    print(j)
      j <- j + 1
  }
  Testset <- Testset[!duplicated(Testset),]    
  Testset$dist <- 0
  k <- 1
  for(k in 1:length(Testset$predic)){
    k1 <- abs(Testset$predic[k] - 6)
    k2 <- abs(Testset$predic[k] - 15)
    k3 <- abs(Testset$predic[k] - 16)
    k4 <- abs(Testset$predic[k] - 42)
    k5 <- abs(Testset$predic[k] - 52)
    k6 <- abs(Testset$predic[k] - 53)
    k7 <- abs(Testset$predic[k] - 62)
    k8 <- abs(Testset$predic[k] - 64)
    k9 <- abs(Testset$predic[k] - 65)
    k10 <- abs(Testset$predic[k] - 67)
    k11 <- abs(Testset$predic[k] - 88)
    k12 <- abs(Testset$predic[k] - 90)
    k13 <- abs(Testset$predic[k] - 92)
    k14 <- abs(Testset$predic[k] - 95)
    k15 <- abs(Testset$predic[k] - 99)
    selectcol <- c(k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, k12, k13, k14, k15)
    Testset$dist[k] <- which(selectcol == min(selectcol))
    k <- k + 1
    }
    Testset$predic <- NULL
    Testset <- Testset[!duplicated(Testset),]
    
    K <- 1
    for(k in 1:length(Testset$dist)){
      rownum <- grep(paste0("\\b", Testset$object_id[k], "\\b"), submitFile$object_id) 
      submitFile[rownum, 1 + Testset$dist[k]] <- 1
   # print(k)
      k <- k + 1
      }
      write.csv(submitFile, file = "mySubmission.csv", row.names = FALSE)
  
  flag <- max(Testset$object_id)
  write(paste0(date(), "::", i), file = "log.txt", append = TRUE)
  i <- i + 1
}

Conclusion

It is my first time for rum machine learning with classification over time. Thank you Kaggle!!

Kaggle_PLAsTiCC

Gilad Shtern

November 8, 2018

Introduction

Data Handling

Conclusion