Our first glimpse at planets outside of the solar system we call home came in 1992 when several terrestrial-mass planets were detected orbiting the pulsar PSR B1257+12. In this dataset, you can become a space explorer too by analyzing the characteristics of all discovered exoplanets (plus some familiar faces like Mars, Saturn, and even Earth). Data fields include planet and host star attributes, discovery methods, and (of course) date of discovery.
Data was originally collected and continues to be updated by Hanno Rein at the Open Exoplanet Catalogue Github repository.
The aim of this code is to predict the type of planet(TypeFlag) by using the recorded observations(refer columns). TypeFlag : 0=no known stellar binary companion; 1=P-type binary (circumbinary); 2=S-type binary; 3=orphan planet (no star)
###Reading the data
data<-read.csv("C:\\R-Programming\\Exoplanet\\oec.csv")
library("reshape2")
library(ggplot2)
##Plotting some graphs for explorations
df.m <- melt(data, "TypeFlag")
## Warning: attributes are not identical across measure variables; they will
## be dropped
g<-ggplot(df.m,aes(x=value,y=TypeFlag))+geom_point(aes(col=TypeFlag))+facet_wrap(~variable)
library("reshape2")
##Plotting some graphs for explorations
df.m <- melt(data, "TypeFlag")
## Warning: attributes are not identical across measure variables; they will
## be dropped
g<-ggplot(df.m,aes(x=value,y=TypeFlag))+geom_point(aes(col=TypeFlag))+facet_wrap(~variable)
print(g)
temp<-data
temp$PlanetIdentifier<-NULL
library("caret")
## Loading required package: lattice
library("caTools")
###Splitting the data into training and testing set
split<-sample.split(temp$TypeFlag,SplitRatio = 0.8)
train<-subset(temp,split==TRUE)
test<-subset(temp,split==FALSE)
temptest<-test
test$TypeFlag<-NULL
library("xgboost")
library("Matrix")
library(e1071)
library("dplyr")
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:xgboost':
##
## slice
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##Convering factor variables to numeric type
train<- train %>% mutate_if(is.factor,as.numeric)
test<- test %>% mutate_if(is.factor,as.numeric)
##Excluding the target variable(dependent variable)
data_variables <- as.matrix(train[,-1])
data_label <- train[,"TypeFlag"]
data_matrix <- xgb.DMatrix(data = data_variables, label = data_label)
numberOfClasses <- length(unique(train$TypeFlag))
xgb_params <- list("objective" = "multi:softmax",eta=0.04,gamma=0.6,max_depth=10,eval_metric="mlogloss","num_class" = numberOfClasses)
xgbcv <- xgb.cv( params = xgb_params, data = data_matrix, nrounds = 400, nfold = 10, showsd = T, stratified = T, print.every.n = 10, early.stop.round = 20, maximize = F)
## Warning: 'print.every.n' is deprecated.
## Use 'print_every_n' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## Warning: 'early.stop.round' is deprecated.
## Use 'early_stopping_rounds' instead.
## See help("Deprecated") and help("xgboost-deprecated").
## [1] train-mlogloss:1.307975+0.000025 test-mlogloss:1.308010+0.000142
## Multiple eval metrics are present. Will use test_mlogloss for early stopping.
## Will train until test_mlogloss hasn't improved in 20 rounds.
##
## [11] train-mlogloss:0.782430+0.000189 test-mlogloss:0.782884+0.001207
## [21] train-mlogloss:0.496765+0.000277 test-mlogloss:0.497818+0.002020
## [31] train-mlogloss:0.323754+0.000312 test-mlogloss:0.325323+0.002692
## [41] train-mlogloss:0.214145+0.000328 test-mlogloss:0.216189+0.003284
## [51] train-mlogloss:0.143104+0.000325 test-mlogloss:0.145593+0.003835
## [61] train-mlogloss:0.096442+0.000304 test-mlogloss:0.099305+0.004397
## [71] train-mlogloss:0.065336+0.000243 test-mlogloss:0.068668+0.004895
## [81] train-mlogloss:0.044681+0.000205 test-mlogloss:0.048459+0.005309
## [91] train-mlogloss:0.031001+0.000186 test-mlogloss:0.035171+0.005707
## [101] train-mlogloss:0.021816+0.000196 test-mlogloss:0.026335+0.006103
## [111] train-mlogloss:0.015739+0.000202 test-mlogloss:0.020447+0.006330
## [121] train-mlogloss:0.011760+0.000196 test-mlogloss:0.016628+0.006603
## [131] train-mlogloss:0.009077+0.000192 test-mlogloss:0.014122+0.006857
## [141] train-mlogloss:0.007237+0.000179 test-mlogloss:0.012481+0.007096
## [151] train-mlogloss:0.006013+0.000173 test-mlogloss:0.011410+0.007290
## [161] train-mlogloss:0.005193+0.000183 test-mlogloss:0.010773+0.007537
## [171] train-mlogloss:0.004679+0.000199 test-mlogloss:0.010415+0.007787
## [181] train-mlogloss:0.004363+0.000201 test-mlogloss:0.010228+0.008008
## [191] train-mlogloss:0.004133+0.000199 test-mlogloss:0.010132+0.008207
## [201] train-mlogloss:0.004026+0.000189 test-mlogloss:0.010104+0.008397
## [211] train-mlogloss:0.003962+0.000195 test-mlogloss:0.010107+0.008549
## [221] train-mlogloss:0.003927+0.000194 test-mlogloss:0.010108+0.008639
## Stopping. Best iteration:
## [204] train-mlogloss:0.004003+0.000189 test-mlogloss:0.010098+0.008449
nround <- xgbcv$best_iteration # number of XGBoost rounds
cv.nfold <- 10
# Fit cv.nfold * cv.nround XGB models and save OOF predictions
bst_model <- xgb.train(params = xgb_params,
data = data_matrix,
nrounds = nround)
test_matrix<-xgb.DMatrix(data = as.matrix(test))
predictionsXGBoost<-predict(bst_model,newdata=test_matrix)
table(predictionsXGBoost,temptest$TypeFlag)
##
## predictionsXGBoost 0 1 2 3
## 0 679 0 0 0
## 1 0 6 0 1
## 2 0 0 31 0