SETUP ENVIRONMENT

Clear environment

rm(list= ls(all.names = TRUE))

Set working directory

setwd('C://Users//brbhatta//Desktop//INSOFE//PHD//1997B42_Broto//phd_data')

Load required R library

library(ROSE)
## Warning: package 'ROSE' was built under R version 3.5.1
## Loaded ROSE 0.0-3
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
library(plyr)
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:DMwR':
## 
##     join

Read Train and Test

Train <- read.csv('Train.csv')
Test <- read.csv('Test.csv')

Read MachinesDetails

MachinesDetails <- read.csv('MachinesDetails.csv')

Form the TrainData and TestData by merging with MachineDetails

TrainData <- join(x=Train, y=MachinesDetails, by="MachineID")
TestData <- join(x=Test, y=MachinesDetails, by="MachineID")

GENERAL VIEW OF DATA

First few rows of TrainData

head(TrainData)
##   MachineID     ActionPoint MachineModel ServicePeriod
## 1  MID11001 ComponentRepair       model4            21
## 2  MID11002 ComponentRepair       model1            21
## 3  MID11003         NoIssue       model3            22
## 4  MID11004         NoIssue       model4            22
## 5  MID11005 ComponentRepair       model3            22
## 6  MID11008 ComponentRepair       model3            21

Summary of TrainData

summary(TrainData)
##     MachineID                 ActionPoint  MachineModel ServicePeriod  
##  MID11001:  1   ComponentRepair     :413   model1: 56   Min.   : 3.00  
##  MID11002:  1   ComponentReplacement: 70   model2:128   1st Qu.: 9.00  
##  MID11003:  1   NoIssue             :191   model3:276   Median :13.00  
##  MID11004:  1                              model4:214   Mean   :13.09  
##  MID11005:  1                                           3rd Qu.:18.00  
##  MID11008:  1                                           Max.   :22.00  
##  (Other) :668

Structure of TrainData

str(TrainData)
## 'data.frame':    674 obs. of  4 variables:
##  $ MachineID    : Factor w/ 674 levels "MID11001","MID11002",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ ActionPoint  : Factor w/ 3 levels "ComponentRepair",..: 1 1 3 3 1 1 2 1 1 3 ...
##  $ MachineModel : Factor w/ 4 levels "model1","model2",..: 4 1 3 4 3 3 4 3 1 4 ...
##  $ ServicePeriod: int  21 21 22 22 22 21 21 21 21 21 ...

Check for NA values

colSums(is.na(TrainData)) # No NA values
##     MachineID   ActionPoint  MachineModel ServicePeriod 
##             0             0             0             0

Removing MachineID from Training data as its useless while building model

TrainData <- subset(TrainData, select = -c(MachineID))

Distribution of Train data

table(TrainData$ActionPoint)
## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                  413                   70                  191
str(TrainData$ActionPoint)
##  Factor w/ 3 levels "ComponentRepair",..: 1 1 3 3 1 1 2 1 1 3 ...

TRAIN-VALIDATION SPLIT

Stratified sampling on ActionPoint i.e. Target variable

set.seed(789)
index_id <- createDataPartition(TrainData$ActionPoint, p=0.6, list=F)
training_data <- TrainData[index_id,]
validation_data <- TrainData[-index_id,]

Check the proportion of target variable in training and validation data

table(training_data$ActionPoint)
## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                  248                   42                  115
table(validation_data$ActionPoint)
## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                  165                   28                   76
prop.table(table(training_data$ActionPoint))
## 
##      ComponentRepair ComponentReplacement              NoIssue 
##            0.6123457            0.1037037            0.2839506
prop.table(table(validation_data$ActionPoint))
## 
##      ComponentRepair ComponentReplacement              NoIssue 
##            0.6133829            0.1040892            0.2825279

SOLUTION FOR CLASS IMBALANCE : SMOTING

Smoting on Train data

training_data <- SMOTE(ActionPoint~., training_data, perc.over=500, perc.under=100)

Smoting on Validation data

validation_data <- SMOTE(ActionPoint~., validation_data, perc.over=500, perc.under=100)

Check the proportion of target variable in training and validation data after SMOTING

table(training_data$ActionPoint)
## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                  140                  252                   70
table(validation_data$ActionPoint)
## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                   97                  168                   43
prop.table(table(training_data$ActionPoint))
## 
##      ComponentRepair ComponentReplacement              NoIssue 
##            0.3030303            0.5454545            0.1515152
prop.table(table(validation_data$ActionPoint))
## 
##      ComponentRepair ComponentReplacement              NoIssue 
##            0.3149351            0.5454545            0.1396104

MODEL BUILDING

Random Forest

model_rf = randomForest(ActionPoint ~ ., data=training_data, keep.forest=TRUE, ntree=100)
print(model_rf)
## 
## Call:
##  randomForest(formula = ActionPoint ~ ., data = training_data,      keep.forest = TRUE, ntree = 100) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 38.74%
## Confusion matrix:
##                      ComponentRepair ComponentReplacement NoIssue
## ComponentRepair                   50                   77      13
## ComponentReplacement              27                  225       0
## NoIssue                           33                   29       8
##                      class.error
## ComponentRepair        0.6428571
## ComponentReplacement   0.1071429
## NoIssue                0.8857143

PREDICTION ON TRAINING DATA

pred_Train = predict(model_rf, 
                     training_data[,setdiff(names(training_data), "ActionPoint")],
                     type="response", 
                     norm.votes=TRUE)

Confusion matrix on Training data

cm_Train = table("actual" = training_data$ActionPoint, "predicted" = pred_Train)

Accuracy on Training data

accu_Train = sum(diag(cm_Train))/sum(cm_Train)

PREDICTION ON VALIDATION DATA

pred_Val = predict(model_rf, 
                   validation_data[,setdiff(names(validation_data), "ActionPoint")],
                   type="response", 
                   norm.votes=TRUE)

Confusion matrix on Validation data

cm_Val = table("actual" = validation_data$ActionPoint, "predicted" = pred_Val)

Accuracy on Validation data

accu_Val = sum(diag(cm_Val))/sum(cm_Val)

Prediction on Test Data

Removing MachineID from test data

TestingData <- subset(TestData, select = -c(MachineID))

Predict on Testing Data

pred_Test = predict(model_rf, TestingData,
                    type="response", 
                    norm.votes=TRUE)

Write predictions into predictions.csv

index_value <- data.frame(MachineID = TestData$MachineID, 
                          ActionPoint = pred_Test)

write.csv(index_value, "my_predictions.csv", na="")