SETUP ENVIRONMENT

Clear environment

rm(list= ls(all.names = TRUE))

Set working directory

setwd('C://Users//brbhatta//Desktop//INSOFE//PHD//1997B42_Broto//phd_data')

Load required R library

library(ROSE)

## Warning: package 'ROSE' was built under R version 3.5.1

## Loaded ROSE 0.0-3

library(DMwR)

## Loading required package: lattice

## Loading required package: grid

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

library(caret)

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

library(plyr)

## 
## Attaching package: 'plyr'

## The following object is masked from 'package:DMwR':
## 
##     join

Read Train and Test

Train <- read.csv('Train.csv')
Test <- read.csv('Test.csv')

Read MachinesDetails

MachinesDetails <- read.csv('MachinesDetails.csv')

Form the TrainData and TestData by merging with MachineDetails

TrainData <- join(x=Train, y=MachinesDetails, by="MachineID")
TestData <- join(x=Test, y=MachinesDetails, by="MachineID")

GENERAL VIEW OF DATA

First few rows of TrainData

head(TrainData)

##   MachineID     ActionPoint MachineModel ServicePeriod
## 1  MID11001 ComponentRepair       model4            21
## 2  MID11002 ComponentRepair       model1            21
## 3  MID11003         NoIssue       model3            22
## 4  MID11004         NoIssue       model4            22
## 5  MID11005 ComponentRepair       model3            22
## 6  MID11008 ComponentRepair       model3            21

Summary of TrainData

summary(TrainData)

##     MachineID                 ActionPoint  MachineModel ServicePeriod  
##  MID11001:  1   ComponentRepair     :413   model1: 56   Min.   : 3.00  
##  MID11002:  1   ComponentReplacement: 70   model2:128   1st Qu.: 9.00  
##  MID11003:  1   NoIssue             :191   model3:276   Median :13.00  
##  MID11004:  1                              model4:214   Mean   :13.09  
##  MID11005:  1                                           3rd Qu.:18.00  
##  MID11008:  1                                           Max.   :22.00  
##  (Other) :668

Structure of TrainData

str(TrainData)

## 'data.frame':    674 obs. of  4 variables:
##  $ MachineID    : Factor w/ 674 levels "MID11001","MID11002",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ ActionPoint  : Factor w/ 3 levels "ComponentRepair",..: 1 1 3 3 1 1 2 1 1 3 ...
##  $ MachineModel : Factor w/ 4 levels "model1","model2",..: 4 1 3 4 3 3 4 3 1 4 ...
##  $ ServicePeriod: int  21 21 22 22 22 21 21 21 21 21 ...

Check for NA values

colSums(is.na(TrainData)) # No NA values

##     MachineID   ActionPoint  MachineModel ServicePeriod 
##             0             0             0             0

Removing MachineID from Training data as its useless while building model

TrainData <- subset(TrainData, select = -c(MachineID))

Distribution of Train data

table(TrainData$ActionPoint)

## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                  413                   70                  191

str(TrainData$ActionPoint)

##  Factor w/ 3 levels "ComponentRepair",..: 1 1 3 3 1 1 2 1 1 3 ...

TRAIN-VALIDATION SPLIT

Stratified sampling on ActionPoint i.e. Target variable

set.seed(789)
index_id <- createDataPartition(TrainData$ActionPoint, p=0.6, list=F)
training_data <- TrainData[index_id,]
validation_data <- TrainData[-index_id,]

Check the proportion of target variable in training and validation data

table(training_data$ActionPoint)

## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                  248                   42                  115

table(validation_data$ActionPoint)

## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                  165                   28                   76

prop.table(table(training_data$ActionPoint))

## 
##      ComponentRepair ComponentReplacement              NoIssue 
##            0.6123457            0.1037037            0.2839506

prop.table(table(validation_data$ActionPoint))

## 
##      ComponentRepair ComponentReplacement              NoIssue 
##            0.6133829            0.1040892            0.2825279

SOLUTION FOR CLASS IMBALANCE : SMOTING

Smoting on Train data

training_data <- SMOTE(ActionPoint~., training_data, perc.over=500, perc.under=100)

Smoting on Validation data

validation_data <- SMOTE(ActionPoint~., validation_data, perc.over=500, perc.under=100)

Check the proportion of target variable in training and validation data after SMOTING

table(training_data$ActionPoint)

## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                  140                  252                   70

table(validation_data$ActionPoint)

## 
##      ComponentRepair ComponentReplacement              NoIssue 
##                   97                  168                   43

prop.table(table(training_data$ActionPoint))

## 
##      ComponentRepair ComponentReplacement              NoIssue 
##            0.3030303            0.5454545            0.1515152

prop.table(table(validation_data$ActionPoint))

## 
##      ComponentRepair ComponentReplacement              NoIssue 
##            0.3149351            0.5454545            0.1396104

MODEL BUILDING

Random Forest

model_rf = randomForest(ActionPoint ~ ., data=training_data, keep.forest=TRUE, ntree=100)
print(model_rf)

## 
## Call:
##  randomForest(formula = ActionPoint ~ ., data = training_data,      keep.forest = TRUE, ntree = 100) 
##                Type of random forest: classification
##                      Number of trees: 100
## No. of variables tried at each split: 1
## 
##         OOB estimate of  error rate: 38.74%
## Confusion matrix:
##                      ComponentRepair ComponentReplacement NoIssue
## ComponentRepair                   50                   77      13
## ComponentReplacement              27                  225       0
## NoIssue                           33                   29       8
##                      class.error
## ComponentRepair        0.6428571
## ComponentReplacement   0.1071429
## NoIssue                0.8857143

PREDICTION ON TRAINING DATA

pred_Train = predict(model_rf, 
                     training_data[,setdiff(names(training_data), "ActionPoint")],
                     type="response", 
                     norm.votes=TRUE)

Confusion matrix on Training data

cm_Train = table("actual" = training_data$ActionPoint, "predicted" = pred_Train)

Accuracy on Training data

accu_Train = sum(diag(cm_Train))/sum(cm_Train)

PREDICTION ON VALIDATION DATA

pred_Val = predict(model_rf, 
                   validation_data[,setdiff(names(validation_data), "ActionPoint")],
                   type="response", 
                   norm.votes=TRUE)

Confusion matrix on Validation data

cm_Val = table("actual" = validation_data$ActionPoint, "predicted" = pred_Val)

Accuracy on Validation data

accu_Val = sum(diag(cm_Val))/sum(cm_Val)

Print accuracy of training and validation data

print(accu_Train)

## [1] 0.6991342

print(accu_Val)

## [1] 0.5422078

Prediction on Test Data

Removing MachineID from test data

TestingData <- subset(TestData, select = -c(MachineID))

Predict on Testing Data

pred_Test = predict(model_rf, TestingData,
                    type="response", 
                    norm.votes=TRUE)

Write predictions into predictions.csv

index_value <- data.frame(MachineID = TestData$MachineID, 
                          ActionPoint = pred_Test)

write.csv(index_value, "my_predictions.csv", na="")

PHD

Broto Bhattacharjee

September 21, 2018

SETUP ENVIRONMENT

Clear environment

Set working directory

Load required R library

Read Train and Test

Read MachinesDetails

Form the TrainData and TestData by merging with MachineDetails

GENERAL VIEW OF DATA

First few rows of TrainData

Summary of TrainData

Structure of TrainData

Check for NA values

Removing MachineID from Training data as its useless while building model

Distribution of Train data

TRAIN-VALIDATION SPLIT

Stratified sampling on ActionPoint i.e. Target variable

Check the proportion of target variable in training and validation data

SOLUTION FOR CLASS IMBALANCE : SMOTING

Smoting on Train data

Smoting on Validation data

Check the proportion of target variable in training and validation data after SMOTING

MODEL BUILDING

Random Forest

PREDICTION ON TRAINING DATA

Confusion matrix on Training data

Accuracy on Training data

PREDICTION ON VALIDATION DATA

Confusion matrix on Validation data

Accuracy on Validation data

Print accuracy of training and validation data

Prediction on Test Data

Removing MachineID from test data

Predict on Testing Data

Write predictions into predictions.csv