SETUP ENVIRONMENT
Clear environment
rm(list= ls(all.names = TRUE))
Set working directory
setwd('C://Users//brbhatta//Desktop//INSOFE//PHD//1997B42_Broto//phd_data')
Load required R library
library(ROSE)
## Warning: package 'ROSE' was built under R version 3.5.1
## Loaded ROSE 0.0-3
library(DMwR)
## Loading required package: lattice
## Loading required package: grid
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(caret)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:DMwR':
##
## join
Read Train and Test
Train <- read.csv('Train.csv')
Test <- read.csv('Test.csv')
Read MachinesDetails
MachinesDetails <- read.csv('MachinesDetails.csv')
GENERAL VIEW OF DATA
First few rows of TrainData
head(TrainData)
## MachineID ActionPoint MachineModel ServicePeriod
## 1 MID11001 ComponentRepair model4 21
## 2 MID11002 ComponentRepair model1 21
## 3 MID11003 NoIssue model3 22
## 4 MID11004 NoIssue model4 22
## 5 MID11005 ComponentRepair model3 22
## 6 MID11008 ComponentRepair model3 21
Summary of TrainData
summary(TrainData)
## MachineID ActionPoint MachineModel ServicePeriod
## MID11001: 1 ComponentRepair :413 model1: 56 Min. : 3.00
## MID11002: 1 ComponentReplacement: 70 model2:128 1st Qu.: 9.00
## MID11003: 1 NoIssue :191 model3:276 Median :13.00
## MID11004: 1 model4:214 Mean :13.09
## MID11005: 1 3rd Qu.:18.00
## MID11008: 1 Max. :22.00
## (Other) :668
Structure of TrainData
str(TrainData)
## 'data.frame': 674 obs. of 4 variables:
## $ MachineID : Factor w/ 674 levels "MID11001","MID11002",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ ActionPoint : Factor w/ 3 levels "ComponentRepair",..: 1 1 3 3 1 1 2 1 1 3 ...
## $ MachineModel : Factor w/ 4 levels "model1","model2",..: 4 1 3 4 3 3 4 3 1 4 ...
## $ ServicePeriod: int 21 21 22 22 22 21 21 21 21 21 ...
Check for NA values
colSums(is.na(TrainData)) # No NA values
## MachineID ActionPoint MachineModel ServicePeriod
## 0 0 0 0
Removing MachineID from Training data as its useless while building model
TrainData <- subset(TrainData, select = -c(MachineID))
Distribution of Train data
table(TrainData$ActionPoint)
##
## ComponentRepair ComponentReplacement NoIssue
## 413 70 191
str(TrainData$ActionPoint)
## Factor w/ 3 levels "ComponentRepair",..: 1 1 3 3 1 1 2 1 1 3 ...
TRAIN-VALIDATION SPLIT
Stratified sampling on ActionPoint i.e. Target variable
set.seed(789)
index_id <- createDataPartition(TrainData$ActionPoint, p=0.6, list=F)
training_data <- TrainData[index_id,]
validation_data <- TrainData[-index_id,]
Check the proportion of target variable in training and validation data
table(training_data$ActionPoint)
##
## ComponentRepair ComponentReplacement NoIssue
## 248 42 115
table(validation_data$ActionPoint)
##
## ComponentRepair ComponentReplacement NoIssue
## 165 28 76
prop.table(table(training_data$ActionPoint))
##
## ComponentRepair ComponentReplacement NoIssue
## 0.6123457 0.1037037 0.2839506
prop.table(table(validation_data$ActionPoint))
##
## ComponentRepair ComponentReplacement NoIssue
## 0.6133829 0.1040892 0.2825279
SOLUTION FOR CLASS IMBALANCE : SMOTING
Smoting on Train data
training_data <- SMOTE(ActionPoint~., training_data, perc.over=500, perc.under=100)
Smoting on Validation data
validation_data <- SMOTE(ActionPoint~., validation_data, perc.over=500, perc.under=100)
Check the proportion of target variable in training and validation data after SMOTING
table(training_data$ActionPoint)
##
## ComponentRepair ComponentReplacement NoIssue
## 140 252 70
table(validation_data$ActionPoint)
##
## ComponentRepair ComponentReplacement NoIssue
## 97 168 43
prop.table(table(training_data$ActionPoint))
##
## ComponentRepair ComponentReplacement NoIssue
## 0.3030303 0.5454545 0.1515152
prop.table(table(validation_data$ActionPoint))
##
## ComponentRepair ComponentReplacement NoIssue
## 0.3149351 0.5454545 0.1396104
MODEL BUILDING
Random Forest
model_rf = randomForest(ActionPoint ~ ., data=training_data, keep.forest=TRUE, ntree=100)
print(model_rf)
##
## Call:
## randomForest(formula = ActionPoint ~ ., data = training_data, keep.forest = TRUE, ntree = 100)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 1
##
## OOB estimate of error rate: 38.74%
## Confusion matrix:
## ComponentRepair ComponentReplacement NoIssue
## ComponentRepair 50 77 13
## ComponentReplacement 27 225 0
## NoIssue 33 29 8
## class.error
## ComponentRepair 0.6428571
## ComponentReplacement 0.1071429
## NoIssue 0.8857143
PREDICTION ON TRAINING DATA
pred_Train = predict(model_rf,
training_data[,setdiff(names(training_data), "ActionPoint")],
type="response",
norm.votes=TRUE)
Confusion matrix on Training data
cm_Train = table("actual" = training_data$ActionPoint, "predicted" = pred_Train)
Accuracy on Training data
accu_Train = sum(diag(cm_Train))/sum(cm_Train)
PREDICTION ON VALIDATION DATA
pred_Val = predict(model_rf,
validation_data[,setdiff(names(validation_data), "ActionPoint")],
type="response",
norm.votes=TRUE)
Confusion matrix on Validation data
cm_Val = table("actual" = validation_data$ActionPoint, "predicted" = pred_Val)
Accuracy on Validation data
accu_Val = sum(diag(cm_Val))/sum(cm_Val)
Print accuracy of training and validation data
print(accu_Train)
## [1] 0.6991342
print(accu_Val)
## [1] 0.5422078
Prediction on Test Data
Removing MachineID from test data
TestingData <- subset(TestData, select = -c(MachineID))
Predict on Testing Data
pred_Test = predict(model_rf, TestingData,
type="response",
norm.votes=TRUE)
Write predictions into predictions.csv
index_value <- data.frame(MachineID = TestData$MachineID,
ActionPoint = pred_Test)
write.csv(index_value, "my_predictions.csv", na="")