CASE: Building a predictive Model using machine learning (ML) to predic the probability of a device Failure. Objective: Minimize false Positives and False Negatives. Data: http://aws-proserve-data-science.s3.amazonaws.com/predictive_maintenance.csv

Entering the Data from source:

setwd("C:/Users/gredy/Downloads/data")
data<-read.csv('predictive_maintenance.csv',header=T,na.strings = c(""))
head(data)

Next We decide to Drop or delete the variables or Columns Device and Time and any column with less than 100 values.There are no missing values and nas are under control.

datasub<-subset(data, select= c(failure,metric4,metric5,metric7,metric9))
head(datasub)
datasub$failure<-factor(datasub$failure)
str(datasub)
'data.frame':   124494 obs. of  5 variables:
 $ failure: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ metric4: int  52 0 0 0 0 41 0 1 0 9 ...
 $ metric5: int  6 6 12 6 15 6 8 19 14 9 ...
 $ metric7: int  0 0 0 0 0 0 0 16 0 0 ...
 $ metric9: int  7 0 0 0 3 1 0 3 0 164 ...

Partitioning the data in train and test sets

ind <- sample(2, nrow(datasub), replace = T, prob = c(0.7, 0.3))
intrain <- datasub[ind == 1,]
test <- datasub[ind == 2,]
str(intrain)
'data.frame':   87279 obs. of  5 variables:
 $ failure: Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
 $ metric4: int  52 0 0 0 41 0 1 0 9 0 ...
 $ metric5: int  6 12 6 15 6 8 19 14 9 7 ...
 $ metric7: int  0 0 0 0 0 0 16 0 0 0 ...
 $ metric9: int  7 0 0 3 1 0 3 0 164 1 ...
dim(intrain)
[1] 87279     5
table(intrain$failure)

    0     1 
87204    75 

Due to a imbalance in Data More “0” than “1” Objective of the model is to predic "failures(1) A sampling procedure is used to select over, under and both type of samples

library(ROSE)
rfboth <-randomForest(failure~., data=both)

Summary of Over sampling

summary(rfboth)

Sampling Under

under <- ovun.sample(failure~., data=intrain, method = "under")$data
table(under$failure)

 0  1 
73 75 
both <- ovun.sample(failure~., data=intrain, method = "both",
                    p = 0.5,
                    seed = 222,
                    N = 212)$data
table(both$failure)

  0   1 
 95 117 

Method= ROSE

drose <- ROSE(failure~., data = intrain, N = 500, seed=111)$data
table(drose$failure)

  0   1 
234 266 

Summary of Rose`

summary(drose)
 failure    metric4             metric5           metric7             metric9        
 0:234   Min.   :-331.4779   Min.   :-17.318   Min.   :-177.9983   Min.   :-179.648  
 1:266   1st Qu.:  -8.1676   1st Qu.:  6.321   1st Qu.:  -2.4084   1st Qu.: -36.133  
         Median :   0.7893   Median : 11.016   Median :   0.4295   Median :  -2.221  
         Mean   :  12.4010   Mean   : 16.272   Mean   :  12.2333   Mean   :   5.488  
         3rd Qu.:  14.2514   3rd Qu.: 17.554   3rd Qu.:  13.1202   3rd Qu.:  34.152  
         Max.   :1638.4847   Max.   :100.501   Max.   : 886.6524   Max.   :2674.522  

Predictive MOdels

###predictive  Models(Random Forest)
library(randomForest)
library(caret)
library(e1071)

It takes a while to process ………

intrain$failure<-factor(intrain$failure)
rftrain <- randomForest(failure~., data = intrain)
confusionMatrix(predict(rftrain, test), test$failure, positive = '1')
Confusion Matrix and Statistics

          Reference
Prediction     0     1
         0 37183    31
         1     1     0
                                          
               Accuracy : 0.9991          
                 95% CI : (0.9988, 0.9994)
    No Information Rate : 0.9992          
    P-Value [Acc > NIR] : 0.6168          
                                          
                  Kappa : -1e-04          
                                          
 Mcnemar's Test P-Value : 2.951e-07       
                                          
            Sensitivity : 0.000e+00       
            Specificity : 1.000e+00       
         Pos Pred Value : 0.000e+00       
         Neg Pred Value : 9.992e-01       
             Prevalence : 8.330e-04       
         Detection Rate : 0.000e+00       
   Detection Prevalence : 2.687e-05       
      Balanced Accuracy : 5.000e-01       
                                          
       'Positive' Class : 1               
                                          
over <- ovun.sample(failure~., data = intrain, method = "over")$data
table(over$failure)

   No   Yes 
87003 87082 

Over Sampling Procedure

rfover <- randomForest(failure~., data = over)
Error: cannot allocate vector of size 1.3 Gb

Rose Sample

rfrose <- randomForest(failure~., data=drose)
confusionMatrix(predict(rfrose, test), test$failure, positive = '1')
Confusion Matrix and Statistics

          Reference
Prediction     0     1
         0 36440    13
         1   744    18
                                          
               Accuracy : 0.9797          
                 95% CI : (0.9782, 0.9811)
    No Information Rate : 0.9992          
    P-Value [Acc > NIR] : 1               
                                          
                  Kappa : 0.0439          
                                          
 Mcnemar's Test P-Value : <2e-16          
                                          
            Sensitivity : 0.5806452       
            Specificity : 0.9799914       
         Pos Pred Value : 0.0236220       
         Neg Pred Value : 0.9996434       
             Prevalence : 0.0008330       
         Detection Rate : 0.0004837       
   Detection Prevalence : 0.0204756       
      Balanced Accuracy : 0.7803183       
                                          
       'Positive' Class : 1               
                                          
Conclusions
  1. The imbalance data impose or need to be Oversample, that is , It is neccesary to sample a more balance sample in order to train the model to detect a failure(1).

  2. Despite of obtaining some models still it is needed to estimate or study some other options.

  3. This short study only applies Random Forest due to have a categorical variable as dependent one. Some others options will be KNN, Decision Trees and Logistic regression.

  4. The model based in ROSE sample is showing the best results of prediction with 23 ones. However the sensitivity=.696 and the specifity=.914

LS0tDQp0aXRsZTogIlByZWRpY3RpdmUgTW9kZWwgb2YgZmFpbHVyZXMiDQpieTogR3JlZHkgR2Fycmlkbw0Kb3V0cHV0OiBodG1sX25vdGVib29rDQotLS0NCkNBU0U6IEJ1aWxkaW5nIGEgcHJlZGljdGl2ZSBNb2RlbCB1c2luZyBtYWNoaW5lIGxlYXJuaW5nIChNTCkgdG8gcHJlZGljIHRoZSBwcm9iYWJpbGl0eSBvZiBhIGRldmljZSBGYWlsdXJlLg0KT2JqZWN0aXZlOiBNaW5pbWl6ZSBmYWxzZSBQb3NpdGl2ZXMgYW5kIEZhbHNlIE5lZ2F0aXZlcy4NCkRhdGE6IGh0dHA6Ly9hd3MtcHJvc2VydmUtZGF0YS1zY2llbmNlLnMzLmFtYXpvbmF3cy5jb20vcHJlZGljdGl2ZV9tYWludGVuYW5jZS5jc3YNCg0KDQpFbnRlcmluZyB0aGUgRGF0YSBmcm9tIHNvdXJjZToNCmBgYHtyfQ0Kc2V0d2QoIkM6L1VzZXJzL2dyZWR5L0Rvd25sb2Fkcy9kYXRhIikNCmRhdGE8LXJlYWQuY3N2KCdwcmVkaWN0aXZlX21haW50ZW5hbmNlLmNzdicsaGVhZGVyPVQsbmEuc3RyaW5ncyA9IGMoIiIpKQ0KaGVhZChkYXRhKQ0KYGBgDQpOZXh0IFdlIGRlY2lkZSB0byBEcm9wIG9yIGRlbGV0ZSB0aGUgdmFyaWFibGVzIG9yIENvbHVtbnMgRGV2aWNlIGFuZCBUaW1lIGFuZCBhbnkgY29sdW1uIHdpdGggbGVzcyB0aGFuIDEwMCB2YWx1ZXMuVGhlcmUgYXJlIG5vIG1pc3NpbmcgdmFsdWVzIGFuZCBuYXMgYXJlIHVuZGVyIGNvbnRyb2wuDQoNCmBgYHtyfQ0KZGF0YXN1Yjwtc3Vic2V0KGRhdGEsIHNlbGVjdD0gYyhmYWlsdXJlLG1ldHJpYzQsbWV0cmljNSxtZXRyaWM3LG1ldHJpYzkpKQ0KaGVhZChkYXRhc3ViKQ0KZGF0YXN1YiRmYWlsdXJlPC1mYWN0b3IoZGF0YXN1YiRmYWlsdXJlKQ0Kc3RyKGRhdGFzdWIpDQpgYGANClBhcnRpdGlvbmluZyB0aGUgZGF0YSBpbiB0cmFpbiBhbmQgdGVzdCBzZXRzDQoNCmBgYHtyfQ0KaW5kIDwtIHNhbXBsZSgyLCBucm93KGRhdGFzdWIpLCByZXBsYWNlID0gVCwgcHJvYiA9IGMoMC43LCAwLjMpKQ0KaW50cmFpbiA8LSBkYXRhc3ViW2luZCA9PSAxLF0NCnRlc3QgPC0gZGF0YXN1YltpbmQgPT0gMixdDQpzdHIoaW50cmFpbikNCmRpbShpbnRyYWluKQ0KdGFibGUoaW50cmFpbiRmYWlsdXJlKQ0KDQpgYGANCg0KRHVlIHRvIGEgaW1iYWxhbmNlIGluIERhdGEgTW9yZSAiMCIgdGhhbiAiMSIgDQogT2JqZWN0aXZlIG9mIHRoZSBtb2RlbCBpcyB0byBwcmVkaWMgImZhaWx1cmVzKDEpDQogQSBzYW1wbGluZyBwcm9jZWR1cmUgaXMgdXNlZCB0byBzZWxlY3Qgb3ZlciwgdW5kZXIgYW5kIGJvdGggdHlwZSBvZiBzYW1wbGVzDQpgYGB7cn0NCmxpYnJhcnkoUk9TRSkNCnJmYm90aCA8LXJhbmRvbUZvcmVzdChmYWlsdXJlfi4sIGRhdGE9Ym90aCkNCmBgYA0KDQpTdW1tYXJ5IG9mIE92ZXIgc2FtcGxpbmcNCmBgYHtyfQ0Kc3VtbWFyeShyZmJvdGgpDQpgYGANClNhbXBsaW5nIFVuZGVyDQoNCmBgYHtyfQ0KdW5kZXIgPC0gb3Z1bi5zYW1wbGUoZmFpbHVyZX4uLCBkYXRhPWludHJhaW4sIG1ldGhvZCA9ICJ1bmRlciIpJGRhdGENCnRhYmxlKHVuZGVyJGZhaWx1cmUpDQpgYGANCg0KYGBge3J9DQpib3RoIDwtIG92dW4uc2FtcGxlKGZhaWx1cmV+LiwgZGF0YT1pbnRyYWluLCBtZXRob2QgPSAiYm90aCIsDQogICAgICAgICAgICAgICAgICAgIHAgPSAwLjUsDQogICAgICAgICAgICAgICAgICAgIHNlZWQgPSAyMjIsDQogICAgICAgICAgICAgICAgICAgIE4gPSAyMTIpJGRhdGENCnRhYmxlKGJvdGgkZmFpbHVyZSkNCg0KYGBgDQpNZXRob2Q9IFJPU0UNCmBgYHtyfQ0KZHJvc2UgPC0gUk9TRShmYWlsdXJlfi4sIGRhdGEgPSBpbnRyYWluLCBOID0gNTAwLCBzZWVkPTExMSkkZGF0YQ0KdGFibGUoZHJvc2UkZmFpbHVyZSkNCmBgYA0KDQpTdW1tYXJ5IG9mIFJvc2VgDQpgYGB7cn0NCnN1bW1hcnkoZHJvc2UpDQpgYGANCg0KUHJlZGljdGl2ZSBNT2RlbHMNCmBgYHtyfQ0KIyMjcHJlZGljdGl2ZSAgTW9kZWxzKFJhbmRvbSBGb3Jlc3QpDQpsaWJyYXJ5KHJhbmRvbUZvcmVzdCkNCmxpYnJhcnkoY2FyZXQpDQpsaWJyYXJ5KGUxMDcxKQ0KYGBgDQoNCkl0IHRha2VzIGEgd2hpbGUgdG8gcHJvY2VzcyAuLi4uLi4uLi4NCmBgYHtyfQ0KaW50cmFpbiRmYWlsdXJlPC1mYWN0b3IoaW50cmFpbiRmYWlsdXJlKQ0KcmZ0cmFpbiA8LSByYW5kb21Gb3Jlc3QoZmFpbHVyZX4uLCBkYXRhID0gaW50cmFpbikNCmNvbmZ1c2lvbk1hdHJpeChwcmVkaWN0KHJmdHJhaW4sIHRlc3QpLCB0ZXN0JGZhaWx1cmUsIHBvc2l0aXZlID0gJzEnKQ0KDQpgYGANCmBgYHtyfQ0Kb3ZlciA8LSBvdnVuLnNhbXBsZShmYWlsdXJlfi4sIGRhdGEgPSBpbnRyYWluLCBtZXRob2QgPSAib3ZlciIpJGRhdGENCnRhYmxlKG92ZXIkZmFpbHVyZSkNCmBgYA0KT3ZlciBTYW1wbGluZyBQcm9jZWR1cmUNCg0KYGBge3J9DQpyZm92ZXIgPC0gcmFuZG9tRm9yZXN0KGZhaWx1cmV+LiwgZGF0YSA9IG92ZXIpDQpgYGANCg0KUm9zZSBTYW1wbGUNCmBgYHtyfQ0KcmZyb3NlIDwtIHJhbmRvbUZvcmVzdChmYWlsdXJlfi4sIGRhdGE9ZHJvc2UpDQpgYGANCmBgYHtyfQ0KY29uZnVzaW9uTWF0cml4KHByZWRpY3QocmZyb3NlLCB0ZXN0KSwgdGVzdCRmYWlsdXJlLCBwb3NpdGl2ZSA9ICcxJykNCmBgYA0KDQoNCg0KYGBge3J9DQpDb25jbHVzaW9ucw0KYGBgDQoxLiBUaGUgaW1iYWxhbmNlIGRhdGEgaW1wb3NlIG9yIG5lZWQgdG8gYmUgT3ZlcnNhbXBsZSwgdGhhdCBpcyAsIEl0IGlzIG5lY2Nlc2FyeSB0byBzYW1wbGUgYSBtb3JlIGJhbGFuY2Ugc2FtcGxlIGluIG9yZGVyIHRvIHRyYWluIHRoZSBtb2RlbCB0byBkZXRlY3QgYSBmYWlsdXJlKDEpLg0KDQoyLiBEZXNwaXRlIG9mIG9idGFpbmluZyBzb21lIG1vZGVscyBzdGlsbCBpdCBpcyBuZWVkZWQgdG8gZXN0aW1hdGUgb3Igc3R1ZHkgc29tZSBvdGhlciBvcHRpb25zLg0KMy4gVGhpcyBzaG9ydCBzdHVkeSBvbmx5IGFwcGxpZXMgUmFuZG9tIEZvcmVzdCBkdWUgdG8gaGF2ZSBhIGNhdGVnb3JpY2FsIHZhcmlhYmxlIGFzIGRlcGVuZGVudCBvbmUuIFNvbWUgb3RoZXJzIG9wdGlvbnMgd2lsbCBiZSBLTk4sIERlY2lzaW9uIFRyZWVzIGFuZCBMb2dpc3RpYyByZWdyZXNzaW9uLg0KNC4gVGhlIG1vZGVsIGJhc2VkIGluIFJPU0Ugc2FtcGxlIGlzIHNob3dpbmcgdGhlIGJlc3QgcmVzdWx0cyBvZiBwcmVkaWN0aW9uIHdpdGggMjMgIG9uZXMuIEhvd2V2ZXIgdGhlIHNlbnNpdGl2aXR5PS42OTYgYW5kIHRoZSBzcGVjaWZpdHk9LjkxNCANCg0K