knitr::opts_chunk$set(echo = TRUE,cache = TRUE,autodep = TRUE)

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

Loading the data sets into a statistical programming language. In this case we are using R programming language.

R version 3.4.1 (2017-06-30) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows >= 8 x64 (build 9200)

We are using R Studio as an IDE for R. All the packages attached are listed with the code

setwd("D:/UIC/MyResearch")



library(readxl)
library(magrittr)
library(dplyr)

Attaching f in prefix because loading the name in dataset

path="D:/UIC/MyResearch/Pumps/KIRLOSKER PUMP BREAKDOWN DETAILS (1).xlsx"
for(i in 1:length(excel_sheets(path))){
assign(x =paste0("f",excel_sheets(path)[i]), value =read_excel(path = path,sheet = excel_sheets(path)[i]))  
}
# drop a column 

fP301A<-within(fP301A,rm("System status"))

Kirlosker<-rbind(fAGA951,fAGA952A,fAGA955,fCGA201A,fCGA302A,f11P19A,fP301A)

Let’s identify the equipments notifn

knitr::kable(table(Kirlosker %>% group_by(`Notifictn type` ) %>% select(`Notifictn type`,Equipment) ,dnn = c("Notification Type","Freq equipment")),caption="Matrix for Pumps and Notification")
Matrix for Pumps and Notification
11P19A AGA951 AGA952A AGA955 CGA201A P301A UGA302A
M1 12 5 12 11 1 3 0
Z1 0 0 0 0 7 8 8

Z1 category is preventice maintenance. Thus we see that if a pump is maintained preventively chances are less of suffering a fault.

Extracting data from the second excel sheet

# To automate the extraction of sheets from excel


path="D:/UIC/MyResearch/Pumps/KSB PUMPS BREAKDOWN RECORD (2).xlsx"
for(i in 1:length(excel_sheets(path))){
assign(x =excel_sheets(path)[i], value =read_excel(path = path,sheet = excel_sheets(path)[i]))  
  }

# Rbinding difficult due to different names

`11P24A`<-within(`11P24A`,rm("System status"))
P701A<-within(P701A,rm("System status"))
UGA1107A<-within(UGA1107A,rm("System status"))

colnames(P901A)<-names(P701A)
# Now again aggregating all the data from different sheets and putting them in a KSB dataset

KSB<-rbind(`11P24A`,P701A,P702A,P2001A,P901A,UGA1107A,UGA2201A)


knitr::kable(table(KSB %>% group_by(`Notifictn type` ) %>% select(`Notifictn type`,Equipment) ,dnn = c("Notification Type","Freq equipment")),caption="Matrix for Pumps and Notification")
Matrix for Pumps and Notification
21P24A AGA901A P2001A P701A P702A UGA1107A UGA2201A
M1 1 4 4 2 3 0 9
Z1 0 3 0 0 9 8 7
Z3 6 0 0 5 0 0 0

Extracting data from the third excel sheet

path="D:/UIC/MyResearch/Pumps/MAINTENANCE HISTORY - compressor and turbile- K1&TK1.xlsx"
for(i in 1:length(excel_sheets(path))){
  assign(x =paste0("c",excel_sheets(path)[i]), value =read_excel(path = path,sheet = excel_sheets(path)[i]))  
}


# All these pumps have extra atrributes which 1st are not common across all pumps and secondly 
# may not be relevant in analyzing the data





# What is important is the notification date because that might help us in knowing the frequency

# The third data set is of compressor and turbine. We will be stripping of unwanted columns 


# Let's us change the names of columns so they are in a readable format 
# X_3 is useful in 11ko1. We ll rename the column to Execution date

c11K01<-rename(c11K01,Execution.date=X__3)
cP607C<-rename(cP607C,Execution.date=X__1)

# In the equipment the equipment column should have a constant name throughout the dataset

c11K01$Equipment<-rep("11K01",nrow(c11K01))
cK431$Equipment<-rep("K431",nrow(cK431))


Compar<-data.frame(Ko1=colnames(c11K01))
Compar[14,]<-rep(x = NA,1)
Compar1<-data.frame(TKo1=colnames(c11TK01))
Compar1[11:14,]<-rep(x = NA,4)
Compar2<-data.frame(P607C=colnames(cP607C))
Compar3<-data.frame(K431=colnames(cK431))
Compar3[13:14,]<-rep(x = NA,2)

NetNames<-cbind(Compar,Compar1,Compar2,Compar3)


knitr::kable(NetNames,caption = "column names")
column names
Ko1 TKo1 P607C K431
Notifictn type Notifictn type Notifictn type Notifictn type
Notification Notification Notification Notification
Notif.date Notif.date Notif.date Notif.date
Order Order Order Order
Equipment Equipment Equipment Equipment
Description Description Description Description
Functional loc. Functional loc. Functional loc. Functional loc.
System status System status System status System status
X__1 User status Execution.date User status
X__2 Main WorkCtr X__2 Execution Date
Execution.date NA User status Job Details
X__4 NA PartnerResp. Main WorkCtr
PartnerResp. NA DescEmpl.Resp. NA
NA NA Main WorkCtr NA
rm(Compar,Compar1,Compar2,Compar3)
# We will delete the unwanted variables from the dataset
# From the netnames we see that first 8 variables are common along the dataset. We will delete the rest


for(i in 1:length(excel_sheets(path))){
assign(x=paste0("c",excel_sheets(path)[i]),value=get(paste0("c",excel_sheets(path)[i]))[,1:8])
}


# They have different date formats so we need to convert to a common date format

#The below code is not working. The dates format are not proper 

# ?c11K01$Notif.date<-as.Date(c11K01$Notif.date,format = "%d.%m.%y")
# ?c11TK01$Notif.date<-as.Date(c11TK01$Notif.date,format = "%d.%m.%y")

c11K01$Notif.date<-lubridate::dmy(c11K01$Notif.date)
c11TK01$Notif.date<-lubridate::dmy(c11TK01$Notif.date)
cP607C$Notif.date<-as.Date(cP607C$Notif.date,format = "%d.%m.%y")
cK431$Notif.date<-as.Date(cK431$Notif.date,format = "%d.%m.%y")

# Comnbining for analysis


CompTurbine<-rbind(get(paste0("c",excel_sheets(path)[1])),get(paste0("c",excel_sheets(path)[2])),get(paste0("c",excel_sheets(path)[3])),get(paste0("c",excel_sheets(path)[4])))


knitr::kable(table(CompTurbine %>% group_by(`Notifictn type` ) %>% select(`Notifictn type`,Equipment) ,dnn = c("Notification Type","Freq equipment")),caption="Matrix for Pumps and Notification")
Matrix for Pumps and Notification
11K01 11TK01 K431 P607C
M1 9 50 16 10
Z1 0 0 0 8
Z3 6 30 0 0
CompTurbine %>% group_by(`Notifictn type`,Equipment ) %>% select(`Notifictn type`,Equipment) %>% tally()
## # A tibble: 7 x 3
## # Groups:   Notifictn type [?]
##   `Notifictn type` Equipment     n
##   <chr>            <chr>     <int>
## 1 M1               11K01         9
## 2 M1               11TK01       50
## 3 M1               K431         16
## 4 M1               P607C        10
## 5 Z1               P607C         8
## 6 Z3               11K01         6
## 7 Z3               11TK01       30

It is assumed that installation date of every pump is to be considered the previous year’s date in April

4 dates were formatted in excel itself before loading. Their format was changed from %m/%d/%y and %m.%d.y

We will arrange the dates in descending order ( in the order in which they are listed ) and then group them according to the notification type

Kirlosker$Notif.date<-lubridate::dmy(Kirlosker$Notif.date)

KSB$Notif.date<-lubridate::dmy(KSB$Notif.date)

After converting all the dates in a common format, we will

Analyzing the Kirlosker dataset first

We know that start date of any pump is the previous year and april 1 which would be
dates_from_subtd<-Kirlosker %>% group_by(Equipment) %>% summarise(Date=min(Notif.date))

#For every pump we got the min date and hence subtracting one year from the date we can get our date for analysis 


date_installed<-as.data.frame(paste((lubridate::year(dates_from_subtd$Date)-1),"04-01",sep="-"))
colnames(date_installed)<-"Date Installed"

dates_from_subtd<-cbind(dates_from_subtd,date_installed)
rm(date_installed)

dates_from_subtd<-dates_from_subtd[,c(1,3)]


Kirlosker<-left_join(Kirlosker,dates_from_subtd)
## Joining, by = "Equipment"
rm(dates_from_subtd)

dates_from_subtd<-Kirlosker %>% select(Equipment,Notif.date,`Date Installed`)  

dates_from_subtd$`Date Installed`<-as.Date(dates_from_subtd$`Date Installed`)

dates_from_subtd<-dates_from_subtd %>% mutate(diff_date=(Notif.date - `Date Installed`))

dates_from_subtd %>% group_by(Equipment) %>% summarise(Date=min(diff_date))
## # A tibble: 7 x 2
##   Equipment     Date
##       <chr>   <time>
## 1    11P19A 330 days
## 2    AGA951 322 days
## 3   AGA952A 414 days
## 4    AGA955 365 days
## 5   CGA201A 556 days
## 6     P301A 308 days
## 7   UGA302A 576 days

The above table gives us the the days between which the pump had fault for the first time

Can use %in% operator to selectively filter some of the values

Kirlosker<-Kirlosker %>% select(`Notifictn type`,Equipment,Notif.date,`Date Installed`)
Kirlosker<-Kirlosker %>% group_by(Equipment,Notif.date) %>% arrange(Equipment,desc(Notif.date))
Kirlosker<-as.data.frame(Kirlosker)

Kirlosker<-Kirlosker %>% mutate(diff_days=Notif.date -lead(Notif.date))

Kirlosker$diff_days<-as.numeric(Kirlosker$diff_days)

for(i in 1:(nrow(Kirlosker)-1)){
if(Kirlosker$diff_days[i]<0)
{
Kirlosker$diff_days[i]<--1  
}
else{
Kirlosker$diff_days[i]<-Kirlosker$diff_days[i]
}
    
}

We need to manually put the dates which are set to zero because it has to be subtracted from the data installed

Although we can easily replace it with few rows I want to write code which is generic For this we have to know how many rows each category has and will then replace them will the last one because they are the ones which are zero or NA

# We get the count of each equipment and we know that last we have to calculate manually


Kirlosker[is.na(Kirlosker[,5]),5]<--1

# Now wherever there are zeroes we can replace them with the date installed subtracted from the first fault occurence

Kirlosker$`Date Installed`<-as.Date(Kirlosker$`Date Installed`)

for(i in 1:nrow(Kirlosker)){
  
  if(Kirlosker[i,5]==-1){
    Kirlosker[i,5]<-Kirlosker[i,3]-Kirlosker[i,4]
    
  }
}

We will be doing the same for KSB and CompTurbine

dates_from_subtd<-KSB%>% group_by(Equipment) %>% summarise(Date=min(Notif.date))

#For every pump we got the min date and hence subtracting one year from the date we can get our date for analysis 


date_installed<-as.data.frame(paste((lubridate::year(dates_from_subtd$Date)-1),"04-01",sep="-"))
colnames(date_installed)<-"Date Installed"

dates_from_subtd<-cbind(dates_from_subtd,date_installed)
rm(date_installed)

dates_from_subtd<-dates_from_subtd[,c(1,3)]


KSB<-left_join(KSB,dates_from_subtd)
## Joining, by = "Equipment"
rm(dates_from_subtd)

dates_from_subtd<-KSB %>% select(Equipment,Notif.date,`Date Installed`)  

dates_from_subtd$`Date Installed`<-as.Date(dates_from_subtd$`Date Installed`)

dates_from_subtd<-dates_from_subtd %>% mutate(diff_date=(Notif.date - `Date Installed`))

dates_from_subtd %>% group_by(Equipment) %>% summarise(Date=min(diff_date))
## # A tibble: 7 x 2
##   Equipment     Date
##       <chr>   <time>
## 1    21P24A 332 days
## 2   AGA901A 380 days
## 3    P2001A 377 days
## 4     P701A 330 days
## 5     P702A 287 days
## 6  UGA1107A 448 days
## 7  UGA2201A 321 days

The above table gives us the the days between which the pump had fault for the first time

Can use %in% operator to selectively filter some of the values

KSB<-KSB %>% select(`Notifictn type`,Equipment,Notif.date,`Date Installed`)
KSB<-KSB %>% group_by(Equipment,Notif.date) %>% arrange(Equipment,desc(Notif.date))

KSB<-as.data.frame(KSB)

# to calculate difference between successive defaults

KSB<-KSB %>% mutate(diff_days=Notif.date -lead(Notif.date))

KSB$diff_days<-as.numeric(KSB$diff_days)

# We are running the loop max-1 because there is no lead for the last item which generates NA's and we know NA's cause lot of troubles in R

for(i in 1:(nrow(KSB)-1)){
if(KSB$diff_days[i] < 0)
{
KSB$diff_days[i]<--1  
}
else{
KSB$diff_days[i]<-KSB$diff_days[i]
}
    
}

We need to manually put the dates which are set to zero because it has to be subtracted from the data installed

Although we can easily replace it with few rows I want to write code which is generic For this we have to know how many rows each category has and will then replace them will the last one because they are the ones which are zero or NA

# setting the last NA to zero

KSB[is.na(KSB[,5]),5]<--1

# Now wherever there are zeroes we can replace them with the date installed subtracted from the first fault occurence

KSB$`Date Installed`<-as.Date(KSB$`Date Installed`)

for(i in 1:nrow(KSB)){
  
  if(KSB[i,5]==-1){
    KSB[i,5]<-KSB[i,3]-KSB[i,4]
    
  }
}

NOTE: It might be a wrong practice to carry the dates installed as it is carrying the same data.

CompTurbine

We see that CompTurbine the dates are not arranged in a strictly decreasing order within a category.

dates_from_subtd<-CompTurbine%>% group_by(Equipment) %>% summarise(Date=min(Notif.date))

#For every pump we got the min date and hence subtracting one year from the date we can get our date for analysis 


date_installed<-as.data.frame(paste((lubridate::year(dates_from_subtd$Date)-1),"04-01",sep="-"))
colnames(date_installed)<-"Date Installed"

dates_from_subtd<-cbind(dates_from_subtd,date_installed)
rm(date_installed)

dates_from_subtd<-dates_from_subtd[,c(1,3)]


CompTurbine<-left_join(CompTurbine,dates_from_subtd)
## Joining, by = "Equipment"
rm(dates_from_subtd)

dates_from_subtd<-CompTurbine %>% select(Equipment,Notif.date,`Date Installed`)  

dates_from_subtd$`Date Installed`<-as.Date(dates_from_subtd$`Date Installed`)

dates_from_subtd<-dates_from_subtd %>% mutate(diff_date=(Notif.date - `Date Installed`))

dates_from_subtd %>% group_by(Equipment) %>% summarise(Date=min(diff_date))
## # A tibble: 4 x 2
##   Equipment Date  
##   <chr>     <time>
## 1 11K01     362   
## 2 11TK01    294   
## 3 K431      365   
## 4 P607C     440

The above table gives us the the days between which the pump had fault for the first time

Can use %in% operator to selectively filter some of the values

CompTurbine<-CompTurbine %>% select(`Notifictn type`,Equipment,Notif.date,`Date Installed`)
CompTurbine<-CompTurbine %>% group_by(Equipment,Notif.date) %>% arrange(Equipment,desc(Notif.date))
CompTurbine<-as.data.frame(CompTurbine)

# to calculate difference between successive defaults

CompTurbine<-CompTurbine %>% mutate(diff_days=Notif.date -lead(Notif.date))

CompTurbine$diff_days<-as.numeric(CompTurbine$diff_days)

# We are running the loop max-1 because there is no lead for the last item which generates NA's and we know NA's cause lot of troubles in R

for(i in 1:(nrow(CompTurbine)-1)){
if(CompTurbine$diff_days[i] < 0)
{
CompTurbine$diff_days[i]<--1  
}
else{
CompTurbine$diff_days[i]<-CompTurbine$diff_days[i]
}
    
}

We need to manually put the dates which are set to zero because it has to be subtracted from the data installed

Although we can easily replace it with few rows I want to write code which is generic For this we have to know how many rows each category has and will then replace them will the last one because they are the ones which are zero or NA

# setting the last NA to zero

CompTurbine[is.na(CompTurbine[,5]),5]<--1

# Now wherever there are zeroes we can replace them with the date installed subtracted from the first fault occurence

CompTurbine$`Date Installed`<-as.Date(CompTurbine$`Date Installed`)

for(i in 1:nrow(CompTurbine)){
  
  if(CompTurbine[i,5]==-1){
    CompTurbine[i,5]<-CompTurbine[i,3]-CompTurbine[i,4]
    
  }
}

NOTE: It might be a wrong practice to carry the dates installed as it is carrying the same data.

After calculating the fault frequency in terms of days for all the three equipments let us put the output in a nice format

knitr::kable(Kirlosker,caption = 'Kirlosker')
Kirlosker
Notifictn type Equipment Notif.date Date Installed diff_days
M1 11P19A 2017-05-18 2009-04-01 1074
M1 11P19A 2014-06-09 2009-04-01 154
M1 11P19A 2014-01-06 2009-04-01 113
M1 11P19A 2013-09-15 2009-04-01 52
M1 11P19A 2013-07-25 2009-04-01 172
M1 11P19A 2013-02-03 2009-04-01 85
M1 11P19A 2012-11-10 2009-04-01 101
M1 11P19A 2012-08-01 2009-04-01 14
M1 11P19A 2012-07-18 2009-04-01 65
M1 11P19A 2012-05-14 2009-04-01 389
M1 11P19A 2011-04-21 2009-04-01 420
M1 11P19A 2010-02-25 2009-04-01 330
M1 AGA951 2017-01-11 2010-04-01 391
M1 AGA951 2015-12-17 2010-04-01 576
M1 AGA951 2014-05-20 2010-04-01 664
M1 AGA951 2012-07-25 2010-04-01 524
M1 AGA951 2011-02-17 2010-04-01 322
M1 AGA952A 2016-10-28 2008-04-01 27
M1 AGA952A 2016-10-01 2008-04-01 90
M1 AGA952A 2016-07-03 2008-04-01 122
M1 AGA952A 2016-03-03 2008-04-01 629
M1 AGA952A 2014-06-13 2008-04-01 314
M1 AGA952A 2013-08-03 2008-04-01 9
M1 AGA952A 2013-07-25 2008-04-01 562
M1 AGA952A 2012-01-10 2008-04-01 113
M1 AGA952A 2011-09-19 2008-04-01 767
M1 AGA952A 2009-08-13 2008-04-01 73
M1 AGA952A 2009-06-01 2008-04-01 12
M1 AGA952A 2009-05-20 2008-04-01 414
M1 AGA955 2017-06-07 2008-04-01 300
M1 AGA955 2016-08-11 2008-04-01 246
M1 AGA955 2015-12-09 2008-04-01 195
M1 AGA955 2015-05-28 2008-04-01 114
M1 AGA955 2015-02-03 2008-04-01 159
M1 AGA955 2014-08-28 2008-04-01 1179
M1 AGA955 2011-06-06 2008-04-01 298
M1 AGA955 2010-08-12 2008-04-01 285
M1 AGA955 2009-10-31 2008-04-01 211
M1 AGA955 2009-04-03 2008-04-01 2
M1 AGA955 2009-04-01 2008-04-01 365
Z1 CGA201A 2017-01-02 2008-04-01 621
M1 CGA201A 2015-04-22 2008-04-01 194
Z1 CGA201A 2014-10-10 2008-04-01 240
Z1 CGA201A 2014-02-12 2008-04-01 544
Z1 CGA201A 2012-08-17 2008-04-01 302
Z1 CGA201A 2011-10-20 2008-04-01 367
Z1 CGA201A 2010-10-18 2008-04-01 374
Z1 CGA201A 2009-10-09 2008-04-01 556
M1 P301A 2017-02-16 2009-04-01 49
Z1 P301A 2016-12-29 2009-04-01 324
Z1 P301A 2016-02-09 2009-04-01 288
Z1 P301A 2015-04-27 2009-04-01 0
M1 P301A 2015-04-27 2009-04-01 441
Z1 P301A 2014-02-10 2009-04-01 368
Z1 P301A 2013-02-07 2009-04-01 372
M1 P301A 2012-02-01 2009-04-01 16
Z1 P301A 2012-01-16 2009-04-01 349
Z1 P301A 2011-02-01 2009-04-01 363
Z1 P301A 2010-02-03 2009-04-01 308
Z1 UGA302A 2016-05-24 2008-04-01 320
Z1 UGA302A 2015-07-09 2008-04-01 351
Z1 UGA302A 2014-07-23 2008-04-01 392
Z1 UGA302A 2013-06-26 2008-04-01 347
Z1 UGA302A 2012-07-14 2008-04-01 369
Z1 UGA302A 2011-07-11 2008-04-01 373
Z1 UGA302A 2010-07-03 2008-04-01 247
Z1 UGA302A 2009-10-29 2008-04-01 576
knitr::kable(KSB,caption = 'KSB')
KSB
Notifictn type Equipment Notif.date Date Installed diff_days
Z3 21P24A 2016-10-25 2008-04-01 976
Z3 21P24A 2014-02-22 2008-04-01 676
Z3 21P24A 2012-04-17 2008-04-01 353
Z3 21P24A 2011-04-30 2008-04-01 387
Z3 21P24A 2010-04-08 2008-04-01 382
M1 21P24A 2009-03-22 2008-04-01 23
Z3 21P24A 2009-02-27 2008-04-01 332
Z1 AGA901A 2017-04-07 2008-04-01 367
Z1 AGA901A 2016-04-05 2008-04-01 780
M1 AGA901A 2014-02-15 2008-04-01 304
M1 AGA901A 2013-04-17 2008-04-01 489
M1 AGA901A 2011-12-15 2008-04-01 358
M1 AGA901A 2010-12-22 2008-04-01 615
Z1 AGA901A 2009-04-16 2008-04-01 380
M1 P2001A 2017-05-29 2010-04-01 24
M1 P2001A 2017-05-05 2010-04-01 710
M1 P2001A 2015-05-26 2010-04-01 1504
M1 P2001A 2011-04-13 2010-04-01 377
Z3 P701A 2016-10-18 2008-04-01 711
M1 P701A 2014-11-07 2008-04-01 242
M1 P701A 2014-03-10 2008-04-01 703
Z3 P701A 2012-04-06 2008-04-01 343
Z3 P701A 2011-04-29 2008-04-01 384
Z3 P701A 2010-04-10 2008-04-01 409
Z3 P701A 2009-02-25 2008-04-01 330
Z1 P702A 2017-02-24 2008-04-01 232
M1 P702A 2016-07-07 2008-04-01 26
M1 P702A 2016-06-11 2008-04-01 5
M1 P702A 2016-06-06 2008-04-01 125
Z1 P702A 2016-02-02 2008-04-01 376
Z1 P702A 2015-01-22 2008-04-01 365
Z1 P702A 2014-01-22 2008-04-01 330
Z1 P702A 2013-02-26 2008-04-01 473
Z1 P702A 2011-11-11 2008-04-01 330
Z1 P702A 2010-12-16 2008-04-01 315
Z1 P702A 2010-02-04 2008-04-01 387
Z1 P702A 2009-01-13 2008-04-01 287
Z1 UGA1107A 2017-04-12 2008-04-01 190
Z1 UGA1107A 2016-10-04 2008-04-01 599
Z1 UGA1107A 2015-02-13 2008-04-01 663
Z1 UGA1107A 2013-04-21 2008-04-01 408
Z1 UGA1107A 2012-03-09 2008-04-01 283
Z1 UGA1107A 2011-05-31 2008-04-01 354
Z1 UGA1107A 2010-06-11 2008-04-01 353
Z1 UGA1107A 2009-06-23 2008-04-01 448
M1 UGA2201A 2017-06-01 2008-04-01 415
Z1 UGA2201A 2016-04-12 2008-04-01 357
Z1 UGA2201A 2015-04-21 2008-04-01 313
M1 UGA2201A 2014-06-12 2008-04-01 23
M1 UGA2201A 2014-05-20 2008-04-01 13
Z1 UGA2201A 2014-05-07 2008-04-01 332
M1 UGA2201A 2013-06-09 2008-04-01 16
Z1 UGA2201A 2013-05-24 2008-04-01 252
M1 UGA2201A 2012-09-14 2008-04-01 136
M1 UGA2201A 2012-05-01 2008-04-01 705
Z1 UGA2201A 2010-05-27 2008-04-01 5
Z1 UGA2201A 2010-05-22 2008-04-01 167
M1 UGA2201A 2009-12-06 2008-04-01 74
M1 UGA2201A 2009-09-23 2008-04-01 127
Z1 UGA2201A 2009-05-19 2008-04-01 92
M1 UGA2201A 2009-02-16 2008-04-01 321
knitr::kable(CompTurbine,caption = 'CompTurbine')
CompTurbine
Notifictn type Equipment Notif.date Date Installed diff_days
M1 11K01 2014-09-29 2009-04-01 183
M1 11K01 2014-03-30 2009-04-01 34
M1 11K01 2014-02-24 2009-04-01 74
M1 11K01 2013-12-12 2009-04-01 170
Z3 11K01 2013-06-25 2009-04-01 46
Z3 11K01 2013-05-10 2009-04-01 523
M1 11K01 2011-12-04 2009-04-01 244
Z3 11K01 2011-04-04 2009-04-01 0
Z3 11K01 2011-04-04 2009-04-01 217
M1 11K01 2010-08-30 2009-04-01 96
M1 11K01 2010-05-26 2009-04-01 22
M1 11K01 2010-05-04 2009-04-01 3
M1 11K01 2010-05-01 2009-04-01 33
Z3 11K01 2010-03-29 2009-04-01 0
Z3 11K01 2010-03-29 2009-04-01 362
Z3 11TK01 2017-02-02 2008-04-01 0
Z3 11TK01 2017-02-02 2008-04-01 69
M1 11TK01 2016-11-25 2008-04-01 9
M1 11TK01 2016-11-16 2008-04-01 4
M1 11TK01 2016-11-12 2008-04-01 115
M1 11TK01 2016-07-20 2008-04-01 37
Z3 11TK01 2016-06-13 2008-04-01 2
M1 11TK01 2016-06-11 2008-04-01 80
Z3 11TK01 2016-03-23 2008-04-01 0
Z3 11TK01 2016-03-23 2008-04-01 0
Z3 11TK01 2016-03-23 2008-04-01 5
Z3 11TK01 2016-03-18 2008-04-01 0
Z3 11TK01 2016-03-18 2008-04-01 12
Z3 11TK01 2016-03-06 2008-04-01 239
M1 11TK01 2015-07-11 2008-04-01 145
M1 11TK01 2015-02-16 2008-04-01 30
Z3 11TK01 2015-01-17 2008-04-01 39
Z3 11TK01 2014-12-09 2008-04-01 74
M1 11TK01 2014-09-26 2008-04-01 4
M1 11TK01 2014-09-22 2008-04-01 24
M1 11TK01 2014-08-29 2008-04-01 64
M1 11TK01 2014-06-26 2008-04-01 9
M1 11TK01 2014-06-17 2008-04-01 1
M1 11TK01 2014-06-16 2008-04-01 9
M1 11TK01 2014-06-07 2008-04-01 10
M1 11TK01 2014-05-28 2008-04-01 3
M1 11TK01 2014-05-25 2008-04-01 0
M1 11TK01 2014-05-25 2008-04-01 19
Z3 11TK01 2014-05-06 2008-04-01 1
Z3 11TK01 2014-05-05 2008-04-01 38
M1 11TK01 2014-03-28 2008-04-01 3
M1 11TK01 2014-03-25 2008-04-01 72
M1 11TK01 2014-01-12 2008-04-01 42
Z3 11TK01 2013-12-01 2008-04-01 33
Z3 11TK01 2013-10-29 2008-04-01 0
Z3 11TK01 2013-10-29 2008-04-01 46
M1 11TK01 2013-09-13 2008-04-01 0
Z3 11TK01 2013-09-13 2008-04-01 32
M1 11TK01 2013-08-12 2008-04-01 18
Z3 11TK01 2013-07-25 2008-04-01 26
Z3 11TK01 2013-06-29 2008-04-01 4
Z3 11TK01 2013-06-25 2008-04-01 19
Z3 11TK01 2013-06-06 2008-04-01 54
M1 11TK01 2013-04-13 2008-04-01 95
Z3 11TK01 2013-01-08 2008-04-01 0
Z3 11TK01 2013-01-08 2008-04-01 69
M1 11TK01 2012-10-31 2008-04-01 38
M1 11TK01 2012-09-23 2008-04-01 68
M1 11TK01 2012-07-17 2008-04-01 33
M1 11TK01 2012-06-14 2008-04-01 43
M1 11TK01 2012-05-02 2008-04-01 10
M1 11TK01 2012-04-22 2008-04-01 72
Z3 11TK01 2012-02-10 2008-04-01 9
M1 11TK01 2012-02-01 2008-04-01 23
Z3 11TK01 2012-01-09 2008-04-01 35
M1 11TK01 2011-12-05 2008-04-01 92
Z3 11TK01 2011-09-04 2008-04-01 55
M1 11TK01 2011-07-11 2008-04-01 0
M1 11TK01 2011-07-11 2008-04-01 20
Z3 11TK01 2011-06-21 2008-04-01 1
Z3 11TK01 2011-06-20 2008-04-01 25
M1 11TK01 2011-05-26 2008-04-01 7
M1 11TK01 2011-05-19 2008-04-01 10
M1 11TK01 2011-05-09 2008-04-01 39
Z3 11TK01 2011-03-31 2008-04-01 33
M1 11TK01 2011-02-26 2008-04-01 151
M1 11TK01 2010-09-28 2008-04-01 4
M1 11TK01 2010-09-24 2008-04-01 121
M1 11TK01 2010-05-26 2008-04-01 25
M1 11TK01 2010-05-01 2008-04-01 0
M1 11TK01 2010-05-01 2008-04-01 4
M1 11TK01 2010-04-27 2008-04-01 24
M1 11TK01 2010-04-03 2008-04-01 58
M1 11TK01 2010-02-04 2008-04-01 54
M1 11TK01 2009-12-12 2008-04-01 30
Z3 11TK01 2009-11-12 2008-04-01 111
M1 11TK01 2009-07-24 2008-04-01 15
M1 11TK01 2009-07-09 2008-04-01 76
M1 11TK01 2009-04-24 2008-04-01 94
M1 11TK01 2009-01-20 2008-04-01 294
M1 K431 2015-11-05 2009-04-01 622
M1 K431 2014-02-21 2009-04-01 155
M1 K431 2013-09-19 2009-04-01 24
M1 K431 2013-08-26 2009-04-01 65
M1 K431 2013-06-22 2009-04-01 78
M1 K431 2013-04-05 2009-04-01 228
M1 K431 2012-08-20 2009-04-01 31
M1 K431 2012-07-20 2009-04-01 0
M1 K431 2012-07-20 2009-04-01 86
M1 K431 2012-04-25 2009-04-01 44
M1 K431 2012-03-12 2009-04-01 265
M1 K431 2011-06-21 2009-04-01 205
M1 K431 2010-11-28 2009-04-01 0
M1 K431 2010-11-28 2009-04-01 146
M1 K431 2010-07-05 2009-04-01 95
M1 K431 2010-04-01 2009-04-01 365
M1 P607C 2016-10-13 2008-04-01 104
M1 P607C 2016-07-01 2008-04-01 88
Z1 P607C 2016-04-04 2008-04-01 365
Z1 P607C 2015-04-05 2008-04-01 43
Z1 P607C 2015-02-21 2008-04-01 189
M1 P607C 2014-08-16 2008-04-01 184
M1 P607C 2014-02-13 2008-04-01 188
M1 P607C 2013-08-09 2008-04-01 119
M1 P607C 2013-04-12 2008-04-01 8
Z1 P607C 2013-04-04 2008-04-01 0
Z1 P607C 2013-04-04 2008-04-01 336
M1 P607C 2012-05-03 2008-04-01 65
Z1 P607C 2012-02-28 2008-04-01 97
M1 P607C 2011-11-23 2008-04-01 156
M1 P607C 2011-06-20 2008-04-01 90
M1 P607C 2011-03-22 2008-04-01 45
Z1 P607C 2011-02-05 2008-04-01 600
Z1 P607C 2009-06-15 2008-04-01 440

Statistical Analysis

We will first observe the five number summary for Kirlosker, KSB and Compressor and Turbine

For some readers who do not know about Kirlosker and KSB following are the links https://kirloskar.com and https://www.ksb.com/ksb-en/

summary(Kirlosker$diff_days)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   118.0   314.0   313.2   390.0  1179.0
summary(KSB$diff_days)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     5.0   232.0   353.0   361.6   409.0  1504.0
summary(CompTurbine$diff_days)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   39.00   83.43   95.00  622.00

Using the Notification Type to analyze

Kirlosker %>% group_by(`Notifictn type`) %>% summarise(mean(diff_days),median(diff_days))
## # A tibble: 2 x 3
##   `Notifictn type` `mean(diff_days)` `median(diff_days)`
##   <chr>                        <dbl>               <dbl>
## 1 M1                             287                 203
## 2 Z1                             363                 363
KSB %>% group_by(`Notifictn type`) %>% summarise(mean(diff_days),median(diff_days))
## # A tibble: 3 x 3
##   `Notifictn type` `mean(diff_days)` `median(diff_days)`
##   <chr>                        <dbl>               <dbl>
## 1 M1                             319                 242
## 2 Z1                             350                 353
## 3 Z3                             480                 384
CompTurbine %>% group_by(`Notifictn type`) %>% summarise(mean(diff_days),median(diff_days))
## # A tibble: 3 x 3
##   `Notifictn type` `mean(diff_days)` `median(diff_days)`
##   <chr>                        <dbl>               <dbl>
## 1 M1                            76.6                43.0
## 2 Z1                           259                 262  
## 3 Z3                            60.5                29.0

As we can see that Z is preventive maintainance and M is natural fault that occurs We see that Z category has a mean difference greater than M

Converting into factor variables

Kirlosker$`Notifictn type`<-as.factor(Kirlosker$`Notifictn type`)
KSB$`Notifictn type`<-as.factor(KSB$`Notifictn type`)
CompTurbine$`Notifictn type`<-as.factor(CompTurbine$`Notifictn type`)

Now putting linear regression

Y(dependent variable) = Future date of failure , calculated based on the historical analysis . However dates which are quite close to each other having diff_days=0 have to be left out since MTBF will be wrong if we include those

X (independent variable) = Can be a factor variable which in this case can be notification type.

kker<-Kirlosker %>% filter(diff_days>0)

kker_model<-lm(formula=kker$diff_days~kker$`Notifictn type`,data=kker)
summary(kker_model)
## 
## Call:
## lm(formula = kker$diff_days ~ kker$`Notifictn type`, data = kker)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -285.09 -158.72  -29.59   95.91  891.91 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               287.09      34.70   8.273 1.07e-11 ***
## kker$`Notifictn type`Z1    92.50      60.11   1.539    0.129    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 230.2 on 64 degrees of freedom
## Multiple R-squared:  0.03568,    Adjusted R-squared:  0.02062 
## F-statistic: 2.368 on 1 and 64 DF,  p-value: 0.1288
ksber<-KSB %>% filter(diff_days>0)

ksber_model<-lm(formula=ksber$diff_days~ksber$`Notifictn type`,data=ksber)
summary(ksber_model)
## 
## Call:
## lm(formula = ksber$diff_days ~ ksber$`Notifictn type`, data = ksber)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -344.56 -150.27  -19.56   58.44 1185.09 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                318.91      54.05   5.900 1.99e-07 ***
## ksber$`Notifictn type`Z1    30.64      73.55   0.417   0.6785    
## ksber$`Notifictn type`Z3   161.36      95.02   1.698   0.0948 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 259.2 on 58 degrees of freedom
## Multiple R-squared:  0.04899,    Adjusted R-squared:  0.0162 
## F-statistic: 1.494 on 2 and 58 DF,  p-value: 0.233
CompTur<-CompTurbine %>% filter(diff_days>0)

CompTur_model<-lm(formula=CompTur$diff_days~CompTur$`Notifictn type`,data=CompTur)
summary(CompTur_model)
## 
## Call:
## lm(formula = CompTur$diff_days ~ CompTur$`Notifictn type`, data = CompTur)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -252.71  -63.19  -37.94   19.56  539.56 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  82.443     12.457   6.618 1.34e-09 ***
## CompTur$`Notifictn type`Z1  213.271     43.662   4.885 3.51e-06 ***
## CompTur$`Notifictn type`Z3   -4.622     24.351  -0.190     0.85    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 110.7 on 111 degrees of freedom
## Multiple R-squared:  0.182,  Adjusted R-squared:  0.1672 
## F-statistic: 12.35 on 2 and 111 DF,  p-value: 1.441e-05
library(ggplot2)
g1<-ggplot(aes(diff_days,`Notifictn type`),data = kker)+
geom_col(aes(`Notifictn type`,diff_days))
  plot(g1)

g3<-ggplot(aes(diff_days,`Notifictn type`),data = kker)+
geom_point(aes(diff_days,`Notifictn type`))
  plot(g3)

library(ggplot2)
g2<-ggplot(aes(diff_days,`Notifictn type`),data = ksber)+
geom_count(aes(`Notifictn type`,diff_days))
plot(g2)

g4<-ggplot(aes(diff_days,`Notifictn type`),data = ksber)+
geom_point(aes(diff_days,`Notifictn type`))
  plot(g4)

g5<-ggplot(aes(diff_days,`Notifictn type`),data = CompTur)+
geom_point(aes(diff_days,`Notifictn type`))
  plot(g5)

g6<-ggplot(aes(diff_days,`Notifictn type`),data = CompTur)+
geom_col(aes(`Notifictn type`,diff_days))
  plot(g6)

Let us bind all the datasets and see if we are able to predcit

Total<-rbind(Kirlosker %>% select(Equipment,diff_days,`Notifictn type`),KSB %>% select(Equipment,diff_days,`Notifictn type`),CompTurbine %>% select(Equipment,diff_days,`Notifictn type`))


g7<-ggplot(aes(diff_days,`Notifictn type`),data = Total)+
geom_point(aes(`Notifictn type`,diff_days))
  plot(g7)

If we try to group them based on the categories of the notification then we see that there is no feature in the dataset which can disect this dataset into three categories

Total<-Total %>% filter(diff_days >0)

results<-kmeans(Total$diff_days,3)

table(Total$`Notifictn type`,results$cluster)
##     
##        1   2   3
##   M1 105  13  28
##   Z1   7   8  41
##   Z3  24   3  12

How the overall data is behaving irrespective of any pumps.An attempt to figure out something from the plot

plot(Total$diff_days,type='o')

Are there any outliers

boxplot(Total$diff_days)

From the boxplot we see that observations that are below the median don’t have quite a variance but above the median the observations are quite far apart as illustrated from the box plot.

What is causing such behavior I don’t know. We may have to slice the data from different angles so that we can get a better perspective

We are not able to get good clusters as well.

After applying different techniques, I am thinking that a pump’s fault would be based on what kind/when repairing has been done.If pump has been repaired yesterday chances are less that it will need a repair within a month

Let us apply time series model

library(forecast)
mydata.arima101<-forecast::Arima(y = Total$diff_days,order = c(1,0,1))
mydata.pred1 <- predict(mydata.arima101, n.ahead=100)
plot (Total$diff_days)
lines(mydata.pred1$pred, col="blue")
lines(mydata.pred1$pred+2*mydata.pred1$se, col="red")
lines(mydata.pred1$pred-2*mydata.pred1$se, col="red")

Using the Arima 1,0,1 model and using the standard error , we are calculating a confidence interval for our prediction

mydata.pred1$pred+2*mydata.pred1$se
## Time Series:
## Start = 242 
## End = 341 
## Frequency = 1 
##   [1] 608.9806 611.1307 613.2139 615.2327 617.1894 619.0862 620.9254
##   [8] 622.7090 624.4389 626.1170 627.7452 629.3250 630.8583 632.3465
##  [15] 633.7912 635.1939 636.5558 637.8785 639.1631 640.4109 641.6231
##  [22] 642.8008 643.9452 645.0573 646.1381 647.1886 648.2098 649.2025
##  [29] 650.1677 651.1062 652.0189 652.9064 653.7697 654.6093 655.4261
##  [36] 656.2208 656.9939 657.7461 658.4782 659.1905 659.8838 660.5586
##  [43] 661.2155 661.8549 662.4774 663.0834 663.6735 664.2482 664.8077
##  [50] 665.3527 665.8834 666.4004 666.9039 667.3945 667.8724 668.3380
##  [57] 668.7916 669.2336 669.6644 670.0842 670.4933 670.8921 671.2807
##  [64] 671.6596 672.0290 672.3890 672.7401 673.0824 673.4162 673.7417
##  [71] 674.0591 674.3686 674.6706 674.9651 675.2523 675.5326 675.8060
##  [78] 676.0727 676.3329 676.5869 676.8347 677.0765 677.3125 677.5428
##  [85] 677.7676 677.9870 678.2012 678.4103 678.6145 678.8138 679.0084
##  [92] 679.1984 679.3840 679.5652 679.7422 679.9151 680.0839 680.2488
##  [99] 680.4099 680.5673
mydata.pred1$pred-2*mydata.pred1$se
## Time Series:
## Start = 242 
## End = 341 
## Frequency = 1 
##   [1] -179.4064 -180.9676 -182.4707 -183.9178 -185.3113 -186.6533 -187.9457
##   [8] -189.1906 -190.3898 -191.5450 -192.6580 -193.7303 -194.7634 -195.7589
##  [15] -196.7181 -197.6424 -198.5331 -199.3914 -200.2185 -201.0156 -201.7837
##  [22] -202.5238 -203.2371 -203.9244 -204.5867 -205.2249 -205.8398 -206.4323
##  [29] -207.0031 -207.5531 -208.0829 -208.5933 -209.0849 -209.5584 -210.0145
##  [36] -210.4537 -210.8766 -211.2838 -211.6759 -212.0533 -212.4166 -212.7663
##  [43] -213.1028 -213.4266 -213.7381 -214.0377 -214.3260 -214.6032 -214.8697
##  [50] -215.1259 -215.3722 -215.6089 -215.8363 -216.0547 -216.2646 -216.4661
##  [57] -216.6595 -216.8451 -217.0233 -217.1942 -217.3581 -217.5153 -217.6659
##  [64] -217.8102 -217.9485 -218.0809 -218.2077 -218.3290 -218.4450 -218.5559
##  [71] -218.6620 -218.7633 -218.8600 -218.9523 -219.0404 -219.1244 -219.2044
##  [78] -219.2806 -219.3532 -219.4221 -219.4877 -219.5500 -219.6091 -219.6652
##  [85] -219.7183 -219.7685 -219.8160 -219.8609 -219.9033 -219.9432 -219.9807
##  [92] -220.0160 -220.0491 -220.0800 -220.1090 -220.1360 -220.1612 -220.1845
##  [99] -220.2062 -220.2261

As we can see the red part is in negative days so we will have to adjust our prediction

In time series based approach we don’t have to calculate the differences between successive observations. Times series methods inherently take the differences and the differences can be of first order or second order or more depending on the stationariness of the data

There are various tests to check the stationarity of the data

I know of Dickey-Fuller test so I will be using it

Total<-Total %>% filter(diff_days>0)



tseries::adf.test(Total$diff_days,alternative = "stationary",k = 0)
## Warning in tseries::adf.test(Total$diff_days, alternative = "stationary", :
## p-value smaller than printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  Total$diff_days
## Dickey-Fuller = -11.718, Lag order = 0, p-value = 0.01
## alternative hypothesis: stationary
acf(Total$diff_days)

pacf(Total$diff_days)

ggplot(data = Total,aes(diff_days,Equipment))+
geom_line(aes(diff_days,color=Equipment))+
geom_point(aes(diff_days,color=Equipment))+
geom_vline(xintercept = 320)

The point of drawing such a plot is to see the distribution of the differences in the successive two faults. Drawing a line at h=320 because this line intersects with every other pump.It means that for every pumps there will probably be one observation

In conclusion, probabilty of occurence is maximum

We can take a reference point for dates. For every pump there is a date installed

ggplot(Kirlosker,aes(diff_days,Equipment))+
geom_point(aes(diff_days,Equipment),data = Kirlosker)  

We will be plotting the dates as a function of index

ggplot(Kirlosker,aes(diff_days))+
geom_line(aes(y=diff_days,x=seq(1:67)),data = Kirlosker)

ggplot(Kirlosker,aes(diff_days))+
geom_line(aes(y=diff_days,x=seq(1:61)),data = KSB)

ggplot(Kirlosker,aes(diff_days))+
geom_line(aes(y=diff_days,x=seq(1:129)),data = CompTurbine)