#Purpose of this exercise is to explore functions in R
#related to lists & data-time

#set working directory
setwd("C:/Users/Hai/Downloads")

#load data set -- Data Set contains made up financial data of
#made up companies -- replace blanks with NA
util <- read.csv("P3-Machine-Utilization.csv")

#view data structure
str(util)
## 'data.frame':    3600 obs. of  3 variables:
##  $ Timestamp   : chr  "01/09/2016 00:00" "01/09/2016 01:00" "01/09/2016 02:00" "01/09/2016 03:00" ...
##  $ Machine     : chr  "RL1" "RL1" "RL1" "RL1" ...
##  $ Percent.Idle: num  NA NA NA NA NA ...
#first 12 rows in data set
head(util,12)
##           Timestamp Machine Percent.Idle
## 1  01/09/2016 00:00     RL1           NA
## 2  01/09/2016 01:00     RL1           NA
## 3  01/09/2016 02:00     RL1           NA
## 4  01/09/2016 03:00     RL1           NA
## 5  01/09/2016 04:00     RL1           NA
## 6  01/09/2016 05:00     RL1           NA
## 7  01/09/2016 06:00     RL1           NA
## 8  01/09/2016 07:00     RL1   0.01994048
## 9  01/09/2016 08:00     RL1   0.01997024
## 10 01/09/2016 09:00     RL1   0.02119048
## 11 01/09/2016 10:00     RL1   0.01375000
## 12 01/09/2016 11:00     RL1   0.01916667
#summary of data set
summary(util)
##   Timestamp           Machine           Percent.Idle   
##  Length:3600        Length:3600        Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:0.0262  
##  Mode  :character   Mode  :character   Median :0.0410  
##                                        Mean   :0.0431  
##                                        3rd Qu.:0.0576  
##                                        Max.   :0.1508  
##                                        NA's   :361
#convert machine to factor
util$Machine <- factor(util$Machine)

#summary of revised data set
summary(util)
##   Timestamp         Machine     Percent.Idle   
##  Length:3600        RL1 :720   Min.   :0.0000  
##  Class :character   RL2 :720   1st Qu.:0.0262  
##  Mode  :character   SR1 :720   Median :0.0410  
##                     SR4A:720   Mean   :0.0431  
##                     SR6 :720   3rd Qu.:0.0576  
##                                Max.   :0.1508  
##                                NA's   :361
#derivation of utilization
util$Utilization <- 1 - util$Percent.Idle

#first 12 rows in data set with new Utilization column
head(util,12)
##           Timestamp Machine Percent.Idle Utilization
## 1  01/09/2016 00:00     RL1           NA          NA
## 2  01/09/2016 01:00     RL1           NA          NA
## 3  01/09/2016 02:00     RL1           NA          NA
## 4  01/09/2016 03:00     RL1           NA          NA
## 5  01/09/2016 04:00     RL1           NA          NA
## 6  01/09/2016 05:00     RL1           NA          NA
## 7  01/09/2016 06:00     RL1           NA          NA
## 8  01/09/2016 07:00     RL1   0.01994048   0.9800595
## 9  01/09/2016 08:00     RL1   0.01997024   0.9800298
## 10 01/09/2016 09:00     RL1   0.02119048   0.9788095
## 11 01/09/2016 10:00     RL1   0.01375000   0.9862500
## 12 01/09/2016 11:00     RL1   0.01916667   0.9808333
#from the first 12 rows, we are not able to determine the
#data format, so we look at the last 12 rows
tail(util)
##             Timestamp Machine Percent.Idle Utilization
## 3595 30/09/2016 18:00     SR6   0.03485507   0.9651449
## 3596 30/09/2016 19:00     SR6   0.04605073   0.9539493
## 3597 30/09/2016 20:00     SR6   0.04427536   0.9557246
## 3598 30/09/2016 21:00     SR6   0.04141304   0.9585870
## 3599 30/09/2016 22:00     SR6   0.06750000   0.9325000
## 3600 30/09/2016 23:00     SR6   0.05355073   0.9464493
#now we know the format, we will create a new field with
#the convert the Timestamp field to a universal data-time format
util$PosixTime <- as.POSIXct(util$Timestamp, format="%d/%m/%Y %H:%M")

#show first 6 rows with new field
head(util)
##          Timestamp Machine Percent.Idle Utilization           PosixTime
## 1 01/09/2016 00:00     RL1           NA          NA 2016-09-01 00:00:00
## 2 01/09/2016 01:00     RL1           NA          NA 2016-09-01 01:00:00
## 3 01/09/2016 02:00     RL1           NA          NA 2016-09-01 02:00:00
## 4 01/09/2016 03:00     RL1           NA          NA 2016-09-01 03:00:00
## 5 01/09/2016 04:00     RL1           NA          NA 2016-09-01 04:00:00
## 6 01/09/2016 05:00     RL1           NA          NA 2016-09-01 05:00:00
#remove Timestamp field and re-arrange columns with PosixTime at the beginning
util$Timestamp <- NULL
util <- util[,c(4,1,2,3)]

#show first 6 rows with fields re-arranged
head(util)
##             PosixTime Machine Percent.Idle Utilization
## 1 2016-09-01 00:00:00     RL1           NA          NA
## 2 2016-09-01 01:00:00     RL1           NA          NA
## 3 2016-09-01 02:00:00     RL1           NA          NA
## 4 2016-09-01 03:00:00     RL1           NA          NA
## 5 2016-09-01 04:00:00     RL1           NA          NA
## 6 2016-09-01 05:00:00     RL1           NA          NA
#show summary of data set
summary(util)
##    PosixTime                   Machine     Percent.Idle     Utilization    
##  Min.   :2016-09-01 00:00:00   RL1 :720   Min.   :0.0000   Min.   :0.8492  
##  1st Qu.:2016-09-08 11:45:00   RL2 :720   1st Qu.:0.0262   1st Qu.:0.9424  
##  Median :2016-09-15 23:30:00   SR1 :720   Median :0.0410   Median :0.9590  
##  Mean   :2016-09-15 23:30:00   SR4A:720   Mean   :0.0431   Mean   :0.9569  
##  3rd Qu.:2016-09-23 11:15:00   SR6 :720   3rd Qu.:0.0576   3rd Qu.:0.9738  
##  Max.   :2016-09-30 23:00:00              Max.   :0.1508   Max.   :1.0000  
##                                           NA's   :361      NA's   :361
#since we are only going to be investigating RL1
#we will create a new subset called RL1
RL1 <- util[util$Machine == "RL1",]
RL1$Machine <- factor(RL1$Machine)

#show summary of RL1
summary(RL1)
##    PosixTime                   Machine    Percent.Idle      Utilization    
##  Min.   :2016-09-01 00:00:00   RL1:720   Min.   :0.00500   Min.   :0.8492  
##  1st Qu.:2016-09-08 11:45:00             1st Qu.:0.03208   1st Qu.:0.9403  
##  Median :2016-09-15 23:30:00             Median :0.04613   Median :0.9539  
##  Mean   :2016-09-15 23:30:00             Mean   :0.04830   Mean   :0.9517  
##  3rd Qu.:2016-09-23 11:15:00             3rd Qu.:0.05967   3rd Qu.:0.9679  
##  Max.   :2016-09-30 23:00:00             Max.   :0.15077   Max.   :0.9950  
##                                          NA's   :7         NA's   :7
#create vector of min, mean and max utilization
util_stats_RL1 <- c(min(RL1$Utilization, na.rm=TRUE),
                    mean(RL1$Utilization,na.rm=TRUE),
                    max(RL1$Utilization ,na.rm=TRUE))

#create field - did utilization ever drop below 90%?
#use which to ignore NA values
util_under_90_flag <- length(which(RL1$Utilization < 0.9)) > 0

#create list of machine name, stats and the utilization under 90% flag
list_RL1 <- list(Machine="RL1", Stats=util_stats_RL1, LoadThreshold=util_under_90_flag)

#show list
list_RL1
## $Machine
## [1] "RL1"
## 
## $Stats
## [1] 0.8492262 0.9516976 0.9950000
## 
## $LoadThreshold
## [1] TRUE
#add new list of hours where utilization is not known
list_RL1$UnknownHours <- RL1[is.na(RL1$Utilization),"PosixTime"]

#show list
list_RL1
## $Machine
## [1] "RL1"
## 
## $Stats
## [1] 0.8492262 0.9516976 0.9950000
## 
## $LoadThreshold
## [1] TRUE
## 
## $UnknownHours
## [1] "2016-09-01 00:00:00 EDT" "2016-09-01 01:00:00 EDT"
## [3] "2016-09-01 02:00:00 EDT" "2016-09-01 03:00:00 EDT"
## [5] "2016-09-01 04:00:00 EDT" "2016-09-01 05:00:00 EDT"
## [7] "2016-09-01 06:00:00 EDT"
#add new component of data frame RL1
list_RL1$data <- RL1

#summary of the list
summary(list_RL1)
##               Length Class      Mode     
## Machine       1      -none-     character
## Stats         3      -none-     numeric  
## LoadThreshold 1      -none-     logical  
## UnknownHours  7      POSIXct    numeric  
## data          4      data.frame list
#activate ggplot package
library(ggplot2)

#create time-series plot
ggplot(data=util) + geom_line(aes(x=PosixTime, y=Utilization,
                                  color=Machine), size=1.2) +
  facet_grid(Machine~.) + geom_hline(yintercept = 0.9, color="Gray", size=1.2,
                                     linetype=3) + 
  ggtitle("Machine Utilization over Time") +
  theme(plot.title = element_text(hjust=0.5))
## Warning: Removed 155 row(s) containing missing values (geom_path).