#Purpose of this exercise is to explore functions in R
#related to lists & data-time
#set working directory
setwd("C:/Users/Hai/Downloads")
#load data set -- Data Set contains made up financial data of
#made up companies -- replace blanks with NA
util <- read.csv("P3-Machine-Utilization.csv")
#view data structure
str(util)
## 'data.frame': 3600 obs. of 3 variables:
## $ Timestamp : chr "01/09/2016 00:00" "01/09/2016 01:00" "01/09/2016 02:00" "01/09/2016 03:00" ...
## $ Machine : chr "RL1" "RL1" "RL1" "RL1" ...
## $ Percent.Idle: num NA NA NA NA NA ...
#first 12 rows in data set
head(util,12)
## Timestamp Machine Percent.Idle
## 1 01/09/2016 00:00 RL1 NA
## 2 01/09/2016 01:00 RL1 NA
## 3 01/09/2016 02:00 RL1 NA
## 4 01/09/2016 03:00 RL1 NA
## 5 01/09/2016 04:00 RL1 NA
## 6 01/09/2016 05:00 RL1 NA
## 7 01/09/2016 06:00 RL1 NA
## 8 01/09/2016 07:00 RL1 0.01994048
## 9 01/09/2016 08:00 RL1 0.01997024
## 10 01/09/2016 09:00 RL1 0.02119048
## 11 01/09/2016 10:00 RL1 0.01375000
## 12 01/09/2016 11:00 RL1 0.01916667
#summary of data set
summary(util)
## Timestamp Machine Percent.Idle
## Length:3600 Length:3600 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0262
## Mode :character Mode :character Median :0.0410
## Mean :0.0431
## 3rd Qu.:0.0576
## Max. :0.1508
## NA's :361
#convert machine to factor
util$Machine <- factor(util$Machine)
#summary of revised data set
summary(util)
## Timestamp Machine Percent.Idle
## Length:3600 RL1 :720 Min. :0.0000
## Class :character RL2 :720 1st Qu.:0.0262
## Mode :character SR1 :720 Median :0.0410
## SR4A:720 Mean :0.0431
## SR6 :720 3rd Qu.:0.0576
## Max. :0.1508
## NA's :361
#derivation of utilization
util$Utilization <- 1 - util$Percent.Idle
#first 12 rows in data set with new Utilization column
head(util,12)
## Timestamp Machine Percent.Idle Utilization
## 1 01/09/2016 00:00 RL1 NA NA
## 2 01/09/2016 01:00 RL1 NA NA
## 3 01/09/2016 02:00 RL1 NA NA
## 4 01/09/2016 03:00 RL1 NA NA
## 5 01/09/2016 04:00 RL1 NA NA
## 6 01/09/2016 05:00 RL1 NA NA
## 7 01/09/2016 06:00 RL1 NA NA
## 8 01/09/2016 07:00 RL1 0.01994048 0.9800595
## 9 01/09/2016 08:00 RL1 0.01997024 0.9800298
## 10 01/09/2016 09:00 RL1 0.02119048 0.9788095
## 11 01/09/2016 10:00 RL1 0.01375000 0.9862500
## 12 01/09/2016 11:00 RL1 0.01916667 0.9808333
#from the first 12 rows, we are not able to determine the
#data format, so we look at the last 12 rows
tail(util)
## Timestamp Machine Percent.Idle Utilization
## 3595 30/09/2016 18:00 SR6 0.03485507 0.9651449
## 3596 30/09/2016 19:00 SR6 0.04605073 0.9539493
## 3597 30/09/2016 20:00 SR6 0.04427536 0.9557246
## 3598 30/09/2016 21:00 SR6 0.04141304 0.9585870
## 3599 30/09/2016 22:00 SR6 0.06750000 0.9325000
## 3600 30/09/2016 23:00 SR6 0.05355073 0.9464493
#now we know the format, we will create a new field with
#the convert the Timestamp field to a universal data-time format
util$PosixTime <- as.POSIXct(util$Timestamp, format="%d/%m/%Y %H:%M")
#show first 6 rows with new field
head(util)
## Timestamp Machine Percent.Idle Utilization PosixTime
## 1 01/09/2016 00:00 RL1 NA NA 2016-09-01 00:00:00
## 2 01/09/2016 01:00 RL1 NA NA 2016-09-01 01:00:00
## 3 01/09/2016 02:00 RL1 NA NA 2016-09-01 02:00:00
## 4 01/09/2016 03:00 RL1 NA NA 2016-09-01 03:00:00
## 5 01/09/2016 04:00 RL1 NA NA 2016-09-01 04:00:00
## 6 01/09/2016 05:00 RL1 NA NA 2016-09-01 05:00:00
#remove Timestamp field and re-arrange columns with PosixTime at the beginning
util$Timestamp <- NULL
util <- util[,c(4,1,2,3)]
#show first 6 rows with fields re-arranged
head(util)
## PosixTime Machine Percent.Idle Utilization
## 1 2016-09-01 00:00:00 RL1 NA NA
## 2 2016-09-01 01:00:00 RL1 NA NA
## 3 2016-09-01 02:00:00 RL1 NA NA
## 4 2016-09-01 03:00:00 RL1 NA NA
## 5 2016-09-01 04:00:00 RL1 NA NA
## 6 2016-09-01 05:00:00 RL1 NA NA
#show summary of data set
summary(util)
## PosixTime Machine Percent.Idle Utilization
## Min. :2016-09-01 00:00:00 RL1 :720 Min. :0.0000 Min. :0.8492
## 1st Qu.:2016-09-08 11:45:00 RL2 :720 1st Qu.:0.0262 1st Qu.:0.9424
## Median :2016-09-15 23:30:00 SR1 :720 Median :0.0410 Median :0.9590
## Mean :2016-09-15 23:30:00 SR4A:720 Mean :0.0431 Mean :0.9569
## 3rd Qu.:2016-09-23 11:15:00 SR6 :720 3rd Qu.:0.0576 3rd Qu.:0.9738
## Max. :2016-09-30 23:00:00 Max. :0.1508 Max. :1.0000
## NA's :361 NA's :361
#since we are only going to be investigating RL1
#we will create a new subset called RL1
RL1 <- util[util$Machine == "RL1",]
RL1$Machine <- factor(RL1$Machine)
#show summary of RL1
summary(RL1)
## PosixTime Machine Percent.Idle Utilization
## Min. :2016-09-01 00:00:00 RL1:720 Min. :0.00500 Min. :0.8492
## 1st Qu.:2016-09-08 11:45:00 1st Qu.:0.03208 1st Qu.:0.9403
## Median :2016-09-15 23:30:00 Median :0.04613 Median :0.9539
## Mean :2016-09-15 23:30:00 Mean :0.04830 Mean :0.9517
## 3rd Qu.:2016-09-23 11:15:00 3rd Qu.:0.05967 3rd Qu.:0.9679
## Max. :2016-09-30 23:00:00 Max. :0.15077 Max. :0.9950
## NA's :7 NA's :7
#create vector of min, mean and max utilization
util_stats_RL1 <- c(min(RL1$Utilization, na.rm=TRUE),
mean(RL1$Utilization,na.rm=TRUE),
max(RL1$Utilization ,na.rm=TRUE))
#create field - did utilization ever drop below 90%?
#use which to ignore NA values
util_under_90_flag <- length(which(RL1$Utilization < 0.9)) > 0
#create list of machine name, stats and the utilization under 90% flag
list_RL1 <- list(Machine="RL1", Stats=util_stats_RL1, LoadThreshold=util_under_90_flag)
#show list
list_RL1
## $Machine
## [1] "RL1"
##
## $Stats
## [1] 0.8492262 0.9516976 0.9950000
##
## $LoadThreshold
## [1] TRUE
#add new list of hours where utilization is not known
list_RL1$UnknownHours <- RL1[is.na(RL1$Utilization),"PosixTime"]
#show list
list_RL1
## $Machine
## [1] "RL1"
##
## $Stats
## [1] 0.8492262 0.9516976 0.9950000
##
## $LoadThreshold
## [1] TRUE
##
## $UnknownHours
## [1] "2016-09-01 00:00:00 EDT" "2016-09-01 01:00:00 EDT"
## [3] "2016-09-01 02:00:00 EDT" "2016-09-01 03:00:00 EDT"
## [5] "2016-09-01 04:00:00 EDT" "2016-09-01 05:00:00 EDT"
## [7] "2016-09-01 06:00:00 EDT"
#add new component of data frame RL1
list_RL1$data <- RL1
#summary of the list
summary(list_RL1)
## Length Class Mode
## Machine 1 -none- character
## Stats 3 -none- numeric
## LoadThreshold 1 -none- logical
## UnknownHours 7 POSIXct numeric
## data 4 data.frame list
#activate ggplot package
library(ggplot2)
#create time-series plot
ggplot(data=util) + geom_line(aes(x=PosixTime, y=Utilization,
color=Machine), size=1.2) +
facet_grid(Machine~.) + geom_hline(yintercept = 0.9, color="Gray", size=1.2,
linetype=3) +
ggtitle("Machine Utilization over Time") +
theme(plot.title = element_text(hjust=0.5))
## Warning: Removed 155 row(s) containing missing values (geom_path).
