Tools

#This package will allow us to interpolate between missing time series values
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
#These packagew provide functions for easy data wrangling
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(reshape2)

#This package provided automated forecasting of time series data
library(forecast)
## Warning: package 'forecast' was built under R version 3.5.2
#This package allows us to create pretty plots
library(ggplot2)

#This package will allow us to create javascript widgets for use on webpages
library(htmlwidgets)

#This package uses htmlwidgets to make time series widgets
library(dygraphs)

Read data and take care of NAs and ? in the data

We will now look at the structure of the data to see that the data has been read in correctly and the data we are set to analyze

## 'data.frame':    2075259 obs. of  9 variables:
##  $ Date                 : Factor w/ 1442 levels "1/1/2007","1/1/2008",..: 342 342 342 342 342 342 342 342 342 342 ...
##  $ Time                 : Factor w/ 1440 levels "00:00:00","00:01:00",..: 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 ...
##  $ Global_active_power  : num  4.22 5.36 5.37 5.39 3.67 ...
##  $ Global_reactive_power: num  0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
##  $ Voltage              : num  235 234 233 234 236 ...
##  $ Global_intensity     : num  18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  1 1 2 1 1 2 1 1 1 2 ...
##  $ Sub_metering_3       : num  17 16 17 17 17 17 17 17 17 16 ...

Data Wrangling

The Date and Time variables were read in a characters so we wil convert them to data & time classes

#Convert date to an ISO date
power$Date = as.Date(power$Date, format = "%d/%m/%Y")

#Create a DateTime object
power$DateTime = as.POSIXct(paste(power$Date, power$Time))

#Obtain the Month and Year for each data point
power$Month = format(power$Date, "%Y-%m")

#Add the first to each Y-m combo and convert back to an ISO date
power$Month = as.Date(paste0(power$Month, "-01"))

#Verify that changes have been made
str(power)
## 'data.frame':    2075259 obs. of  11 variables:
##  $ Date                 : Date, format: "2006-12-16" "2006-12-16" ...
##  $ Time                 : Factor w/ 1440 levels "00:00:00","00:01:00",..: 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 ...
##  $ Global_active_power  : num  4.22 5.36 5.37 5.39 3.67 ...
##  $ Global_reactive_power: num  0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
##  $ Voltage              : num  235 234 233 234 236 ...
##  $ Global_intensity     : num  18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  1 1 2 1 1 2 1 1 1 2 ...
##  $ Sub_metering_3       : num  17 16 17 17 17 17 17 17 17 16 ...
##  $ DateTime             : POSIXct, format: "2006-12-16" "2006-12-16" ...
##  $ Month                : Date, format: "2006-12-01" "2006-12-01" ...
#No we will call a summary of the variables
summary(power)
##       Date                  Time         Global_active_power
##  Min.   :2006-12-16   17:24:00:   1442   Min.   : 0.076     
##  1st Qu.:2007-12-12   17:25:00:   1442   1st Qu.: 0.308     
##  Median :2008-12-06   17:26:00:   1442   Median : 0.602     
##  Mean   :2008-12-05   17:27:00:   1442   Mean   : 1.092     
##  3rd Qu.:2009-12-01   17:28:00:   1442   3rd Qu.: 1.528     
##  Max.   :2010-11-26   17:29:00:   1442   Max.   :11.122     
##                       (Other) :2066607   NA's   :25979      
##  Global_reactive_power    Voltage      Global_intensity Sub_metering_1  
##  Min.   :0.000         Min.   :223.2   Min.   : 0.200   Min.   : 0.000  
##  1st Qu.:0.048         1st Qu.:239.0   1st Qu.: 1.400   1st Qu.: 0.000  
##  Median :0.100         Median :241.0   Median : 2.600   Median : 0.000  
##  Mean   :0.124         Mean   :240.8   Mean   : 4.628   Mean   : 1.122  
##  3rd Qu.:0.194         3rd Qu.:242.9   3rd Qu.: 6.400   3rd Qu.: 0.000  
##  Max.   :1.390         Max.   :254.2   Max.   :48.400   Max.   :88.000  
##  NA's   :25979         NA's   :25979   NA's   :25979    NA's   :25979   
##  Sub_metering_2   Sub_metering_3      DateTime                  
##  Min.   : 0.000   Min.   : 0.000   Min.   :2006-12-16 00:00:00  
##  1st Qu.: 0.000   1st Qu.: 0.000   1st Qu.:2007-12-12 00:00:00  
##  Median : 0.000   Median : 1.000   Median :2008-12-06 00:00:00  
##  Mean   : 1.299   Mean   : 6.458   Mean   :2008-12-05 18:33:49  
##  3rd Qu.: 1.000   3rd Qu.:17.000   3rd Qu.:2009-12-01 00:00:00  
##  Max.   :80.000   Max.   :31.000   Max.   :2010-11-26 00:00:00  
##  NA's   :25979    NA's   :25979                                 
##      Month           
##  Min.   :2006-12-01  
##  1st Qu.:2007-12-01  
##  Median :2008-12-01  
##  Mean   :2008-11-21  
##  3rd Qu.:2009-12-01  
##  Max.   :2010-11-01  
## 

Of the total data, there are 25,979 missing values out of 2,075,259 observations or 1.25% of the data. We can create a visual to assess missing data over time using ifelse and dplyr

#Implement ifelse to count each minute that is NA
power$Missing = ifelse(is.na(power$Global_active_power), 1,0)

#We are now going to use dplyr's group_by function to group the data by date
power_group_day = group_by(power, Date)

#We will now use dplyr's summarize function to summarize by our NA indicator
power_day_missing = summarize(power_group_day, Count_Missing = sum(Missing))

#Now we will download the calendarHeat function from revolutionanalytics.com
source("http://blog.revolutionanalytics.com/downloads/calendarHeat.R")

#Now we will plot the calendar graph to view the missing data
calendarHeat(power_day_missing$Date, power_day_missing$Count_Missing, varname="Missing Data", color="w2b")
## Loading required package: lattice
## Loading required package: grid
## Loading required package: chron
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'chron'

To address missing values, we will carry the last value forward using na.locf function in the zoo package. This will be the first attempt at addressing missing values

#Use zoo to perform the interpolation for missing time series values
power$Global_active_power_locf = na.locf(power$Global_active_power)

#Compare the original and interpolated distributions.  Then we will reshape the two variables into long form for ggplot
power_long = melt(power, id.vars = "DateTime", measure.vars = c("Global_active_power", "Global_active_power_locf"))

#Next create density plot
density_plot = ggplot(power_long, aes(value, fill=variable, color=variable)) + geom_density(alpha=0.75) + facet_wrap(~variable)

#Display plot
density_plot
## Warning: Removed 25979 rows containing non-finite values (stat_density).

Since the overall shape of the data is the same, we can use the complete time series using the carry forward method

Now that we have a complete time series, we can now determine total monthly usage (kWh) in addition to calculating max demand for a given month over the period.

#Use dplyr to group by month
power_group = group_by(power, Month)

#Use dplyr to get monthly max demand and total use results
power_monthly = summarize(power_group, Max_Demand_kw = max(Global_active_power_locf), Total_Use_kWh = sum(Global_active_power_locf)/60)

#Remove partial months from data frame
power_monthly = power_monthly[2:47,]

#Convert Month to date
power_monthly$Month = as.Date(paste0(power_monthly$Month, "-01"))

#Observe the structure of the result
str(power_monthly)
## Classes 'tbl_df', 'tbl' and 'data.frame':    46 obs. of  3 variables:
##  $ Month        : Date, format: "2007-01-01" "2007-02-01" ...
##  $ Max_Demand_kw: num  9.27 9.41 10.67 8.16 7.67 ...
##  $ Total_Use_kWh: num  1150 942 981 617 733 ...

Exploratory Data Analysis

#Create plot of total use by month
total_use_plot = ggplot(power_monthly, aes(Month, Total_Use_kWh)) + geom_line(col="blue", lwd=1)

total_use_plot

Forecase Model

Next I will create a forecast for total usage over the next 6 months using the forecast function from the forecast package.

#Create time series object of monthly total use

total_use_ts = ts(power_monthly$Total_Use_kWh, start = c(2007,1), frequency = 12)

#Automatically obtain the forecase for the next 6 months use the forecast function

total_use_fc = forecast(total_use_ts, h=6)

summary(total_use_ts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   205.7   637.9   815.9   785.4   926.1  1210.1
plot(total_use_fc)

Creation of report for decision making