Tools
#This package will allow us to interpolate between missing time series values
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
#These packagew provide functions for easy data wrangling
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.1
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(reshape2)
#This package provided automated forecasting of time series data
library(forecast)
## Warning: package 'forecast' was built under R version 3.5.2
#This package allows us to create pretty plots
library(ggplot2)
#This package will allow us to create javascript widgets for use on webpages
library(htmlwidgets)
#This package uses htmlwidgets to make time series widgets
library(dygraphs)
We will now look at the structure of the data to see that the data has been read in correctly and the data we are set to analyze
## 'data.frame': 2075259 obs. of 9 variables:
## $ Date : Factor w/ 1442 levels "1/1/2007","1/1/2008",..: 342 342 342 342 342 342 342 342 342 342 ...
## $ Time : Factor w/ 1440 levels "00:00:00","00:01:00",..: 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 ...
## $ Global_active_power : num 4.22 5.36 5.37 5.39 3.67 ...
## $ Global_reactive_power: num 0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
## $ Voltage : num 235 234 233 234 236 ...
## $ Global_intensity : num 18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 1 1 2 1 1 2 1 1 1 2 ...
## $ Sub_metering_3 : num 17 16 17 17 17 17 17 17 17 16 ...
The Date and Time variables were read in a characters so we wil convert them to data & time classes
#Convert date to an ISO date
power$Date = as.Date(power$Date, format = "%d/%m/%Y")
#Create a DateTime object
power$DateTime = as.POSIXct(paste(power$Date, power$Time))
#Obtain the Month and Year for each data point
power$Month = format(power$Date, "%Y-%m")
#Add the first to each Y-m combo and convert back to an ISO date
power$Month = as.Date(paste0(power$Month, "-01"))
#Verify that changes have been made
str(power)
## 'data.frame': 2075259 obs. of 11 variables:
## $ Date : Date, format: "2006-12-16" "2006-12-16" ...
## $ Time : Factor w/ 1440 levels "00:00:00","00:01:00",..: 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 ...
## $ Global_active_power : num 4.22 5.36 5.37 5.39 3.67 ...
## $ Global_reactive_power: num 0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
## $ Voltage : num 235 234 233 234 236 ...
## $ Global_intensity : num 18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 1 1 2 1 1 2 1 1 1 2 ...
## $ Sub_metering_3 : num 17 16 17 17 17 17 17 17 17 16 ...
## $ DateTime : POSIXct, format: "2006-12-16" "2006-12-16" ...
## $ Month : Date, format: "2006-12-01" "2006-12-01" ...
#No we will call a summary of the variables
summary(power)
## Date Time Global_active_power
## Min. :2006-12-16 17:24:00: 1442 Min. : 0.076
## 1st Qu.:2007-12-12 17:25:00: 1442 1st Qu.: 0.308
## Median :2008-12-06 17:26:00: 1442 Median : 0.602
## Mean :2008-12-05 17:27:00: 1442 Mean : 1.092
## 3rd Qu.:2009-12-01 17:28:00: 1442 3rd Qu.: 1.528
## Max. :2010-11-26 17:29:00: 1442 Max. :11.122
## (Other) :2066607 NA's :25979
## Global_reactive_power Voltage Global_intensity Sub_metering_1
## Min. :0.000 Min. :223.2 Min. : 0.200 Min. : 0.000
## 1st Qu.:0.048 1st Qu.:239.0 1st Qu.: 1.400 1st Qu.: 0.000
## Median :0.100 Median :241.0 Median : 2.600 Median : 0.000
## Mean :0.124 Mean :240.8 Mean : 4.628 Mean : 1.122
## 3rd Qu.:0.194 3rd Qu.:242.9 3rd Qu.: 6.400 3rd Qu.: 0.000
## Max. :1.390 Max. :254.2 Max. :48.400 Max. :88.000
## NA's :25979 NA's :25979 NA's :25979 NA's :25979
## Sub_metering_2 Sub_metering_3 DateTime
## Min. : 0.000 Min. : 0.000 Min. :2006-12-16 00:00:00
## 1st Qu.: 0.000 1st Qu.: 0.000 1st Qu.:2007-12-12 00:00:00
## Median : 0.000 Median : 1.000 Median :2008-12-06 00:00:00
## Mean : 1.299 Mean : 6.458 Mean :2008-12-05 18:33:49
## 3rd Qu.: 1.000 3rd Qu.:17.000 3rd Qu.:2009-12-01 00:00:00
## Max. :80.000 Max. :31.000 Max. :2010-11-26 00:00:00
## NA's :25979 NA's :25979
## Month
## Min. :2006-12-01
## 1st Qu.:2007-12-01
## Median :2008-12-01
## Mean :2008-11-21
## 3rd Qu.:2009-12-01
## Max. :2010-11-01
##
Of the total data, there are 25,979 missing values out of 2,075,259 observations or 1.25% of the data. We can create a visual to assess missing data over time using ifelse and dplyr
#Implement ifelse to count each minute that is NA
power$Missing = ifelse(is.na(power$Global_active_power), 1,0)
#We are now going to use dplyr's group_by function to group the data by date
power_group_day = group_by(power, Date)
#We will now use dplyr's summarize function to summarize by our NA indicator
power_day_missing = summarize(power_group_day, Count_Missing = sum(Missing))
#Now we will download the calendarHeat function from revolutionanalytics.com
source("http://blog.revolutionanalytics.com/downloads/calendarHeat.R")
#Now we will plot the calendar graph to view the missing data
calendarHeat(power_day_missing$Date, power_day_missing$Count_Missing, varname="Missing Data", color="w2b")
## Loading required package: lattice
## Loading required package: grid
## Loading required package: chron
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'chron'
To address missing values, we will carry the last value forward using na.locf function in the zoo package. This will be the first attempt at addressing missing values
#Use zoo to perform the interpolation for missing time series values
power$Global_active_power_locf = na.locf(power$Global_active_power)
#Compare the original and interpolated distributions. Then we will reshape the two variables into long form for ggplot
power_long = melt(power, id.vars = "DateTime", measure.vars = c("Global_active_power", "Global_active_power_locf"))
#Next create density plot
density_plot = ggplot(power_long, aes(value, fill=variable, color=variable)) + geom_density(alpha=0.75) + facet_wrap(~variable)
#Display plot
density_plot
## Warning: Removed 25979 rows containing non-finite values (stat_density).
Since the overall shape of the data is the same, we can use the complete time series using the carry forward method
Now that we have a complete time series, we can now determine total monthly usage (kWh) in addition to calculating max demand for a given month over the period.
#Use dplyr to group by month
power_group = group_by(power, Month)
#Use dplyr to get monthly max demand and total use results
power_monthly = summarize(power_group, Max_Demand_kw = max(Global_active_power_locf), Total_Use_kWh = sum(Global_active_power_locf)/60)
#Remove partial months from data frame
power_monthly = power_monthly[2:47,]
#Convert Month to date
power_monthly$Month = as.Date(paste0(power_monthly$Month, "-01"))
#Observe the structure of the result
str(power_monthly)
## Classes 'tbl_df', 'tbl' and 'data.frame': 46 obs. of 3 variables:
## $ Month : Date, format: "2007-01-01" "2007-02-01" ...
## $ Max_Demand_kw: num 9.27 9.41 10.67 8.16 7.67 ...
## $ Total_Use_kWh: num 1150 942 981 617 733 ...
Exploratory Data Analysis
#Create plot of total use by month
total_use_plot = ggplot(power_monthly, aes(Month, Total_Use_kWh)) + geom_line(col="blue", lwd=1)
total_use_plot
Forecase Model
Next I will create a forecast for total usage over the next 6 months using the forecast function from the forecast package.
#Create time series object of monthly total use
total_use_ts = ts(power_monthly$Total_Use_kWh, start = c(2007,1), frequency = 12)
#Automatically obtain the forecase for the next 6 months use the forecast function
total_use_fc = forecast(total_use_ts, h=6)
summary(total_use_ts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 205.7 637.9 815.9 785.4 926.1 1210.1
plot(total_use_fc)
Creation of report for decision making