Objective: Create a statistical model to predict elecricity usage in a month
1. Get electricity usage data from smartmetertexas.com starting Feb 2018. Only past 24 months are allowed
2. Get daily temperature/humidity information off the web for the same duration for my area
3. Build a model with minimal error

rm(list = ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 480817 25.7    1052965 56.3   641594 34.3
## Vcells 903989  6.9    8388608 64.0  1752684 13.4
setwd('/home/sumit/Data_Science/R/Projects/electricity')
usage = read.csv("DailyData.csv", stringsAsFactors = FALSE)
usage = usage[,c(-1,-3,-4,-5)]
usage$USAGE_DATE = as.Date(usage$USAGE_DATE, format="%m/%d/%Y")
colnames(usage) = c("date","used")
usage = na.omit(usage)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(rvest)
## Loading required package: xml2
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
monthly_usage <- usage %>%
                 filter(date < as.Date("01/01/2020","%m/%d/%Y")) %>%  #Removed Jan 2020 data as we do not have data for the whole month
                 group_by(month_year=format(date,"%b-%Y")) %>%
                 summarize(used=sum(used))

my = c(paste(month.abb, "-2018",sep=''), paste(month.abb, "-2019",sep=''), paste(month.abb, "-2020",sep='') )

monthly_usage$month_year = factor(monthly_usage$month_year, levels = my)
monthly_usage$high = ifelse(monthly_usage$used>1000,"Yes","No")
ggplot(monthly_usage, aes(x=month_year, y= used, fill=high)) + geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle=90, vjust = -0.005 ))  +
  labs(x="",y="Electricity used in KWH", fill = "Above 1000") 


There have been only three month in 23 month period when usage was more than 1000 KWH. Electricity pricing usually changes around 1000KWH. Based on the plot above, I should be able to pick a plan which offers cheap electricity if usage is below 1000KWH.

#Lets scrape the temperature information for each day from almanac.com so we can build a model to predict electricity usage
get_temp_info <- function(date_value){
url = paste("https://www.almanac.com/weather/history/zipcode/78665/",date_value,sep='')
html = read_html(url)

values <- html %>%
         html_node("body") %>%
         xml_find_all("//tr[contains(@class, 'weatherhistory_results_datavalue temp')]") %>%
         xml_text()
min_temp  = str_replace(values[1],"Minimum Temperature",'')
mean_temp = str_replace(values[2],"Mean Temperature",'')
max_temp  = str_replace(values[3],"Maximum Temperature",'')
temp_info = c(date_value, min_temp, mean_temp, max_temp)
return(temp_info)
}
if(file.exists("temperature_info.csv")==TRUE){
  temperature_info = read.csv("temperature_info.csv", header = TRUE) 
  temperature_info$date = as.Date(temperature_info$date,"%Y-%m-%d")
} else {
  print('Get Temperature info from Almanac')
  temperature_info = data.frame(matrix(nrow=0, ncol=4))
  for(i in 1:nrow(usage)){
    date_value =  format(usage$date[i],'%Y-%m-%d')
    Sys.sleep(5)
    temperature_info = rbind(temperature_info, get_temp_info(date_value), stringsAsFactors=FALSE)
  }
  colnames(temperature_info) = c("date", "minimum_temperature", "mean_temperature", "maximum_temperature")
  temperature_info$date = as.Date(temperature_info$date,"%Y-%m-%d")
  temperature_info = separate(temperature_info, "minimum_temperature",c("minimum_temperature","min_unit"), sep=' ')
  temperature_info = separate(temperature_info, "maximum_temperature",c("maximum_temperature","max_unit"), sep=' ')
  temperature_info = separate(temperature_info, "mean_temperature"   ,c("mean_temperature"   ,"mean_unit"), sep=' ')

  #min, mean and max unit columns can be dropped as they are all in degree fahrenheit
  temperature_info = temperature_info[,c(-3,-5,-7)]
  temperature_info$minimum_temperature = as.numeric(temperature_info$minimum_temperature)
  temperature_info$mean_temperature    = as.numeric(temperature_info$mean_temperature)
  temperature_info$maximum_temperature = as.numeric(temperature_info$maximum_temperature)
  write.csv(temperature_info,"temperature_info.csv", row.names = FALSE)
}  
electricity_data = inner_join(usage, temperature_info, by = c("date" = "date"))
electricity_data$hdd = 65 - electricity_data$mean_temperature  #Heating Degree Days
electricity_data$cdd = electricity_data$mean_temperature -65   #Cooling Degree Days
electricity_data$hdd = ifelse(electricity_data$hdd <=0, 0, electricity_data$hdd)
electricity_data$cdd = ifelse(electricity_data$cdd <=0, 0, electricity_data$cdd)
cor(electricity_data[,-1])
##                           used minimum_temperature mean_temperature
## used                 1.0000000           0.5414552        0.5617088
## minimum_temperature  0.5414552           1.0000000        0.9667652
## mean_temperature     0.5617088           0.9667652        1.0000000
## maximum_temperature  0.5424843           0.8820000        0.9611770
## hdd                 -0.3188666          -0.8537065       -0.8872614
## cdd                  0.6728197           0.8824062        0.9089385
##                     maximum_temperature        hdd        cdd
## used                          0.5424843 -0.3188666  0.6728197
## minimum_temperature           0.8820000 -0.8537065  0.8824062
## mean_temperature              0.9611770 -0.8872614  0.9089385
## maximum_temperature           1.0000000 -0.8547371  0.8719137
## hdd                          -0.8547371  1.0000000 -0.6141499
## cdd                           0.8719137 -0.6141499  1.0000000


mean temperature is highly correlated with minimum and maximum temperature. I will use mean_temperature only for modeling.

ggplot(electricity_data, aes(x=mean_temperature, y=used)) + geom_point(alpha=0.4) +
  labs(x="Mean Temperature in Degrees Fahrenheit", y="Electricity Usage in KWH")


There is a non linear relation between mean temperature and electricity usage. Lets summarize the data by month as the goal is to predict electricity usage by month based on available forecast. Using Temperature information alone to see if this alone can build a decent predictor.

monthly = electricity_data %>%
   group_by(month_year=format(date, "%b-%Y")) %>%
   summarize(used = sum(used), average_temp=mean(mean_temperature), hdd=sum(hdd), cdd=sum(cdd)) 
monthly$month_year = factor(monthly$month_year, levels = my)  
#Humidity information from web
humidity = data.frame("month"=month.abb, humidity=c(62, 62, 63, 62, 66, 65, 60, 58, 59, 61, 64, 63))
monthly$month = substring(as.character(monthly$month_year),1,3)
monthly = inner_join(monthly, humidity, by = c( "month"= "month"))
## Warning: Column `month` joining character vector and factor, coercing into
## character vector


Lets plot our data

ggplot(monthly, aes(x=average_temp, y=used)) + geom_point() + labs(x="Average Monthly Temperature", y = "Monthly Electricity Usage in KWH" )


There appears to be a non-linear relation between Average monthly temperature and electricity usage

ggplot(monthly, aes(x=hdd, y=used)) + geom_point() + labs(x="Monthly Heating Degree Days", y = "Monthly Electricity Usage in KWH" )


There does not appear to be a pattern/relation here.

ggplot(monthly, aes(x=cdd, y=used)) + geom_point() + labs(x="Monthly Cooling Degree Days", y = "Monthly Electricity Usage in KWH" )


it seems like there is a linear relation between CDD and electricity usage

ggplot(monthly, aes(x=humidity, y=used)) + geom_point() + labs(x="Average monthly humidity", y = "Average monthly electricity usage in KWH")


There does appear to be a non-linear relation

#Lets check correlation amongst predictors
cor(monthly[,c(-1, -6)])
##                    used average_temp        hdd        cdd   humidity
## used          1.0000000    0.8743780 -0.6787245  0.9215087 -0.4317350
## average_temp  0.8743780    1.0000000 -0.9218472  0.9646804 -0.3372755
## hdd          -0.6787245   -0.9218472  1.0000000 -0.7952499  0.2292216
## cdd           0.9215087    0.9646804 -0.7952499  1.0000000 -0.4176455
## humidity     -0.4317350   -0.3372755  0.2292216 -0.4176455  1.0000000


There is high correlation amongst some of the predictors. Lets use cdd and humidity build a couple models

set.seed(1)   
train = sample(1:nrow(monthly), nrow(monthly)*0.6)
test = -train

lm.fit = lm(used ~ cdd + poly(humidity,2), data=monthly[train,])
summary(lm.fit)
## 
## Call:
## lm(formula = used ~ cdd + poly(humidity, 2), data = monthly[train, 
##     ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -158.79  -36.15   10.61   42.15   67.36 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         500.8508    34.6585  14.451    5e-08 ***
## cdd                   0.6199     0.1382   4.484  0.00117 ** 
## poly(humidity, 2)1 -170.8679    89.0778  -1.918  0.08406 .  
## poly(humidity, 2)2  169.3673   108.9898   1.554  0.15124    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 69.03 on 10 degrees of freedom
## Multiple R-squared:  0.9234, Adjusted R-squared:  0.9005 
## F-statistic: 40.21 on 3 and 10 DF,  p-value: 6.888e-06
lm.pred1 = predict(lm.fit, type="response", monthly[test,], interval = "prediction")
(mse_linear = mean((lm.pred1[,1] - monthly$used[test])^2)   )
## [1] 10351.76


Adjusted R-Squared is 0.905 meaning 90.5% of the variability in the training data can be explained by this regression model. This model also indicates that p value is large for squared(humidity). Lets build a model without squared(humidity).

lm.fit = lm(used ~ cdd + humidity, data=monthly[train,])
summary(lm.fit)
## 
## Call:
## lm(formula = used ~ cdd + humidity, data = monthly[train, ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -160.22  -48.18   39.91   49.25   54.14 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1281.72217  662.55085   1.935   0.0792 .  
## cdd            0.78615    0.09302   8.451 3.86e-06 ***
## humidity     -13.13348   10.50754  -1.250   0.2373    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 73.34 on 11 degrees of freedom
## Multiple R-squared:  0.905,  Adjusted R-squared:  0.8877 
## F-statistic: 52.37 on 2 and 11 DF,  p-value: 2.391e-06
lm.pred2 = predict(lm.fit, type="response", monthly[test,], interval = "prediction")
(mse_linear = mean((lm.pred2[,1] - monthly$used[test])^2)   )
## [1] 11591.04


Adjusted R-squared decreased slightly which is expected as we remove predictors from the model but mse has gone up a bit.

library(randomForest)
set.seed(1)
rf.fit = randomForest(used ~ cdd + humidity, data=monthly[train,])
rf.pred = predict(rf.fit, type="response", monthly[test,])
rf.fit
## 
## Call:
##  randomForest(formula = used ~ cdd + humidity, data = monthly[train,      ]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 9986.658
##                     % Var explained: 77.54
(mean((rf.pred - monthly$used[test])^2))
## [1] 15498.23


MSE is higher for randomForest model

#Lets try KNN
monthly_scaled = scale(monthly[,c(-1,-6, -2)])
scale_attr = list(center=attr(monthly_scaled,"scaled:center"), scale=attr(monthly_scaled,"scaled:scale"))

x = cbind(monthly_scaled[,"cdd"], monthly_scaled[,"humidity"])
y = monthly$used

train.x = x[train,]
test.x  = x[test,]
train.y = y[train]

knn_mse = c()
for(i in seq(1:10)){
pred = FNN::knn.reg(train.x, test.x, train.y, k=i)
knn_mse[i]=mean((pred$pred - monthly$used[test])^2)
}
plot(knn_mse, type="l", xlab = "K value", ylab = "MSE")

(knn_mse[which.min(knn_mse)])
## [1] 16763.7


Minimum MSE is obtained at K=3 but it is still higher than MSE obtained by linear model

#Lets compare predicted usage and actual usage for the linear model
test_result = data.frame("Month_Year"=monthly$month_year[test], Predicted= lm.pred1[,1], Actual= monthly$used[test])
test_result = gather(test_result, "Type","Value",2:3)
pi_result = data.frame("Month_Year"=monthly$month_year[test], Pred_Lower= lm.pred1[,2], Pred_Higher= lm.pred1[,3])
ggplot(test_result) + geom_bar(aes(x=Month_Year, y=Value, fill=Type), stat = "identity", position="dodge")  +
  geom_line(data = pi_result, aes(x=Month_Year, y=Pred_Lower, group=1), linetype="dotted") +
  geom_line(data = pi_result, aes(x=Month_Year, y=Pred_Higher, group=1), linetype="dotted") +
labs(x='',y='Electricity Usage in KWH',title = 'Predicted vs. Actual electricity usage for test periods', caption = "Dotted black lines represent prediction interval")  +
theme(axis.text.x = element_text(angle=90, vjust = -0.002))


This model did a pretty decent job on predicting electricity usage within [sqrt(10351)] 100 units (approx.) of actual usage on test data not used for training. Prediction for July is off because of our trip to India in that month. A better model could have been built if I had the data on days we were out of the house.