Energy consumption task using prophet, lapply and map function
pacman::p_load(dplyr,readr,lubridate,ggplot2,reshape,forecast, zoo, tseries, padr, RMySQL, prophet, purrr,
data.table, stringr, Rserve, readthedown)
#Rserve()
#list.functions.in.file(filename = "code")Data import
## Create a database connection
con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!',
dbname='dataanalytics2018',
host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')
# Load data-----
j <- c("yr_2006", "yr_2007", "yr_2008", "yr_2009", "yr_2010")
HHPC <- c()
for (i in 1:length(j)) {
X <- dbGetQuery(con,
paste("SELECT * FROM ",
j[i]))
HHPC <- rbind(HHPC,X)
}
rm(X, j)Fill NAs with data
example of sql connection
| id | Date | Time | Global_active_power | Global_reactive_power | Global_intensity | Voltage | Sub_metering_1 | Sub_metering_2 | Sub_metering_3 |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 2007-01-01 | 00:00:00 | 2.580 | 0.136 | 10.6 | 241.97 | 0 | 0 | 0 |
| 2 | 2007-01-01 | 00:01:00 | 2.552 | 0.100 | 10.4 | 241.75 | 0 | 0 | 0 |
| 3 | 2007-01-01 | 00:02:00 | 2.550 | 0.100 | 10.4 | 241.64 | 0 | 0 | 0 |
| 4 | 2007-01-01 | 00:03:00 | 2.550 | 0.100 | 10.4 | 241.71 | 0 | 0 | 0 |
| 5 | 2007-01-01 | 00:04:00 | 2.554 | 0.100 | 10.4 | 241.98 | 0 | 0 | 0 |
| 6 | 2007-01-01 | 00:05:00 | 2.550 | 0.100 | 10.4 | 241.83 | 0 | 0 | 0 |
| 7 | 2007-01-01 | 00:06:00 | 2.534 | 0.096 | 10.4 | 241.07 | 0 | 0 | 0 |
| 8 | 2007-01-01 | 00:07:00 | 2.484 | 0.000 | 10.2 | 241.29 | 0 | 0 | 0 |
| 9 | 2007-01-01 | 00:08:00 | 2.468 | 0.000 | 10.2 | 241.23 | 0 | 0 | 0 |
| 10 | 2007-01-01 | 00:09:00 | 2.486 | 0.000 | 10.2 | 242.18 | 0 | 0 | 0 |
Exploratory analysis
# Scale conversion
HHPC$kitchen_kwh <- HHPC$Sub_metering_1/1000
HHPC$laundry_kwh <- HHPC$Sub_metering_2/1000
HHPC$waterheat_aircond_kwh <- HHPC$Sub_metering_3/1000
HHPC$Global_active_power_kwh <- HHPC$Global_active_power/60
HHPC <- HHPC[,-which(names(HHPC) %in% c("Sub_metering_1","Sub_metering_2","Sub_metering_3"))] # delete old columns (sub 1, 2, 3)
HHPC$Other_kwh <- HHPC$Global_active_power_kwh - HHPC$kitchen_kwh - HHPC$laundry_kwh -
HHPC$waterheat_aircond_kwh
##### Daylight saving time treatment######
for (i in c("2007-03-25 02:00:00", "2008-03-30 02:00:00", "2009-03-29 02:00:00", "2010-03-28 02:00:00")) {
HHPC_dst <- HHPC %>%
mutate(DateTime = ifelse(between(DateTime,as_datetime(i),
as_datetime("2007-10-28 01:59:00")),
DateTime+3600,DateTime))
HHPC_dst$DateTime <- as.POSIXct(HHPC_dst$DateTime, origin = "1970-01-01 00:00:00",
"%d-%m-%Y %H:%M:%S")
}
HHPC_dst$Date <- as.Date(HHPC_dst$DateTime)
# New time variables
HHPC_dst$Year <- floor_date(HHPC_dst$Date, unit = "year")
HHPC_dst$MonthYear <- floor_date(HHPC_dst$Date, unit = "month")
HHPC_dst$WeekYear <- floor_date(HHPC_dst$Date, "week")Data by year, month, week, day
#write.csv(HHPC_dst, file = "HHPC_dst.csv")
granularity <- list()
group <- as.list(c("Year","MonthYear","WeekYear","Date"))
#group <- as.list(c("Year","MonthYear","WeekYear","Date", "DateTime"))
for(i in group) {
granularity[[i]] <- HHPC_dst %>% group_by_at(i) %>%
summarise(Global_reactive_power = sum(Global_reactive_power),
Global_active_power_kwh = sum(Global_active_power_kwh),
kitchen_kwh = sum(kitchen_kwh), laundry_kwh = sum(laundry_kwh),
waterheat_aircond_kwh = sum(waterheat_aircond_kwh),
Other_kwh = sum(Other_kwh),
Voltage = mean(Voltage),
Global_intensity = mean(Global_intensity))
}
granularity$Year## # A tibble: 5 x 9
## Year Global_reactive~ Global_active_p~ kitchen_kwh laundry_kwh
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2006-01-01 2890. 697. 27.5 48.8
## 2 2007-01-01 61231. 9712. 643. 854.
## 3 2008-01-01 61707. 9415. 585. 662.
## 4 2009-01-01 68509. 9372. 593. 592.
## 5 2010-01-01 59201. 8089. 452. 504.
## # ... with 4 more variables: waterheat_aircond_kwh <dbl>, Other_kwh <dbl>,
## # Voltage <dbl>, Global_intensity <dbl>
granularity$MonthYear$Month <- lubridate::month(granularity$MonthYear$MonthYear, label = FALSE, abbr = FALSE, locale = "English")
granularity$Date$week <- lubridate::week(granularity$Date$Date)
granularity$MonthYear <- granularity$MonthYear %>% mutate(id = seq.int(nrow(granularity$MonthYear)))
write.csv(granularity$MonthYear, file = "MonthYear.csv")
#granularity$Year <- filter(granularity$Year, Year > 2007)
granularity$Year## # A tibble: 5 x 9
## Year Global_reactive~ Global_active_p~ kitchen_kwh laundry_kwh
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 2006-01-01 2890. 697. 27.5 48.8
## 2 2007-01-01 61231. 9712. 643. 854.
## 3 2008-01-01 61707. 9415. 585. 662.
## 4 2009-01-01 68509. 9372. 593. 592.
## 5 2010-01-01 59201. 8089. 452. 504.
## # ... with 4 more variables: waterheat_aircond_kwh <dbl>, Other_kwh <dbl>,
## # Voltage <dbl>, Global_intensity <dbl>
# granularity$MonthYear <- granularity$MonthYear[!(granularity$MonthYear$MonthYear == "12 - 2006"),] # excluding incomplete month
# granularity$MonthYear <- granularity$MonthYear[!(granularity$MonthYear$MonthYear == "11 - 2010"),]
#
# granularity$WeekYear <- granularity$WeekYear[!(granularity$WeekYear$Year == "2006"),]Forecast
Prophet
df_prophet <- granularity
df_prophet$Date <- setnames(df_prophet$Date, c("Date","Global_active_power_kwh"),c("ds","y"))
df_prophet$MonthYear <- setnames(df_prophet$MonthYear, c("MonthYear","Global_active_power_kwh"),c("ds","y"))
df_prophet$Year <- setnames(df_prophet$Year, c("Year","Global_active_power_kwh"),c("ds","y"))
df_prophet$WeekYear <- setnames(df_prophet$WeekYear, c("WeekYear","Global_active_power_kwh"),c("ds","y"))
df_prophet$Date$type <- "day"
df_prophet$WeekYear$type <- "week"
df_prophet$MonthYear$type <- "month"
df_prophet$Year$type <- "year"m <- lapply(df_prophet, function(x) prophet(df = x, daily.seasonality = T,
yearly.seasonality = T,
weekly.seasonality = T))## n.changepoints greater than number of observations. Using 3
# Extend dataframe 100 days into the future
future <- lapply(m,function(x) make_future_dataframe(m = x,periods = 24, freq = paste(x$history[1,"type"]))) # x$history in this case means m$Year$history# Generate forecast for next 100 days
forecast <- map2(m, future, predict) # lapply doesn't work in these cases
map2(m,forecast, dyplot.prophet)## $Year
##
## $MonthYear
##
## $WeekYear
##
## $Date
## Making 2 forecasts with cutoffs between 2009-05-30 and 2009-11-26
## Warning: Unknown or uninitialised column: 'y'.
## Warning: Unknown or uninitialised column: 'y'.
## Skipping MAPE because y close to 0
# export tables for power bi analysis
#granularity$MonthYear <- select(granularity$MonthYear, Global_active_power_kwh) %>% mutate(ID = seq.int(nrow(granularity$MonthYear)))
#write.csv(granularity$MonthYear, file = "MonthYear.csv")
#write.csv(granularity$Date, file = "Date.csv")
#write.csv(forecast$MonthYear, file = "Month_forecast.csv")