Install packages

install.packages("readr", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("skimr", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("dplyr", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("lubridate", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("RMySQL", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("plotly", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("circular", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("ggeasy", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("RColorBrewer", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("viridis", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("reshape2", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
options(scipen = 999) # force R not to use exponential notation

Load Libraries

library("readr")
## Warning: package 'readr' was built under R version 4.1.2
library("skimr")
library("dplyr")
## Warning: package 'dplyr' was built under R version 4.1.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ stringr 1.4.0
## ✓ tidyr   1.2.0     ✓ forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library("lubridate")
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library("RMySQL")
## Loading required package: DBI
library("plotly")
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library("circular")
## 
## Attaching package: 'circular'
## The following object is masked from 'package:plotly':
## 
##     wind
## The following objects are masked from 'package:stats':
## 
##     sd, var
library("ggeasy")
library("RColorBrewer")
library("viridis")
## Loading required package: viridisLite
library("viridisLite")
library("reshape2")
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

Obtain data using SQL querry

# Create db connection
con <- dbConnect(MySQL(),
                 user='deepAnalytics',
                 password='Sqltask1234!',
                 dbname='dataanalytics2018',
                 host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')

# List the tables contained in the database
dbListTables(con)
## [1] "iris"    "yr_2006" "yr_2007" "yr_2008" "yr_2009" "yr_2010"

Yearly Electricity records

Attributes

dbListFields(con, "yr_2006")
##  [1] "id"                    "Date"                  "Time"                 
##  [4] "Global_active_power"   "Global_reactive_power" "Global_intensity"     
##  [7] "Voltage"               "Sub_metering_1"        "Sub_metering_2"       
## [10] "Sub_metering_3"
dbListFields(con, "yr_2007")
##  [1] "id"                    "Date"                  "Time"                 
##  [4] "Global_active_power"   "Global_reactive_power" "Global_intensity"     
##  [7] "Voltage"               "Sub_metering_1"        "Sub_metering_2"       
## [10] "Sub_metering_3"
dbListFields(con, "yr_2008")
##  [1] "id"                    "Date"                  "Time"                 
##  [4] "Global_active_power"   "Global_reactive_power" "Global_intensity"     
##  [7] "Voltage"               "Sub_metering_1"        "Sub_metering_2"       
## [10] "Sub_metering_3"
dbListFields(con, "yr_2009")
##  [1] "id"                    "Date"                  "Time"                 
##  [4] "Global_active_power"   "Global_reactive_power" "Global_intensity"     
##  [7] "Voltage"               "Sub_metering_1"        "Sub_metering_2"       
## [10] "Sub_metering_3"
dbListFields(con, "yr_2010")
##  [1] "id"                    "Date"                  "Time"                 
##  [4] "Global_active_power"   "Global_reactive_power" "Global_intensity"     
##  [7] "Voltage"               "Sub_metering_1"        "Sub_metering_2"       
## [10] "Sub_metering_3"

Download tables

yr_2006 <- dbGetQuery(con, "SELECT * FROM yr_2006")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
yr_2007 <- dbGetQuery(con, "SELECT * FROM yr_2007")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
yr_2008 <- dbGetQuery(con, "SELECT * FROM yr_2008")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
yr_2009 <- dbGetQuery(con, "SELECT * FROM yr_2009")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
yr_2010 <- dbGetQuery(con, "SELECT * FROM yr_2010")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
str(yr_2006)
## 'data.frame':    21992 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2006-12-16" "2006-12-16" "2006-12-16" "2006-12-16" ...
##  $ Time                 : chr  "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
##  $ Global_active_power  : num  4.22 5.36 5.37 5.39 3.67 ...
##  $ Global_reactive_power: num  0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
##  $ Global_intensity     : num  18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
##  $ Voltage              : num  235 234 233 234 236 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  1 1 2 1 1 2 1 1 1 2 ...
##  $ Sub_metering_3       : num  17 16 17 17 17 17 17 17 17 16 ...
str(yr_2007)
## 'data.frame':    521669 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
##  $ Time                 : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Global_active_power  : num  2.58 2.55 2.55 2.55 2.55 ...
##  $ Global_reactive_power: num  0.136 0.1 0.1 0.1 0.1 0.1 0.096 0 0 0 ...
##  $ Global_intensity     : num  10.6 10.4 10.4 10.4 10.4 10.4 10.4 10.2 10.2 10.2 ...
##  $ Voltage              : num  242 242 242 242 242 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3       : num  0 0 0 0 0 0 0 0 0 0 ...
str(yr_2008)
## 'data.frame':    526905 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2008-01-01" "2008-01-01" "2008-01-01" "2008-01-01" ...
##  $ Time                 : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Global_active_power  : num  1.62 1.63 1.62 1.61 1.61 ...
##  $ Global_reactive_power: num  0.07 0.072 0.072 0.07 0.07 0 0 0 0 0 ...
##  $ Global_intensity     : num  6.6 6.6 6.6 6.6 6.6 6.4 6.4 6.4 6.4 6.4 ...
##  $ Voltage              : num  241 242 242 241 241 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3       : num  18 18 18 18 18 17 18 18 18 18 ...
str(yr_2009)
## 'data.frame':    521320 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2009-01-01" "2009-01-01" "2009-01-01" "2009-01-01" ...
##  $ Time                 : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Global_active_power  : num  0.484 0.484 0.482 0.482 0.482 0.57 0.59 0.588 0.586 0.586 ...
##  $ Global_reactive_power: num  0.062 0.062 0.062 0.06 0.062 0 0.078 0.078 0.078 0.078 ...
##  $ Global_intensity     : num  2.2 2.2 2.2 2.2 2.2 2.6 2.6 2.6 2.6 2.6 ...
##  $ Voltage              : num  248 248 248 248 247 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3       : num  0 0 0 0 0 0 0 0 0 0 ...
str(yr_2010)
## 'data.frame':    457394 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2010-01-01" "2010-01-01" "2010-01-01" "2010-01-01" ...
##  $ Time                 : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Global_active_power  : num  1.79 1.78 1.78 1.75 1.69 ...
##  $ Global_reactive_power: num  0.236 0.234 0.234 0.186 0.102 0.1 0.1 0.102 0.072 0 ...
##  $ Global_intensity     : num  7.4 7.4 7.4 7.2 7 7 7 7 6.8 6.6 ...
##  $ Voltage              : num  241 240 240 240 240 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3       : num  18 18 19 18 18 19 18 18 19 18 ...
summary(yr_2006)
##        id            Date               Time           Global_active_power
##  Min.   :    1   Length:21992       Length:21992       Min.   :0.194      
##  1st Qu.: 5499   Class :character   Class :character   1st Qu.:0.496      
##  Median :10998   Mode  :character   Mode  :character   Median :1.708      
##  Mean   :10998                                         Mean   :1.901      
##  3rd Qu.:16496                                         3rd Qu.:2.692      
##  Max.   :21996                                         Max.   :9.132      
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1  
##  Min.   :0.0000        Min.   : 0.80    Min.   :228.2   Min.   : 0.000  
##  1st Qu.:0.0000        1st Qu.: 2.20    1st Qu.:238.8   1st Qu.: 0.000  
##  Median :0.1140        Median : 7.20    Median :241.7   Median : 0.000  
##  Mean   :0.1314        Mean   : 8.03    Mean   :241.4   Mean   : 1.249  
##  3rd Qu.:0.1980        3rd Qu.:11.40    3rd Qu.:244.4   3rd Qu.: 0.000  
##  Max.   :0.8000        Max.   :39.40    Max.   :251.7   Max.   :77.000  
##  Sub_metering_2   Sub_metering_3 
##  Min.   : 0.000   Min.   : 0.00  
##  1st Qu.: 0.000   1st Qu.: 0.00  
##  Median : 0.000   Median : 0.00  
##  Mean   : 2.215   Mean   : 7.41  
##  3rd Qu.: 1.000   3rd Qu.:17.00  
##  Max.   :74.000   Max.   :20.00
summary(yr_2007)
##        id             Date               Time           Global_active_power
##  Min.   :     1   Length:521669      Length:521669      Min.   : 0.082     
##  1st Qu.:130423   Class :character   Class :character   1st Qu.: 0.278     
##  Median :264606   Mode  :character   Mode  :character   Median : 0.504     
##  Mean   :263456                                         Mean   : 1.117     
##  3rd Qu.:395178                                         3rd Qu.: 1.548     
##  Max.   :525600                                         Max.   :10.670     
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1  
##  Min.   :0.0000        Min.   : 0.400   Min.   :223.5   Min.   : 0.000  
##  1st Qu.:0.0000        1st Qu.: 1.200   1st Qu.:236.9   1st Qu.: 0.000  
##  Median :0.1000        Median : 2.400   Median :239.7   Median : 0.000  
##  Mean   :0.1174        Mean   : 4.764   Mean   :239.4   Mean   : 1.232  
##  3rd Qu.:0.1860        3rd Qu.: 6.400   3rd Qu.:241.8   3rd Qu.: 0.000  
##  Max.   :1.1480        Max.   :46.400   Max.   :252.1   Max.   :78.000  
##  Sub_metering_2   Sub_metering_3  
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 0.000   Median : 0.000  
##  Mean   : 1.638   Mean   : 5.795  
##  3rd Qu.: 1.000   3rd Qu.:17.000  
##  Max.   :78.000   Max.   :20.000
summary(yr_2008)
##        id             Date               Time           Global_active_power
##  Min.   :     1   Length:526905      Length:526905      Min.   : 0.076     
##  1st Qu.:131732   Class :character   Class :character   1st Qu.: 0.300     
##  Median :263461   Mode  :character   Mode  :character   Median : 0.566     
##  Mean   :263474                                         Mean   : 1.072     
##  3rd Qu.:395191                                         3rd Qu.: 1.518     
##  Max.   :527040                                         Max.   :10.348     
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1 
##  Min.   :0.0000        Min.   : 0.200   Min.   :224.6   Min.   : 0.00  
##  1st Qu.:0.0460        1st Qu.: 1.400   1st Qu.:238.9   1st Qu.: 0.00  
##  Median :0.0940        Median : 2.400   Median :240.7   Median : 0.00  
##  Mean   :0.1171        Mean   : 4.552   Mean   :240.6   Mean   : 1.11  
##  3rd Qu.:0.1840        3rd Qu.: 6.400   3rd Qu.:242.5   3rd Qu.: 0.00  
##  Max.   :1.3900        Max.   :44.600   Max.   :250.9   Max.   :80.00  
##  Sub_metering_2   Sub_metering_3  
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 0.000   Median : 1.000  
##  Mean   : 1.256   Mean   : 6.034  
##  3rd Qu.: 1.000   3rd Qu.:17.000  
##  Max.   :76.000   Max.   :31.000
summary(yr_2009)
##        id             Date               Time           Global_active_power
##  Min.   :     1   Length:521320      Length:521320      Min.   : 0.122     
##  1st Qu.:130398   Class :character   Class :character   1st Qu.: 0.318     
##  Median :264038   Mode  :character   Mode  :character   Median : 0.622     
##  Mean   :262890                                         Mean   : 1.079     
##  3rd Qu.:395266                                         3rd Qu.: 1.514     
##  Max.   :525600                                         Max.   :11.122     
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1  
##  Min.   :0.0000        Min.   : 0.400   Min.   :223.2   Min.   : 0.000  
##  1st Qu.:0.0520        1st Qu.: 1.400   1st Qu.:240.1   1st Qu.: 0.000  
##  Median :0.1060        Median : 2.800   Median :241.9   Median : 0.000  
##  Mean   :0.1314        Mean   : 4.555   Mean   :241.9   Mean   : 1.137  
##  3rd Qu.:0.2060        3rd Qu.: 6.200   3rd Qu.:243.6   3rd Qu.: 0.000  
##  Max.   :1.2400        Max.   :48.400   Max.   :254.2   Max.   :82.000  
##  Sub_metering_2   Sub_metering_3  
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 0.000   Median : 1.000  
##  Mean   : 1.136   Mean   : 6.823  
##  3rd Qu.: 1.000   3rd Qu.:18.000  
##  Max.   :77.000   Max.   :31.000
summary(yr_2010)
##        id             Date               Time           Global_active_power
##  Min.   :     1   Length:457394      Length:457394      Min.   :0.138      
##  1st Qu.:119509   Class :character   Class :character   1st Qu.:0.336      
##  Median :233860   Mode  :character   Mode  :character   Median :0.700      
##  Mean   :236335                                         Mean   :1.061      
##  3rd Qu.:355437                                         3rd Qu.:1.512      
##  Max.   :475023                                         Max.   :9.724      
##  Global_reactive_power Global_intensity    Voltage      Sub_metering_1   
##  Min.   :0.0000        Min.   : 0.600   Min.   :225.3   Min.   : 0.0000  
##  1st Qu.:0.0540        1st Qu.: 1.400   1st Qu.:239.8   1st Qu.: 0.0000  
##  Median :0.1000        Median : 3.000   Median :241.5   Median : 0.0000  
##  Mean   :0.1294        Mean   : 4.478   Mean   :241.5   Mean   : 0.9875  
##  3rd Qu.:0.2000        3rd Qu.: 6.200   3rd Qu.:243.2   3rd Qu.: 0.0000  
##  Max.   :1.1240        Max.   :43.000   Max.   :253.5   Max.   :88.0000  
##  Sub_metering_2   Sub_metering_3  
##  Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 0.000   1st Qu.: 1.000  
##  Median : 0.000   Median : 1.000  
##  Mean   : 1.102   Mean   : 7.244  
##  3rd Qu.: 1.000   3rd Qu.:18.000  
##  Max.   :80.000   Max.   :31.000
head(yr_2006)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2006-12-16 17:24:00               4.216                 0.418
## 2  2 2006-12-16 17:25:00               5.360                 0.436
## 3  3 2006-12-16 17:26:00               5.374                 0.498
## 4  4 2006-12-16 17:27:00               5.388                 0.502
## 5  5 2006-12-16 17:28:00               3.666                 0.528
## 6  6 2006-12-16 17:29:00               3.520                 0.522
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1             18.4  234.84              0              1             17
## 2             23.0  233.63              0              1             16
## 3             23.0  233.29              0              2             17
## 4             23.0  233.74              0              1             17
## 5             15.8  235.68              0              1             17
## 6             15.0  235.02              0              2             17
tail(yr_2006)
##          id       Date     Time Global_active_power Global_reactive_power
## 21987 21991 2006-12-31 23:54:00               2.576                 0.132
## 21988 21992 2006-12-31 23:55:00               2.574                 0.132
## 21989 21993 2006-12-31 23:56:00               2.576                 0.132
## 21990 21994 2006-12-31 23:57:00               2.586                 0.134
## 21991 21995 2006-12-31 23:58:00               2.648                 0.212
## 21992 21996 2006-12-31 23:59:00               2.646                 0.236
##       Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 21987             10.6  241.90              0              0              0
## 21988             10.6  241.89              0              0              0
## 21989             10.6  242.06              0              0              0
## 21990             10.6  242.61              0              0              0
## 21991             11.0  241.93              0              0              0
## 21992             11.0  241.89              0              0              0
head(yr_2007)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2007-01-01 00:00:00               2.580                 0.136
## 2  2 2007-01-01 00:01:00               2.552                 0.100
## 3  3 2007-01-01 00:02:00               2.550                 0.100
## 4  4 2007-01-01 00:03:00               2.550                 0.100
## 5  5 2007-01-01 00:04:00               2.554                 0.100
## 6  6 2007-01-01 00:05:00               2.550                 0.100
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1             10.6  241.97              0              0              0
## 2             10.4  241.75              0              0              0
## 3             10.4  241.64              0              0              0
## 4             10.4  241.71              0              0              0
## 5             10.4  241.98              0              0              0
## 6             10.4  241.83              0              0              0
tail(yr_2007)
##            id       Date     Time Global_active_power Global_reactive_power
## 521664 525595 2007-12-31 23:54:00               1.648                 0.102
## 521665 525596 2007-12-31 23:55:00               1.746                 0.204
## 521666 525597 2007-12-31 23:56:00               1.732                 0.210
## 521667 525598 2007-12-31 23:57:00               1.732                 0.210
## 521668 525599 2007-12-31 23:58:00               1.684                 0.144
## 521669 525600 2007-12-31 23:59:00               1.628                 0.072
##        Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521664              6.8  241.56              0              0             18
## 521665              7.2  242.41              0              0             18
## 521666              7.2  242.42              0              0             18
## 521667              7.2  242.50              0              0             18
## 521668              7.0  242.18              0              0             18
## 521669              6.6  241.79              0              0             18
head(yr_2008)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2008-01-01 00:00:00               1.620                 0.070
## 2  2 2008-01-01 00:01:00               1.626                 0.072
## 3  3 2008-01-01 00:02:00               1.622                 0.072
## 4  4 2008-01-01 00:03:00               1.612                 0.070
## 5  5 2008-01-01 00:04:00               1.612                 0.070
## 6  6 2008-01-01 00:05:00               1.546                 0.000
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1              6.6  241.25              0              0             18
## 2              6.6  241.74              0              0             18
## 3              6.6  241.52              0              0             18
## 4              6.6  240.82              0              0             18
## 5              6.6  240.80              0              0             18
## 6              6.4  240.66              0              0             17
tail(yr_2008)
##            id       Date     Time Global_active_power Global_reactive_power
## 526900 527035 2008-12-31 23:54:00               0.484                 0.064
## 526901 527036 2008-12-31 23:55:00               0.484                 0.064
## 526902 527037 2008-12-31 23:56:00               0.482                 0.064
## 526903 527038 2008-12-31 23:57:00               0.482                 0.064
## 526904 527039 2008-12-31 23:58:00               0.480                 0.064
## 526905 527040 2008-12-31 23:59:00               0.482                 0.062
##        Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 526900              2.2  248.15              0              0              0
## 526901              2.2  247.69              0              0              0
## 526902              2.2  247.35              0              0              0
## 526903              2.2  246.99              0              0              0
## 526904              2.2  246.52              0              0              0
## 526905              2.2  246.97              0              0              0
head(yr_2009)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2009-01-01 00:00:00               0.484                 0.062
## 2  2 2009-01-01 00:01:00               0.484                 0.062
## 3  3 2009-01-01 00:02:00               0.482                 0.062
## 4  4 2009-01-01 00:03:00               0.482                 0.060
## 5  5 2009-01-01 00:04:00               0.482                 0.062
## 6  6 2009-01-01 00:05:00               0.570                 0.000
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1              2.2  247.86              0              0              0
## 2              2.2  247.72              0              0              0
## 3              2.2  247.75              0              0              0
## 4              2.2  247.52              0              0              0
## 5              2.2  246.94              0              0              0
## 6              2.6  246.94              0              0              0
tail(yr_2009)
##            id       Date     Time Global_active_power Global_reactive_power
## 521315 525595 2009-12-31 23:54:00               1.704                 0.128
## 521316 525596 2009-12-31 23:55:00               1.746                 0.158
## 521317 525597 2009-12-31 23:56:00               1.786                 0.234
## 521318 525598 2009-12-31 23:57:00               1.784                 0.232
## 521319 525599 2009-12-31 23:58:00               1.792                 0.236
## 521320 525600 2009-12-31 23:59:00               1.792                 0.238
##        Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521315              7.0  239.43              0              0             18
## 521316              7.2  239.95              0              0             18
## 521317              7.4  240.09              0              0             19
## 521318              7.4  239.99              0              0             18
## 521319              7.4  240.62              0              0             18
## 521320              7.4  240.82              0              0             19
head(yr_2010)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2010-01-01 00:00:00               1.790                 0.236
## 2  2 2010-01-01 00:01:00               1.780                 0.234
## 3  3 2010-01-01 00:02:00               1.780                 0.234
## 4  4 2010-01-01 00:03:00               1.746                 0.186
## 5  5 2010-01-01 00:04:00               1.686                 0.102
## 6  6 2010-01-01 00:05:00               1.686                 0.100
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1              7.4  240.65              0              0             18
## 2              7.4  240.07              0              0             18
## 3              7.4  240.15              0              0             19
## 4              7.2  240.26              0              0             18
## 5              7.0  240.12              0              0             18
## 6              7.0  240.10              0              0             19
tail(yr_2010)
##            id       Date     Time Global_active_power Global_reactive_power
## 457389 475018 2010-11-26 20:57:00               0.946                     0
## 457390 475019 2010-11-26 20:58:00               0.946                     0
## 457391 475020 2010-11-26 20:59:00               0.944                     0
## 457392 475021 2010-11-26 21:00:00               0.938                     0
## 457393 475022 2010-11-26 21:01:00               0.934                     0
## 457394 475023 2010-11-26 21:02:00               0.932                     0
##        Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 457389              4.0  240.33              0              0              0
## 457390              4.0  240.43              0              0              0
## 457391              4.0  240.00              0              0              0
## 457392              3.8  239.82              0              0              0
## 457393              3.8  239.70              0              0              0
## 457394              3.8  239.55              0              0              0

The years 2006 and 2010 do not have full records for the entire year, so moving on we’re going to keep 2007, 2008 and 2009 for the analysis. Date and Time appear as characters. Will need to transform.

Create Primary DataFrame

#combine tables into one df using dplyr

df789 <- bind_rows(yr_2007, yr_2008, yr_2009)

Inspect df789

str(df789)
## 'data.frame':    1569894 obs. of  10 variables:
##  $ id                   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Date                 : chr  "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
##  $ Time                 : chr  "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
##  $ Global_active_power  : num  2.58 2.55 2.55 2.55 2.55 ...
##  $ Global_reactive_power: num  0.136 0.1 0.1 0.1 0.1 0.1 0.096 0 0 0 ...
##  $ Global_intensity     : num  10.6 10.4 10.4 10.4 10.4 10.4 10.4 10.2 10.2 10.2 ...
##  $ Voltage              : num  242 242 242 242 242 ...
##  $ Sub_metering_1       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_2       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Sub_metering_3       : num  0 0 0 0 0 0 0 0 0 0 ...
head(df789)
##   id       Date     Time Global_active_power Global_reactive_power
## 1  1 2007-01-01 00:00:00               2.580                 0.136
## 2  2 2007-01-01 00:01:00               2.552                 0.100
## 3  3 2007-01-01 00:02:00               2.550                 0.100
## 4  4 2007-01-01 00:03:00               2.550                 0.100
## 5  5 2007-01-01 00:04:00               2.554                 0.100
## 6  6 2007-01-01 00:05:00               2.550                 0.100
##   Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1             10.6  241.97              0              0              0
## 2             10.4  241.75              0              0              0
## 3             10.4  241.64              0              0              0
## 4             10.4  241.71              0              0              0
## 5             10.4  241.98              0              0              0
## 6             10.4  241.83              0              0              0
tail(df789)
##             id       Date     Time Global_active_power Global_reactive_power
## 1569889 525595 2009-12-31 23:54:00               1.704                 0.128
## 1569890 525596 2009-12-31 23:55:00               1.746                 0.158
## 1569891 525597 2009-12-31 23:56:00               1.786                 0.234
## 1569892 525598 2009-12-31 23:57:00               1.784                 0.232
## 1569893 525599 2009-12-31 23:58:00               1.792                 0.236
## 1569894 525600 2009-12-31 23:59:00               1.792                 0.238
##         Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1569889              7.0  239.43              0              0             18
## 1569890              7.2  239.95              0              0             18
## 1569891              7.4  240.09              0              0             19
## 1569892              7.4  239.99              0              0             18
## 1569893              7.4  240.62              0              0             18
## 1569894              7.4  240.82              0              0             19

Date-Time Preprocessing

# bind together Date and Time, which are now chr.

df789 <- cbind(df789, paste(df789$Date,df789$Time), stringsAsFactors=FALSE)
## Give the new attribute in the 9th column a header name
colnames(df789)[11] <-"DateTime"

# since we're at renaming columns, will rename the sub_meters too

colnames(df789)[8] <-"Kitchen"
colnames(df789)[9] <-"Laundromat"
colnames(df789)[10] <-"Temp_control"
#move the DateTime attribute within the dataset

df789 <- df789[,c(ncol(df789),1:(ncol(df789)-1))]
# drop original date and time columns

df789 <- subset(df789, select = -c(Date, Time, id))
#convert from chr to POSIXct
df789$DateTime <- as.POSIXct(df789$DateTime, "%Y/%m/%d %H:%M:%S") 
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown timezone
## '%Y/%m/%d %H:%M:%S'
#set time-zone
attr(df789$DateTime, "tzone") <- "Europe/Paris"

#inspect new data types
str(df789)
## 'data.frame':    1569894 obs. of  8 variables:
##  $ DateTime             : POSIXct, format: "2007-01-01 01:00:00" "2007-01-01 01:01:00" ...
##  $ Global_active_power  : num  2.58 2.55 2.55 2.55 2.55 ...
##  $ Global_reactive_power: num  0.136 0.1 0.1 0.1 0.1 0.1 0.096 0 0 0 ...
##  $ Global_intensity     : num  10.6 10.4 10.4 10.4 10.4 10.4 10.4 10.2 10.2 10.2 ...
##  $ Voltage              : num  242 242 242 242 242 ...
##  $ Kitchen              : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Laundromat           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Temp_control         : num  0 0 0 0 0 0 0 0 0 0 ...
df789$year <- year(df789$DateTime)
df789$month <- month(df789$DateTime, label=TRUE)
df789$week <- week(df789$DateTime)
df789$day <- day(df789$DateTime)
df789$hour <- hour(df789$DateTime)
df789$minute <- minute(df789$DateTime)
df789$quarter <- quarter(df789$DateTime)
df789$weekDay <- wday(df789$DateTime, label=TRUE, week_start=getOption("lubridate.week.start", 1))
# sum sub meters
df789$sumMeters <- df789$Kitchen + df789$Laundromat + df789$Temp_control

# compute total consumption
df789$gapW <- df789$Global_active_power*1000/60

# compute other electricals
df789$otherEE <- df789$gapW - df789$sumMeters

Initial exploration

summary(df789)
##     DateTime                   Global_active_power Global_reactive_power
##  Min.   :2007-01-01 01:00:00   Min.   : 0.076      Min.   :0.0000       
##  1st Qu.:2007-10-03 08:39:15   1st Qu.: 0.300      1st Qu.:0.0460       
##  Median :2008-07-01 22:05:30   Median : 0.566      Median :0.1000       
##  Mean   :2008-07-02 03:54:14   Mean   : 1.089      Mean   :0.1219       
##  3rd Qu.:2009-03-31 14:32:45   3rd Qu.: 1.524      3rd Qu.:0.1920       
##  Max.   :2010-01-01 00:59:00   Max.   :11.122      Max.   :1.3900       
##                                                                         
##  Global_intensity    Voltage         Kitchen         Laundromat    
##  Min.   : 0.200   Min.   :223.2   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 1.400   1st Qu.:238.7   1st Qu.: 0.000   1st Qu.: 0.000  
##  Median : 2.600   Median :240.8   Median : 0.000   Median : 0.000  
##  Mean   : 4.624   Mean   :240.6   Mean   : 1.159   Mean   : 1.343  
##  3rd Qu.: 6.400   3rd Qu.:242.8   3rd Qu.: 0.000   3rd Qu.: 1.000  
##  Max.   :48.400   Max.   :254.2   Max.   :82.000   Max.   :78.000  
##                                                                    
##   Temp_control         year          month             week      
##  Min.   : 0.000   Min.   :2007   Oct    :134054   Min.   : 1.00  
##  1st Qu.: 0.000   1st Qu.:2007   Jan    :133916   1st Qu.:13.00  
##  Median : 1.000   Median :2008   May    :133914   Median :27.00  
##  Mean   : 6.216   Mean   :2008   Dec    :133846   Mean   :26.62  
##  3rd Qu.:17.000   3rd Qu.:2009   Jul    :133783   3rd Qu.:40.00  
##  Max.   :31.000   Max.   :2010   Mar    :133736   Max.   :53.00  
##                                  (Other):766645                  
##       day             hour          minute         quarter     weekDay     
##  Min.   : 1.00   Min.   : 0.0   Min.   : 0.00   Min.   :1.00   Mon:224448  
##  1st Qu.: 8.00   1st Qu.: 5.0   1st Qu.:14.25   1st Qu.:2.00   Tue:226049  
##  Median :16.00   Median :12.0   Median :30.00   Median :3.00   Wed:225980  
##  Mean   :15.71   Mean   :11.5   Mean   :29.50   Mean   :2.51   Thu:225184  
##  3rd Qu.:23.00   3rd Qu.:18.0   3rd Qu.:44.00   3rd Qu.:4.00   Fri:224694  
##  Max.   :31.00   Max.   :23.0   Max.   :59.00   Max.   :4.00   Sat:221960  
##                                                                Sun:221579  
##    sumMeters            gapW            otherEE       
##  Min.   :  0.000   Min.   :  1.267   Min.   : -2.400  
##  1st Qu.:  0.000   1st Qu.:  5.000   1st Qu.:  3.767  
##  Median :  1.000   Median :  9.433   Median :  5.500  
##  Mean   :  8.719   Mean   : 18.153   Mean   :  9.435  
##  3rd Qu.: 18.000   3rd Qu.: 25.400   3rd Qu.: 10.367  
##  Max.   :134.000   Max.   :185.367   Max.   :122.167  
## 
#df789 <- df789%>%
#  filter(year<2010) # some 2010 records have snuck into the data, so I'm dropping them here.

#summary(df789)

!!! There are negative values on Other Electrical Appliances, which is odd, unless they have a solar panel or something.

df789plus <- df789 %>%
  filter(otherEE<0)
  
str(df789plus)
## 'data.frame':    1011 obs. of  19 variables:
##  $ DateTime             : POSIXct, format: "2007-04-18 10:24:00" "2007-04-18 10:26:00" ...
##  $ Global_active_power  : num  2.992 1.554 2.014 0.338 0.828 ...
##  $ Global_reactive_power: num  0 0 0.056 0.106 0.104 0 0.096 0 0.07 0.1 ...
##  $ Global_intensity     : num  13 8 9.8 2.6 5.2 5 5.4 11.6 13.2 1.2 ...
##  $ Voltage              : num  236 238 229 239 236 ...
##  $ Kitchen              : num  33 12 18 4 12 2 2 26 0 0 ...
##  $ Laundromat           : num  0 0 0 2 2 0 2 0 34 5 ...
##  $ Temp_control         : num  17 15 16 0 0 18 18 17 18 0 ...
##  $ year                 : num  2007 2007 2007 2007 2007 ...
##  $ month                : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 4 5 5 6 6 6 6 6 ...
##  $ week                 : num  16 16 17 18 18 25 25 25 26 26 ...
##  $ day                  : int  18 18 23 5 6 20 20 20 27 27 ...
##  $ hour                 : int  10 10 10 2 17 10 10 11 9 16 ...
##  $ minute               : int  24 26 58 15 35 0 4 7 30 45 ...
##  $ quarter              : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ weekDay              : Ord.factor w/ 7 levels "Mon"<"Tue"<"Wed"<..: 3 3 1 6 7 3 3 3 3 3 ...
##  $ sumMeters            : num  50 27 34 6 14 20 22 43 52 5 ...
##  $ gapW                 : num  49.87 25.9 33.57 5.63 13.8 ...
##  $ otherEE              : num  -0.133 -1.1 -0.433 -0.367 -0.2 ...

1011 observations show more consumption than available. That’s still odd.

Yearly breakdown

yearly_summary <- df789 %>%
  group_by(year)%>%
  summarise(yearly_kitchen=sum(Kitchen),
            yearly_laundromat= sum(Laundromat),
            yearly_TC=sum(Temp_control),
            yearly_oEE=sum(otherEE),
            yearly_total=sum(gapW))

yearly_summary
## # A tibble: 4 × 6
##    year yearly_kitchen yearly_laundromat yearly_TC yearly_oEE yearly_total
##   <dbl>          <dbl>             <dbl>     <dbl>      <dbl>        <dbl>
## 1  2007         642548            854356   3021773   5191509.     9710186.
## 2  2008         584784            661877   3180254   4989011.     9415926.
## 3  2009         592657            592177   3555717   4630254.     9370805.
## 4  2010              0                 0      1099       592.        1690.
#install.packages("reshape2")
#library("reshape2")

# drop month total before wide to long
yearlies <- yearly_summary[,!names(yearly_summary) %in% c("yearly_total")]
yearlies <- yearlies %>%
  rename(Kitchen = yearly_kitchen)%>%
  rename(Laundromat = yearly_laundromat)%>%
  rename(Temp_Control = yearly_TC)%>%
  rename(Other_Appliances = yearly_oEE)

# wide to long
yearly_long <- melt(yearlies, id.vars=c("year"))
yearly_long <- yearly_long%>%
  rename(Consumption = variable)

# Stacked barplot using ggplot2
ggplot(yearly_long,                  
       aes(x = year,
           y = value,
           fill = Consumption)) +
  geom_bar(stat="identity")+
  scale_fill_viridis(option="turbo",discrete = TRUE)+
  #scale_y_continuous(limits=c(0,3500000), expand=c(0,0))+
  labs(title="Consumption by Year", x="Year", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()

Quarterly overview

quarterly <- df789%>%
  group_by(quarter)%>%
  summarise(sumK = sum(Kitchen),
            sumL = sum(Laundromat),
            sumT = sum(Temp_control),
            sumoEE=sum(otherEE),
            sumTotal=sum(gapW))

ggplot(quarterly, aes(x=quarter, y=sumTotal, fill=sumTotal))+
  geom_col()+
  scale_fill_viridis(option = "viridis", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,8706418), expand=c(0,0))+
  labs(title="Tota Global Consumption by Quarter", x="Quarter", y="Global Consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

#install.packages("reshape2")
#ibrary("reshape2")

# drop  total before wide to long
quarterlies <- quarterly[,!names(quarterly) %in% c("sumTotal")]
quarterlies <- quarterlies %>%
  rename(Kitchen = sumK)%>%
  rename(Laundromat = sumL)%>%
  rename(Temp_Control = sumT)%>%
  rename(Other_Appliances = sumoEE)

# wide to long
quarter_long <- melt(quarterlies, id.vars=c("quarter"))
quarter_long <- quarter_long%>%
  rename(Consumption = variable)

# Stacked barplot using ggplot2
ggplot(quarter_long,                  
       aes(x = quarter,
           y = value,
           fill = Consumption)) +
  geom_bar(stat="identity")+
  scale_fill_viridis(option="turbo",discrete = TRUE)+
  scale_y_continuous(limits=c(0,8706418), expand=c(0,0))+
  labs(title="Consumption by Quarter", x="Quarter", y="Average consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()

Monthtly breakdown

monthly_summary <- df789%>%
  group_by(month)%>%
  summarise(month_kitchen=sum(Kitchen),
            month_laundromat=sum(Laundromat),
            month_TC=sum(Temp_control),
            month_oEE=sum(otherEE),
            month_total=sum(gapW))

monthly_summary
## # A tibble: 12 × 6
##    month month_kitchen month_laundromat month_TC month_oEE month_total
##    <ord>         <dbl>            <dbl>    <dbl>     <dbl>       <dbl>
##  1 Jan          192844           215111   971362  1903977.    3283294.
##  2 Feb          134875           171989   824005  1473836.    2604705.
##  3 Mar          187971           232242   894985  1497473.    2812671.
##  4 Apr          143719           161953   795170  1112430.    2213272.
##  5 May          172366           181133   830840  1063423.    2247762.
##  6 Jun          160285           163270   739991   809510.    1873056.
##  7 Jul          108591           141705   569979   724366.    1544641.
##  8 Aug           73187           111008   495795   577916.    1257906.
##  9 Sep          156767           165924   806763   988552.    2118006.
## 10 Oct          143051           205184   862518  1314660.    2525413.
## 11 Nov          169335           183018   914937  1579651.    2846941.
## 12 Dec          176998           175873  1052498  1765573.    3170942.
summary(monthly_summary)
##      month   month_kitchen    month_laundromat    month_TC      
##  Jan    :1   Min.   : 73187   Min.   :111008   Min.   : 495795  
##  Feb    :1   1st Qu.:141007   1st Qu.:162941   1st Qu.: 781375  
##  Mar    :1   Median :158526   Median :173931   Median : 827422  
##  Apr    :1   Mean   :151666   Mean   :175701   Mean   : 813237  
##  May    :1   3rd Qu.:173524   3rd Qu.:188560   3rd Qu.: 899973  
##  Jun    :1   Max.   :192844   Max.   :232242   Max.   :1052498  
##  (Other):6                                                      
##    month_oEE        month_total     
##  Min.   : 577916   Min.   :1257906  
##  1st Qu.: 943791   1st Qu.:2056768  
##  Median :1213545   Median :2386587  
##  Mean   :1234280   Mean   :2374884  
##  3rd Qu.:1518017   3rd Qu.:2821238  
##  Max.   :1903977   Max.   :3283294  
## 
#total
monthly_summary %>%
  ggplot(aes(x=month, y=month_total, fill=month_total))+
  geom_col()+
  scale_fill_viridis(option = "cividis", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,3500000), expand=c(0,0))+
  labs(title="Total Global Consumption by Month", x="Month", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

#kitchen
monthly_summary %>%
  ggplot(aes(x=month, y=month_kitchen, fill=month_kitchen))+
  geom_col()+
  scale_fill_viridis(option = "magma", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,200000), expand=c(0,0))+
  labs(title="Total Kitchen Consumption by Month", x="Month", y="Kitchen consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

#laundromat
monthly_summary %>%
  ggplot(aes(x=month, y=month_laundromat, fill=month_laundromat))+
  geom_col()+
  scale_fill_viridis(option = "viridis", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,250000), expand=c(0,0))+
  labs(title="Total Laundromat Consumption by Month", x="Month", y="Laundromat consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

#temperature control
monthly_summary %>%
  ggplot(aes(x=month, y=month_TC, fill=month_TC))+
  geom_col()+
  scale_fill_viridis(option = "plasma", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,1200000), expand=c(0,0))+
  labs(title="Total Temp Control Consumption by Month", x="Month", y="Temp Control consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

# other appliances
monthly_summary %>%
  ggplot(aes(x=month, y=month_oEE, fill=month_oEE))+
  geom_col()+
  scale_fill_viridis(option = "mako", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,2000000), expand=c(0,0))+
  labs(title="Total Other Appliances Consumption by Month", x="Month", y="Other Appliances consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

#install.packages("reshape2")
#ibrary("reshape2")

# drop month total before wide to long
monthlies <- monthly_summary[,!names(monthly_summary) %in% c("month_total")]
monthlies <- monthlies %>%
  rename(Kitchen = month_kitchen)%>%
  rename(Laundromat = month_laundromat)%>%
  rename(Temp_Control = month_TC)%>%
  rename(Other_Appliances = month_oEE)

# wide to long
monthly_long <- melt(monthlies, id.vars=c("month"))
monthly_long <- monthly_long%>%
  rename(Consumption = variable)

# Stacked barplot using ggplot2
ggplot(monthly_long,                  
       aes(x = month,
           y = value,
           fill = Consumption)) +
  geom_bar(stat="identity")+
  scale_fill_viridis(option="turbo",discrete = TRUE)+
  scale_y_continuous(limits=c(0,3500000), expand=c(0,0))+
  labs(title="Consumption by Month", x="Month", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()

Weekly breakdown

weekly_summary <- df789%>%
  group_by(week)%>%
  summarise(week_kitchen=sum(Kitchen),
            week_laundromat=sum(Laundromat),
            week_TC=sum(Temp_control),
            week_oEE=sum(otherEE),
            week_total=sum(gapW))

weekly_summary
## # A tibble: 53 × 6
##     week week_kitchen week_laundromat week_TC week_oEE week_total
##    <dbl>        <dbl>           <dbl>   <dbl>    <dbl>      <dbl>
##  1     1        31069           52432  186088  428059.    697648.
##  2     2        52702           37775  227499  423693.    741669.
##  3     3        50650           54580  240533  433080.    778843.
##  4     4        43626           48758  215638  439292.    747314.
##  5     5        40231           46734  250321  434579.    771865.
##  6     6        44496           51349  231532  405994.    733371.
##  7     7        32703           37630  213297  370327.    653957.
##  8     8        29943           46243  179498  353210.    608894.
##  9     9        19352           35884  113406  234767.    403409.
## 10    10        50092           62833  196837  386395.    696157.
## # … with 43 more rows
summary(weekly_summary)
##       week     week_kitchen   week_laundromat    week_TC          week_oEE     
##  Min.   : 1   Min.   : 2975   Min.   : 4332   Min.   : 50458   Min.   : 66493  
##  1st Qu.:14   1st Qu.:29943   1st Qu.:32892   1st Qu.:158848   1st Qu.:197449  
##  Median :27   Median :35237   Median :39398   Median :191320   Median :264975  
##  Mean   :27   Mean   :34339   Mean   :39781   Mean   :184129   Mean   :279460  
##  3rd Qu.:40   3rd Qu.:41139   3rd Qu.:48758   3rd Qu.:217066   3rd Qu.:377351  
##  Max.   :53   Max.   :52702   Max.   :62833   Max.   :251723   Max.   :439292  
##    week_total    
##  Min.   :124258  
##  1st Qu.:417885  
##  Median :537872  
##  Mean   :537710  
##  3rd Qu.:680882  
##  Max.   :778843
#total
weekly_summary %>%
  ggplot(aes(x=week, y=week_total, fill=week_total))+
  geom_col()+
  scale_fill_viridis(option = "cividis", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,778843), expand=c(0,0))+
  labs(title="Total Global Consumption by week", x="week", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()
## Warning: Removed 1 rows containing missing values (position_stack).

#kitchen
weekly_summary %>%
  ggplot(aes(x=week, y=week_kitchen, fill=week_kitchen))+
  geom_col()+
  scale_fill_viridis(option = "magma", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,52702), expand=c(0,0))+
  labs(title="Total Kitchen Consumption by week", x="Week", y="Kitchen consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

#laundromat
weekly_summary %>%
  ggplot(aes(x=week, y=week_laundromat, fill=week_laundromat))+
  geom_col()+
  scale_fill_viridis(option = "viridis", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,62833), expand=c(0,0))+
  labs(title="Total Laundromat Consumption by week", x="Week", y="Laundromat consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

#temperature control
weekly_summary %>%
  ggplot(aes(x=week, y=week_TC, fill=week_TC))+
  geom_col()+
  scale_fill_viridis(option = "plasma", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,251723), expand=c(0,0))+
  labs(title="Total Temp Control Consumption by week", x="Week", y="Temp Control consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

# other appliances
weekly_summary %>%
  ggplot(aes(x=week, y=week_oEE, fill=week_oEE))+
  geom_col()+
  scale_fill_viridis(option = "mako", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,439292), expand=c(0,0))+
  labs(title="Total Other Appliances Consumption by week", x="Week", y="Other Appliances consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()

#install.packages("reshape2")
#ibrary("reshape2")

# drop month total before wide to long
weeklies <- weekly_summary[,!names(weekly_summary) %in% c("week_total")]
weeklies <- weeklies %>%
  rename(Kitchen = week_kitchen)%>%
  rename(Laundromat = week_laundromat)%>%
  rename(Temp_Control = week_TC)%>%
  rename(Other_Appliances = week_oEE)

# wide to long
weekly_long <- melt(weeklies, id.vars=c("week"))
weekly_long <- weekly_long%>%
  rename(Consumption = variable)

# Stacked barplot using ggplot2
ggplot(weekly_long,                  
       aes(x = week,
           y = value,
           fill = Consumption)) +
  geom_bar(stat="identity")+
  scale_fill_viridis(option="turbo",discrete = TRUE)+
  scale_y_continuous(limits=c(0,500000), expand=c(0,0))+
  labs(title="Consumption by Week", x="Week", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()
## Warning: Removed 81 rows containing missing values (geom_bar).

Day of the week breakdown

weekDay_summary <- df789%>%
  group_by(week,weekDay)%>%
  summarise(weekDay_kitchen=sum(Kitchen),
            weekDay_laundromat=sum(Laundromat),
            weekDay_TC=sum(Temp_control),
            weekDay_oEE=sum(otherEE),
            weekDay_total=sum(gapW))
## `summarise()` has grouped output by 'week'. You can override using the
## `.groups` argument.
weekDay_summary
## # A tibble: 368 × 7
## # Groups:   week [53]
##     week weekDay weekDay_kitchen weekDay_laundromat weekDay_TC weekDay_oEE
##    <dbl> <ord>             <dbl>              <dbl>      <dbl>       <dbl>
##  1     1 Mon                2535                956      23740      72071.
##  2     1 Tue                2461               3338      31755      58010 
##  3     1 Wed                2687               5625      23904      44513.
##  4     1 Thu                2946              13284      20166      67108.
##  5     1 Fri                5141              12339      31647      75366.
##  6     1 Sat                9889               1025      30173      51913.
##  7     1 Sun                5410              15865      24703      59078.
##  8     2 Mon                6369               1145      40181      51171.
##  9     2 Tue                5025               6873      35742      52442 
## 10     2 Wed                4209               6321      31395      55818.
## # … with 358 more rows, and 1 more variable: weekDay_total <dbl>
summary(weekDay_summary)
##       week       weekDay  weekDay_kitchen weekDay_laundromat   weekDay_TC   
##  Min.   : 1.00   Mon:53   Min.   :    0   Min.   :  214      Min.   : 7145  
##  1st Qu.:14.00   Tue:53   1st Qu.: 2514   1st Qu.: 2686      1st Qu.:21442  
##  Median :27.00   Wed:53   Median : 3922   Median : 5014      Median :27076  
##  Mean   :26.79   Thu:53   Mean   : 4946   Mean   : 5729      Mean   :26519  
##  3rd Qu.:40.00   Fri:52   3rd Qu.: 6397   3rd Qu.: 7901      3rd Qu.:31661  
##  Max.   :53.00   Sat:52   Max.   :19606   Max.   :21946      Max.   :47045  
##                  Sun:52                                                     
##   weekDay_oEE    weekDay_total   
##  Min.   :11559   Min.   : 19697  
##  1st Qu.:28230   1st Qu.: 58665  
##  Median :38701   Median : 77773  
##  Mean   :40248   Mean   : 77442  
##  3rd Qu.:51516   3rd Qu.: 93020  
##  Max.   :87395   Max.   :167695  
## 
#total
weekDay_summary %>%
  ggplot(aes(x=weekDay, y=weekDay_total, fill=weekDay_total))+
  geom_col()+
  scale_fill_viridis(option = "cividis", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,167695), expand=c(0,0))+
  labs(title="Total Global Consumption by weekDay", x="Week Day", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 360 rows containing missing values (geom_col).

#kitchen
weekDay_summary %>%
  ggplot(aes(x=weekDay, y=weekDay_kitchen, fill=weekDay_kitchen))+
  geom_col()+
  scale_fill_viridis(option = "magma", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,19606), expand=c(0,0))+
  labs(title="Total Kitchen Consumption by weekDay", x="Week Day", y="Kitchen consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()
## Warning: Removed 343 rows containing missing values (geom_col).

#laundromat
weekDay_summary %>%
  ggplot(aes(x=weekDay, y=weekDay_laundromat, fill=weekDay_laundromat))+
  geom_col()+
  scale_fill_viridis(option = "viridis", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,21946), expand=c(0,0))+
  labs(title="Total Laundromat Consumption by weekDay", x="Week Day", y="Laundromat consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()
## Warning: Removed 341 rows containing missing values (geom_col).

#temperature control
weekDay_summary %>%
  ggplot(aes(x=weekDay, y=weekDay_TC, fill=weekDay_TC))+
  geom_col()+
  scale_fill_viridis(option = "plasma", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,47045), expand=c(0,0))+
  labs(title="Total Temp Control Consumption by weekDay", x="Week Day", y="Temp Control consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()
## Warning: Removed 361 rows containing missing values (geom_col).

# other appliances
weekDay_summary %>%
  ggplot(aes(x=weekDay, y=weekDay_oEE, fill=weekDay_oEE))+
  geom_col()+
  scale_fill_viridis(option = "mako", discrete=FALSE, direction = -1)+
  scale_y_continuous(limits=c(0,87395), expand=c(0,0))+
  labs(title="Total Other Appliances Consumption by weekDay", x="Week Day", y="Other Appliances consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
  theme_minimal()+
  easy_remove_legend()
## Warning: Removed 1 rows containing missing values (position_stack).
## Removed 360 rows containing missing values (geom_col).

Daily breakdown

tod <- filter(df789, year==2008 & month=="Jan" & day==9)

plot_ly(tod, x=~tod$DateTime, y=~tod$Kitchen, type="scatter",mode='lines')
plot_ly(tod, x=tod$DateTime, y=~tod$Kitchen, name="Kitchen", type="scatter", mode='lines')%>%
  add_trace(y=~tod$Laundromat, name="Laundromat", mode='lines')%>%
  add_trace(y=~tod$Temp_control, name="Temp Control", mode='lines')%>%
  add_trace(y=~tod$otherEE, name="Other appliances", mode='lines')%>%
  #add_trace(y=~tod$gapW, name="All consumption", mode='lines')%>%
  layout(title="Power Consumption January 9th, 2008", xaxis=list(title="Time"), yaxis=list(title="Power(watt-hours)"))
eventsK_tod <- tod %>%
  filter(Kitchen>0)

eventsL_tod <- tod %>%
  filter(Laundromat>0)

eventsT_tod <- tod %>%
  filter(Temp_control>0)

eventsE_tod <- tod %>%
  filter(otherEE>0)

eventsAll_tod <- tod %>%
  filter(gapW>0)

# make our data into circular class from package circular
eventsK_tod$Kitchen <- circular(eventsK_tod$hour%%24, # convert to 24 hrs
      units="hours", template="clock24")
eventsL_tod$Laundromat <- circular(eventsL_tod$hour%%24, # convert to 24 hrs
      units="hours", template="clock24")
eventsT_tod$Temp_control <- circular(eventsT_tod$hour%%24, # convert to 24 hrs
      units="hours", template="clock24")
eventsE_tod$otherEE <- circular(eventsE_tod$hour%%24, units="hours", template="clock24")
eventsAll_tod$allW <- circular(eventsAll_tod$hour%%24, units="hours", template="clock24")



# plot a rose diagram, setting prop(ortion) argument after trial-n-error
rose.diag(eventsK_tod$Kitchen, bin = 24, col = "lightblue", main = "Kitchen Events by Hour", prop = 1.3)

rose.diag(eventsL_tod$Laundromat, bin = 24, col = "orange", main = "Laundromat Events by Hour", prop = 1.3)

rose.diag(eventsT_tod$Temp_control, bin = 24, col = "green ", main = "Temperature control Events by Hour", prop = 1.3)

rose.diag(eventsE_tod$otherEE, bin = 24, col = "red ", main = "Other appliances Events by Hour", prop = 1.3)

rose.diag(eventsAll_tod$allW, bin = 24, col = "gray ", main = "All consumption Events by Hour", prop = 1.3)