install.packages("readr", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("skimr", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("dplyr", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("tidyverse", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("lubridate", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("RMySQL", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("plotly", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("circular", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("ggeasy", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("RColorBrewer", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("viridis", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
install.packages("reshape2", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/4g/91cbwfqd3b3_n7gm9tn8vtp00000gn/T//Rtmp3IsVN4/downloaded_packages
options(scipen = 999) # force R not to use exponential notation
library("readr")
## Warning: package 'readr' was built under R version 4.1.2
library("skimr")
library("dplyr")
## Warning: package 'dplyr' was built under R version 4.1.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library("tidyverse")
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ stringr 1.4.0
## ✓ tidyr 1.2.0 ✓ forcats 0.5.1
## Warning: package 'tidyr' was built under R version 4.1.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library("lubridate")
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library("RMySQL")
## Loading required package: DBI
library("plotly")
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library("circular")
##
## Attaching package: 'circular'
## The following object is masked from 'package:plotly':
##
## wind
## The following objects are masked from 'package:stats':
##
## sd, var
library("ggeasy")
library("RColorBrewer")
library("viridis")
## Loading required package: viridisLite
library("viridisLite")
library("reshape2")
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
# Create db connection
con <- dbConnect(MySQL(),
user='deepAnalytics',
password='Sqltask1234!',
dbname='dataanalytics2018',
host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')
# List the tables contained in the database
dbListTables(con)
## [1] "iris" "yr_2006" "yr_2007" "yr_2008" "yr_2009" "yr_2010"
Attributes
dbListFields(con, "yr_2006")
## [1] "id" "Date" "Time"
## [4] "Global_active_power" "Global_reactive_power" "Global_intensity"
## [7] "Voltage" "Sub_metering_1" "Sub_metering_2"
## [10] "Sub_metering_3"
dbListFields(con, "yr_2007")
## [1] "id" "Date" "Time"
## [4] "Global_active_power" "Global_reactive_power" "Global_intensity"
## [7] "Voltage" "Sub_metering_1" "Sub_metering_2"
## [10] "Sub_metering_3"
dbListFields(con, "yr_2008")
## [1] "id" "Date" "Time"
## [4] "Global_active_power" "Global_reactive_power" "Global_intensity"
## [7] "Voltage" "Sub_metering_1" "Sub_metering_2"
## [10] "Sub_metering_3"
dbListFields(con, "yr_2009")
## [1] "id" "Date" "Time"
## [4] "Global_active_power" "Global_reactive_power" "Global_intensity"
## [7] "Voltage" "Sub_metering_1" "Sub_metering_2"
## [10] "Sub_metering_3"
dbListFields(con, "yr_2010")
## [1] "id" "Date" "Time"
## [4] "Global_active_power" "Global_reactive_power" "Global_intensity"
## [7] "Voltage" "Sub_metering_1" "Sub_metering_2"
## [10] "Sub_metering_3"
Download tables
yr_2006 <- dbGetQuery(con, "SELECT * FROM yr_2006")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
yr_2007 <- dbGetQuery(con, "SELECT * FROM yr_2007")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
yr_2008 <- dbGetQuery(con, "SELECT * FROM yr_2008")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
yr_2009 <- dbGetQuery(con, "SELECT * FROM yr_2009")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
yr_2010 <- dbGetQuery(con, "SELECT * FROM yr_2010")
## Warning in .local(conn, statement, ...): Unsigned INTEGER in col 0 imported as
## numeric
str(yr_2006)
## 'data.frame': 21992 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2006-12-16" "2006-12-16" "2006-12-16" "2006-12-16" ...
## $ Time : chr "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
## $ Global_active_power : num 4.22 5.36 5.37 5.39 3.67 ...
## $ Global_reactive_power: num 0.418 0.436 0.498 0.502 0.528 0.522 0.52 0.52 0.51 0.51 ...
## $ Global_intensity : num 18.4 23 23 23 15.8 15 15.8 15.8 15.8 15.8 ...
## $ Voltage : num 235 234 233 234 236 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 1 1 2 1 1 2 1 1 1 2 ...
## $ Sub_metering_3 : num 17 16 17 17 17 17 17 17 17 16 ...
str(yr_2007)
## 'data.frame': 521669 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Global_active_power : num 2.58 2.55 2.55 2.55 2.55 ...
## $ Global_reactive_power: num 0.136 0.1 0.1 0.1 0.1 0.1 0.096 0 0 0 ...
## $ Global_intensity : num 10.6 10.4 10.4 10.4 10.4 10.4 10.4 10.2 10.2 10.2 ...
## $ Voltage : num 242 242 242 242 242 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3 : num 0 0 0 0 0 0 0 0 0 0 ...
str(yr_2008)
## 'data.frame': 526905 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2008-01-01" "2008-01-01" "2008-01-01" "2008-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Global_active_power : num 1.62 1.63 1.62 1.61 1.61 ...
## $ Global_reactive_power: num 0.07 0.072 0.072 0.07 0.07 0 0 0 0 0 ...
## $ Global_intensity : num 6.6 6.6 6.6 6.6 6.6 6.4 6.4 6.4 6.4 6.4 ...
## $ Voltage : num 241 242 242 241 241 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3 : num 18 18 18 18 18 17 18 18 18 18 ...
str(yr_2009)
## 'data.frame': 521320 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2009-01-01" "2009-01-01" "2009-01-01" "2009-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Global_active_power : num 0.484 0.484 0.482 0.482 0.482 0.57 0.59 0.588 0.586 0.586 ...
## $ Global_reactive_power: num 0.062 0.062 0.062 0.06 0.062 0 0.078 0.078 0.078 0.078 ...
## $ Global_intensity : num 2.2 2.2 2.2 2.2 2.2 2.6 2.6 2.6 2.6 2.6 ...
## $ Voltage : num 248 248 248 248 247 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3 : num 0 0 0 0 0 0 0 0 0 0 ...
str(yr_2010)
## 'data.frame': 457394 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2010-01-01" "2010-01-01" "2010-01-01" "2010-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Global_active_power : num 1.79 1.78 1.78 1.75 1.69 ...
## $ Global_reactive_power: num 0.236 0.234 0.234 0.186 0.102 0.1 0.1 0.102 0.072 0 ...
## $ Global_intensity : num 7.4 7.4 7.4 7.2 7 7 7 7 6.8 6.6 ...
## $ Voltage : num 241 240 240 240 240 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3 : num 18 18 19 18 18 19 18 18 19 18 ...
summary(yr_2006)
## id Date Time Global_active_power
## Min. : 1 Length:21992 Length:21992 Min. :0.194
## 1st Qu.: 5499 Class :character Class :character 1st Qu.:0.496
## Median :10998 Mode :character Mode :character Median :1.708
## Mean :10998 Mean :1.901
## 3rd Qu.:16496 3rd Qu.:2.692
## Max. :21996 Max. :9.132
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.80 Min. :228.2 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 2.20 1st Qu.:238.8 1st Qu.: 0.000
## Median :0.1140 Median : 7.20 Median :241.7 Median : 0.000
## Mean :0.1314 Mean : 8.03 Mean :241.4 Mean : 1.249
## 3rd Qu.:0.1980 3rd Qu.:11.40 3rd Qu.:244.4 3rd Qu.: 0.000
## Max. :0.8000 Max. :39.40 Max. :251.7 Max. :77.000
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.00
## 1st Qu.: 0.000 1st Qu.: 0.00
## Median : 0.000 Median : 0.00
## Mean : 2.215 Mean : 7.41
## 3rd Qu.: 1.000 3rd Qu.:17.00
## Max. :74.000 Max. :20.00
summary(yr_2007)
## id Date Time Global_active_power
## Min. : 1 Length:521669 Length:521669 Min. : 0.082
## 1st Qu.:130423 Class :character Class :character 1st Qu.: 0.278
## Median :264606 Mode :character Mode :character Median : 0.504
## Mean :263456 Mean : 1.117
## 3rd Qu.:395178 3rd Qu.: 1.548
## Max. :525600 Max. :10.670
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.400 Min. :223.5 Min. : 0.000
## 1st Qu.:0.0000 1st Qu.: 1.200 1st Qu.:236.9 1st Qu.: 0.000
## Median :0.1000 Median : 2.400 Median :239.7 Median : 0.000
## Mean :0.1174 Mean : 4.764 Mean :239.4 Mean : 1.232
## 3rd Qu.:0.1860 3rd Qu.: 6.400 3rd Qu.:241.8 3rd Qu.: 0.000
## Max. :1.1480 Max. :46.400 Max. :252.1 Max. :78.000
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 0.000
## Mean : 1.638 Mean : 5.795
## 3rd Qu.: 1.000 3rd Qu.:17.000
## Max. :78.000 Max. :20.000
summary(yr_2008)
## id Date Time Global_active_power
## Min. : 1 Length:526905 Length:526905 Min. : 0.076
## 1st Qu.:131732 Class :character Class :character 1st Qu.: 0.300
## Median :263461 Mode :character Mode :character Median : 0.566
## Mean :263474 Mean : 1.072
## 3rd Qu.:395191 3rd Qu.: 1.518
## Max. :527040 Max. :10.348
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.200 Min. :224.6 Min. : 0.00
## 1st Qu.:0.0460 1st Qu.: 1.400 1st Qu.:238.9 1st Qu.: 0.00
## Median :0.0940 Median : 2.400 Median :240.7 Median : 0.00
## Mean :0.1171 Mean : 4.552 Mean :240.6 Mean : 1.11
## 3rd Qu.:0.1840 3rd Qu.: 6.400 3rd Qu.:242.5 3rd Qu.: 0.00
## Max. :1.3900 Max. :44.600 Max. :250.9 Max. :80.00
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 1.000
## Mean : 1.256 Mean : 6.034
## 3rd Qu.: 1.000 3rd Qu.:17.000
## Max. :76.000 Max. :31.000
summary(yr_2009)
## id Date Time Global_active_power
## Min. : 1 Length:521320 Length:521320 Min. : 0.122
## 1st Qu.:130398 Class :character Class :character 1st Qu.: 0.318
## Median :264038 Mode :character Mode :character Median : 0.622
## Mean :262890 Mean : 1.079
## 3rd Qu.:395266 3rd Qu.: 1.514
## Max. :525600 Max. :11.122
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.400 Min. :223.2 Min. : 0.000
## 1st Qu.:0.0520 1st Qu.: 1.400 1st Qu.:240.1 1st Qu.: 0.000
## Median :0.1060 Median : 2.800 Median :241.9 Median : 0.000
## Mean :0.1314 Mean : 4.555 Mean :241.9 Mean : 1.137
## 3rd Qu.:0.2060 3rd Qu.: 6.200 3rd Qu.:243.6 3rd Qu.: 0.000
## Max. :1.2400 Max. :48.400 Max. :254.2 Max. :82.000
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 0.000 Median : 1.000
## Mean : 1.136 Mean : 6.823
## 3rd Qu.: 1.000 3rd Qu.:18.000
## Max. :77.000 Max. :31.000
summary(yr_2010)
## id Date Time Global_active_power
## Min. : 1 Length:457394 Length:457394 Min. :0.138
## 1st Qu.:119509 Class :character Class :character 1st Qu.:0.336
## Median :233860 Mode :character Mode :character Median :0.700
## Mean :236335 Mean :1.061
## 3rd Qu.:355437 3rd Qu.:1.512
## Max. :475023 Max. :9.724
## Global_reactive_power Global_intensity Voltage Sub_metering_1
## Min. :0.0000 Min. : 0.600 Min. :225.3 Min. : 0.0000
## 1st Qu.:0.0540 1st Qu.: 1.400 1st Qu.:239.8 1st Qu.: 0.0000
## Median :0.1000 Median : 3.000 Median :241.5 Median : 0.0000
## Mean :0.1294 Mean : 4.478 Mean :241.5 Mean : 0.9875
## 3rd Qu.:0.2000 3rd Qu.: 6.200 3rd Qu.:243.2 3rd Qu.: 0.0000
## Max. :1.1240 Max. :43.000 Max. :253.5 Max. :88.0000
## Sub_metering_2 Sub_metering_3
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 0.000 1st Qu.: 1.000
## Median : 0.000 Median : 1.000
## Mean : 1.102 Mean : 7.244
## 3rd Qu.: 1.000 3rd Qu.:18.000
## Max. :80.000 Max. :31.000
head(yr_2006)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2006-12-16 17:24:00 4.216 0.418
## 2 2 2006-12-16 17:25:00 5.360 0.436
## 3 3 2006-12-16 17:26:00 5.374 0.498
## 4 4 2006-12-16 17:27:00 5.388 0.502
## 5 5 2006-12-16 17:28:00 3.666 0.528
## 6 6 2006-12-16 17:29:00 3.520 0.522
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 18.4 234.84 0 1 17
## 2 23.0 233.63 0 1 16
## 3 23.0 233.29 0 2 17
## 4 23.0 233.74 0 1 17
## 5 15.8 235.68 0 1 17
## 6 15.0 235.02 0 2 17
tail(yr_2006)
## id Date Time Global_active_power Global_reactive_power
## 21987 21991 2006-12-31 23:54:00 2.576 0.132
## 21988 21992 2006-12-31 23:55:00 2.574 0.132
## 21989 21993 2006-12-31 23:56:00 2.576 0.132
## 21990 21994 2006-12-31 23:57:00 2.586 0.134
## 21991 21995 2006-12-31 23:58:00 2.648 0.212
## 21992 21996 2006-12-31 23:59:00 2.646 0.236
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 21987 10.6 241.90 0 0 0
## 21988 10.6 241.89 0 0 0
## 21989 10.6 242.06 0 0 0
## 21990 10.6 242.61 0 0 0
## 21991 11.0 241.93 0 0 0
## 21992 11.0 241.89 0 0 0
head(yr_2007)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2007-01-01 00:00:00 2.580 0.136
## 2 2 2007-01-01 00:01:00 2.552 0.100
## 3 3 2007-01-01 00:02:00 2.550 0.100
## 4 4 2007-01-01 00:03:00 2.550 0.100
## 5 5 2007-01-01 00:04:00 2.554 0.100
## 6 6 2007-01-01 00:05:00 2.550 0.100
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 10.6 241.97 0 0 0
## 2 10.4 241.75 0 0 0
## 3 10.4 241.64 0 0 0
## 4 10.4 241.71 0 0 0
## 5 10.4 241.98 0 0 0
## 6 10.4 241.83 0 0 0
tail(yr_2007)
## id Date Time Global_active_power Global_reactive_power
## 521664 525595 2007-12-31 23:54:00 1.648 0.102
## 521665 525596 2007-12-31 23:55:00 1.746 0.204
## 521666 525597 2007-12-31 23:56:00 1.732 0.210
## 521667 525598 2007-12-31 23:57:00 1.732 0.210
## 521668 525599 2007-12-31 23:58:00 1.684 0.144
## 521669 525600 2007-12-31 23:59:00 1.628 0.072
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521664 6.8 241.56 0 0 18
## 521665 7.2 242.41 0 0 18
## 521666 7.2 242.42 0 0 18
## 521667 7.2 242.50 0 0 18
## 521668 7.0 242.18 0 0 18
## 521669 6.6 241.79 0 0 18
head(yr_2008)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2008-01-01 00:00:00 1.620 0.070
## 2 2 2008-01-01 00:01:00 1.626 0.072
## 3 3 2008-01-01 00:02:00 1.622 0.072
## 4 4 2008-01-01 00:03:00 1.612 0.070
## 5 5 2008-01-01 00:04:00 1.612 0.070
## 6 6 2008-01-01 00:05:00 1.546 0.000
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 6.6 241.25 0 0 18
## 2 6.6 241.74 0 0 18
## 3 6.6 241.52 0 0 18
## 4 6.6 240.82 0 0 18
## 5 6.6 240.80 0 0 18
## 6 6.4 240.66 0 0 17
tail(yr_2008)
## id Date Time Global_active_power Global_reactive_power
## 526900 527035 2008-12-31 23:54:00 0.484 0.064
## 526901 527036 2008-12-31 23:55:00 0.484 0.064
## 526902 527037 2008-12-31 23:56:00 0.482 0.064
## 526903 527038 2008-12-31 23:57:00 0.482 0.064
## 526904 527039 2008-12-31 23:58:00 0.480 0.064
## 526905 527040 2008-12-31 23:59:00 0.482 0.062
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 526900 2.2 248.15 0 0 0
## 526901 2.2 247.69 0 0 0
## 526902 2.2 247.35 0 0 0
## 526903 2.2 246.99 0 0 0
## 526904 2.2 246.52 0 0 0
## 526905 2.2 246.97 0 0 0
head(yr_2009)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2009-01-01 00:00:00 0.484 0.062
## 2 2 2009-01-01 00:01:00 0.484 0.062
## 3 3 2009-01-01 00:02:00 0.482 0.062
## 4 4 2009-01-01 00:03:00 0.482 0.060
## 5 5 2009-01-01 00:04:00 0.482 0.062
## 6 6 2009-01-01 00:05:00 0.570 0.000
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2.2 247.86 0 0 0
## 2 2.2 247.72 0 0 0
## 3 2.2 247.75 0 0 0
## 4 2.2 247.52 0 0 0
## 5 2.2 246.94 0 0 0
## 6 2.6 246.94 0 0 0
tail(yr_2009)
## id Date Time Global_active_power Global_reactive_power
## 521315 525595 2009-12-31 23:54:00 1.704 0.128
## 521316 525596 2009-12-31 23:55:00 1.746 0.158
## 521317 525597 2009-12-31 23:56:00 1.786 0.234
## 521318 525598 2009-12-31 23:57:00 1.784 0.232
## 521319 525599 2009-12-31 23:58:00 1.792 0.236
## 521320 525600 2009-12-31 23:59:00 1.792 0.238
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 521315 7.0 239.43 0 0 18
## 521316 7.2 239.95 0 0 18
## 521317 7.4 240.09 0 0 19
## 521318 7.4 239.99 0 0 18
## 521319 7.4 240.62 0 0 18
## 521320 7.4 240.82 0 0 19
head(yr_2010)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2010-01-01 00:00:00 1.790 0.236
## 2 2 2010-01-01 00:01:00 1.780 0.234
## 3 3 2010-01-01 00:02:00 1.780 0.234
## 4 4 2010-01-01 00:03:00 1.746 0.186
## 5 5 2010-01-01 00:04:00 1.686 0.102
## 6 6 2010-01-01 00:05:00 1.686 0.100
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 7.4 240.65 0 0 18
## 2 7.4 240.07 0 0 18
## 3 7.4 240.15 0 0 19
## 4 7.2 240.26 0 0 18
## 5 7.0 240.12 0 0 18
## 6 7.0 240.10 0 0 19
tail(yr_2010)
## id Date Time Global_active_power Global_reactive_power
## 457389 475018 2010-11-26 20:57:00 0.946 0
## 457390 475019 2010-11-26 20:58:00 0.946 0
## 457391 475020 2010-11-26 20:59:00 0.944 0
## 457392 475021 2010-11-26 21:00:00 0.938 0
## 457393 475022 2010-11-26 21:01:00 0.934 0
## 457394 475023 2010-11-26 21:02:00 0.932 0
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 457389 4.0 240.33 0 0 0
## 457390 4.0 240.43 0 0 0
## 457391 4.0 240.00 0 0 0
## 457392 3.8 239.82 0 0 0
## 457393 3.8 239.70 0 0 0
## 457394 3.8 239.55 0 0 0
The years 2006 and 2010 do not have full records for the entire year, so moving on we’re going to keep 2007, 2008 and 2009 for the analysis. Date and Time appear as characters. Will need to transform.
Create Primary DataFrame
#combine tables into one df using dplyr
df789 <- bind_rows(yr_2007, yr_2008, yr_2009)
Inspect df789
str(df789)
## 'data.frame': 1569894 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ Date : chr "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Global_active_power : num 2.58 2.55 2.55 2.55 2.55 ...
## $ Global_reactive_power: num 0.136 0.1 0.1 0.1 0.1 0.1 0.096 0 0 0 ...
## $ Global_intensity : num 10.6 10.4 10.4 10.4 10.4 10.4 10.4 10.2 10.2 10.2 ...
## $ Voltage : num 242 242 242 242 242 ...
## $ Sub_metering_1 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3 : num 0 0 0 0 0 0 0 0 0 0 ...
head(df789)
## id Date Time Global_active_power Global_reactive_power
## 1 1 2007-01-01 00:00:00 2.580 0.136
## 2 2 2007-01-01 00:01:00 2.552 0.100
## 3 3 2007-01-01 00:02:00 2.550 0.100
## 4 4 2007-01-01 00:03:00 2.550 0.100
## 5 5 2007-01-01 00:04:00 2.554 0.100
## 6 6 2007-01-01 00:05:00 2.550 0.100
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 10.6 241.97 0 0 0
## 2 10.4 241.75 0 0 0
## 3 10.4 241.64 0 0 0
## 4 10.4 241.71 0 0 0
## 5 10.4 241.98 0 0 0
## 6 10.4 241.83 0 0 0
tail(df789)
## id Date Time Global_active_power Global_reactive_power
## 1569889 525595 2009-12-31 23:54:00 1.704 0.128
## 1569890 525596 2009-12-31 23:55:00 1.746 0.158
## 1569891 525597 2009-12-31 23:56:00 1.786 0.234
## 1569892 525598 2009-12-31 23:57:00 1.784 0.232
## 1569893 525599 2009-12-31 23:58:00 1.792 0.236
## 1569894 525600 2009-12-31 23:59:00 1.792 0.238
## Global_intensity Voltage Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1569889 7.0 239.43 0 0 18
## 1569890 7.2 239.95 0 0 18
## 1569891 7.4 240.09 0 0 19
## 1569892 7.4 239.99 0 0 18
## 1569893 7.4 240.62 0 0 18
## 1569894 7.4 240.82 0 0 19
# bind together Date and Time, which are now chr.
df789 <- cbind(df789, paste(df789$Date,df789$Time), stringsAsFactors=FALSE)
## Give the new attribute in the 9th column a header name
colnames(df789)[11] <-"DateTime"
# since we're at renaming columns, will rename the sub_meters too
colnames(df789)[8] <-"Kitchen"
colnames(df789)[9] <-"Laundromat"
colnames(df789)[10] <-"Temp_control"
#move the DateTime attribute within the dataset
df789 <- df789[,c(ncol(df789),1:(ncol(df789)-1))]
# drop original date and time columns
df789 <- subset(df789, select = -c(Date, Time, id))
#convert from chr to POSIXct
df789$DateTime <- as.POSIXct(df789$DateTime, "%Y/%m/%d %H:%M:%S")
## Warning in strptime(xx, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(x): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in strptime(x, f, tz = tz): unknown timezone '%Y/%m/%d %H:%M:%S'
## Warning in as.POSIXct.POSIXlt(as.POSIXlt(x, tz, ...), tz, ...): unknown timezone
## '%Y/%m/%d %H:%M:%S'
#set time-zone
attr(df789$DateTime, "tzone") <- "Europe/Paris"
#inspect new data types
str(df789)
## 'data.frame': 1569894 obs. of 8 variables:
## $ DateTime : POSIXct, format: "2007-01-01 01:00:00" "2007-01-01 01:01:00" ...
## $ Global_active_power : num 2.58 2.55 2.55 2.55 2.55 ...
## $ Global_reactive_power: num 0.136 0.1 0.1 0.1 0.1 0.1 0.096 0 0 0 ...
## $ Global_intensity : num 10.6 10.4 10.4 10.4 10.4 10.4 10.4 10.2 10.2 10.2 ...
## $ Voltage : num 242 242 242 242 242 ...
## $ Kitchen : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Laundromat : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Temp_control : num 0 0 0 0 0 0 0 0 0 0 ...
df789$year <- year(df789$DateTime)
df789$month <- month(df789$DateTime, label=TRUE)
df789$week <- week(df789$DateTime)
df789$day <- day(df789$DateTime)
df789$hour <- hour(df789$DateTime)
df789$minute <- minute(df789$DateTime)
df789$quarter <- quarter(df789$DateTime)
df789$weekDay <- wday(df789$DateTime, label=TRUE, week_start=getOption("lubridate.week.start", 1))
# sum sub meters
df789$sumMeters <- df789$Kitchen + df789$Laundromat + df789$Temp_control
# compute total consumption
df789$gapW <- df789$Global_active_power*1000/60
# compute other electricals
df789$otherEE <- df789$gapW - df789$sumMeters
Initial exploration
summary(df789)
## DateTime Global_active_power Global_reactive_power
## Min. :2007-01-01 01:00:00 Min. : 0.076 Min. :0.0000
## 1st Qu.:2007-10-03 08:39:15 1st Qu.: 0.300 1st Qu.:0.0460
## Median :2008-07-01 22:05:30 Median : 0.566 Median :0.1000
## Mean :2008-07-02 03:54:14 Mean : 1.089 Mean :0.1219
## 3rd Qu.:2009-03-31 14:32:45 3rd Qu.: 1.524 3rd Qu.:0.1920
## Max. :2010-01-01 00:59:00 Max. :11.122 Max. :1.3900
##
## Global_intensity Voltage Kitchen Laundromat
## Min. : 0.200 Min. :223.2 Min. : 0.000 Min. : 0.000
## 1st Qu.: 1.400 1st Qu.:238.7 1st Qu.: 0.000 1st Qu.: 0.000
## Median : 2.600 Median :240.8 Median : 0.000 Median : 0.000
## Mean : 4.624 Mean :240.6 Mean : 1.159 Mean : 1.343
## 3rd Qu.: 6.400 3rd Qu.:242.8 3rd Qu.: 0.000 3rd Qu.: 1.000
## Max. :48.400 Max. :254.2 Max. :82.000 Max. :78.000
##
## Temp_control year month week
## Min. : 0.000 Min. :2007 Oct :134054 Min. : 1.00
## 1st Qu.: 0.000 1st Qu.:2007 Jan :133916 1st Qu.:13.00
## Median : 1.000 Median :2008 May :133914 Median :27.00
## Mean : 6.216 Mean :2008 Dec :133846 Mean :26.62
## 3rd Qu.:17.000 3rd Qu.:2009 Jul :133783 3rd Qu.:40.00
## Max. :31.000 Max. :2010 Mar :133736 Max. :53.00
## (Other):766645
## day hour minute quarter weekDay
## Min. : 1.00 Min. : 0.0 Min. : 0.00 Min. :1.00 Mon:224448
## 1st Qu.: 8.00 1st Qu.: 5.0 1st Qu.:14.25 1st Qu.:2.00 Tue:226049
## Median :16.00 Median :12.0 Median :30.00 Median :3.00 Wed:225980
## Mean :15.71 Mean :11.5 Mean :29.50 Mean :2.51 Thu:225184
## 3rd Qu.:23.00 3rd Qu.:18.0 3rd Qu.:44.00 3rd Qu.:4.00 Fri:224694
## Max. :31.00 Max. :23.0 Max. :59.00 Max. :4.00 Sat:221960
## Sun:221579
## sumMeters gapW otherEE
## Min. : 0.000 Min. : 1.267 Min. : -2.400
## 1st Qu.: 0.000 1st Qu.: 5.000 1st Qu.: 3.767
## Median : 1.000 Median : 9.433 Median : 5.500
## Mean : 8.719 Mean : 18.153 Mean : 9.435
## 3rd Qu.: 18.000 3rd Qu.: 25.400 3rd Qu.: 10.367
## Max. :134.000 Max. :185.367 Max. :122.167
##
#df789 <- df789%>%
# filter(year<2010) # some 2010 records have snuck into the data, so I'm dropping them here.
#summary(df789)
!!! There are negative values on Other Electrical Appliances, which is odd, unless they have a solar panel or something.
df789plus <- df789 %>%
filter(otherEE<0)
str(df789plus)
## 'data.frame': 1011 obs. of 19 variables:
## $ DateTime : POSIXct, format: "2007-04-18 10:24:00" "2007-04-18 10:26:00" ...
## $ Global_active_power : num 2.992 1.554 2.014 0.338 0.828 ...
## $ Global_reactive_power: num 0 0 0.056 0.106 0.104 0 0.096 0 0.07 0.1 ...
## $ Global_intensity : num 13 8 9.8 2.6 5.2 5 5.4 11.6 13.2 1.2 ...
## $ Voltage : num 236 238 229 239 236 ...
## $ Kitchen : num 33 12 18 4 12 2 2 26 0 0 ...
## $ Laundromat : num 0 0 0 2 2 0 2 0 34 5 ...
## $ Temp_control : num 17 15 16 0 0 18 18 17 18 0 ...
## $ year : num 2007 2007 2007 2007 2007 ...
## $ month : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 4 5 5 6 6 6 6 6 ...
## $ week : num 16 16 17 18 18 25 25 25 26 26 ...
## $ day : int 18 18 23 5 6 20 20 20 27 27 ...
## $ hour : int 10 10 10 2 17 10 10 11 9 16 ...
## $ minute : int 24 26 58 15 35 0 4 7 30 45 ...
## $ quarter : int 2 2 2 2 2 2 2 2 2 2 ...
## $ weekDay : Ord.factor w/ 7 levels "Mon"<"Tue"<"Wed"<..: 3 3 1 6 7 3 3 3 3 3 ...
## $ sumMeters : num 50 27 34 6 14 20 22 43 52 5 ...
## $ gapW : num 49.87 25.9 33.57 5.63 13.8 ...
## $ otherEE : num -0.133 -1.1 -0.433 -0.367 -0.2 ...
1011 observations show more consumption than available. That’s still odd.
yearly_summary <- df789 %>%
group_by(year)%>%
summarise(yearly_kitchen=sum(Kitchen),
yearly_laundromat= sum(Laundromat),
yearly_TC=sum(Temp_control),
yearly_oEE=sum(otherEE),
yearly_total=sum(gapW))
yearly_summary
## # A tibble: 4 × 6
## year yearly_kitchen yearly_laundromat yearly_TC yearly_oEE yearly_total
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2007 642548 854356 3021773 5191509. 9710186.
## 2 2008 584784 661877 3180254 4989011. 9415926.
## 3 2009 592657 592177 3555717 4630254. 9370805.
## 4 2010 0 0 1099 592. 1690.
#install.packages("reshape2")
#library("reshape2")
# drop month total before wide to long
yearlies <- yearly_summary[,!names(yearly_summary) %in% c("yearly_total")]
yearlies <- yearlies %>%
rename(Kitchen = yearly_kitchen)%>%
rename(Laundromat = yearly_laundromat)%>%
rename(Temp_Control = yearly_TC)%>%
rename(Other_Appliances = yearly_oEE)
# wide to long
yearly_long <- melt(yearlies, id.vars=c("year"))
yearly_long <- yearly_long%>%
rename(Consumption = variable)
# Stacked barplot using ggplot2
ggplot(yearly_long,
aes(x = year,
y = value,
fill = Consumption)) +
geom_bar(stat="identity")+
scale_fill_viridis(option="turbo",discrete = TRUE)+
#scale_y_continuous(limits=c(0,3500000), expand=c(0,0))+
labs(title="Consumption by Year", x="Year", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()
quarterly <- df789%>%
group_by(quarter)%>%
summarise(sumK = sum(Kitchen),
sumL = sum(Laundromat),
sumT = sum(Temp_control),
sumoEE=sum(otherEE),
sumTotal=sum(gapW))
ggplot(quarterly, aes(x=quarter, y=sumTotal, fill=sumTotal))+
geom_col()+
scale_fill_viridis(option = "viridis", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,8706418), expand=c(0,0))+
labs(title="Tota Global Consumption by Quarter", x="Quarter", y="Global Consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
#install.packages("reshape2")
#ibrary("reshape2")
# drop total before wide to long
quarterlies <- quarterly[,!names(quarterly) %in% c("sumTotal")]
quarterlies <- quarterlies %>%
rename(Kitchen = sumK)%>%
rename(Laundromat = sumL)%>%
rename(Temp_Control = sumT)%>%
rename(Other_Appliances = sumoEE)
# wide to long
quarter_long <- melt(quarterlies, id.vars=c("quarter"))
quarter_long <- quarter_long%>%
rename(Consumption = variable)
# Stacked barplot using ggplot2
ggplot(quarter_long,
aes(x = quarter,
y = value,
fill = Consumption)) +
geom_bar(stat="identity")+
scale_fill_viridis(option="turbo",discrete = TRUE)+
scale_y_continuous(limits=c(0,8706418), expand=c(0,0))+
labs(title="Consumption by Quarter", x="Quarter", y="Average consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()
monthly_summary <- df789%>%
group_by(month)%>%
summarise(month_kitchen=sum(Kitchen),
month_laundromat=sum(Laundromat),
month_TC=sum(Temp_control),
month_oEE=sum(otherEE),
month_total=sum(gapW))
monthly_summary
## # A tibble: 12 × 6
## month month_kitchen month_laundromat month_TC month_oEE month_total
## <ord> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Jan 192844 215111 971362 1903977. 3283294.
## 2 Feb 134875 171989 824005 1473836. 2604705.
## 3 Mar 187971 232242 894985 1497473. 2812671.
## 4 Apr 143719 161953 795170 1112430. 2213272.
## 5 May 172366 181133 830840 1063423. 2247762.
## 6 Jun 160285 163270 739991 809510. 1873056.
## 7 Jul 108591 141705 569979 724366. 1544641.
## 8 Aug 73187 111008 495795 577916. 1257906.
## 9 Sep 156767 165924 806763 988552. 2118006.
## 10 Oct 143051 205184 862518 1314660. 2525413.
## 11 Nov 169335 183018 914937 1579651. 2846941.
## 12 Dec 176998 175873 1052498 1765573. 3170942.
summary(monthly_summary)
## month month_kitchen month_laundromat month_TC
## Jan :1 Min. : 73187 Min. :111008 Min. : 495795
## Feb :1 1st Qu.:141007 1st Qu.:162941 1st Qu.: 781375
## Mar :1 Median :158526 Median :173931 Median : 827422
## Apr :1 Mean :151666 Mean :175701 Mean : 813237
## May :1 3rd Qu.:173524 3rd Qu.:188560 3rd Qu.: 899973
## Jun :1 Max. :192844 Max. :232242 Max. :1052498
## (Other):6
## month_oEE month_total
## Min. : 577916 Min. :1257906
## 1st Qu.: 943791 1st Qu.:2056768
## Median :1213545 Median :2386587
## Mean :1234280 Mean :2374884
## 3rd Qu.:1518017 3rd Qu.:2821238
## Max. :1903977 Max. :3283294
##
#total
monthly_summary %>%
ggplot(aes(x=month, y=month_total, fill=month_total))+
geom_col()+
scale_fill_viridis(option = "cividis", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,3500000), expand=c(0,0))+
labs(title="Total Global Consumption by Month", x="Month", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
#kitchen
monthly_summary %>%
ggplot(aes(x=month, y=month_kitchen, fill=month_kitchen))+
geom_col()+
scale_fill_viridis(option = "magma", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,200000), expand=c(0,0))+
labs(title="Total Kitchen Consumption by Month", x="Month", y="Kitchen consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
#laundromat
monthly_summary %>%
ggplot(aes(x=month, y=month_laundromat, fill=month_laundromat))+
geom_col()+
scale_fill_viridis(option = "viridis", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,250000), expand=c(0,0))+
labs(title="Total Laundromat Consumption by Month", x="Month", y="Laundromat consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
#temperature control
monthly_summary %>%
ggplot(aes(x=month, y=month_TC, fill=month_TC))+
geom_col()+
scale_fill_viridis(option = "plasma", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,1200000), expand=c(0,0))+
labs(title="Total Temp Control Consumption by Month", x="Month", y="Temp Control consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
# other appliances
monthly_summary %>%
ggplot(aes(x=month, y=month_oEE, fill=month_oEE))+
geom_col()+
scale_fill_viridis(option = "mako", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,2000000), expand=c(0,0))+
labs(title="Total Other Appliances Consumption by Month", x="Month", y="Other Appliances consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
#install.packages("reshape2")
#ibrary("reshape2")
# drop month total before wide to long
monthlies <- monthly_summary[,!names(monthly_summary) %in% c("month_total")]
monthlies <- monthlies %>%
rename(Kitchen = month_kitchen)%>%
rename(Laundromat = month_laundromat)%>%
rename(Temp_Control = month_TC)%>%
rename(Other_Appliances = month_oEE)
# wide to long
monthly_long <- melt(monthlies, id.vars=c("month"))
monthly_long <- monthly_long%>%
rename(Consumption = variable)
# Stacked barplot using ggplot2
ggplot(monthly_long,
aes(x = month,
y = value,
fill = Consumption)) +
geom_bar(stat="identity")+
scale_fill_viridis(option="turbo",discrete = TRUE)+
scale_y_continuous(limits=c(0,3500000), expand=c(0,0))+
labs(title="Consumption by Month", x="Month", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()
weekly_summary <- df789%>%
group_by(week)%>%
summarise(week_kitchen=sum(Kitchen),
week_laundromat=sum(Laundromat),
week_TC=sum(Temp_control),
week_oEE=sum(otherEE),
week_total=sum(gapW))
weekly_summary
## # A tibble: 53 × 6
## week week_kitchen week_laundromat week_TC week_oEE week_total
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 31069 52432 186088 428059. 697648.
## 2 2 52702 37775 227499 423693. 741669.
## 3 3 50650 54580 240533 433080. 778843.
## 4 4 43626 48758 215638 439292. 747314.
## 5 5 40231 46734 250321 434579. 771865.
## 6 6 44496 51349 231532 405994. 733371.
## 7 7 32703 37630 213297 370327. 653957.
## 8 8 29943 46243 179498 353210. 608894.
## 9 9 19352 35884 113406 234767. 403409.
## 10 10 50092 62833 196837 386395. 696157.
## # … with 43 more rows
summary(weekly_summary)
## week week_kitchen week_laundromat week_TC week_oEE
## Min. : 1 Min. : 2975 Min. : 4332 Min. : 50458 Min. : 66493
## 1st Qu.:14 1st Qu.:29943 1st Qu.:32892 1st Qu.:158848 1st Qu.:197449
## Median :27 Median :35237 Median :39398 Median :191320 Median :264975
## Mean :27 Mean :34339 Mean :39781 Mean :184129 Mean :279460
## 3rd Qu.:40 3rd Qu.:41139 3rd Qu.:48758 3rd Qu.:217066 3rd Qu.:377351
## Max. :53 Max. :52702 Max. :62833 Max. :251723 Max. :439292
## week_total
## Min. :124258
## 1st Qu.:417885
## Median :537872
## Mean :537710
## 3rd Qu.:680882
## Max. :778843
#total
weekly_summary %>%
ggplot(aes(x=week, y=week_total, fill=week_total))+
geom_col()+
scale_fill_viridis(option = "cividis", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,778843), expand=c(0,0))+
labs(title="Total Global Consumption by week", x="week", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
## Warning: Removed 1 rows containing missing values (position_stack).
#kitchen
weekly_summary %>%
ggplot(aes(x=week, y=week_kitchen, fill=week_kitchen))+
geom_col()+
scale_fill_viridis(option = "magma", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,52702), expand=c(0,0))+
labs(title="Total Kitchen Consumption by week", x="Week", y="Kitchen consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
#laundromat
weekly_summary %>%
ggplot(aes(x=week, y=week_laundromat, fill=week_laundromat))+
geom_col()+
scale_fill_viridis(option = "viridis", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,62833), expand=c(0,0))+
labs(title="Total Laundromat Consumption by week", x="Week", y="Laundromat consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
#temperature control
weekly_summary %>%
ggplot(aes(x=week, y=week_TC, fill=week_TC))+
geom_col()+
scale_fill_viridis(option = "plasma", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,251723), expand=c(0,0))+
labs(title="Total Temp Control Consumption by week", x="Week", y="Temp Control consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
# other appliances
weekly_summary %>%
ggplot(aes(x=week, y=week_oEE, fill=week_oEE))+
geom_col()+
scale_fill_viridis(option = "mako", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,439292), expand=c(0,0))+
labs(title="Total Other Appliances Consumption by week", x="Week", y="Other Appliances consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
#install.packages("reshape2")
#ibrary("reshape2")
# drop month total before wide to long
weeklies <- weekly_summary[,!names(weekly_summary) %in% c("week_total")]
weeklies <- weeklies %>%
rename(Kitchen = week_kitchen)%>%
rename(Laundromat = week_laundromat)%>%
rename(Temp_Control = week_TC)%>%
rename(Other_Appliances = week_oEE)
# wide to long
weekly_long <- melt(weeklies, id.vars=c("week"))
weekly_long <- weekly_long%>%
rename(Consumption = variable)
# Stacked barplot using ggplot2
ggplot(weekly_long,
aes(x = week,
y = value,
fill = Consumption)) +
geom_bar(stat="identity")+
scale_fill_viridis(option="turbo",discrete = TRUE)+
scale_y_continuous(limits=c(0,500000), expand=c(0,0))+
labs(title="Consumption by Week", x="Week", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()
## Warning: Removed 81 rows containing missing values (geom_bar).
weekDay_summary <- df789%>%
group_by(week,weekDay)%>%
summarise(weekDay_kitchen=sum(Kitchen),
weekDay_laundromat=sum(Laundromat),
weekDay_TC=sum(Temp_control),
weekDay_oEE=sum(otherEE),
weekDay_total=sum(gapW))
## `summarise()` has grouped output by 'week'. You can override using the
## `.groups` argument.
weekDay_summary
## # A tibble: 368 × 7
## # Groups: week [53]
## week weekDay weekDay_kitchen weekDay_laundromat weekDay_TC weekDay_oEE
## <dbl> <ord> <dbl> <dbl> <dbl> <dbl>
## 1 1 Mon 2535 956 23740 72071.
## 2 1 Tue 2461 3338 31755 58010
## 3 1 Wed 2687 5625 23904 44513.
## 4 1 Thu 2946 13284 20166 67108.
## 5 1 Fri 5141 12339 31647 75366.
## 6 1 Sat 9889 1025 30173 51913.
## 7 1 Sun 5410 15865 24703 59078.
## 8 2 Mon 6369 1145 40181 51171.
## 9 2 Tue 5025 6873 35742 52442
## 10 2 Wed 4209 6321 31395 55818.
## # … with 358 more rows, and 1 more variable: weekDay_total <dbl>
summary(weekDay_summary)
## week weekDay weekDay_kitchen weekDay_laundromat weekDay_TC
## Min. : 1.00 Mon:53 Min. : 0 Min. : 214 Min. : 7145
## 1st Qu.:14.00 Tue:53 1st Qu.: 2514 1st Qu.: 2686 1st Qu.:21442
## Median :27.00 Wed:53 Median : 3922 Median : 5014 Median :27076
## Mean :26.79 Thu:53 Mean : 4946 Mean : 5729 Mean :26519
## 3rd Qu.:40.00 Fri:52 3rd Qu.: 6397 3rd Qu.: 7901 3rd Qu.:31661
## Max. :53.00 Sat:52 Max. :19606 Max. :21946 Max. :47045
## Sun:52
## weekDay_oEE weekDay_total
## Min. :11559 Min. : 19697
## 1st Qu.:28230 1st Qu.: 58665
## Median :38701 Median : 77773
## Mean :40248 Mean : 77442
## 3rd Qu.:51516 3rd Qu.: 93020
## Max. :87395 Max. :167695
##
#total
weekDay_summary %>%
ggplot(aes(x=weekDay, y=weekDay_total, fill=weekDay_total))+
geom_col()+
scale_fill_viridis(option = "cividis", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,167695), expand=c(0,0))+
labs(title="Total Global Consumption by weekDay", x="Week Day", y="Total consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
## Warning: Removed 1 rows containing missing values (position_stack).
## Warning: Removed 360 rows containing missing values (geom_col).
#kitchen
weekDay_summary %>%
ggplot(aes(x=weekDay, y=weekDay_kitchen, fill=weekDay_kitchen))+
geom_col()+
scale_fill_viridis(option = "magma", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,19606), expand=c(0,0))+
labs(title="Total Kitchen Consumption by weekDay", x="Week Day", y="Kitchen consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
## Warning: Removed 343 rows containing missing values (geom_col).
#laundromat
weekDay_summary %>%
ggplot(aes(x=weekDay, y=weekDay_laundromat, fill=weekDay_laundromat))+
geom_col()+
scale_fill_viridis(option = "viridis", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,21946), expand=c(0,0))+
labs(title="Total Laundromat Consumption by weekDay", x="Week Day", y="Laundromat consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
## Warning: Removed 341 rows containing missing values (geom_col).
#temperature control
weekDay_summary %>%
ggplot(aes(x=weekDay, y=weekDay_TC, fill=weekDay_TC))+
geom_col()+
scale_fill_viridis(option = "plasma", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,47045), expand=c(0,0))+
labs(title="Total Temp Control Consumption by weekDay", x="Week Day", y="Temp Control consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
## Warning: Removed 361 rows containing missing values (geom_col).
# other appliances
weekDay_summary %>%
ggplot(aes(x=weekDay, y=weekDay_oEE, fill=weekDay_oEE))+
geom_col()+
scale_fill_viridis(option = "mako", discrete=FALSE, direction = -1)+
scale_y_continuous(limits=c(0,87395), expand=c(0,0))+
labs(title="Total Other Appliances Consumption by weekDay", x="Week Day", y="Other Appliances consumption", caption="Data from: https://archive.ics.uci.edu/ml/datasets/Individual+household+electric+power+consumption")+
theme_minimal()+
easy_remove_legend()
## Warning: Removed 1 rows containing missing values (position_stack).
## Removed 360 rows containing missing values (geom_col).
tod <- filter(df789, year==2008 & month=="Jan" & day==9)
plot_ly(tod, x=~tod$DateTime, y=~tod$Kitchen, type="scatter",mode='lines')
plot_ly(tod, x=tod$DateTime, y=~tod$Kitchen, name="Kitchen", type="scatter", mode='lines')%>%
add_trace(y=~tod$Laundromat, name="Laundromat", mode='lines')%>%
add_trace(y=~tod$Temp_control, name="Temp Control", mode='lines')%>%
add_trace(y=~tod$otherEE, name="Other appliances", mode='lines')%>%
#add_trace(y=~tod$gapW, name="All consumption", mode='lines')%>%
layout(title="Power Consumption January 9th, 2008", xaxis=list(title="Time"), yaxis=list(title="Power(watt-hours)"))
eventsK_tod <- tod %>%
filter(Kitchen>0)
eventsL_tod <- tod %>%
filter(Laundromat>0)
eventsT_tod <- tod %>%
filter(Temp_control>0)
eventsE_tod <- tod %>%
filter(otherEE>0)
eventsAll_tod <- tod %>%
filter(gapW>0)
# make our data into circular class from package circular
eventsK_tod$Kitchen <- circular(eventsK_tod$hour%%24, # convert to 24 hrs
units="hours", template="clock24")
eventsL_tod$Laundromat <- circular(eventsL_tod$hour%%24, # convert to 24 hrs
units="hours", template="clock24")
eventsT_tod$Temp_control <- circular(eventsT_tod$hour%%24, # convert to 24 hrs
units="hours", template="clock24")
eventsE_tod$otherEE <- circular(eventsE_tod$hour%%24, units="hours", template="clock24")
eventsAll_tod$allW <- circular(eventsAll_tod$hour%%24, units="hours", template="clock24")
# plot a rose diagram, setting prop(ortion) argument after trial-n-error
rose.diag(eventsK_tod$Kitchen, bin = 24, col = "lightblue", main = "Kitchen Events by Hour", prop = 1.3)
rose.diag(eventsL_tod$Laundromat, bin = 24, col = "orange", main = "Laundromat Events by Hour", prop = 1.3)
rose.diag(eventsT_tod$Temp_control, bin = 24, col = "green ", main = "Temperature control Events by Hour", prop = 1.3)
rose.diag(eventsE_tod$otherEE, bin = 24, col = "red ", main = "Other appliances Events by Hour", prop = 1.3)
rose.diag(eventsAll_tod$allW, bin = 24, col = "gray ", main = "All consumption Events by Hour", prop = 1.3)