# Load libraries
library(RMySQL)
library(dplyr)
# Create a database connection
con = dbConnect(MySQL(), user='deepAnalytics', password='Sqltask1234!',
dbname='dataanalytics2018',
host='data-analytics-2018.cbrosir2cswx.us-east-1.rds.amazonaws.com')
## List the tables contained in the database
dbListTables(con)
## [1] "iris" "yr_2006" "yr_2007" "yr_2008" "yr_2009" "yr_2010"
# Using the dbListFields() to learn the attributes associated with the yr_2006 table.
dbListFields(con,'yr_2006')
## [1] "id" "Date" "Time"
## [4] "Global_active_power" "Global_reactive_power" "Global_intensity"
## [7] "Voltage" "Sub_metering_1" "Sub_metering_2"
## [10] "Sub_metering_3"
# Select Date, Time and the 3 sub-meter attributes to gather tables from 2006 to 2010 using dbGetQuery funciton
yr_2006 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,
Sub_metering_2, Sub_metering_3 FROM yr_2006")
yr_2007 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,
Sub_metering_2, Sub_metering_3 FROM yr_2007")
yr_2008 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,
Sub_metering_2, Sub_metering_3 FROM yr_2008")
yr_2009 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,
Sub_metering_2, Sub_metering_3 FROM yr_2009")
yr_2010 <- dbGetQuery(con, "SELECT Date, Time, Sub_metering_1,
Sub_metering_2, Sub_metering_3 FROM yr_2010")
# Investigate each new data frame
# Use str(), summary(), head() and tail() with each data frame
str(yr_2006)
## 'data.frame': 21992 obs. of 5 variables:
## $ Date : chr "2006-12-16" "2006-12-16" "2006-12-16" "2006-12-16" ...
## $ Time : chr "17:24:00" "17:25:00" "17:26:00" "17:27:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 1 1 2 1 1 2 1 1 1 2 ...
## $ Sub_metering_3: num 17 16 17 17 17 17 17 17 17 16 ...
summary(yr_2006)
## Date Time Sub_metering_1 Sub_metering_2
## Length:21992 Length:21992 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.000 Median : 0.000
## Mean : 1.249 Mean : 2.215
## 3rd Qu.: 0.000 3rd Qu.: 1.000
## Max. :77.000 Max. :74.000
## Sub_metering_3
## Min. : 0.00
## 1st Qu.: 0.00
## Median : 0.00
## Mean : 7.41
## 3rd Qu.:17.00
## Max. :20.00
head(yr_2006)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 1 2006-12-16 17:24:00 0 1 17
## 2 2006-12-16 17:25:00 0 1 16
## 3 2006-12-16 17:26:00 0 2 17
## 4 2006-12-16 17:27:00 0 1 17
## 5 2006-12-16 17:28:00 0 1 17
## 6 2006-12-16 17:29:00 0 2 17
tail(yr_2006)
## Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## 21987 2006-12-31 23:54:00 0 0 0
## 21988 2006-12-31 23:55:00 0 0 0
## 21989 2006-12-31 23:56:00 0 0 0
## 21990 2006-12-31 23:57:00 0 0 0
## 21991 2006-12-31 23:58:00 0 0 0
## 21992 2006-12-31 23:59:00 0 0 0
# Combine tables into one dataframe using dplyr
newDF <- bind_rows(yr_2007, yr_2008, yr_2009)
# Examine the new dataframe
str(newDF)
## 'data.frame': 1569894 obs. of 5 variables:
## $ Date : chr "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 0 0 0 0 0 0 0 0 0 0 ...
summary(newDF)
## Date Time Sub_metering_1 Sub_metering_2
## Length:1569894 Length:1569894 Min. : 0.000 Min. : 0.000
## Class :character Class :character 1st Qu.: 0.000 1st Qu.: 0.000
## Mode :character Mode :character Median : 0.000 Median : 0.000
## Mean : 1.159 Mean : 1.343
## 3rd Qu.: 0.000 3rd Qu.: 1.000
## Max. :82.000 Max. :78.000
## Sub_metering_3
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 1.000
## Mean : 6.216
## 3rd Qu.:17.000
## Max. :31.000
# Combine Date and Time attribute values in a new attribute column
updated_newDF <-cbind(newDF,paste(newDF$Date,newDF$Time), stringsAsFactors=FALSE)
# Give the new attribute in the 6th column a header name
colnames(updated_newDF)[6] <-"DateTime"
# Move the DateTime attribute within the dataset
updated_newDF <- updated_newDF[,c(ncol(updated_newDF), 1:(ncol(updated_newDF)-1))]
# Convert DateTime from character to POSIXct
updated_newDF$DateTime <- as.POSIXct(updated_newDF$DateTime, "%Y%m/%d %H:%M:%S")
# Add the time zone
attr(updated_newDF$DateTime, "tzone") <- "Europe/Paris"
# Inspect the data types
str(updated_newDF)
## 'data.frame': 1569894 obs. of 6 variables:
## $ DateTime : POSIXct, format: "2007-01-01 01:00:00" "2007-01-01 01:01:00" ...
## $ Date : chr "2007-01-01" "2007-01-01" "2007-01-01" "2007-01-01" ...
## $ Time : chr "00:00:00" "00:01:00" "00:02:00" "00:03:00" ...
## $ Sub_metering_1: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_2: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Sub_metering_3: num 0 0 0 0 0 0 0 0 0 0 ...
# Extract and create "Year" information from DateTime using the Lubridate
# Create "quarter", "month" "week", "weekDay", "day", "hour", "minute" with lubridate
library(lubridate)
updated_newDF$year <- year(updated_newDF$DateTime)
updated_newDF$quarter <- quarter(updated_newDF$DateTime)
updated_newDF$month <- month(updated_newDF$DateTime)
updated_newDF$week <- week(updated_newDF$DateTime)
updated_newDF$weekDay <- wday(updated_newDF$DateTime)
updated_newDF$day <- day(updated_newDF$DateTime)
updated_newDF$hour <- hour(updated_newDF$DateTime)
updated_newDF$minute <- minute(updated_newDF$DateTime)
Power Consumption
houseWeek <- filter(updated_newDF, year == 2008 & week == 2)
# Plot subset houseWeek
plot(houseWeek$Sub_metering_1)
# Reducing Granularity
## Subset the 9th day of January 2008 - 10 Minute frequency
library(plotly)
houseDay10 <- filter(updated_newDF, year == 2008 & month == 1 & day == 9 &
(minute == 0 | minute == 10 | minute == 20 | minute == 30 | minute == 40 | minute == 50))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(houseDay10, x = ~houseDay10$DateTime, y = ~houseDay10$Sub_metering_1,
name = 'Kitchen', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~houseDay10$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption January 9th, 2008",
xaxis = list(title = "Time"),
yaxis = list (title = "Power (watt-hours)"))
# Subset week 2 from year 2008 with 30 Minutes frequency
week2 <- filter(updated_newDF, year == 2008 & month == 1 & week == 2 & (minute == 0 | minute == 30 ))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 30 Minute frequency
plot_ly(week2, x = ~week2$DateTime, y = ~week2$Sub_metering_1, name = 'Kitchen',
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~week2$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~week2$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption of Week 2, 2008",
xaxis = list(title = "Time in every 30 minutes"),
yaxis = list (title = "Power (watt-hours)"))
# Subset the summmer 4 month for 2008 - 60 Minutes frequency
summer2008 <- filter(updated_newDF, year == 2008 & month > 5 & month < 10 & (minute == 0 | minute == 60))
## Plot sub-meter 1, 2 and 3 with title, legend and labels - 10 Minute frequency
plot_ly(summer2008, x = ~summer2008$DateTime, y = ~summer2008$Sub_metering_1, name = 'Kitchen',
type = 'scatter', mode = 'lines') %>%
add_trace(y = ~summer2008$Sub_metering_2, name = 'Laundry Room', mode = 'lines') %>%
add_trace(y = ~summer2008$Sub_metering_3, name = 'Water Heater & AC', mode = 'lines') %>%
layout(title = "Power Consumption for Summer 2008",
xaxis = list(title = "Time in every 60 minutes"),
yaxis = list (title = "Power (watt-hours)"))
# Subset to one observation per week on Mondays at 8:00pm for 2007, 2008 and 2009
house070809weekly <- filter(updated_newDF, weekDay == "2" & hour == 20 & minute == 0)
# Create TS object with SubMeter3
tsSM3_070809weekly <- ts(house070809weekly$Sub_metering_3, frequency=52,
start=c(2007,1))
# Import Libraries
library(ggplot2)
library(ggfortify)
library(labeling)
# Plot sub-meter 3 with plot.ts
plot.ts(tsSM3_070809weekly, main = "Sub-meter 3", col="red")
# My trial (weekly--"Saturday") for SubMeter 1
# Subset to 1 observation per week on Saturday at 8:00pm for 2007, 2008 and 2009
house070809weekly_Sat <- filter(updated_newDF, weekDay == 7 & hour == 20 & minute == 0)
# Create TS object with SubMeter 1
tsSM1_070809weekly_Sat <- ts(house070809weekly_Sat$Sub_metering_1, frequency=52, start=c(2007,1))
# Plot sub-meter 1 with plot.ts
plot.ts(tsSM1_070809weekly_Sat, main = "Sub-meter 1-Kitchen", col="red")
# My trial (Daily-365 days) for Sub-meter 2
# Subset to one observation per day at 8:00pm for 2007, 2008 and 2009
house070809daily <- filter(updated_newDF, hour ==20 & minute == 0)
# Create TS object with SubMeter 2
tsSM2_070809daily <- ts(house070809daily$Sub_metering_2, frequency=365, start=c(2007,1))
# Plot sub-meter 2 with plot.ts
plot.ts(tsSM2_070809daily, main = "Sub-meter 2-Laundry Room", col="red")
# My trail Sub-Meter 2 (one month of data, taken at 18:00pm per day)
# Reduce granularity: change the object from daily to monthly, from minute data to hourly.
# Import 'tibble' & subset to 1 hour interval
library(tibbletime)
updated_newDF <- as_tbl_time(updated_newDF, index = DateTime)
newDF_1h <- as_period(updated_newDF, '1 hour')
newDF_1h
## # A time tibble: 26,174 x 14
## # Index: DateTime
## DateTime Date Time Sub_metering_1 Sub_metering_2 Sub_metering_3
## <dttm> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2007-01-01 01:00:00 2007~ 00:0~ 0 0 0
## 2 2007-01-01 02:00:00 2007~ 01:0~ 0 0 0
## 3 2007-01-01 03:00:00 2007~ 02:0~ 0 0 0
## 4 2007-01-01 04:00:00 2007~ 03:0~ 0 2 0
## 5 2007-01-01 05:00:00 2007~ 04:0~ 0 0 0
## 6 2007-01-01 06:00:00 2007~ 05:0~ 0 0 0
## 7 2007-01-01 07:00:00 2007~ 06:0~ 0 0 0
## 8 2007-01-01 08:00:00 2007~ 07:0~ 0 0 0
## 9 2007-01-01 09:00:00 2007~ 08:0~ 0 1 0
## 10 2007-01-01 10:00:00 2007~ 09:0~ 0 0 0
## # ... with 26,164 more rows, and 8 more variables: year <dbl>, quarter <int>,
## # month <dbl>, week <dbl>, weekDay <dbl>, day <int>, hour <int>, minute <int>
# March 2007
house07Monthly <- filter(newDF_1h, year == 2007 & month ==3)
tsSM2_07Monthly <- ts(house07Monthly$Sub_metering_2, frequency =24)
autoplot(tsSM2_07Monthly, ts.colour = 'red', xlab = "Time",
ylab = "Watt Hours", main = "Sub-meter 2 for March 2007" )
# March 2008
house08Monthly <- filter(newDF_1h, year == 2008 & month ==3)
tsSM2_08Monthly <- ts(house08Monthly$Sub_metering_2, frequency =24)
autoplot(tsSM2_08Monthly, ts.colour ='red', xlab = "Time",
ylab = "Watt Hours", main = "Sub-meter 2 for March 2008")
# March 2009
house09Monthly <- filter(newDF_1h, year == 2009 & month ==3)
tsSM2_09Monthly <- ts(house09Monthly$Sub_metering_2, frequency=24)
autoplot(tsSM2_09Monthly, ts.colour = 'red', xlab = "Time",
ylab = "Watt Hours", main = "Sub-meter 2 for March 2009")