INTRODUCTION:
The goal is to visualize the evolution of power consumed by period of time
To impede warnings and messages
knitr::opts_chunk$set(message= FALSE)
knitr::opts_chunk$set(warning= FALSE)
Calling the libraries
library(dplyr)#to transform dataframes
library(ggplot2)#to plot
library(skimr)#to resume dataframes
library(extrafont)#to add more styles of text on the graph
library(lubridate)# to deal with datetime variables
library(help=lubridate)
dades <- read.csv("household_power_consumption.txt", header=TRUE, sep=";")
skim(dades)
| Name | dades |
| Number of rows | 2075259 |
| Number of columns | 9 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Date | 0 | 1 | 8 | 10 | 0 | 1442 | 0 |
| Time | 0 | 1 | 8 | 8 | 0 | 1440 | 0 |
| Global_active_power | 0 | 1 | 1 | 6 | 0 | 4187 | 0 |
| Global_reactive_power | 0 | 1 | 1 | 5 | 0 | 533 | 0 |
| Voltage | 0 | 1 | 1 | 7 | 0 | 2838 | 0 |
| Global_intensity | 0 | 1 | 1 | 6 | 0 | 222 | 0 |
| Sub_metering_1 | 0 | 1 | 1 | 6 | 0 | 89 | 0 |
| Sub_metering_2 | 0 | 1 | 1 | 6 | 0 | 82 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Sub_metering_3 | 25979 | 0.99 | 6.46 | 8.44 | 0 | 0 | 1 | 17 | 31 | ▇▁▃▁▁ |
glimpse(dades)
## Rows: 2,075,259
## Columns: 9
## $ Date <chr> "16/12/2006", "16/12/2006", "16/12/2006", "16...
## $ Time <chr> "17:24:00", "17:25:00", "17:26:00", "17:27:00...
## $ Global_active_power <chr> "4.216", "5.360", "5.374", "5.388", "3.666", ...
## $ Global_reactive_power <chr> "0.418", "0.436", "0.498", "0.502", "0.528", ...
## $ Voltage <chr> "234.840", "233.630", "233.290", "233.740", "...
## $ Global_intensity <chr> "18.400", "23.000", "23.000", "23.000", "15.8...
## $ Sub_metering_1 <chr> "0.000", "0.000", "0.000", "0.000", "0.000", ...
## $ Sub_metering_2 <chr> "1.000", "1.000", "2.000", "1.000", "1.000", ...
## $ Sub_metering_3 <dbl> 17, 16, 17, 17, 17, 17, 17, 17, 17, 16, 17, 1...
Percentage of missing values in Sub_metering_3:
paste(round(25979*100/dim(dades)[1],1),'%')
## [1] "1.3 %"
head(dades)
The columns to be taking into account are as follow:
dades <- dades %>% mutate(Global_reactive_power= NULL,Global_intensity=NULL,Voltage= NULL)
head(dades,n=10)
To numeric values
dades <- dades %>% mutate(Global_active_power = as.numeric(Global_active_power),
Sub_metering_1 = as.numeric(Sub_metering_1),
Sub_metering_2 = as.numeric(Sub_metering_2))
It has defined null values because those chains correspond a non-numeric texts. Replacing missing values with mean values in the following columns:
for (j in 3:6){dades[,j][is.na(dades[,j])] <- mean(dades[,j],na.rm=T)}
skim(dades)
| Name | dades |
| Number of rows | 2075259 |
| Number of columns | 6 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 4 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Date | 0 | 1 | 8 | 10 | 0 | 1442 | 0 |
| Time | 0 | 1 | 8 | 8 | 0 | 1440 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| Global_active_power | 0 | 1 | 1.09 | 1.05 | 0.08 | 0.31 | 0.63 | 1.52 | 11.12 | ▇▁▁▁▁ |
| Sub_metering_1 | 0 | 1 | 1.12 | 6.11 | 0.00 | 0.00 | 0.00 | 0.00 | 88.00 | ▇▁▁▁▁ |
| Sub_metering_2 | 0 | 1 | 1.30 | 5.79 | 0.00 | 0.00 | 0.00 | 1.00 | 80.00 | ▇▁▁▁▁ |
| Sub_metering_3 | 0 | 1 | 6.46 | 8.38 | 0.00 | 0.00 | 1.00 | 17.00 | 31.00 | ▇▁▃▁▁ |
Setting Datetime and multiplying Global active power by 100/6 :
dades <- dades %>% mutate(Datetime = paste(Date,Time),
Datetime = as.POSIXct(Datetime,format='%d/%m/%Y %H:%M:%S'),
Global_active_power=Global_active_power*100/6)
Creating Dark_power as the difference between Global_active_power and the total of the sub metering ones
dades$Dark_power <- dades[,3] - (dades[,4]+ dades[,5]+ dades[,6])
Creating first a program to help us save time in coding for the graphics series
graph <- function(df, title) {
p <- ggplot(df,aes(x=Datetime)) +
geom_line(aes(y=Global_active_power,colour='Total power')) +
geom_line(aes(y=Sub_metering_1,colour='Sub 1')) +
geom_line(aes(y=Sub_metering_2,colour='Sub 2')) +
geom_line(aes(y=Sub_metering_3,colour='Sub 3')) +
geom_line(aes(y=Dark_power,colour='Unknown power')) +
labs(y='watt-h',x='',title= title) +
scale_colour_manual('Metering',values=c('Total power'='lightgray','Sub 1'='orange4','Sub 2'='thistle4',
'Sub 3'= 'paleturquoise4','Unknown power'='lightsteelblue3')) +
theme(legend.position = 'bottom',legend.direction = 'horizontal') +
theme(axis.line = element_line(size=0.5,color='black'),
panel.grid.major = element_blank(),
panel.grid.minor= element_blank(),
panel.border=element_blank(),
panel.background=element_blank()) +
theme(plot.title = element_text(family= 'Comic Sans MS',hjust=0.5),
text=element_text(family='Comic Sans MS'),
axis.text.x=element_text(colour='black',size=9),
axis.text.y=element_text(colour='black',size=9),
legend.key=element_rect(fill='white',colour='white'))
return(p) }
graph(dades,'Power Consumption per minute')
Adding Hour as a new column
dades$Hour <- hour(dades$Datetime)
Grouping by Date and Hour
hourly_dades <- dades %>% group_by(Date,Hour) %>% summarise(Sub_metering_1=sum(Sub_metering_1),
Sub_metering_2=sum(Sub_metering_2),
Sub_metering_3=sum(Sub_metering_3),
Dark_power=sum(Dark_power),
Global_active_power=sum(Global_active_power))
#Setting Datetime
hourly_dades <- hourly_dades %>% mutate(Datetime = paste(paste(Date,Hour),':00:00',sep=''),
Datetime = as.POSIXct(Datetime,format='%d/%m/%Y %H:%M:%S'))
graph(hourly_dades,'Hourly Power Consumption')
Grouping by Day
daily_dades <- hourly_dades %>% group_by(Date) %>% summarise(Sub_metering_1=sum(Sub_metering_1),
Sub_metering_2=sum(Sub_metering_2),
Sub_metering_3=sum(Sub_metering_3),
Dark_power=sum(Dark_power),
Global_active_power=sum(Global_active_power))
#Setting Datetime
daily_dades <- daily_dades %>% mutate(Datetime = paste(Date,'00:00:00'),
Datetime = as.POSIXct(Datetime,format='%d/%m/%Y %H:%M:%S'))
graph(daily_dades,'Daily Power Consumption')
Adding Year and Month as new columns
daily_dades<- daily_dades %>% mutate(Year = year(Datetime), Month= month(Datetime))
Grouping by Year and Month
monthly_dades <-daily_dades%>%group_by(Year,Month)%>%summarise(Sub_metering_1=sum(Sub_metering_1),
Sub_metering_2=sum(Sub_metering_2),
Sub_metering_3=sum(Sub_metering_3),
Dark_power=sum(Dark_power),
Global_active_power=sum(Global_active_power))
#Setting Datetime
monthly_dades <- monthly_dades %>%
mutate(Datetime = paste(paste('15',paste(Month,Year,sep='/'),sep='/'),'00:00:00'),
Datetime = as.POSIXct(Datetime,format='%d/%m/%Y %H:%M:%S'))
graph(monthly_dades,'Monthly Power Consumption')
#0:holiday,1:weekday
#1:'Winter',2:'Spring',3:'Summer',4:'Autumn'
dades <- dades %>% mutate(Month= month(Datetime),
Wday = wday(Datetime,label=T,abbr = F),
Kday = ifelse(Wday == 'sábado' | Wday == 'domingo','w','h'))
for(s in 1:4){
dades_season <- filter(dades, 1 + 3*(s-1) <= Month & Month <= 3+ 3*(s-1) )
dades_season <- dades_season %>% group_by(Kday,Time) %>% summarize(Sub_metering_1=mean(Sub_metering_1,na.rm=T),
Sub_metering_2=mean(Sub_metering_2,na.rm=T),
Sub_metering_3=mean(Sub_metering_3,na.rm=T),
Dark_power=mean(Dark_power,na.rm=T),
Global_active_power=mean(Global_active_power,na.rm=T))
dades_season$Season<-s
if(s == 1){dades_a_day <- dades_season}
else{dades_a_day <-rbind(dades_a_day,dades_season)}
}
dades_a_day <- dades_a_day %>% mutate(Time = paste('01/01/2010',Time),
Time = as.POSIXct(Time,format='%d/%m/%Y %H:%M:%S'))
Plotting
names <- list( '1'="Spring",'2'="Summer",'3'="Fall",'4'="Winter",'w'="weekday",'h'="holiday")
labeller <- function(variable,value){ return(names[value])}
ggplot(dades_a_day, aes(Time)) + geom_line (aes(y= Global_active_power, colour="Total power")) +
geom_line(aes(y= Sub_metering_1 ,colour="Sub 1")) +
geom_line(aes(y=Sub_metering_2, colour= "Sub 2")) +
geom_line(aes(y=Sub_metering_3, colour= "Sub 3")) +
geom_line(aes(y=Dark_power, colour= "Unknown power")) +
scale_colour_manual('Metering',values=c('Total power'='lightgray','Sub 1'='orange4','Sub 2'='thistle4',
'Sub 3'= 'paleturquoise4','Unknown power'='lightsteelblue3')) +
facet_grid(Kday ~ Season, labeller = labeller, scale= "free_y" ) + theme_bw()+
theme(strip.background = element_rect(colour = "paleturquoise", fill = "paleturquoise"))+
theme(legend.position = "bottom",legend.direction = "horizontal",legend.title = element_blank())+
theme(plot.title = element_text(hjust = 0.5), axis.line = element_line(size= 0.5, colour= "black"))+
theme(panel.grid.minor = element_blank())+
theme(plot.title = element_text(family = "Comic Sans MS"), text = element_text(family= "Comic Sans MS"))+
theme(axis.text.x = element_text(colour="black",size=7),axis.text.y = element_text(colour="black", size=10))+
theme(legend.text = element_text(colour= "black",size=10)) +
scale_x_datetime(date_labels = "%H") +
ggtitle("Averaged Power Consumption in a day") + ylab("watt-h") + xlab(" ")