NSM & Weekend & WeekdaysThis is EDA performed over energy use of appliances.Data used include measurements of temperature and humidity sensors from a wireless network.he data set is at 10 min for about 4.5 months. The house temperature and humidity conditions.The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station.
Lets Read Train data
df_train <- read.csv("data//training.csv")
head(df_train,12)## date Appliances lights T1 RH_1 T2
## 1 2016-01-11 17:00:00 60 30 19.89000 47.59667 19.20000
## 2 2016-01-11 17:10:00 60 30 19.89000 46.69333 19.20000
## 3 2016-01-11 17:20:00 50 30 19.89000 46.30000 19.20000
## 4 2016-01-11 17:40:00 60 40 19.89000 46.33333 19.20000
## 5 2016-01-11 17:50:00 50 40 19.89000 46.02667 19.20000
## 6 2016-01-11 18:10:00 60 50 19.85667 45.56000 19.20000
## 7 2016-01-11 18:20:00 60 40 19.79000 45.59750 19.20000
## 8 2016-01-11 18:30:00 70 40 19.85667 46.09000 19.23000
## 9 2016-01-11 19:00:00 430 50 20.13333 48.00000 19.56667
## 10 2016-01-11 19:10:00 250 40 20.26000 52.72667 19.73000
## 11 2016-01-11 19:20:00 100 10 20.42667 55.89333 19.85667
## 12 2016-01-11 19:40:00 90 10 20.73000 52.66000 20.16667
## RH_2 T3 RH_3 T4 RH_4 T5 RH_5 T6
## 1 44.79000 19.79000 44.73000 19.00000 45.56667 17.16667 55.20000 7.026667
## 2 44.72250 19.79000 44.79000 19.00000 45.99250 17.16667 55.20000 6.833333
## 3 44.62667 19.79000 44.93333 18.92667 45.89000 17.16667 55.09000 6.560000
## 4 44.53000 19.79000 45.00000 18.89000 45.53000 17.20000 55.09000 6.366667
## 5 44.50000 19.79000 44.93333 18.89000 45.73000 17.13333 55.03000 6.300000
## 6 44.50000 19.73000 44.90000 18.89000 45.86333 17.10000 54.90000 6.190000
## 7 44.43333 19.73000 44.79000 18.89000 45.79000 17.16667 55.00000 6.123333
## 8 44.40000 19.79000 44.86333 18.89000 46.09667 17.10000 55.00000 6.190000
## 9 44.40000 19.89000 44.90000 19.00000 46.36333 17.10000 55.09000 6.123333
## 10 45.10000 19.89000 45.49333 19.00000 47.22333 17.10000 55.16333 6.067500
## 11 45.83333 20.03333 47.52667 19.00000 48.69667 17.10000 55.50000 5.900000
## 12 47.22333 20.20000 48.53000 18.92667 48.15667 17.16667 56.49000 5.726667
## RH_6 T7 RH_7 T8 RH_8 T9 RH_9 T_out
## 1 84.25667 17.20000 41.62667 18.20000 48.90000 17.03333 45.53000 6.600000
## 2 84.06333 17.20000 41.56000 18.20000 48.86333 17.06667 45.56000 6.483333
## 3 83.15667 17.20000 41.43333 18.20000 48.73000 17.00000 45.50000 6.366667
## 4 84.89333 17.20000 41.23000 18.10000 48.59000 17.00000 45.40000 6.133333
## 5 85.76667 17.13333 41.26000 18.10000 48.59000 17.00000 45.29000 6.016667
## 6 86.42333 17.10000 41.20000 18.10000 48.59000 17.00000 45.29000 5.916667
## 7 87.22667 17.16667 41.40000 18.10000 48.59000 17.00000 45.29000 5.933333
## 8 87.62667 17.20000 41.50000 18.10000 48.59000 17.00000 45.29000 5.950000
## 9 88.59000 17.82333 45.49333 18.06667 48.56000 16.96333 45.29000 6.000000
## 10 88.21500 17.96333 46.16000 18.03333 48.66667 16.89000 45.32667 6.000000
## 11 88.15667 17.96333 45.53333 18.10000 49.19333 16.89000 45.34500 6.000000
## 12 88.16000 17.76000 44.26667 18.23000 49.63333 16.89000 45.29000 6.000000
## Press_mm_hg RH_out Windspeed Visibility Tdewpoint rv1 rv2
## 1 733.5000 92.00000 7.000000 63.00000 5.300000 13.275433 13.275433
## 2 733.6000 92.00000 6.666667 59.16667 5.200000 18.606195 18.606195
## 3 733.7000 92.00000 6.333333 55.33333 5.100000 28.642668 28.642668
## 4 733.9000 92.00000 5.666667 47.66667 4.900000 10.084097 10.084097
## 5 734.0000 92.00000 5.333333 43.83333 4.800000 44.919484 44.919484
## 6 734.1667 91.83333 5.166667 40.00000 4.683333 33.039890 33.039890
## 7 734.2333 91.66667 5.333333 40.00000 4.666667 31.455702 31.455702
## 8 734.3000 91.50000 5.500000 40.00000 4.650000 3.089314 3.089314
## 9 734.5000 91.00000 6.000000 40.00000 4.600000 34.351142 34.351142
## 10 734.6167 90.50000 6.000000 40.00000 4.516667 19.205186 19.205186
## 11 734.7333 90.00000 6.000000 40.00000 4.433333 38.492071 38.492071
## 12 734.9667 89.00000 6.000000 40.00000 4.266667 35.880925 35.880925
## NSM WeekStatus Day_of_week
## 1 61200 Weekday Monday
## 2 61800 Weekday Monday
## 3 62400 Weekday Monday
## 4 63600 Weekday Monday
## 5 64200 Weekday Monday
## 6 65400 Weekday Monday
## 7 66000 Weekday Monday
## 8 66600 Weekday Monday
## 9 68400 Weekday Monday
## 10 69000 Weekday Monday
## 11 69600 Weekday Monday
## 12 70800 Weekday Monday
Feature Name
names(df_train)## [1] "date" "Appliances" "lights" "T1" "RH_1"
## [6] "T2" "RH_2" "T3" "RH_3" "T4"
## [11] "RH_4" "T5" "RH_5" "T6" "RH_6"
## [16] "T7" "RH_7" "T8" "RH_8" "T9"
## [21] "RH_9" "T_out" "Press_mm_hg" "RH_out" "Windspeed"
## [26] "Visibility" "Tdewpoint" "rv1" "rv2" "NSM"
## [31] "WeekStatus" "Day_of_week"
Dimension of data
cat("\n Features:",dim(df_train)[2],
"\n Observation:",dim(df_train)[1])##
## Features: 32
## Observation: 14803
Lets Read Test data
df_test <- read.csv("data//testing.csv");(head(df_test,5))## date Appliances lights T1 RH_1 T2
## 1 2016-01-11 17:30:00 50 40 19.89000 46.06667 19.20000
## 2 2016-01-11 18:00:00 60 50 19.89000 45.76667 19.20000
## 3 2016-01-11 18:40:00 230 70 19.92667 45.86333 19.35667
## 4 2016-01-11 18:50:00 580 60 20.06667 46.39667 19.42667
## 5 2016-01-11 19:30:00 100 10 20.56667 53.89333 20.03333
## RH_2 T3 RH_3 T4 RH_4 T5 RH_5 T6
## 1 44.59000 19.79 45.00000 18.89 45.72333 17.16667 55.09000 6.433333
## 2 44.50000 19.79 44.90000 18.89 45.79000 17.10000 54.96667 6.263333
## 3 44.40000 19.79 44.90000 18.89 46.43000 17.10000 55.00000 6.190000
## 4 44.40000 19.79 44.82667 19.00 46.43000 17.10000 55.00000 6.123333
## 5 46.75667 20.10 48.46667 19.00 48.49000 17.15000 56.04250 5.800000
## RH_6 T7 RH_7 T8 RH_8 T9 RH_9 T_out
## 1 83.42333 17.13333 41.29000 18.10000 48.59000 17.00 45.40000 6.250000
## 2 86.09000 17.13333 41.20000 18.10000 48.59000 17.00 45.29000 5.900000
## 3 87.86667 17.24750 42.71750 18.10000 48.59000 17.00 45.29000 5.966667
## 4 87.99333 17.53000 44.26333 18.06667 48.63333 16.89 45.29000 5.983333
## 5 88.36667 17.89000 44.92667 18.15000 49.20000 16.89 45.32667 6.000000
## Press_mm_hg RH_out Windspeed Visibility Tdewpoint rv1 rv2
## 1 733.8000 92.00000 6.000000 51.5 5.000000 45.410389 45.410389
## 2 734.1000 92.00000 5.000000 40.0 4.700000 47.233763 47.233763
## 3 734.3667 91.33333 5.666667 40.0 4.633333 10.298729 10.298729
## 4 734.4333 91.16667 5.833333 40.0 4.616667 8.827838 8.827838
## 5 734.8500 89.50000 6.000000 40.0 4.350000 24.884962 24.884962
## NSM WeekStatus Day_of_week
## 1 63000 Weekday Monday
## 2 64800 Weekday Monday
## 3 67200 Weekday Monday
## 4 67800 Weekday Monday
## 5 70200 Weekday Monday
Feature Name
names(df_test)## [1] "date" "Appliances" "lights" "T1" "RH_1"
## [6] "T2" "RH_2" "T3" "RH_3" "T4"
## [11] "RH_4" "T5" "RH_5" "T6" "RH_6"
## [16] "T7" "RH_7" "T8" "RH_8" "T9"
## [21] "RH_9" "T_out" "Press_mm_hg" "RH_out" "Windspeed"
## [26] "Visibility" "Tdewpoint" "rv1" "rv2" "NSM"
## [31] "WeekStatus" "Day_of_week"
Dimension of data
cat("\n Features:",dim(df_test)[2],
"\n Observation:",dim(df_test)[1])##
## Features: 32
## Observation: 4932
There are basically to create the data for creating Model one is pre-process the train and test specifically and create the model all combine train and test together and then create the model according , to normalise the result lets combine train and test together.
df <- rbind(df_train,df_test)names(df)## [1] "date" "Appliances" "lights" "T1" "RH_1"
## [6] "T2" "RH_2" "T3" "RH_3" "T4"
## [11] "RH_4" "T5" "RH_5" "T6" "RH_6"
## [16] "T7" "RH_7" "T8" "RH_8" "T9"
## [21] "RH_9" "T_out" "Press_mm_hg" "RH_out" "Windspeed"
## [26] "Visibility" "Tdewpoint" "rv1" "rv2" "NSM"
## [31] "WeekStatus" "Day_of_week"
df$date <- strptime(as.character(df$date),format="%Y-%m-%d %H:%M:%S")
df$date <- as.POSIXct(df$date , tz="UTC")NSM & Weekend & Weekdayssecond_day <- function(x){
s <- hour(x)+ minute(x)*60+second(x)
}
weekend_weekday <- function(x){
ind <- weekdays(x)
if (ind == 'Saturday' | ind == "Sunday") {
ind2 = 'Weekend'
}
else {
ind2 = 'Weekday'
}
return(ind2)
}# unlist(lapply(df$date,weekend_weekday)) # Applying week days function on a column date to extract weekday and weekends
# weekdays(df$date) # Extracting day name for date column
# unique(df$Day_of_week)
# unique(df$WeekStatus)as.data.frame(round(describe(df),1))## vars n mean sd median trimmed mad min
## date* 1 19735 NaN NA NA NaN NA Inf
## Appliances 2 19735 97.7 102.5 60.0 72.8 29.7 10.0
## lights 3 19735 3.8 7.9 0.0 1.8 0.0 0.0
## T1 4 19735 21.7 1.6 21.6 21.7 1.3 16.8
## RH_1 5 19735 40.3 4.0 39.7 40.1 4.0 27.0
## T2 6 19735 20.3 2.2 20.0 20.2 1.9 16.1
## RH_2 7 19735 40.4 4.1 40.5 40.5 4.0 20.5
## T3 8 19735 22.3 2.0 22.1 22.2 1.9 17.2
## RH_3 9 19735 39.2 3.3 38.5 39.1 3.1 28.8
## T4 10 19735 20.9 2.0 20.7 20.8 1.9 15.1
## RH_4 11 19735 39.0 4.3 38.4 38.8 4.7 27.7
## T5 12 19735 19.6 1.8 19.4 19.5 1.8 15.3
## RH_5 13 19735 50.9 9.0 49.1 49.6 5.9 29.8
## T6 14 19735 7.9 6.1 7.3 7.5 5.6 -6.1
## RH_6 15 19735 54.6 31.1 55.3 55.8 40.1 1.0
## T7 16 19735 20.3 2.1 20.0 20.2 2.1 15.4
## RH_7 17 19735 35.4 5.1 34.9 35.3 5.5 23.2
## T8 18 19735 22.0 2.0 22.1 22.1 1.9 16.3
## RH_8 19 19735 42.9 5.2 42.4 42.8 5.4 29.6
## T9 20 19735 19.5 2.0 19.4 19.4 1.9 14.9
## RH_9 21 19735 41.6 4.2 40.9 41.4 4.1 29.2
## T_out 22 19735 7.4 5.3 6.9 7.1 5.0 -5.0
## Press_mm_hg 23 19735 755.5 7.4 756.1 755.9 7.5 729.3
## RH_out 24 19735 79.8 14.9 83.7 81.4 14.1 24.0
## Windspeed 25 19735 4.0 2.5 3.7 3.8 2.5 0.0
## Visibility 26 19735 38.3 11.8 40.0 37.6 9.6 1.0
## Tdewpoint 27 19735 3.8 4.2 3.4 3.7 4.1 -6.6
## rv1 28 19735 25.0 14.5 24.9 25.0 18.6 0.0
## rv2 29 19735 25.0 14.5 24.9 25.0 18.6 0.0
## NSM 30 19735 42907.1 24940.0 43200.0 42908.9 32024.2 0.0
## WeekStatus* 31 19735 1.3 0.4 1.0 1.2 0.0 1.0
## Day_of_week* 32 19735 4.0 2.0 4.0 4.0 3.0 1.0
## max range skew kurtosis se
## date* -Inf -Inf NA NA NA
## Appliances 1080.0 1070.0 3.4 13.7 0.7
## lights 70.0 70.0 2.2 4.5 0.1
## T1 26.3 9.5 0.1 0.2 0.0
## RH_1 63.4 36.3 0.5 0.1 0.0
## T2 29.9 13.8 0.9 0.9 0.0
## RH_2 56.0 35.6 -0.3 0.7 0.0
## T3 29.2 12.0 0.5 0.0 0.0
## RH_3 50.2 21.4 0.5 -0.6 0.0
## T4 26.2 11.1 0.2 0.0 0.0
## RH_4 51.1 23.4 0.4 -0.6 0.0
## T5 25.8 10.5 0.6 0.1 0.0
## RH_5 96.3 66.5 1.9 4.5 0.1
## T6 28.3 34.4 0.6 0.4 0.0
## RH_6 99.9 98.9 -0.2 -1.1 0.2
## T7 26.0 10.6 0.3 -0.5 0.0
## RH_7 51.4 28.2 0.2 -0.5 0.0
## T8 27.2 10.9 -0.3 -0.2 0.0
## RH_8 58.8 29.2 0.3 -0.5 0.0
## T9 24.5 9.6 0.4 -0.3 0.0
## RH_9 53.3 24.2 0.4 -0.4 0.0
## T_out 26.1 31.1 0.5 0.4 0.0
## Press_mm_hg 772.3 43.0 -0.4 0.1 0.1
## RH_out 100.0 76.0 -0.9 0.3 0.1
## Windspeed 14.0 14.0 0.9 0.2 0.0
## Visibility 66.0 65.0 0.4 0.2 0.1
## Tdewpoint 15.5 22.1 0.2 -0.1 0.0
## rv1 50.0 50.0 0.0 -1.2 0.1
## rv2 50.0 50.0 0.0 -1.2 0.1
## NSM 85800.0 85800.0 0.0 -1.2 177.5
## WeekStatus* 2.0 1.0 1.0 -1.0 0.0
## Day_of_week* 7.0 6.0 0.0 -1.3 0.0
i=1
for(i in 1:ncol(df)){
if( (is.numeric(df[,i]))== FALSE )next
else{
plot(density(df[,i]),main=paste("",names(df[i])))
}
}cat("\n=----------------Numeric Variables-------------------\n")##
## =----------------Numeric Variables-------------------
df_num <- df[,sapply(df , is.numeric)];names(df_num)## [1] "Appliances" "lights" "T1" "RH_1" "T2"
## [6] "RH_2" "T3" "RH_3" "T4" "RH_4"
## [11] "T5" "RH_5" "T6" "RH_6" "T7"
## [16] "RH_7" "T8" "RH_8" "T9" "RH_9"
## [21] "T_out" "Press_mm_hg" "RH_out" "Windspeed" "Visibility"
## [26] "Tdewpoint" "rv1" "rv2" "NSM"
cat("\n=----------------Factor Variables-------------------\n")##
## =----------------Factor Variables-------------------
df_cat <- df[,sapply(df , is.factor )];names(df_cat)## [1] "WeekStatus" "Day_of_week"
i=1
for(i in 1:ncol(df)){
if( (is.numeric(df[,i]))== FALSE )next
else{
boxplot(df[,i],main=paste("",names(df[i]),"",length(boxplot(df[,i])$out),plot =FALSE), col = "lightgray")
}
}count_outliers <- function(x, na.rm = TRUE) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = TRUE)
H <- 1.5 * IQR(x, na.rm = TRUE)
y <- x
length(y[x < (qnt[1] - H)])
length(y[x > (qnt[2] + H)])
a <- data.frame(
Max_outlier = as.numeric(length(y[x > (qnt[2] + H)])) ,
Min_outlier = as.numeric(length(y[x < (qnt[2] - H)])))
a$Total_outlier = a$Max_outlier + a$Min_outlier
return(a)
}
a <-sapply(df_num , function(x)count_outliers(x))
rname<- rownames(t(a))
a <- as.matrix.data.frame(t(a))
a <- as.data.frame(a)
a$COlnames <- rname
names(a) <- c("Q3_Above","Q1_Below","Total","Columnames")
a <- a[,c(4,1:3)]
a$Total <- round((a$Total/dim(df)[1])*100,4)
a <- arrange(a ,desc(Total));a## Warning: package 'bindrcpp' was built under R version 3.3.3
## Columnames Q3_Above Q1_Below Total
## 1 lights 4483 0 22.7160
## 2 Visibility 2356 1904 21.5860
## 3 RH_5 1314 1233 12.9060
## 4 T1 266 2226 12.6273
## 5 Appliances 2138 352 12.6172
## 6 RH_out 0 2390 12.1105
## 7 T8 0 2040 10.3370
## 8 Press_mm_hg 0 2025 10.2610
## 9 RH_2 68 1910 10.0228
## 10 T_out 436 1381 9.2070
## 11 T6 515 1267 9.0296
## 12 RH_6 0 1766 8.9486
## 13 T2 546 1140 8.5432
## 14 T4 41 1532 7.9706
## 15 Tdewpoint 10 1489 7.5956
## 16 T5 179 1068 6.3187
## 17 T3 217 1016 6.2478
## 18 T9 0 1157 5.8627
## 19 T7 2 1125 5.7107
## 20 RH_8 17 1039 5.3509
## 21 RH_7 42 1013 5.3458
## 22 RH_1 126 833 4.8594
## 23 RH_9 4 898 4.5706
## 24 RH_3 9 682 3.5014
## 25 RH_4 0 377 1.9103
## 26 Windspeed 214 15 1.1604
## 27 NSM 0 137 0.6942
## 28 rv1 0 0 0.0000
## 29 rv2 0 0 0.0000
Flooring with Month and Date
df$my <- floor_date(df$date,"month")
df$mhr <- floor_date(df$date,"hour")hmap<- function(df , indi){
#names(df[as.numeric(indi)])
df1 <- df%>%
mutate(Year = year(mhr),
Month = month(mhr),
week = wday(mhr),
wdays = weekdays(mhr),
Hour = hour(mhr),
Day = day(mhr))
df2 <- df1%>%
group_by(Month,Day)%>%
summarise_at(.vars = colnames(.)[indi],mean,na.rm=TRUE)%>%
#summarise(Appl = sum(paste("", names(df[as.numeric(colum_nam)]),sep=""),na.rm=TRUE))%>%
arrange(Month,Day)
df2 <- as.data.frame(df2)
library(tidyr)
df3 <- t(spread(data = df2,
key = Day,
value = paste("",names(df2)[3],sep="")))
df3 <- as.data.frame(df3)
df3 <- as.matrix.data.frame(df3)
library(plotly)
plot_ly(x=colnames(df3), y=rownames(df3), z = df3, type = "heatmap", colors = colorRamp(c("yellow","orange","red")))%>%
layout(title = paste("",names(df2)[3]),
xaxis = list(title = 'Month'),
yaxis = list(title = 'Days'))
}names(df)## [1] "date" "Appliances" "lights" "T1" "RH_1"
## [6] "T2" "RH_2" "T3" "RH_3" "T4"
## [11] "RH_4" "T5" "RH_5" "T6" "RH_6"
## [16] "T7" "RH_7" "T8" "RH_8" "T9"
## [21] "RH_9" "T_out" "Press_mm_hg" "RH_out" "Windspeed"
## [26] "Visibility" "Tdewpoint" "rv1" "rv2" "NSM"
## [31] "WeekStatus" "Day_of_week" "my" "mhr"
indi <- grep("Appliances", colnames(df))
hmap(df , indi)indi <- grep("lights", colnames(df))
hmap(df , indi)indi <- grep("T1", colnames(df))
hmap(df , indi)indi <- grep("RH_1", colnames(df))
hmap(df , indi)indi <- grep("T2", colnames(df))
hmap(df , indi)indi <- grep("RH_2", colnames(df))
hmap(df , indi)indi <- grep("T3", colnames(df))
hmap(df , indi)indi <- grep("RH_3", colnames(df))
hmap(df , indi)indi <- grep("T4", colnames(df))
hmap(df , indi)indi <- grep("RH_4", colnames(df))
hmap(df , indi)indi <- grep("Windspeed", colnames(df))
hmap(df , indi)indi <- grep("Tdewpoint", colnames(df))
hmap(df , indi)indi <- grep("Visibility", colnames(df))
hmap(df,indi)indi <- grep("T_out", colnames(df))
hmap(df , indi)indi <- grep("Press_mm_hg", colnames(df))
hmap(df , indi)