1 Problem statement:

This is EDA performed over energy use of appliances.Data used include measurements of temperature and humidity sensors from a wireless network.he data set is at 10 min for about 4.5 months. The house temperature and humidity conditions.The energy data was logged every 10 minutes with m-bus energy meters. Weather from the nearest airport weather station.

2 Peek at Train/Test data

2.1 Train Data

Lets Read Train data

df_train <- read.csv("data//training.csv")
head(df_train,12)
##                   date Appliances lights       T1     RH_1       T2
## 1  2016-01-11 17:00:00         60     30 19.89000 47.59667 19.20000
## 2  2016-01-11 17:10:00         60     30 19.89000 46.69333 19.20000
## 3  2016-01-11 17:20:00         50     30 19.89000 46.30000 19.20000
## 4  2016-01-11 17:40:00         60     40 19.89000 46.33333 19.20000
## 5  2016-01-11 17:50:00         50     40 19.89000 46.02667 19.20000
## 6  2016-01-11 18:10:00         60     50 19.85667 45.56000 19.20000
## 7  2016-01-11 18:20:00         60     40 19.79000 45.59750 19.20000
## 8  2016-01-11 18:30:00         70     40 19.85667 46.09000 19.23000
## 9  2016-01-11 19:00:00        430     50 20.13333 48.00000 19.56667
## 10 2016-01-11 19:10:00        250     40 20.26000 52.72667 19.73000
## 11 2016-01-11 19:20:00        100     10 20.42667 55.89333 19.85667
## 12 2016-01-11 19:40:00         90     10 20.73000 52.66000 20.16667
##        RH_2       T3     RH_3       T4     RH_4       T5     RH_5       T6
## 1  44.79000 19.79000 44.73000 19.00000 45.56667 17.16667 55.20000 7.026667
## 2  44.72250 19.79000 44.79000 19.00000 45.99250 17.16667 55.20000 6.833333
## 3  44.62667 19.79000 44.93333 18.92667 45.89000 17.16667 55.09000 6.560000
## 4  44.53000 19.79000 45.00000 18.89000 45.53000 17.20000 55.09000 6.366667
## 5  44.50000 19.79000 44.93333 18.89000 45.73000 17.13333 55.03000 6.300000
## 6  44.50000 19.73000 44.90000 18.89000 45.86333 17.10000 54.90000 6.190000
## 7  44.43333 19.73000 44.79000 18.89000 45.79000 17.16667 55.00000 6.123333
## 8  44.40000 19.79000 44.86333 18.89000 46.09667 17.10000 55.00000 6.190000
## 9  44.40000 19.89000 44.90000 19.00000 46.36333 17.10000 55.09000 6.123333
## 10 45.10000 19.89000 45.49333 19.00000 47.22333 17.10000 55.16333 6.067500
## 11 45.83333 20.03333 47.52667 19.00000 48.69667 17.10000 55.50000 5.900000
## 12 47.22333 20.20000 48.53000 18.92667 48.15667 17.16667 56.49000 5.726667
##        RH_6       T7     RH_7       T8     RH_8       T9     RH_9    T_out
## 1  84.25667 17.20000 41.62667 18.20000 48.90000 17.03333 45.53000 6.600000
## 2  84.06333 17.20000 41.56000 18.20000 48.86333 17.06667 45.56000 6.483333
## 3  83.15667 17.20000 41.43333 18.20000 48.73000 17.00000 45.50000 6.366667
## 4  84.89333 17.20000 41.23000 18.10000 48.59000 17.00000 45.40000 6.133333
## 5  85.76667 17.13333 41.26000 18.10000 48.59000 17.00000 45.29000 6.016667
## 6  86.42333 17.10000 41.20000 18.10000 48.59000 17.00000 45.29000 5.916667
## 7  87.22667 17.16667 41.40000 18.10000 48.59000 17.00000 45.29000 5.933333
## 8  87.62667 17.20000 41.50000 18.10000 48.59000 17.00000 45.29000 5.950000
## 9  88.59000 17.82333 45.49333 18.06667 48.56000 16.96333 45.29000 6.000000
## 10 88.21500 17.96333 46.16000 18.03333 48.66667 16.89000 45.32667 6.000000
## 11 88.15667 17.96333 45.53333 18.10000 49.19333 16.89000 45.34500 6.000000
## 12 88.16000 17.76000 44.26667 18.23000 49.63333 16.89000 45.29000 6.000000
##    Press_mm_hg   RH_out Windspeed Visibility Tdewpoint       rv1       rv2
## 1     733.5000 92.00000  7.000000   63.00000  5.300000 13.275433 13.275433
## 2     733.6000 92.00000  6.666667   59.16667  5.200000 18.606195 18.606195
## 3     733.7000 92.00000  6.333333   55.33333  5.100000 28.642668 28.642668
## 4     733.9000 92.00000  5.666667   47.66667  4.900000 10.084097 10.084097
## 5     734.0000 92.00000  5.333333   43.83333  4.800000 44.919484 44.919484
## 6     734.1667 91.83333  5.166667   40.00000  4.683333 33.039890 33.039890
## 7     734.2333 91.66667  5.333333   40.00000  4.666667 31.455702 31.455702
## 8     734.3000 91.50000  5.500000   40.00000  4.650000  3.089314  3.089314
## 9     734.5000 91.00000  6.000000   40.00000  4.600000 34.351142 34.351142
## 10    734.6167 90.50000  6.000000   40.00000  4.516667 19.205186 19.205186
## 11    734.7333 90.00000  6.000000   40.00000  4.433333 38.492071 38.492071
## 12    734.9667 89.00000  6.000000   40.00000  4.266667 35.880925 35.880925
##      NSM WeekStatus Day_of_week
## 1  61200    Weekday      Monday
## 2  61800    Weekday      Monday
## 3  62400    Weekday      Monday
## 4  63600    Weekday      Monday
## 5  64200    Weekday      Monday
## 6  65400    Weekday      Monday
## 7  66000    Weekday      Monday
## 8  66600    Weekday      Monday
## 9  68400    Weekday      Monday
## 10 69000    Weekday      Monday
## 11 69600    Weekday      Monday
## 12 70800    Weekday      Monday

Feature Name

names(df_train)
##  [1] "date"        "Appliances"  "lights"      "T1"          "RH_1"       
##  [6] "T2"          "RH_2"        "T3"          "RH_3"        "T4"         
## [11] "RH_4"        "T5"          "RH_5"        "T6"          "RH_6"       
## [16] "T7"          "RH_7"        "T8"          "RH_8"        "T9"         
## [21] "RH_9"        "T_out"       "Press_mm_hg" "RH_out"      "Windspeed"  
## [26] "Visibility"  "Tdewpoint"   "rv1"         "rv2"         "NSM"        
## [31] "WeekStatus"  "Day_of_week"

Dimension of data

cat("\n Features:",dim(df_train)[2],
    "\n Observation:",dim(df_train)[1])
## 
##  Features: 32 
##  Observation: 14803

2.2 Test Data

Lets Read Test data

df_test <- read.csv("data//testing.csv");(head(df_test,5))
##                  date Appliances lights       T1     RH_1       T2
## 1 2016-01-11 17:30:00         50     40 19.89000 46.06667 19.20000
## 2 2016-01-11 18:00:00         60     50 19.89000 45.76667 19.20000
## 3 2016-01-11 18:40:00        230     70 19.92667 45.86333 19.35667
## 4 2016-01-11 18:50:00        580     60 20.06667 46.39667 19.42667
## 5 2016-01-11 19:30:00        100     10 20.56667 53.89333 20.03333
##       RH_2    T3     RH_3    T4     RH_4       T5     RH_5       T6
## 1 44.59000 19.79 45.00000 18.89 45.72333 17.16667 55.09000 6.433333
## 2 44.50000 19.79 44.90000 18.89 45.79000 17.10000 54.96667 6.263333
## 3 44.40000 19.79 44.90000 18.89 46.43000 17.10000 55.00000 6.190000
## 4 44.40000 19.79 44.82667 19.00 46.43000 17.10000 55.00000 6.123333
## 5 46.75667 20.10 48.46667 19.00 48.49000 17.15000 56.04250 5.800000
##       RH_6       T7     RH_7       T8     RH_8    T9     RH_9    T_out
## 1 83.42333 17.13333 41.29000 18.10000 48.59000 17.00 45.40000 6.250000
## 2 86.09000 17.13333 41.20000 18.10000 48.59000 17.00 45.29000 5.900000
## 3 87.86667 17.24750 42.71750 18.10000 48.59000 17.00 45.29000 5.966667
## 4 87.99333 17.53000 44.26333 18.06667 48.63333 16.89 45.29000 5.983333
## 5 88.36667 17.89000 44.92667 18.15000 49.20000 16.89 45.32667 6.000000
##   Press_mm_hg   RH_out Windspeed Visibility Tdewpoint       rv1       rv2
## 1    733.8000 92.00000  6.000000       51.5  5.000000 45.410389 45.410389
## 2    734.1000 92.00000  5.000000       40.0  4.700000 47.233763 47.233763
## 3    734.3667 91.33333  5.666667       40.0  4.633333 10.298729 10.298729
## 4    734.4333 91.16667  5.833333       40.0  4.616667  8.827838  8.827838
## 5    734.8500 89.50000  6.000000       40.0  4.350000 24.884962 24.884962
##     NSM WeekStatus Day_of_week
## 1 63000    Weekday      Monday
## 2 64800    Weekday      Monday
## 3 67200    Weekday      Monday
## 4 67800    Weekday      Monday
## 5 70200    Weekday      Monday

Feature Name

names(df_test)
##  [1] "date"        "Appliances"  "lights"      "T1"          "RH_1"       
##  [6] "T2"          "RH_2"        "T3"          "RH_3"        "T4"         
## [11] "RH_4"        "T5"          "RH_5"        "T6"          "RH_6"       
## [16] "T7"          "RH_7"        "T8"          "RH_8"        "T9"         
## [21] "RH_9"        "T_out"       "Press_mm_hg" "RH_out"      "Windspeed"  
## [26] "Visibility"  "Tdewpoint"   "rv1"         "rv2"         "NSM"        
## [31] "WeekStatus"  "Day_of_week"

Dimension of data

cat("\n Features:",dim(df_test)[2],
    "\n Observation:",dim(df_test)[1])
## 
##  Features: 32 
##  Observation: 4932

3

4 Combining Test & Train data for pre-processing

There are basically to create the data for creating Model one is pre-process the train and test specifically and create the model all combine train and test together and then create the model according , to normalise the result lets combine train and test together.

df <- rbind(df_train,df_test)
names(df)
##  [1] "date"        "Appliances"  "lights"      "T1"          "RH_1"       
##  [6] "T2"          "RH_2"        "T3"          "RH_3"        "T4"         
## [11] "RH_4"        "T5"          "RH_5"        "T6"          "RH_6"       
## [16] "T7"          "RH_7"        "T8"          "RH_8"        "T9"         
## [21] "RH_9"        "T_out"       "Press_mm_hg" "RH_out"      "Windspeed"  
## [26] "Visibility"  "Tdewpoint"   "rv1"         "rv2"         "NSM"        
## [31] "WeekStatus"  "Day_of_week"

5 Changing date and time format from character

df$date <- strptime(as.character(df$date),format="%Y-%m-%d %H:%M:%S")
df$date <- as.POSIXct(df$date , tz="UTC")

6 Function to Create NSM & Weekend & Weekdays

second_day <- function(x){
  s <- hour(x)+ minute(x)*60+second(x)
}
weekend_weekday <- function(x){
    ind <- weekdays(x)
    if (ind == 'Saturday' | ind == "Sunday") {
        ind2 = 'Weekend'
    }
    else {
        ind2 = 'Weekday'
    }
    return(ind2)
}

7 Important function for feature Engineering

# unlist(lapply(df$date,weekend_weekday)) # Applying week days function on a column date to extract weekday and weekends
# weekdays(df$date) # Extracting day name for date column
# unique(df$Day_of_week)
# unique(df$WeekStatus)

8 Describing Data

as.data.frame(round(describe(df),1))
##              vars     n    mean      sd  median trimmed     mad   min
## date*           1 19735     NaN      NA      NA     NaN      NA   Inf
## Appliances      2 19735    97.7   102.5    60.0    72.8    29.7  10.0
## lights          3 19735     3.8     7.9     0.0     1.8     0.0   0.0
## T1              4 19735    21.7     1.6    21.6    21.7     1.3  16.8
## RH_1            5 19735    40.3     4.0    39.7    40.1     4.0  27.0
## T2              6 19735    20.3     2.2    20.0    20.2     1.9  16.1
## RH_2            7 19735    40.4     4.1    40.5    40.5     4.0  20.5
## T3              8 19735    22.3     2.0    22.1    22.2     1.9  17.2
## RH_3            9 19735    39.2     3.3    38.5    39.1     3.1  28.8
## T4             10 19735    20.9     2.0    20.7    20.8     1.9  15.1
## RH_4           11 19735    39.0     4.3    38.4    38.8     4.7  27.7
## T5             12 19735    19.6     1.8    19.4    19.5     1.8  15.3
## RH_5           13 19735    50.9     9.0    49.1    49.6     5.9  29.8
## T6             14 19735     7.9     6.1     7.3     7.5     5.6  -6.1
## RH_6           15 19735    54.6    31.1    55.3    55.8    40.1   1.0
## T7             16 19735    20.3     2.1    20.0    20.2     2.1  15.4
## RH_7           17 19735    35.4     5.1    34.9    35.3     5.5  23.2
## T8             18 19735    22.0     2.0    22.1    22.1     1.9  16.3
## RH_8           19 19735    42.9     5.2    42.4    42.8     5.4  29.6
## T9             20 19735    19.5     2.0    19.4    19.4     1.9  14.9
## RH_9           21 19735    41.6     4.2    40.9    41.4     4.1  29.2
## T_out          22 19735     7.4     5.3     6.9     7.1     5.0  -5.0
## Press_mm_hg    23 19735   755.5     7.4   756.1   755.9     7.5 729.3
## RH_out         24 19735    79.8    14.9    83.7    81.4    14.1  24.0
## Windspeed      25 19735     4.0     2.5     3.7     3.8     2.5   0.0
## Visibility     26 19735    38.3    11.8    40.0    37.6     9.6   1.0
## Tdewpoint      27 19735     3.8     4.2     3.4     3.7     4.1  -6.6
## rv1            28 19735    25.0    14.5    24.9    25.0    18.6   0.0
## rv2            29 19735    25.0    14.5    24.9    25.0    18.6   0.0
## NSM            30 19735 42907.1 24940.0 43200.0 42908.9 32024.2   0.0
## WeekStatus*    31 19735     1.3     0.4     1.0     1.2     0.0   1.0
## Day_of_week*   32 19735     4.0     2.0     4.0     4.0     3.0   1.0
##                  max   range skew kurtosis    se
## date*           -Inf    -Inf   NA       NA    NA
## Appliances    1080.0  1070.0  3.4     13.7   0.7
## lights          70.0    70.0  2.2      4.5   0.1
## T1              26.3     9.5  0.1      0.2   0.0
## RH_1            63.4    36.3  0.5      0.1   0.0
## T2              29.9    13.8  0.9      0.9   0.0
## RH_2            56.0    35.6 -0.3      0.7   0.0
## T3              29.2    12.0  0.5      0.0   0.0
## RH_3            50.2    21.4  0.5     -0.6   0.0
## T4              26.2    11.1  0.2      0.0   0.0
## RH_4            51.1    23.4  0.4     -0.6   0.0
## T5              25.8    10.5  0.6      0.1   0.0
## RH_5            96.3    66.5  1.9      4.5   0.1
## T6              28.3    34.4  0.6      0.4   0.0
## RH_6            99.9    98.9 -0.2     -1.1   0.2
## T7              26.0    10.6  0.3     -0.5   0.0
## RH_7            51.4    28.2  0.2     -0.5   0.0
## T8              27.2    10.9 -0.3     -0.2   0.0
## RH_8            58.8    29.2  0.3     -0.5   0.0
## T9              24.5     9.6  0.4     -0.3   0.0
## RH_9            53.3    24.2  0.4     -0.4   0.0
## T_out           26.1    31.1  0.5      0.4   0.0
## Press_mm_hg    772.3    43.0 -0.4      0.1   0.1
## RH_out         100.0    76.0 -0.9      0.3   0.1
## Windspeed       14.0    14.0  0.9      0.2   0.0
## Visibility      66.0    65.0  0.4      0.2   0.1
## Tdewpoint       15.5    22.1  0.2     -0.1   0.0
## rv1             50.0    50.0  0.0     -1.2   0.1
## rv2             50.0    50.0  0.0     -1.2   0.1
## NSM          85800.0 85800.0  0.0     -1.2 177.5
## WeekStatus*      2.0     1.0  1.0     -1.0   0.0
## Day_of_week*     7.0     6.0  0.0     -1.3   0.0
i=1
for(i in 1:ncol(df)){
  if( (is.numeric(df[,i]))== FALSE )next
  else{
    plot(density(df[,i]),main=paste("",names(df[i])))
  }
   
}

9 List of Numeric & categorical columns.

cat("\n=----------------Numeric Variables-------------------\n")
## 
## =----------------Numeric Variables-------------------
df_num <- df[,sapply(df , is.numeric)];names(df_num)
##  [1] "Appliances"  "lights"      "T1"          "RH_1"        "T2"         
##  [6] "RH_2"        "T3"          "RH_3"        "T4"          "RH_4"       
## [11] "T5"          "RH_5"        "T6"          "RH_6"        "T7"         
## [16] "RH_7"        "T8"          "RH_8"        "T9"          "RH_9"       
## [21] "T_out"       "Press_mm_hg" "RH_out"      "Windspeed"   "Visibility" 
## [26] "Tdewpoint"   "rv1"         "rv2"         "NSM"
cat("\n=----------------Factor Variables-------------------\n")
## 
## =----------------Factor Variables-------------------
df_cat <- df[,sapply(df , is.factor )];names(df_cat)
## [1] "WeekStatus"  "Day_of_week"

10 Outlier Visuals

i=1
for(i in 1:ncol(df)){
  if( (is.numeric(df[,i]))== FALSE )next
  else{
    boxplot(df[,i],main=paste("",names(df[i]),"",length(boxplot(df[,i])$out),plot =FALSE), col = "lightgray")
   
  }
   
}

10.1 Outlier in Data frame

count_outliers <- function(x, na.rm = TRUE) {
qnt <- quantile(x, probs=c(.25, .75), na.rm = TRUE)
      H <- 1.5 * IQR(x, na.rm = TRUE)
      y <- x
       length(y[x < (qnt[1] - H)]) 
      length(y[x > (qnt[2] + H)]) 
      
      
      
      a <- data.frame(
                 Max_outlier = as.numeric(length(y[x > (qnt[2] + H)])) ,
                 Min_outlier = as.numeric(length(y[x < (qnt[2] - H)])))
      a$Total_outlier = a$Max_outlier + a$Min_outlier
      return(a)
}

a <-sapply(df_num , function(x)count_outliers(x))
rname<- rownames(t(a))
a <- as.matrix.data.frame(t(a))
a <- as.data.frame(a)
a$COlnames <- rname
names(a) <- c("Q3_Above","Q1_Below","Total","Columnames")
a <- a[,c(4,1:3)]
a$Total <- round((a$Total/dim(df)[1])*100,4)
a <- arrange(a ,desc(Total));a
## Warning: package 'bindrcpp' was built under R version 3.3.3
##     Columnames Q3_Above Q1_Below   Total
## 1       lights     4483        0 22.7160
## 2   Visibility     2356     1904 21.5860
## 3         RH_5     1314     1233 12.9060
## 4           T1      266     2226 12.6273
## 5   Appliances     2138      352 12.6172
## 6       RH_out        0     2390 12.1105
## 7           T8        0     2040 10.3370
## 8  Press_mm_hg        0     2025 10.2610
## 9         RH_2       68     1910 10.0228
## 10       T_out      436     1381  9.2070
## 11          T6      515     1267  9.0296
## 12        RH_6        0     1766  8.9486
## 13          T2      546     1140  8.5432
## 14          T4       41     1532  7.9706
## 15   Tdewpoint       10     1489  7.5956
## 16          T5      179     1068  6.3187
## 17          T3      217     1016  6.2478
## 18          T9        0     1157  5.8627
## 19          T7        2     1125  5.7107
## 20        RH_8       17     1039  5.3509
## 21        RH_7       42     1013  5.3458
## 22        RH_1      126      833  4.8594
## 23        RH_9        4      898  4.5706
## 24        RH_3        9      682  3.5014
## 25        RH_4        0      377  1.9103
## 26   Windspeed      214       15  1.1604
## 27         NSM        0      137  0.6942
## 28         rv1        0        0  0.0000
## 29         rv2        0        0  0.0000

Flooring with Month and Date

df$my <- floor_date(df$date,"month")
df$mhr <- floor_date(df$date,"hour")

11 Heat Map

hmap<- function(df , indi){
  
  #names(df[as.numeric(indi)])
  df1 <- df%>%
  mutate(Year = year(mhr),
         Month = month(mhr),
         week = wday(mhr),
         wdays = weekdays(mhr),
         Hour = hour(mhr),
         Day = day(mhr))

df2  <- df1%>%
  group_by(Month,Day)%>%
   summarise_at(.vars = colnames(.)[indi],mean,na.rm=TRUE)%>%
  #summarise(Appl =  sum(paste("", names(df[as.numeric(colum_nam)]),sep=""),na.rm=TRUE))%>%
  arrange(Month,Day)
df2 <- as.data.frame(df2)
library(tidyr)
df3 <- t(spread(data = df2, 
             key = Day,
             value = paste("",names(df2)[3],sep="")))
df3 <- as.data.frame(df3)
df3 <- as.matrix.data.frame(df3)



library(plotly)
plot_ly(x=colnames(df3), y=rownames(df3), z = df3, type = "heatmap", colors = colorRamp(c("yellow","orange","red")))%>%
  layout(title = paste("",names(df2)[3]),
         xaxis = list(title = 'Month'),
         yaxis = list(title = 'Days'))

}
names(df)
##  [1] "date"        "Appliances"  "lights"      "T1"          "RH_1"       
##  [6] "T2"          "RH_2"        "T3"          "RH_3"        "T4"         
## [11] "RH_4"        "T5"          "RH_5"        "T6"          "RH_6"       
## [16] "T7"          "RH_7"        "T8"          "RH_8"        "T9"         
## [21] "RH_9"        "T_out"       "Press_mm_hg" "RH_out"      "Windspeed"  
## [26] "Visibility"  "Tdewpoint"   "rv1"         "rv2"         "NSM"        
## [31] "WeekStatus"  "Day_of_week" "my"          "mhr"

11.1 Apliance

indi <- grep("Appliances", colnames(df))
hmap(df , indi)

11.2 lights

indi <- grep("lights", colnames(df))
hmap(df , indi)

11.3 T1

indi <- grep("T1", colnames(df))
hmap(df , indi)

11.4 RH_1

indi <- grep("RH_1", colnames(df))
hmap(df , indi)

11.5 T2

indi <- grep("T2", colnames(df))
hmap(df , indi)

11.6 RH_2

indi <- grep("RH_2", colnames(df))
hmap(df , indi)

11.7 T3

indi <- grep("T3", colnames(df))
hmap(df , indi)

11.8 RH_3

indi <- grep("RH_3", colnames(df))
hmap(df , indi)

11.9 T4

indi <- grep("T4", colnames(df))
hmap(df , indi)

11.10 RH_4

indi <- grep("RH_4", colnames(df))
hmap(df , indi)

11.11 Windspeed

indi <- grep("Windspeed", colnames(df))
hmap(df , indi)

11.12 Tdewpoint

indi <- grep("Tdewpoint", colnames(df))
hmap(df , indi)

11.13 Visibility

indi <- grep("Visibility", colnames(df))
hmap(df,indi)

11.14 T_out

indi <- grep("T_out", colnames(df))
hmap(df , indi)

11.15 Press_mm_hg

indi <- grep("Press_mm_hg", colnames(df))
hmap(df , indi)

11.16 T1

11.17 RH_1

11.18 T1

11.19 RH_1

12