library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000     ✔ purrr   0.2.4     
## ✔ tibble  1.4.2          ✔ dplyr   0.7.4     
## ✔ tidyr   0.7.2          ✔ stringr 1.2.0     
## ✔ readr   1.1.1          ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
## 
##     date
library(scales)
## 
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
## 
##     discard
## The following object is masked from 'package:readr':
## 
##     col_factor
source('func.R')
setwd('data_for_Pistoia')
#read mapping mapping bwetween instruments(freezers,incubators,water pump etc) and TetraScience devices. Instrument can have multiple devices attached. Device can have multiple sensors. Sensor data are in CSV files per device
device1=read_csv('device.csv')
## Parsed with column specification:
## cols(
##   file = col_character(),
##   DeviceIdentified = col_character(),
##   name = col_character(),
##   Type = col_character(),
##   Manufacturer = col_integer(),
##   Room = col_integer(),
##   Asset = col_integer(),
##   ParentAsset = col_integer(),
##   Class = col_character(),
##   variables = col_character(),
##   nrow_million = col_double(),
##   epoch_sec = col_integer()
## )
#read sensor data for one specific freezer, FRZ-130048 into "skinny" format (i.e with one column for sensor values)
system.time({d1=device1 %>% filter(grepl('FRZ-130048',name)) %>% rowwise() %>% do(cbind(filter_skinny(.$file),DeviceIdentified=.$DeviceIdentified)) %>% ungroup()})
## [1] "58cc30edcb746e3b5532b67b - -80 Freezer - FRZ-130048 - Temperature.csv"
## [1] "58d55353cb746e3b558d6977 - -80C Freezer Power - FRZ-130048 #1 - Power.csv"
## [1] "58d55353cb746e3b558d6978 - -80C Freezer Power - FRZ-130048 #2- Power.csv"
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
##    user  system elapsed 
##  54.380   1.076  55.579
d1 %>% mutate(var=factor(var),DeviceIdentified=factor(DeviceIdentified)) %>% summary(d1)
## Warning in if (length(ll) > maxsum) {: the condition has length > 1 and
## only the first element will be used
## Warning in if (length(ll) > maxsum) {: the condition has length > 1 and
## only the first element will be used
##   recorded_at                     var                val         
##  Min.   :2016-11-24 23:29:00   power:11161559   Min.   :-166.53  
##  1st Qu.:2017-06-15 23:41:57   temp : 5127872   1st Qu.: -77.71  
##  Median :2017-08-09 15:30:30                    Median :   0.00  
##  Mean   :2017-08-13 16:14:57                    Mean   : -11.20  
##  3rd Qu.:2017-10-10 19:31:40                    3rd Qu.:  34.10  
##  Max.   :2018-02-13 20:28:09                    Max.   :  92.90  
##                  DeviceIdentified  
##  58cc30edcb746e3b5532b67b:5127872  
##  58d55353cb746e3b558d6977:5580691  
##  58d55353cb746e3b558d6978:5580868  
##                                    
##                                    
## 
#Beware about missing timepoints
#Let's check missing ovservation by hours(i.e no observation during one hour)
d12=d1 %>% filter(recorded_at>'2017-05-01'& recorded_at<'2018-01-30') %>% summarise_by_minute(minute = 60) %>% ungroup()
 
expand.grid(recorded_at=seq(min(d12$recorded_at), max(d12$recorded_at), by=3600),var=with(d12,unique(paste(var,DeviceIdentified,sep=':')))) %>% separate(var,c('var','DeviceIdentified')) %>% left_join(d12) %>% mutate(missing=ifelse(is.na(val),1,NA)) %>% ggplot(aes(recorded_at,missing,col=DeviceIdentified)) + geom_point() +facet_grid(var+DeviceIdentified~.,scales = 'free')+ scale_x_datetime(labels = date_format("%m/%d", tz = "America/New_York"),date_breaks = '1 weeks' ,date_minor_breaks = '1 day') + theme(axis.text.x  = element_text(angle=45),legend.position="none")+ggtitle('Hours with missing sensor data')
## Joining, by = c("recorded_at", "var", "DeviceIdentified")
## Warning: Removed 18562 rows containing missing values (geom_point).

#read output from Numenta anomaly detection algorithm https://github.com/numenta/nupic. Configuration in search_def.json. It was run on the temperature data aggregated by mean over 10 minutes using first 20000 aggregated points (i.e data till 2017-08-29) for model parameter esimation. 
d2=numic_read(file.path('../anal/nupic4','model_0/inference/DefaultTask.TemporalAnomaly.predictionLog.csv'))
## Parsed with column specification:
## cols(
##   X1 = col_integer(),
##   X2 = col_datetime(format = ""),
##   X3 = col_double(),
##   X4 = col_double(),
##   X5 = col_character(),
##   X6 = col_character(),
##   X7 = col_double(),
##   X8 = col_character(),
##   X9 = col_double(),
##   X10 = col_character(),
##   X11 = col_double(),
##   X12 = col_double()
## )
## Parsed with column specification:
## cols(
##   reset = col_character(),
##   recorded_at = col_character(),
##   temp = col_character(),
##   multiStepPredictions.actual = col_character(),
##   multiStepPredictions.1 = col_character(),
##   multiStepBucketLikelihoods.1 = col_character(),
##   multiStepBestPredictions.actual = col_character(),
##   multiStepBestPredictions.1 = col_character(),
##   anomalyScore = col_character(),
##   anomalyLabel = col_character(),
##   `multiStepBestPredictions:multiStep:errorMetric='aae':steps=[1]:window=1000:field=temp` = col_character(),
##   `multiStepBestPredictions:multiStep:errorMetric='altMAPE':steps=[1]:window=1000:field=temp` = col_character()
## )
## Warning in numic_read(file.path("../anal/nupic4", "model_0/inference/
## DefaultTask.TemporalAnomaly.predictionLog.csv")): NAs introduced by
## coercion
summary(d2)
##      reset    recorded_at                       temp       
##  Min.   :0   Min.   :2017-03-20 19:46:04   Min.   :-85.42  
##  1st Qu.:0   1st Qu.:2017-06-24 17:46:04   1st Qu.:-81.99  
##  Median :0   Median :2017-09-06 00:46:04   Median :-79.60  
##  Mean   :0   Mean   :2017-09-05 18:35:55   Mean   :-79.91  
##  3rd Qu.:0   3rd Qu.:2017-11-18 04:26:04   3rd Qu.:-78.11  
##  Max.   :0   Max.   :2018-01-30 16:46:04   Max.   : 27.79  
##                                                            
##  multiStepPredictions.actual multiStepPredictions.1
##  Min.   :-85.42              Length:42105          
##  1st Qu.:-81.99              Class :character      
##  Median :-79.60              Mode  :character      
##  Mean   :-79.91                                    
##  3rd Qu.:-78.11                                    
##  Max.   : 27.79                                    
##                                                    
##  multiStepBucketLikelihoods.1 multiStepBestPredictions.actual
##  Length:42105                 Min.   :-85.42                 
##  Class :character             1st Qu.:-81.99                 
##  Mode  :character             Median :-79.60                 
##                               Mean   :-79.91                 
##                               3rd Qu.:-78.11                 
##                               Max.   : 27.79                 
##                                                              
##  multiStepBestPredictions.1  anomalyScore      anomalyLabel      
##  Min.   :-85.19             Min.   :0.000000   Length:42105      
##  1st Qu.:-81.85             1st Qu.:0.000000   Class :character  
##  Median :-79.77             Median :0.000000   Mode  :character  
##  Mean   :-79.93             Mean   :0.008488                     
##  3rd Qu.:-78.17             3rd Qu.:0.000000                     
##  Max.   : 27.79             Max.   :1.000000                     
##  NA's   :1                                                       
##  multiStepBestPredictions:multiStep:errorMetric='aae':steps=[1]:window=1000:field=temp
##  Min.   : 0.000                                                                       
##  1st Qu.: 1.078                                                                       
##  Median : 1.366                                                                       
##  Mean   : 1.413                                                                       
##  3rd Qu.: 1.553                                                                       
##  Max.   :43.926                                                                       
##                                                                                       
##  multiStepBestPredictions:multiStep:errorMetric='altMAPE':steps=[1]:window=1000:field=temp
##  Min.   :  0.00000                                                                        
##  1st Qu.:  0.02997                                                                        
##  Median :  0.03744                                                                        
##  Mean   :  0.25302                                                                        
##  3rd Qu.:  0.07936                                                                        
##  Max.   :106.21690                                                                        
## 
#
#Multi-month view. Actual deployment starte at end of April, 2017. Erlier records might be irrelevant
plot_data(data=d1,anomaly_score = d2,start='2017-04-30',end='2018-01-30',over_minute = 240,format = '%m/%d',breaks = "1 weeks",breaks_minor = '1 day') %>% print()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

#Zooming to the first anomaly region
plot_data(data=d1,anomaly_score = d2,start='2017-04-30 00:00',end='2017-05-15',over_minute = 60,format = '%m/%d %H:%M',breaks = "1 day",breaks_minor = '6 hours') %>% print()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

#Zooming to non-aggregated data
plot_data(data=d1,anomaly_score = d2,start='2017-05-04 01:00',end='2017-05-05 01:00',over_minute = NULL,format = '%m/%d %H:%M',breaks = "4 hour",breaks_minor = '1 hour') %>% print()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

#prepare  multi-variable  data frame
d3=d1  %>% filter(recorded_at>'2017-05-01') %>% summarise_by_minute(minute=10,fun="mean") %>% ungroup() %>% transmute(recorded_at=recorded_at,var=paste(var,DeviceIdentified,sep='.'),val=val) %>% spread(var,val) 

#percent of records where one of varible is missing 
sum(rowSums(is.na(d3)))/nrow(d3)
## [1] 0.3449544
#%>% gather(var,val,-recorded_at) %>% filter(is.na(val)) %>% select(-val) %>% separate(var,c('var','DeviceIdentified')) %>% inner_join(mutate(d1,recorded_at=as_datetime(600*round(as.numeric(recorded_at)/600)))) %>% as.data.frame()