library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.7.2 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
source('func.R')
setwd('data_for_Pistoia')
#read mapping mapping bwetween instruments(freezers,incubators,water pump etc) and TetraScience devices. Instrument can have multiple devices attached. Device can have multiple sensors. Sensor data are in CSV files per device
device1=read_csv('device.csv')
## Parsed with column specification:
## cols(
## file = col_character(),
## DeviceIdentified = col_character(),
## name = col_character(),
## Type = col_character(),
## Manufacturer = col_integer(),
## Room = col_integer(),
## Asset = col_integer(),
## ParentAsset = col_integer(),
## Class = col_character(),
## variables = col_character(),
## nrow_million = col_double(),
## epoch_sec = col_integer()
## )
#read sensor data for one specific freezer, FRZ-130048 into "skinny" format (i.e with one column for sensor values)
system.time({d1=device1 %>% filter(grepl('FRZ-130048',name)) %>% rowwise() %>% do(cbind(filter_skinny(.$file),DeviceIdentified=.$DeviceIdentified)) %>% ungroup()})
## [1] "58cc30edcb746e3b5532b67b - -80 Freezer - FRZ-130048 - Temperature.csv"
## [1] "58d55353cb746e3b558d6977 - -80C Freezer Power - FRZ-130048 #1 - Power.csv"
## [1] "58d55353cb746e3b558d6978 - -80C Freezer Power - FRZ-130048 #2- Power.csv"
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## user system elapsed
## 54.380 1.076 55.579
d1 %>% mutate(var=factor(var),DeviceIdentified=factor(DeviceIdentified)) %>% summary(d1)
## Warning in if (length(ll) > maxsum) {: the condition has length > 1 and
## only the first element will be used
## Warning in if (length(ll) > maxsum) {: the condition has length > 1 and
## only the first element will be used
## recorded_at var val
## Min. :2016-11-24 23:29:00 power:11161559 Min. :-166.53
## 1st Qu.:2017-06-15 23:41:57 temp : 5127872 1st Qu.: -77.71
## Median :2017-08-09 15:30:30 Median : 0.00
## Mean :2017-08-13 16:14:57 Mean : -11.20
## 3rd Qu.:2017-10-10 19:31:40 3rd Qu.: 34.10
## Max. :2018-02-13 20:28:09 Max. : 92.90
## DeviceIdentified
## 58cc30edcb746e3b5532b67b:5127872
## 58d55353cb746e3b558d6977:5580691
## 58d55353cb746e3b558d6978:5580868
##
##
##
#Beware about missing timepoints
#Let's check missing ovservation by hours(i.e no observation during one hour)
d12=d1 %>% filter(recorded_at>'2017-05-01'& recorded_at<'2018-01-30') %>% summarise_by_minute(minute = 60) %>% ungroup()
expand.grid(recorded_at=seq(min(d12$recorded_at), max(d12$recorded_at), by=3600),var=with(d12,unique(paste(var,DeviceIdentified,sep=':')))) %>% separate(var,c('var','DeviceIdentified')) %>% left_join(d12) %>% mutate(missing=ifelse(is.na(val),1,NA)) %>% ggplot(aes(recorded_at,missing,col=DeviceIdentified)) + geom_point() +facet_grid(var+DeviceIdentified~.,scales = 'free')+ scale_x_datetime(labels = date_format("%m/%d", tz = "America/New_York"),date_breaks = '1 weeks' ,date_minor_breaks = '1 day') + theme(axis.text.x = element_text(angle=45),legend.position="none")+ggtitle('Hours with missing sensor data')
## Joining, by = c("recorded_at", "var", "DeviceIdentified")
## Warning: Removed 18562 rows containing missing values (geom_point).

#read output from Numenta anomaly detection algorithm https://github.com/numenta/nupic. Configuration in search_def.json. It was run on the temperature data aggregated by mean over 10 minutes using first 20000 aggregated points (i.e data till 2017-08-29) for model parameter esimation.
d2=numic_read(file.path('../anal/nupic4','model_0/inference/DefaultTask.TemporalAnomaly.predictionLog.csv'))
## Parsed with column specification:
## cols(
## X1 = col_integer(),
## X2 = col_datetime(format = ""),
## X3 = col_double(),
## X4 = col_double(),
## X5 = col_character(),
## X6 = col_character(),
## X7 = col_double(),
## X8 = col_character(),
## X9 = col_double(),
## X10 = col_character(),
## X11 = col_double(),
## X12 = col_double()
## )
## Parsed with column specification:
## cols(
## reset = col_character(),
## recorded_at = col_character(),
## temp = col_character(),
## multiStepPredictions.actual = col_character(),
## multiStepPredictions.1 = col_character(),
## multiStepBucketLikelihoods.1 = col_character(),
## multiStepBestPredictions.actual = col_character(),
## multiStepBestPredictions.1 = col_character(),
## anomalyScore = col_character(),
## anomalyLabel = col_character(),
## `multiStepBestPredictions:multiStep:errorMetric='aae':steps=[1]:window=1000:field=temp` = col_character(),
## `multiStepBestPredictions:multiStep:errorMetric='altMAPE':steps=[1]:window=1000:field=temp` = col_character()
## )
## Warning in numic_read(file.path("../anal/nupic4", "model_0/inference/
## DefaultTask.TemporalAnomaly.predictionLog.csv")): NAs introduced by
## coercion
summary(d2)
## reset recorded_at temp
## Min. :0 Min. :2017-03-20 19:46:04 Min. :-85.42
## 1st Qu.:0 1st Qu.:2017-06-24 17:46:04 1st Qu.:-81.99
## Median :0 Median :2017-09-06 00:46:04 Median :-79.60
## Mean :0 Mean :2017-09-05 18:35:55 Mean :-79.91
## 3rd Qu.:0 3rd Qu.:2017-11-18 04:26:04 3rd Qu.:-78.11
## Max. :0 Max. :2018-01-30 16:46:04 Max. : 27.79
##
## multiStepPredictions.actual multiStepPredictions.1
## Min. :-85.42 Length:42105
## 1st Qu.:-81.99 Class :character
## Median :-79.60 Mode :character
## Mean :-79.91
## 3rd Qu.:-78.11
## Max. : 27.79
##
## multiStepBucketLikelihoods.1 multiStepBestPredictions.actual
## Length:42105 Min. :-85.42
## Class :character 1st Qu.:-81.99
## Mode :character Median :-79.60
## Mean :-79.91
## 3rd Qu.:-78.11
## Max. : 27.79
##
## multiStepBestPredictions.1 anomalyScore anomalyLabel
## Min. :-85.19 Min. :0.000000 Length:42105
## 1st Qu.:-81.85 1st Qu.:0.000000 Class :character
## Median :-79.77 Median :0.000000 Mode :character
## Mean :-79.93 Mean :0.008488
## 3rd Qu.:-78.17 3rd Qu.:0.000000
## Max. : 27.79 Max. :1.000000
## NA's :1
## multiStepBestPredictions:multiStep:errorMetric='aae':steps=[1]:window=1000:field=temp
## Min. : 0.000
## 1st Qu.: 1.078
## Median : 1.366
## Mean : 1.413
## 3rd Qu.: 1.553
## Max. :43.926
##
## multiStepBestPredictions:multiStep:errorMetric='altMAPE':steps=[1]:window=1000:field=temp
## Min. : 0.00000
## 1st Qu.: 0.02997
## Median : 0.03744
## Mean : 0.25302
## 3rd Qu.: 0.07936
## Max. :106.21690
##
#
#Multi-month view. Actual deployment starte at end of April, 2017. Erlier records might be irrelevant
plot_data(data=d1,anomaly_score = d2,start='2017-04-30',end='2018-01-30',over_minute = 240,format = '%m/%d',breaks = "1 weeks",breaks_minor = '1 day') %>% print()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

#Zooming to the first anomaly region
plot_data(data=d1,anomaly_score = d2,start='2017-04-30 00:00',end='2017-05-15',over_minute = 60,format = '%m/%d %H:%M',breaks = "1 day",breaks_minor = '6 hours') %>% print()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

#Zooming to non-aggregated data
plot_data(data=d1,anomaly_score = d2,start='2017-05-04 01:00',end='2017-05-05 01:00',over_minute = NULL,format = '%m/%d %H:%M',breaks = "4 hour",breaks_minor = '1 hour') %>% print()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.

#prepare multi-variable data frame
d3=d1 %>% filter(recorded_at>'2017-05-01') %>% summarise_by_minute(minute=10,fun="mean") %>% ungroup() %>% transmute(recorded_at=recorded_at,var=paste(var,DeviceIdentified,sep='.'),val=val) %>% spread(var,val)
#percent of records where one of varible is missing
sum(rowSums(is.na(d3)))/nrow(d3)
## [1] 0.3449544
#%>% gather(var,val,-recorded_at) %>% filter(is.na(val)) %>% select(-val) %>% separate(var,c('var','DeviceIdentified')) %>% inner_join(mutate(d1,recorded_at=as_datetime(600*round(as.numeric(recorded_at)/600)))) %>% as.data.frame()