Introduction

These are datasets dowloaded directly from the elasticsearch repository living at http://138.100.72.5. They are regular patterns from: * Several times the same user along different scenarios * Different users along the same scenarios than before.

Our main target is to read those datasets and to look for valid features able to help the system in distinguish what are the main actions being carried out by the user at different periods of time:

Reading data

#
library(zoo)
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(plyr)
library(fftw)
library(calibrate)
## Loading required package: MASS
#
path="~/git/zheng_14/data/"
setwd(path)
lf=list.files(path=path,pattern="*.csv",recursive=TRUE,full.names=TRUE)
for ( i in lf){
  d0 = read.csv(file=i,sep=",",stringsAsFactors=FALSE,header=TRUE,colClasses=rep("character",20))
  if ( ! exists("dd")) {
    dd=d0
  } else {
    dd = rbind(dd,d0)
  }
}
# Función para convertir una cadena ascii sin separación en una fecha completa
# con op<-options(digits.secs=3) podemos sacar los ms
rm(d0,i)
numeros2fecha=function(d) {
  strptime(paste(as.character(strptime(substr(d,1,14),"%Y%m%d%H%M%S")),
                 substr(d,15,17),sep="."),"%Y-%m-%d %H:%M:%OS")
}
gps2fecha=function(d) {
  strptime(paste(as.character(as.POSIXct(as.numeric(substr(d,1,10)),origin="1970-01-01")),
                 substr(d,11,13),sep="."),"%Y-%m-%d %H:%M:%OS")
}
ss=mutate(dd[,-c(1:4)],fecha=numeros2fecha(X_source.peb_time))
ss=mutate(ss,fechaGPS=gps2fecha(X_source.t_gps))
ss=mutate(ss,fechaPHONE=gps2fecha(X_source.phone_acc_time))
ss=ss[,-which(colnames(ss) %in% c("X_source.peb_time",
                                  "X_source.t_gps",
                                  "X_source.phone_acc_time"))]
idsPHONE=unique(sort(ss$X_source.user_phone))
idsWATCH=unique(sort(ss$X_source.user_peb))
#
setwd("~/git/zheng_14/")
#

Signal segmentation

Let’s look for the segmentation of the signal:

#
# Savitzky-Golay is not only a good method for chemical engineering, it can
# successfully be applied to smooth process data. One approach is to determine
# the noise level in a time series (ACF, winGamma, ...) and then choose the
# parameter fl such that the difference between the time series and its
# Savitzky-Golay approximation reflects the noise level.
# 
# I would be glad to hear about comments and improvements.
# 
# Hans W. Borchers
# ABB Corporate Research
# 
# P. S.:   Example:
# 
#     t  <- sin(2*pi*(1:1000)/200)
#     t1 <- t + rnorm(1000)/10
#     t2 <- sav.gol(t1, 51)
#     plot(1:1000, t1)
#     lines(1:1000, t,  col="blue")
#     lines(1:1000, t2, col="red")
#
# ----------------------------------------------------------------------
#   Savitzky-Golay Algorithm
# ----------------------------------------------------------------------
# T2 <- sav.gol(T, fl, forder=4, dorder=0);
#
# Polynomial filtering method of Savitzky and Golay
# See Numerical Recipes, 1992, Chapter 14.8, for details.
#
# T      = vector of signals to be filtered
#          (the derivative is calculated for each ROW)
# fl     = filter length (for instance fl = 51..151)
# forder = filter order (2 = quadratic filter, 4= quartic)
# dorder = derivative order (0 = smoothing, 1 = first derivative, etc.)
#
sav.gol = function(T, fl, forder=4, dorder=0) {
    m = length(T)
    dorder = dorder + 1

    # -- calculate filter coefficients --
    fc = (fl-1)/2                          # index: window left and right
    X  = outer(-fc:fc, 0:forder, FUN="^")  # polynomial terms and coefficients
    Y  = pinv(X);                          # pseudoinverse

    # -- filter via convolution and take care of the end points --
    T2 = convolve(T, rev(Y[dorder,]), type="o")    # convolve(...)
    T2 = T2[(fc+1):(length(T2)-fc)]
    return(T2)
}
#-----------------------------------------------------------------------
#   *** PseudoInvers of a Matrix ***
#   using singular value decomposition
#
pinv <- function (A) {
    s <- svd(A)
    # D <- diag(s$d); Dinv <- diag(1/s$d)
    # U <- s$u; V <- s$v
    # A = U D V'
    # X = V Dinv U'
    return(s$v %*% diag(1/s$d) %*% t(s$u))
}
FD=function(x) {
  y=as.numeric(x)
  n=length(x)
  d=max(abs(y-y[1]))
  L=sum(abs(diff(y[-1],y[-n])))
  return(log(n,10)/(log(n,10)+log(d/L,10)))
}
chunk=function(x,n){
  return(split(x,factor((0:(length(x) -1))%/%n)))
}
#
# n  = nunber of samples 2^f
# dt = time step
# t  = (0:(n-1)) * dt
# dw = 2*pi/(n*dt)
# w  = (0:(n-1)) * dw
#
# g=fft(x)
# power=abs(g)^2
# dw = 2* pi/(n*dt)
#
fpow=function(d,twop=6,nfea=7) {
  n   =2^twop
  p   =planFFT(n)
  s   =chunk(d,n)
  if (length(s[[length(s)]]) < n) s[[length(s)]] = NULL
  fc  =lapply(s,FFT)
  fc2 =lapply(fc,function(x,m){x[(2+m):(length(x)-m)]=0+0i; return(x)},nfea) 
  res =ldply(.data=fc2,.fun=function(x){return(Mod(x[2:(nfea+1)]))})
  return(res)
}
fover=function(d,twop=6,nfea,step=10) {
  j=seq(1,2^twop,step)
  for (i in 1:length(j)) {
    d0 = d
    if ( j[i] > 1 ) d0 = d[-(1:(j[i]-1))]
    rs=fpow(d0,twop,nfea)
    rs[,1]=paste(j[i],":",rs[,1],sep="")
    if ( exists("res")) {
      res=rbind(res,rs)
    } else {
      res=rs
    }
  }
  rownames(res)=res[,1]
  res=res[,-1]
  return(res)
}
#
G=function(d,wd=3,eps=0.000001){
  d0=rollapply(d,(2*wd+1),WMA,align="center",partial=TRUE)
  return(abs(diff(rollapply(d0,10,FD,eps,align="right",partial=TRUE))))
}
#
fcx=fover(as.numeric(ss$X_source.peb_a_x),5,7,5)
fcy=fover(as.numeric(ss$X_source.peb_a_y),5,7,5)
fcz=fover(as.numeric(ss$X_source.peb_a_z),5,7,5)
colnames(fcx)=paste("X",1:ncol(fcx),sep="")
colnames(fcy)=paste("Y",1:ncol(fcy),sep="")
colnames(fcz)=paste("Z",1:ncol(fcz),sep="")
fcx[,(ncol(fcx)+1)]=rownames(fcx)
fcy[,(ncol(fcy)+1)]=rownames(fcy)
fcz[,(ncol(fcz)+1)]=rownames(fcz)
colnames(fcx)[ncol(fcx)]="IDX"
colnames(fcy)[ncol(fcy)]="IDX"
colnames(fcz)[ncol(fcz)]="IDX"
fc=merge(fcx,fcy,by="IDX")
fc=merge(fc,fcz,by="IDX")
rownames(fc)=fc[,1]
fc=fc[,-1]
#
pcafc=prcomp(fc)
plot(pcafc)

# Plot of PCA
plot(pcafc$x[,1],pcafc$x[,2])
textxy(pcafc$x[,1],pcafc$x[,2],rownames(pcafc$x))

# chunk number 
ckn=chunk(as.numeric(ss$X_source.peb_a_x),2^5)
plot(ckn[[27]]-mean(ckn[[27]]),type="l") # Plots the 1/27 data chunk CENTERED

ckn16=chunk(as.numeric(ss$X_source.peb_a_x[-c(1:(3*5))]),2^5)
plot(ckn16[[27]]-mean(ckn16[[27]]),type="l") # Plots the 16/47 data chunk  CENTERED

#

Foreseen work TBD

Now it’s time to: * Explore the right value for fourier transform (2^4, 2^5, 2^6, …) * Explore the right values for number of features (the lower the better) * To identify the cutting ranges on PC1 for different behaviors * To write the paper.