Read in the data, and tidy it
library(tidyr); library(dplyr); library(ggplot2)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dam <- read.csv("timeseries_DanceRandom_RAW_forRClub_28Apr2015.csv")
dwells.wide <- dam[ , c(1,3:722)]
dim(dwells.wide)
## [1] 40 721
dwells.wide$SubjID <- as.factor(dwells.wide$SubjID)
library(tidyr)
dwells <- dwells.wide %>%
gather(slide, dwell.time, -SubjID) %>%
separate(slide, into=c("x", "slide"), sep=1) %>%
select(-x) # get rid of the extra column
dwells$slide <- as.numeric(dwells$slide) - 17 # make slide number numeric, and subtract 17 so it starts at 1
dwells <- dplyr::arrange(dwells, slide)
# add position (1st, 2nd, 3rd, 4th slide in element)
slides <- data.frame(slide=unique(dwells$slide))
slides$position <- 1:4 # this will repeat for the length of the dataframe
slides$element <- gl(n=nrow(slides)/4, k=4)
head(slides); message("..."); tail(slides)
## slide position element
## 1 1 1 1
## 2 2 2 1
## 3 3 3 1
## 4 4 4 1
## 5 5 1 2
## 6 6 2 2
## ...
## slide position element
## 715 715 3 179
## 716 716 4 179
## 717 717 1 180
## 718 718 2 180
## 719 719 3 180
## 720 720 4 180
# merge slide info back into dataframe
dwells <- merge(dwells, slides)
xtabs(~slide+position, data=dwells)[1:10,]; message("...") # check that it worked
## position
## slide 1 2 3 4
## 1 40 0 0 0
## 2 0 40 0 0
## 3 0 0 40 0
## 4 0 0 0 40
## 5 40 0 0 0
## 6 0 40 0 0
## 7 0 0 40 0
## 8 0 0 0 40
## 9 40 0 0 0
## 10 0 40 0 0
## ...
dwells$position <- factor(dwells$position)
summary(dwells)
## slide SubjID dwell.time position
## Min. : 1.0 10 : 720 Min. : 0.0160 1:7200
## 1st Qu.:180.8 21 : 720 1st Qu.: 0.2350 2:7200
## Median :360.5 26 : 720 Median : 0.3310 3:7200
## Mean :360.5 27 : 720 Mean : 0.5016 4:7200
## 3rd Qu.:540.2 30 : 720 3rd Qu.: 0.5220
## Max. :720.0 32 : 720 Max. :37.1580
## (Other):24480
## element
## 1 : 160
## 2 : 160
## 3 : 160
## 4 : 160
## 5 : 160
## 6 : 160
## (Other):27840
head(dwells); message("..."); tail(dwells)
## slide SubjID dwell.time position element
## 1 1 10 0.186 1 1
## 2 1 21 0.876 1 1
## 3 1 26 1.636 1 1
## 4 1 27 1.457 1 1
## 5 1 30 1.387 1 1
## 6 1 32 0.868 1 1
## ...
## slide SubjID dwell.time position element
## 28795 720 171 0.191 4 180
## 28796 720 174 0.358 4 180
## 28797 720 175 0.180 4 180
## 28798 720 177 0.245 4 180
## 28799 720 182 0.481 4 180
## 28800 720 183 0.408 4 180
Exploratory Data Analysis
ggplot(dwells, aes(y=dwell.time, x=slide, group=SubjID)) +
geom_line(alpha=.2) +
labs(title="Raw Dwell Times \n(lines by participant)") # add a title

# holy outliers, batman!
dwells <- dplyr::filter(dwells, dwell.time < 10) # remove cases where they looked longer than 10 seconds to one slide (!)
ggplot(dwells, aes(y=dwell.time, x=slide, group=SubjID)) +
geom_line(alpha=.2) +
labs(title="Raw Dwell Times \n outliers (>10sec) removed") # add a title

# summarize across participants
dwells.sum <- summarize(group_by(dwells, position, element, slide), dwell.mean=mean(dwell.time), se=sd(dwell.time), ci=se*2)
ggplot(dwells.sum, aes(x=slide, y=dwell.mean)) +
geom_errorbar(aes(ymin=dwell.mean-ci, ymax=dwell.mean+ci), colour="black", width=.1) +
geom_line() +
geom_point() +
ggtitle("Mean dwell time by slide, \nwith +-2 SE error bars")

dwells.sum.pos <- summarize(group_by(dwells, position, SubjID), dwell.mean=mean(dwell.time), se=sd(dwell.time), ci=se*2)
ggplot(dwells.sum.pos, aes(x=position, y=dwell.mean)) +
geom_boxplot() +
ggtitle("Mean dwell time by position, \nwith +-2 SE error bars")

Basic Time Series
# remember, this is with the outliers removed (from EDA above)
# basic time series is design for data from only one soursce (i.e. one participant)
# it gets used a lot in economics, where your one participant might be your company, or a country
example.participant <- base::sample(dwells$SubjID, size=1)
example <- dplyr::filter(dwells, SubjID==example.participant)
ggplot(example, aes(y=dwell.time, x=slide)) +
geom_line() +
labs(title="Example Participant Dwell Times") # add a title

ggplot(example, aes(y=dwell.time, x=position, group=element)) +
geom_line(aes(color=element)) + # color lines by the group factor
labs(title="Example Participant Dwell Times \nby position within element") + # add a title
scale_colour_discrete(name = "Element") # this changes the name of the legend

ggplot(example, aes(y=dwell.time, x=position)) +
geom_boxplot() +
labs(title="Example Participant Dwell Times \nby position within element")

model1 <- lm(dwell.time ~ slide, data=example)
model2 <- lm(dwell.time ~ slide + element, data=example)
anova(model1, model2)
## Analysis of Variance Table
##
## Model 1: dwell.time ~ slide
## Model 2: dwell.time ~ slide + element
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 718 1.9850
## 2 539 1.1399 179 0.84511 2.2325 1.397e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(x = example$dwell.time, main="ACF of example participant dwell times")

# R gives you lag=0 for ACF in addition to the rest. lag=0 is observations with themselves (will always =1)
pacf(x = example$dwell.time, main="Partial ACF of example participant dwell times")

pacf(x = example$dwell.time, plot=F)
##
## Partial autocorrelations of series 'example$dwell.time', by lag
##
## 1 2 3 4 5 6 7 8 9 10
## 0.439 0.416 0.224 0.258 0.197 0.062 0.055 0.151 0.052 0.086
## 11 12 13 14 15 16 17 18 19 20
## 0.036 0.069 0.080 -0.056 -0.023 0.041 -0.038 0.036 -0.019 0.012
## 21 22 23 24 25 26 27 28
## 0.002 0.000 0.000 0.018 0.045 0.031 -0.044 0.040
which.max(pacf(x = example$dwell.time, plot=F)$acf) # the largest lag
## [1] 1
max(pacf(x = example$dwell.time, plot=F)$acf)
## [1] 0.4386834
NSubjs <- length(unique(dwells$SubjID))
combined.lags <- as.data.frame(matrix(nrow=NSubjs, ncol=28)) # storage variable
colnames(combined.lags) <- paste("lag", 1:28, sep="")
for(i in 1:NSubjs){
this.participant <- as.numeric(as.character(unique(dwells$SubjID)))[i]
sample <- dplyr::filter(dwells, SubjID==this.participant)
combined.lags[i, ]<- as.vector(pacf(x = sample$dwell.time, plot=F)$acf)
}
combined.lags.long <- gather(combined.lags)
boxplot(value ~ key, data=combined.lags.long, main="Combined PACF lags \n(calculated within each participant)", ylab="correlation")

# remember, this is with the outliers removed (from EDA above)
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following object is masked from 'package:tidyr':
##
## expand
##
## Loading required package: Rcpp
library(lmerTest)
##
## Attaching package: 'lmerTest'
##
## The following object is masked from 'package:lme4':
##
## lmer
##
## The following object is masked from 'package:stats':
##
## step
model.null <- lmer(dwell.time ~ 1 + (1|SubjID), data=dwells)
summary(model.null)
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [merModLmerTest]
## Formula: dwell.time ~ 1 + (1 | SubjID)
## Data: dwells
##
## REML criterion at convergence: 21858.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5664 -0.2930 -0.0614 0.1081 22.2230
##
## Random effects:
## Groups Name Variance Std.Dev.
## SubjID (Intercept) 0.1869 0.4324
## Residual 0.1239 0.3519
## Number of obs: 28797, groups: SubjID, 40
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 0.49934 0.06839 39.00000 7.301 8.3e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ICC
L2var<-as.data.frame(VarCorr(model.null))[1,4]
L1var<-as.data.frame(VarCorr(model.null))[2,4]
icc<-L2var/(L2var+L1var)
icc # lots and lots of variance due to participant
## [1] 0.6014814
# center dwells *for each participant* so between-participant differences are minimized
dwells$dwell.time.c <- ave(dwells$dwell.time, dwells$SubjID, FUN=function(x) scale(x, scale=F))
boxplot(dwell.time ~ SubjID, data=dwells, xlab="SubjID", main="Dwell times by participant")

boxplot(dwell.time.c ~ SubjID, data=dwells, xlab="SubjID", main="Centered dwell times by participant")

model1 <- lmer(dwell.time ~ slide + (slide|SubjID), data=dwells) # note that slide is numeric (not a factor) so the effect of slide is linear
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.00225274
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
model2 <- lmer(dwell.time ~ slide + position + (slide|SubjID), data=dwells)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.275416
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
summary(model1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.00225274
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
## - Rescale variables?
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [merModLmerTest]
## Formula: dwell.time ~ slide + (slide | SubjID)
## Data: dwells
##
## REML criterion at convergence: 14114.6
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.2144 -0.2393 -0.0488 0.0984 25.1704
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## SubjID (Intercept) 3.611e-01 0.6008868
## slide 3.875e-07 0.0006225 -0.83
## Residual 9.400e-02 0.3065877
## Number of obs: 28797, groups: SubjID, 40
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 7.007e-01 9.508e-02 3.910e+01 7.370 6.59e-09 ***
## slide -5.586e-04 9.881e-05 3.914e+01 -5.653 1.55e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## slide -0.833
summary(model2)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.275416
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
## - Rescale variables?
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
## to degrees of freedom [merModLmerTest]
## Formula: dwell.time ~ slide + position + (slide | SubjID)
## Data: dwells
##
## REML criterion at convergence: 14147.1
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.1992 -0.2386 -0.0486 0.0980 25.1582
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## SubjID (Intercept) 1.870e-01 0.4324890
## slide 2.572e-07 0.0005071 -0.74
## Residual 9.411e-02 0.3067707
## Number of obs: 28797, groups: SubjID, 40
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 7.072e-01 6.855e-02 9.400e+01 10.316 < 2e-16 ***
## slide -5.585e-04 8.065e-05 6.600e+01 -6.925 2.14e-09 ***
## position2 -1.110e-02 5.113e-03 2.869e+04 -2.170 0.030 *
## position3 -6.990e-03 5.113e-03 2.869e+04 -1.367 0.172
## position4 -7.959e-03 5.113e-03 2.869e+04 -1.556 0.120
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) slide postn2 postn3
## slide -0.734
## position2 -0.037 0.000
## position3 -0.037 0.000 0.500
## position4 -0.037 -0.001 0.500 0.500
anova(model1, model2)
## refitting model(s) with ML (instead of REML)
## Data: dwells
## Models:
## object: dwell.time ~ slide + (slide | SubjID)
## ..1: dwell.time ~ slide + position + (slide | SubjID)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## object 6 14107 14156 -7047.4 14095
## ..1 9 14108 14182 -7045.0 14090 4.8459 3 0.1834