Dance and Music Study

Read in the data, and tidy it

library(tidyr); library(dplyr); library(ggplot2)
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dam <- read.csv("timeseries_DanceRandom_RAW_forRClub_28Apr2015.csv")
dwells.wide <- dam[ , c(1,3:722)]
dim(dwells.wide)
## [1]  40 721
dwells.wide$SubjID <- as.factor(dwells.wide$SubjID)

library(tidyr)
dwells <- dwells.wide %>%
  gather(slide, dwell.time, -SubjID) %>%
  separate(slide, into=c("x", "slide"), sep=1) %>%
  select(-x) # get rid of the extra column

dwells$slide <- as.numeric(dwells$slide) - 17 # make slide number numeric, and subtract 17 so it starts at 1
dwells <- dplyr::arrange(dwells, slide)

# add position (1st, 2nd, 3rd, 4th slide in element)
slides <- data.frame(slide=unique(dwells$slide))
slides$position <- 1:4 # this will repeat for the length of the dataframe
slides$element <- gl(n=nrow(slides)/4, k=4)
head(slides); message("..."); tail(slides)
##   slide position element
## 1     1        1       1
## 2     2        2       1
## 3     3        3       1
## 4     4        4       1
## 5     5        1       2
## 6     6        2       2
## ...
##     slide position element
## 715   715        3     179
## 716   716        4     179
## 717   717        1     180
## 718   718        2     180
## 719   719        3     180
## 720   720        4     180
# merge slide info back into dataframe
dwells <- merge(dwells, slides)
xtabs(~slide+position, data=dwells)[1:10,]; message("...") # check that it worked
##      position
## slide  1  2  3  4
##    1  40  0  0  0
##    2   0 40  0  0
##    3   0  0 40  0
##    4   0  0  0 40
##    5  40  0  0  0
##    6   0 40  0  0
##    7   0  0 40  0
##    8   0  0  0 40
##    9  40  0  0  0
##    10  0 40  0  0
## ...
dwells$position <- factor(dwells$position)

summary(dwells)
##      slide           SubjID        dwell.time      position
##  Min.   :  1.0   10     :  720   Min.   : 0.0160   1:7200  
##  1st Qu.:180.8   21     :  720   1st Qu.: 0.2350   2:7200  
##  Median :360.5   26     :  720   Median : 0.3310   3:7200  
##  Mean   :360.5   27     :  720   Mean   : 0.5016   4:7200  
##  3rd Qu.:540.2   30     :  720   3rd Qu.: 0.5220           
##  Max.   :720.0   32     :  720   Max.   :37.1580           
##                  (Other):24480                             
##     element     
##  1      :  160  
##  2      :  160  
##  3      :  160  
##  4      :  160  
##  5      :  160  
##  6      :  160  
##  (Other):27840
head(dwells); message("..."); tail(dwells)
##   slide SubjID dwell.time position element
## 1     1     10      0.186        1       1
## 2     1     21      0.876        1       1
## 3     1     26      1.636        1       1
## 4     1     27      1.457        1       1
## 5     1     30      1.387        1       1
## 6     1     32      0.868        1       1
## ...
##       slide SubjID dwell.time position element
## 28795   720    171      0.191        4     180
## 28796   720    174      0.358        4     180
## 28797   720    175      0.180        4     180
## 28798   720    177      0.245        4     180
## 28799   720    182      0.481        4     180
## 28800   720    183      0.408        4     180

Exploratory Data Analysis

ggplot(dwells, aes(y=dwell.time, x=slide, group=SubjID)) +
  geom_line(alpha=.2) + 
  labs(title="Raw Dwell Times \n(lines by participant)") # add a title

# holy outliers, batman!
dwells <- dplyr::filter(dwells, dwell.time < 10) # remove cases where they looked longer than 10 seconds to one slide (!)

ggplot(dwells, aes(y=dwell.time, x=slide, group=SubjID)) +
  geom_line(alpha=.2) + 
  labs(title="Raw Dwell Times \n outliers (>10sec) removed") # add a title

# summarize across participants
dwells.sum <- summarize(group_by(dwells, position, element, slide), dwell.mean=mean(dwell.time), se=sd(dwell.time), ci=se*2)

ggplot(dwells.sum, aes(x=slide, y=dwell.mean)) + 
    geom_errorbar(aes(ymin=dwell.mean-ci, ymax=dwell.mean+ci), colour="black", width=.1) +
    geom_line() +
    geom_point() +
    ggtitle("Mean dwell time by slide, \nwith +-2 SE error bars")

dwells.sum.pos <- summarize(group_by(dwells, position, SubjID), dwell.mean=mean(dwell.time), se=sd(dwell.time), ci=se*2)

ggplot(dwells.sum.pos, aes(x=position, y=dwell.mean)) + 
    geom_boxplot() +
    ggtitle("Mean dwell time by position, \nwith +-2 SE error bars")

Basic Time Series

# remember, this is with the outliers removed (from EDA above)

# basic time series is design for data from only one soursce (i.e. one participant)
# it gets used a lot in economics, where your one participant might be your company, or a country
example.participant <- base::sample(dwells$SubjID, size=1)
example <- dplyr::filter(dwells, SubjID==example.participant)

ggplot(example, aes(y=dwell.time, x=slide)) +
  geom_line() + 
  labs(title="Example Participant Dwell Times") # add a title

ggplot(example, aes(y=dwell.time, x=position, group=element)) + 
  geom_line(aes(color=element)) + # color lines by the group factor
  labs(title="Example Participant Dwell Times \nby position within element") +  # add a title
  scale_colour_discrete(name = "Element") # this changes the name of the legend

ggplot(example, aes(y=dwell.time, x=position)) + 
  geom_boxplot() +
  labs(title="Example Participant Dwell Times \nby position within element") 

model1 <- lm(dwell.time ~ slide, data=example)
model2 <- lm(dwell.time ~ slide + element, data=example) 
anova(model1, model2) 
## Analysis of Variance Table
## 
## Model 1: dwell.time ~ slide
## Model 2: dwell.time ~ slide + element
##   Res.Df    RSS  Df Sum of Sq      F    Pr(>F)    
## 1    718 1.9850                                   
## 2    539 1.1399 179   0.84511 2.2325 1.397e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
acf(x = example$dwell.time, main="ACF of example participant dwell times")

# R gives you lag=0 for ACF in addition to the rest. lag=0 is observations with themselves (will always =1)
pacf(x = example$dwell.time, main="Partial ACF of example participant dwell times")

pacf(x = example$dwell.time, plot=F)
## 
## Partial autocorrelations of series 'example$dwell.time', by lag
## 
##      1      2      3      4      5      6      7      8      9     10 
##  0.439  0.416  0.224  0.258  0.197  0.062  0.055  0.151  0.052  0.086 
##     11     12     13     14     15     16     17     18     19     20 
##  0.036  0.069  0.080 -0.056 -0.023  0.041 -0.038  0.036 -0.019  0.012 
##     21     22     23     24     25     26     27     28 
##  0.002  0.000  0.000  0.018  0.045  0.031 -0.044  0.040
which.max(pacf(x = example$dwell.time, plot=F)$acf) # the largest lag 
## [1] 1
max(pacf(x = example$dwell.time, plot=F)$acf)
## [1] 0.4386834
NSubjs <- length(unique(dwells$SubjID))
combined.lags <- as.data.frame(matrix(nrow=NSubjs, ncol=28)) # storage variable
colnames(combined.lags) <- paste("lag", 1:28, sep="")

for(i in 1:NSubjs){
  this.participant <- as.numeric(as.character(unique(dwells$SubjID)))[i]
  sample <- dplyr::filter(dwells, SubjID==this.participant)

  combined.lags[i, ]<- as.vector(pacf(x = sample$dwell.time, plot=F)$acf) 
}

combined.lags.long <- gather(combined.lags)
boxplot(value ~ key, data=combined.lags.long, main="Combined PACF lags \n(calculated within each participant)", ylab="correlation")

# remember, this is with the outliers removed (from EDA above)

library(lme4)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## Loading required package: Rcpp
library(lmerTest)
## 
## Attaching package: 'lmerTest'
## 
## The following object is masked from 'package:lme4':
## 
##     lmer
## 
## The following object is masked from 'package:stats':
## 
##     step
model.null <- lmer(dwell.time ~ 1 + (1|SubjID), data=dwells)
summary(model.null)
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [merModLmerTest]
## Formula: dwell.time ~ 1 + (1 | SubjID)
##    Data: dwells
## 
## REML criterion at convergence: 21858.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.5664 -0.2930 -0.0614  0.1081 22.2230 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  SubjID   (Intercept) 0.1869   0.4324  
##  Residual             0.1239   0.3519  
## Number of obs: 28797, groups:  SubjID, 40
## 
## Fixed effects:
##             Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)  0.49934    0.06839 39.00000   7.301  8.3e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# ICC
L2var<-as.data.frame(VarCorr(model.null))[1,4]
L1var<-as.data.frame(VarCorr(model.null))[2,4]
icc<-L2var/(L2var+L1var)
icc # lots and lots of variance due to participant
## [1] 0.6014814
# center dwells *for each participant* so between-participant differences are minimized
dwells$dwell.time.c <- ave(dwells$dwell.time, dwells$SubjID, FUN=function(x) scale(x, scale=F))

boxplot(dwell.time ~ SubjID, data=dwells, xlab="SubjID", main="Dwell times by participant")

boxplot(dwell.time.c ~ SubjID, data=dwells, xlab="SubjID",  main="Centered dwell times by participant")

model1 <- lmer(dwell.time ~ slide + (slide|SubjID), data=dwells) # note that slide is numeric (not a factor) so the effect of slide is linear
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.00225274
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?
model2 <- lmer(dwell.time ~ slide + position + (slide|SubjID), data=dwells)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.275416
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
summary(model1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.00225274
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?;Model is nearly unidentifiable: large eigenvalue ratio
##  - Rescale variables?
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [merModLmerTest]
## Formula: dwell.time ~ slide + (slide | SubjID)
##    Data: dwells
## 
## REML criterion at convergence: 14114.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.2144 -0.2393 -0.0488  0.0984 25.1704 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr 
##  SubjID   (Intercept) 3.611e-01 0.6008868      
##           slide       3.875e-07 0.0006225 -0.83
##  Residual             9.400e-02 0.3065877      
## Number of obs: 28797, groups:  SubjID, 40
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  7.007e-01  9.508e-02  3.910e+01   7.370 6.59e-09 ***
## slide       -5.586e-04  9.881e-05  3.914e+01  -5.653 1.55e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr)
## slide -0.833
summary(model2)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl =
## control$checkConv, : Model failed to converge with max|grad| = 0.275416
## (tol = 0.002, component 1)
## Warning in checkConv(attr(opt, "derivs"), opt$par, ctrl = control$checkConv, : Model is nearly unidentifiable: very large eigenvalue
##  - Rescale variables?
## Linear mixed model fit by REML t-tests use Satterthwaite approximations
##   to degrees of freedom [merModLmerTest]
## Formula: dwell.time ~ slide + position + (slide | SubjID)
##    Data: dwells
## 
## REML criterion at convergence: 14147.1
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.1992 -0.2386 -0.0486  0.0980 25.1582 
## 
## Random effects:
##  Groups   Name        Variance  Std.Dev.  Corr 
##  SubjID   (Intercept) 1.870e-01 0.4324890      
##           slide       2.572e-07 0.0005071 -0.74
##  Residual             9.411e-02 0.3067707      
## Number of obs: 28797, groups:  SubjID, 40
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  7.072e-01  6.855e-02  9.400e+01  10.316  < 2e-16 ***
## slide       -5.585e-04  8.065e-05  6.600e+01  -6.925 2.14e-09 ***
## position2   -1.110e-02  5.113e-03  2.869e+04  -2.170    0.030 *  
## position3   -6.990e-03  5.113e-03  2.869e+04  -1.367    0.172    
## position4   -7.959e-03  5.113e-03  2.869e+04  -1.556    0.120    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##           (Intr) slide  postn2 postn3
## slide     -0.734                     
## position2 -0.037  0.000              
## position3 -0.037  0.000  0.500       
## position4 -0.037 -0.001  0.500  0.500
anova(model1, model2) 
## refitting model(s) with ML (instead of REML)
## Data: dwells
## Models:
## object: dwell.time ~ slide + (slide | SubjID)
## ..1: dwell.time ~ slide + position + (slide | SubjID)
##        Df   AIC   BIC  logLik deviance  Chisq Chi Df Pr(>Chisq)
## object  6 14107 14156 -7047.4    14095                         
## ..1     9 14108 14182 -7045.0    14090 4.8459      3     0.1834

Mary Had a Little Lamb