Found an efficiant way to clean some of the hourly data, you can click “hide” to ingore the code components and just focus on the results if you want. This is just a sample with the daily conversation duration data. The graph shows how convo unfolds over the course of the day on average for students which is pretty cool in and of itself. The next bit shows how we can creature additional features from the hourly data to incorporate into a predictive model.
library(readr)
library(lme4)
library(lmerTest)
library(ggplot2)
load("C:/Users/dasil/Dropbox/sensing_hour/hrly.RData")
Remove feats we dont need at the moment
all_act_inds<-grep("act", colnames(data_polit_hourly)) #dont remove
still_act_inds<-grep("act_still", colnames(data_polit_hourly)) #dont remove
not_act_inds<-setdiff(all_act_inds,still_act_inds) #remove this
band_inds<-grep("band", colnames(data_polit_hourly))
call_inds<-grep("call", colnames(data_polit_hourly))
sms_inds<-grep("sms", colnames(data_polit_hourly))
home_meta_inds<-grep("_home_", colnames(data_polit_hourly))
study_meta_inds<-grep("_study_", colnames(data_polit_hourly))
social_meta_inds<-grep("_social_", colnames(data_polit_hourly))
light_inds<-grep("light", colnames(data_polit_hourly))
inds_2_remove<-c(not_act_inds,band_inds,call_inds,sms_inds,home_meta_inds,study_meta_inds,social_meta_inds,light_inds)
hrly_fts_removed<-data_polit_hourly[,-c(inds_2_remove)]
bad_feats<-c("is_ios","loc_coed","loc_out_going","party_times","pam","prod","stress","social_level")
hrly_fts_removed<-hrly_fts_removed[ , !colnames(hrly_fts_removed) %in% bad_feats]
Find people with horrible data
var_check_out<-var_check(hrly_fts_removed, within = "uid")
ids2remove<-names(which(sapply(var_check_out,length) > 30))
Get rid of people
clean_dat<-hrly_fts_removed[!hrly_fts_removed$uid %in% ids2remove, ]
0 Pad and re-order data numerically to look at trends across day
audio_convo_hr_inds<-grep("convo_duration_hr_", colnames(clean_dat))
audio_convo_hr<-colnames(clean_dat)[audio_convo_hr_inds]
audio_convo_hr_splt<-strsplit(audio_convo_hr, "_")
audio_convo_hr_splt_HRs<-sapply(audio_convo_hr_splt, tail, 1)
audio_convo_hr_splt_HRs<-as.numeric(as.character(audio_convo_hr_splt_HRs))
audio_convo_hr_padded<-sprintf("audio_convo_duration_hr_%02d", audio_convo_hr_splt_HRs)
colnames(clean_dat)[audio_convo_hr_inds]<-audio_convo_hr_padded
audio_convo_hr_padded_sorted<-sort(audio_convo_hr_padded)
#clean_dat[,c(91:114)]<-clean_dat[,c(audio_convo_hr_padded_sorted)]
audio_convo4feat<-clean_dat[,c(audio_convo_hr_padded_sorted)]
Helper function to find the real NAs
hr_qa<-function(dat_vec,qa_vec){
if (length(dat_vec) != nchar(qa_vec)){
stop("qa vector does not match the length of data vector")
}
for (i in 1:24){
qa<-substr(qa_vec,i,i)
if (qa == 0){
dat_vec[i]<-NA
}
}
return(dat_vec)
}
Graph data by hourly average over participants
conv_hr_cleaned<-list()
for (i in 1:nrow(audio_convo4feat)){
conv_hr_cleaned[[i]]<-hr_qa(audio_convo4feat[i,],clean_dat$quality_hrs_audio[i])
}
conv_hr_cleaned_dat<-do.call("rbind", conv_hr_cleaned)
audio_means<-colMeans(conv_hr_cleaned_dat, na.rm=TRUE)
dat_audio_hr<-data.frame(audio_means, hr = seq(0,23,1), hr2 =seq(0,23,1)^2, hr3 = seq(0,23,1)^3,hr4 = seq(0,23,1)^4 )
What convosersation looks like over the course of a day
ggplot(dat_audio_hr, aes(x = hr, y = audio_means)) + geom_point(pch=19, size=1.5, color = "black") + stat_smooth(span = .9, color = "red", fill = "red", size = 1.2) + theme_bw()+ggtitle("Conversation over a day")+labs(x="HR in Day (0 = midnight)",y="Mean convo duration")+theme(plot.title =element_text(size = 20, color="black", hjust = .5) ,axis.title.x = element_text(size = 18, color="black"),axis.title.y = element_text(size = 18, color="black"), axis.text.x = element_text(size = 14, color="black"), axis.text.y = element_text(size = 14, color="black"))
Here, with the hourly data, I’ve made some new features that we could use later in a predictice model. I fit linear, quadratic, and cubic models to a person’s daily data and we can use those beta estimates as features. Also grabbed daily mins, maxes, and range of this convo duration feature. As you’ll see from the lmer outputs, all these newly created features are significantly related to stress which should help with a predictive model down the road.
stress_audio_slopes<-cbind(conv_hr_cleaned_dat,uid=clean_dat$uid, stress_sm = clean_dat$stress_sm)
out_list<-list()
for (i in 1:nrow(stress_audio_slopes)){
tempy<-as.numeric(stress_audio_slopes[i,1:24])
tempx<-seq(0,23,1)
if (sum(is.na(tempy)) > 7){
out_list[[i]]<-rep(NA,8)
} else {
tempdf<-data.frame(hr = tempx, conv = tempy)
m<-lm(tempy ~ poly(tempx,3,raw=TRUE), data = tempdf)
if (length(unique(tempy)) <=2 ){
sans_0<-tempy
} else {
sans_0<-tempy[tempy!=0]
}
out_list[[i]]<-c(as.numeric(m$coefficients[1:4]),max(tempy, na.rm=TRUE),range(sans_0,na.rm=TRUE)[2]-range(sans_0,na.rm=TRUE)[1],var(tempy,na.rm=TRUE),min(sans_0,na.rm=TRUE))
}
}
out_list_bind<-do.call("rbind", out_list)
colnames(out_list_bind)<-c("b0","b1","b2","b3","max","range","var","min")
audio_binded<-cbind(stress_audio_slopes,out_list_bind)
How the new features relate to daily stress
summary(lmer(stress_sm ~ b0+b1+b2+b3 + (1|uid), data = audio_binded ))
Linear mixed model fit by REML
t-tests use Satterthwaite approximations to degrees of freedom ['lmerMod']
Formula: stress_sm ~ b0 + b1 + b2 + b3 + (1 | uid)
Data: audio_binded
REML criterion at convergence: 21429.5
Scaled residuals:
Min 1Q Median 3Q Max
-2.87192 -0.80044 0.07159 0.75889 2.63602
Random effects:
Groups Name Variance Std.Dev.
uid (Intercept) 2.496 1.580
Residual 15.066 3.882
Number of obs: 3817, groups: uid, 132
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 9.158e+00 1.846e-01 2.237e+02 49.623 < 2e-16 ***
b0 -8.871e-04 2.969e-04 3.756e+03 -2.988 0.00282 **
b1 -1.262e-02 2.606e-03 3.598e+03 -4.843 1.33e-06 ***
b2 -2.089e-01 3.948e-02 3.675e+03 -5.289 1.30e-07 ***
b3 -3.673e+00 7.424e-01 3.780e+03 -4.947 7.86e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr) b0 b1 b2
b0 -0.327
b1 -0.446 0.822
b2 -0.444 0.564 0.912
b3 -0.384 0.406 0.791 0.967
summary(lmer(stress_sm ~ scale(max)+ (1|uid), data = audio_binded ))
Linear mixed model fit by REML
t-tests use Satterthwaite approximations to degrees of freedom ['lmerMod']
Formula: stress_sm ~ scale(max) + (1 | uid)
Data: audio_binded
REML criterion at convergence: 21407.4
Scaled residuals:
Min 1Q Median 3Q Max
-2.84831 -0.79216 0.07359 0.74953 2.63996
Random effects:
Groups Name Variance Std.Dev.
uid (Intercept) 2.474 1.573
Residual 15.102 3.886
Number of obs: 3817, groups: uid, 132
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 8.63860 0.15790 127.38868 54.709 < 2e-16 ***
scale(max) -0.34376 0.07796 3415.78152 -4.409 1.07e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr)
scale(max) 0.021
summary(lmer(stress_sm ~ scale(var)+ (1|uid), data = audio_binded ))
Linear mixed model fit by REML
t-tests use Satterthwaite approximations to degrees of freedom ['lmerMod']
Formula: stress_sm ~ scale(var) + (1 | uid)
Data: audio_binded
REML criterion at convergence: 21393.5
Scaled residuals:
Min 1Q Median 3Q Max
-2.86450 -0.80254 0.07218 0.75657 2.62857
Random effects:
Groups Name Variance Std.Dev.
uid (Intercept) 2.512 1.585
Residual 15.040 3.878
Number of obs: 3817, groups: uid, 132
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 8.62408 0.15883 127.66528 54.297 < 2e-16 ***
scale(var) -0.44581 0.07711 3525.32789 -5.781 8.05e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr)
scale(var) 0.032
summary(lmer(stress_sm ~ scale(range)+ (1|uid), data = audio_binded ))
Linear mixed model fit by REML
t-tests use Satterthwaite approximations to degrees of freedom ['lmerMod']
Formula: stress_sm ~ scale(range) + (1 | uid)
Data: audio_binded
REML criterion at convergence: 21410.7
Scaled residuals:
Min 1Q Median 3Q Max
-2.86017 -0.79064 0.07384 0.75206 2.64501
Random effects:
Groups Name Variance Std.Dev.
uid (Intercept) 2.472 1.572
Residual 15.116 3.888
Number of obs: 3817, groups: uid, 132
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 8.6403 0.1578 127.2661 54.739 < 2e-16 ***
scale(range) -0.3135 0.0781 3406.7973 -4.014 6.11e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr)
scale(rang) 0.020
summary(lmer(stress_sm ~ scale(min)+ (1|uid), data = audio_binded ))
Linear mixed model fit by REML
t-tests use Satterthwaite approximations to degrees of freedom ['lmerMod']
Formula: stress_sm ~ scale(min) + (1 | uid)
Data: audio_binded
REML criterion at convergence: 21422.5
Scaled residuals:
Min 1Q Median 3Q Max
-2.77372 -0.77819 0.06168 0.77599 2.61727
Random effects:
Groups Name Variance Std.Dev.
uid (Intercept) 2.447 1.564
Residual 15.167 3.894
Number of obs: 3817, groups: uid, 132
Fixed effects:
Estimate Std. Error df t value Pr(>|t|)
(Intercept) 8.65213 0.15725 126.44327 55.022 <2e-16 ***
scale(min) -0.13850 0.06409 3752.84680 -2.161 0.0308 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Correlation of Fixed Effects:
(Intr)
scale(min) 0.004