1 Data notebook



1.1 Data cleaning script

1.1.1 Load + prep mobile dataframe

EMA2 = read.csv("~/Downloads/SHINE_Round2EMA_24May2021.csv", stringsAsFactors = FALSE)

EMA2$date <- as.POSIXct(EMA2$Notification.Time, format = "%m/%d/%y")

#test  =EMA2 %>% select(SHINEID,Notification.Time, GPS.Latitude.Start, #GPS.Longitude.Start, GPS.Latitude.Finish, GPS.Longitude.Finish)
##

1.1.2 Create geolocation measure (DV)

lat <- EMA2 %>%
  group_by(date, SHINEID) %>% #shouldnt I also group_by(user_id) so that I get unique results for each user
  # it seems to work now because for each date there is only 1 user somehow
  arrange(date) %>% # so that initial location comes first
  # single date locations
  mutate(sd_lats = paste(GPS.Latitude.Start, collapse = ",")) %>%
  mutate(sd_lngs = paste(GPS.Longitude.Start, collapse = ",")) %>%
  # initial locations
  mutate(init_lat = first(GPS.Latitude.Start)) %>% #if first NA move 
  mutate(init_lng = first(GPS.Longitude.Start))

EMA2= EMA2 %>% filter(GPS.Latitude.Start != 'NA')
EMA2= EMA2 %>% filter(GPS.Latitude.Start != 0.00000)
length(unique(EMA2$SHINEID)) #274 8Ppeople removed
## [1] 274
df1 = EMA2 %>%
  group_by(date, SHINEID) %>% #shouldnt I also group_by(user_id) so that I get unique results for each user
  # it seems to work now because for each date there is only 1 user somehow
  arrange(date) %>% 
 mutate(init_lng = first(GPS.Longitude.Start),
   init_lat = ifelse(
   first(GPS.Latitude.Start) != "NA", first(GPS.Latitude.Start),
   ifelse(GPS.Latitude.Start[2] != "NA", GPS.Latitude.Start[2],
          NA))) %>%
  mutate(abc = first(GPS.Latitude.Start))

#df1 %<>% select(SHINEID,Notification.Time, GPS.Latitude.Start, GPS.Longitude.Start, init_lng, init_lat)

#lat= lat %>% group_by(init_lat, SHINEID) %>% filter(n() >3)

###if initial NA move to next 
lat1 = df1 %>%
  group_by(date, SHINEID) %>%
  # find consecutive distances (not required I think...)
  mutate(dist_consec = distHaversine(cbind(GPS.Longitude.Start, GPS.Latitude.Start),
                                  cbind(lag(GPS.Longitude.Start), lag(GPS.Latitude.Start)))) %>%
  # dist. from initial loc
  mutate(dist_from_init = distHaversine(cbind(GPS.Longitude.Start, GPS.Latitude.Start),
                                  cbind(init_lng, init_lat)))

lat <-lat1 %>%
  group_by(date, SHINEID) %>%
  mutate(sd_dist = max(dist_from_init))

length(unique(lat1$SHINEID)) #279
## [1] 274
#hist(lat$sd_dist)
#hist(log(lat$sd_dist))
options(scipen=999)

df = lat %>% filter(GPS.Latitude.Start != 'NA')
df = df %>% filter(sd_dist != 'NA')

#l = lat1 %>%
#  arrange(sd_dist, date,SHINEID)

df$sd_dist_w = psych::winsor(df$sd_dist, trim = 0.20, na.rm = TRUE)

psych::describe(df$sd_dist)
##    vars     n     mean       sd median trimmed   mad min      max    range
## X1    1 25124 24730.56 247571.2  33.73  1550.6 44.29   0 12791036 12791036
##     skew kurtosis      se
## X1 28.52  1103.69 1561.91
psych::describe(df$sd_dist_w)
##    vars     n    mean      sd median trimmed   mad  min     max   range skew
## X1    1 25124 1221.11 1891.87  33.73  942.94 35.83 9.57 4658.45 4648.89 1.13
##    kurtosis    se
## X1    -0.57 11.94

1.1.3 Create news reading variable (IV)

lat$datetime <- as.POSIXct(lat$Notification.Time, format = "%m/%d/%y %H:%M")
lat<- lat[order(as.factor(lat$SHINEID), lat$datetime),]

# create day order variables
day = lat %>%
  select_("SHINEID", "date") %>%
  unique() %>%
  group_by(SHINEID) %>%
  dplyr::mutate(order_day = row_number()) %>% 
  ungroup()
## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
lat=lat%>%
  group_by(SHINEID) %>%
  dplyr::mutate(order_ema = row_number()) %>% #ema order variable
  ungroup()  %>% 
  left_join(., day) %>% #day order variable to group morning and evening survey of the same day together
  group_by(SHINEID) 
## Joining, by = c("SHINEID", "date")
#daily purpose was measured once in the morning. Copy over morning purpose to evening to create daily average purpose variable
lat$CovidNews_daily = lat$CovidNews
lat=lat%>% 
  group_by(SHINEID,order_day) %>%
  tidyr::fill(CovidNews_daily, .direction = "down")  

# create a lagged variable to test previous purpose predicting current alcohol use
lat=lat %>% 
  dplyr::group_by(SHINEID) %>% 
  dplyr::mutate(CovidNews_daily_lag = lag(CovidNews_daily)) %>% 
  ungroup() 

#check =lat%>%
#  dplyr::select(SHINEID,Notification.Time, Session.Name, order_ema, CovidNews_daily_lag, CovidNews_daily)
##idk if weill work

1.2 Descriptives

1.2.1 Stats

psych::describe(lat$CovidNews_daily)
##    vars     n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 23748 0.79 0.97      1    0.64 1.48   0   7     7 1.33     1.82 0.01
plot_d =lat%>%
  group_by(SHINEID) %>%
  mutate(mean_mob = sd_dist, na.rm = T,
         mean_covid = CovidNews, na.rm =T) 

#plot_d =plot_d %>%
#  ungroup() %>%
 # summarise(mean_mob = mean(mean_mob, na.rm =T),
  #          mean_covid = mean(mean_covid, na.rm =T),
   #         sd_mob = mean(mean_mob, na.rm =T),
    #        sd_covid = mean(mean_covid, na.rm =T)
     #       )

#plot = lat  %>%
#select(CovidNews,sd_dist, SHINEID)
  
#table_one <- tableby(SHINEID ~ ., data = plot) 
#summary(table_one, title = "Gapminder Data")

1.2.2 One person news reading over 28 days

#get one person's drinking mean to use in plot (190)


# psych::describe(lat$CovidNews_daily)
#    vars     n mean   sd median trimmed  mad min max range skew kurtosis   se
# X1    1 23748 0.79 0.97      1    0.64 1.48   0   7     7 1.33     1.82 0.01

#Intensive repeated measures data simultaneously contain 
#between-person and within-person information in drinking
p  = ggplot(data = lat[which(lat$SHINEID=='muri053'),],
       aes(x =order_day , group=ID)) +
  guides(color="none") + #to suppress guide
  geom_line(aes(y=CovidNews_daily), color="green") +
  scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
                              23,24,25,26,27,28,28), name="Day") +
  ylim(0,3) +
  geom_hline(yintercept= mean(lat$CovidNews_daily[lat$SHINEID == 'muri053'],na.rm = T)
, linetype="dashed", 
             color = "blue", size=1) +
  geom_point(aes(y=CovidNews_daily), color="green") +
  ylab("Intensity") +  annotate("text", x = 28, y = 3, label= "example ppt") + 
  theme_bw()
p
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_point).

1.2.3 Data collection time period

check_day1 =lat%>% 
  group_by(SHINEID) %>%
  filter(order_day == 1) %>%
  arrange(Notification.Time)  %>%
  filter(row_number()==1)  %>%
  select(SHINEID, date,Notification.Time ) %>%
  ungroup() %>%
  count(date)

ggplot() + 
  geom_line(data = check_day1, aes(y=n, x = date),
  size = 1, 
  group = 1) + ylab("# of individuals") +  theme_bw()