1 Data notebook

1.1 Data cleaning script

1.1.1 Load + prep mobile dataframe

EMA2 = read.csv("~/Downloads/SHINE_Round2EMA_24May2021.csv", stringsAsFactors = FALSE)

EMA2$date <- as.POSIXct(EMA2$Notification.Time, format = "%m/%d/%y")

#test  =EMA2 %>% select(SHINEID,Notification.Time, GPS.Latitude.Start, #GPS.Longitude.Start, GPS.Latitude.Finish, GPS.Longitude.Finish)
##

1.1.2 Create geolocation measure (DV)

lat <- EMA2 %>%
  group_by(date, SHINEID) %>% #shouldnt I also group_by(user_id) so that I get unique results for each user
  # it seems to work now because for each date there is only 1 user somehow
  arrange(date) %>% # so that initial location comes first
  # single date locations
  mutate(sd_lats = paste(GPS.Latitude.Start, collapse = ",")) %>%
  mutate(sd_lngs = paste(GPS.Longitude.Start, collapse = ",")) %>%
  # initial locations
  mutate(init_lat = first(GPS.Latitude.Start)) %>% #if first NA move 
  mutate(init_lng = first(GPS.Longitude.Start))

EMA2= EMA2 %>% filter(GPS.Latitude.Start != 'NA')
EMA2= EMA2 %>% filter(GPS.Latitude.Start != 0.00000)
length(unique(EMA2$SHINEID)) #274 8Ppeople removed

## [1] 274

df1 = EMA2 %>%
  group_by(date, SHINEID) %>% #shouldnt I also group_by(user_id) so that I get unique results for each user
  # it seems to work now because for each date there is only 1 user somehow
  arrange(date) %>% 
 mutate(init_lng = first(GPS.Longitude.Start),
   init_lat = ifelse(
   first(GPS.Latitude.Start) != "NA", first(GPS.Latitude.Start),
   ifelse(GPS.Latitude.Start[2] != "NA", GPS.Latitude.Start[2],
          NA))) %>%
  mutate(abc = first(GPS.Latitude.Start))

#df1 %<>% select(SHINEID,Notification.Time, GPS.Latitude.Start, GPS.Longitude.Start, init_lng, init_lat)

#lat= lat %>% group_by(init_lat, SHINEID) %>% filter(n() >3)

###if initial NA move to next 
lat1 = df1 %>%
  group_by(date, SHINEID) %>%
  # find consecutive distances (not required I think...)
  mutate(dist_consec = distHaversine(cbind(GPS.Longitude.Start, GPS.Latitude.Start),
                                  cbind(lag(GPS.Longitude.Start), lag(GPS.Latitude.Start)))) %>%
  # dist. from initial loc
  mutate(dist_from_init = distHaversine(cbind(GPS.Longitude.Start, GPS.Latitude.Start),
                                  cbind(init_lng, init_lat)))

lat <-lat1 %>%
  group_by(date, SHINEID) %>%
  mutate(sd_dist = max(dist_from_init))

length(unique(lat1$SHINEID)) #279

## [1] 274

#hist(lat$sd_dist)
#hist(log(lat$sd_dist))
options(scipen=999)

df = lat %>% filter(GPS.Latitude.Start != 'NA')
df = df %>% filter(sd_dist != 'NA')

#l = lat1 %>%
#  arrange(sd_dist, date,SHINEID)

df$sd_dist_w = psych::winsor(df$sd_dist, trim = 0.20, na.rm = TRUE)

psych::describe(df$sd_dist)

##    vars     n     mean       sd median trimmed   mad min      max    range
## X1    1 25124 24730.56 247571.2  33.73  1550.6 44.29   0 12791036 12791036
##     skew kurtosis      se
## X1 28.52  1103.69 1561.91

psych::describe(df$sd_dist_w)

##    vars     n    mean      sd median trimmed   mad  min     max   range skew
## X1    1 25124 1221.11 1891.87  33.73  942.94 35.83 9.57 4658.45 4648.89 1.13
##    kurtosis    se
## X1    -0.57 11.94

1.1.3 Create news reading variable (IV)

lat$datetime <- as.POSIXct(lat$Notification.Time, format = "%m/%d/%y %H:%M")
lat<- lat[order(as.factor(lat$SHINEID), lat$datetime),]

# create day order variables
day = lat %>%
  select_("SHINEID", "date") %>%
  unique() %>%
  group_by(SHINEID) %>%
  dplyr::mutate(order_day = row_number()) %>% 
  ungroup()

## Warning: `select_()` was deprecated in dplyr 0.7.0.
## Please use `select()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

lat=lat%>%
  group_by(SHINEID) %>%
  dplyr::mutate(order_ema = row_number()) %>% #ema order variable
  ungroup()  %>% 
  left_join(., day) %>% #day order variable to group morning and evening survey of the same day together
  group_by(SHINEID)

## Joining, by = c("SHINEID", "date")

#daily purpose was measured once in the morning. Copy over morning purpose to evening to create daily average purpose variable
lat$CovidNews_daily = lat$CovidNews
lat=lat%>% 
  group_by(SHINEID,order_day) %>%
  tidyr::fill(CovidNews_daily, .direction = "down")  

# create a lagged variable to test previous purpose predicting current alcohol use
lat=lat %>% 
  dplyr::group_by(SHINEID) %>% 
  dplyr::mutate(CovidNews_daily_lag = lag(CovidNews_daily)) %>% 
  ungroup() 

#check =lat%>%
#  dplyr::select(SHINEID,Notification.Time, Session.Name, order_ema, CovidNews_daily_lag, CovidNews_daily)
##idk if weill work

1.2 Descriptives

1.2.1 Stats

psych::describe(lat$CovidNews_daily)

##    vars     n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 23748 0.79 0.97      1    0.64 1.48   0   7     7 1.33     1.82 0.01

plot_d =lat%>%
  group_by(SHINEID) %>%
  mutate(mean_mob = sd_dist, na.rm = T,
         mean_covid = CovidNews, na.rm =T) 

#plot_d =plot_d %>%
#  ungroup() %>%
 # summarise(mean_mob = mean(mean_mob, na.rm =T),
  #          mean_covid = mean(mean_covid, na.rm =T),
   #         sd_mob = mean(mean_mob, na.rm =T),
    #        sd_covid = mean(mean_covid, na.rm =T)
     #       )

#plot = lat  %>%
#select(CovidNews,sd_dist, SHINEID)
  
#table_one <- tableby(SHINEID ~ ., data = plot) 
#summary(table_one, title = "Gapminder Data")

1.2.2 One person news reading over 28 days

#get one person's drinking mean to use in plot (190)


# psych::describe(lat$CovidNews_daily)
#    vars     n mean   sd median trimmed  mad min max range skew kurtosis   se
# X1    1 23748 0.79 0.97      1    0.64 1.48   0   7     7 1.33     1.82 0.01

#Intensive repeated measures data simultaneously contain 
#between-person and within-person information in drinking
p  = ggplot(data = lat[which(lat$SHINEID=='muri053'),],
       aes(x =order_day , group=ID)) +
  guides(color="none") + #to suppress guide
  geom_line(aes(y=CovidNews_daily), color="green") +
  scale_x_continuous(breaks=c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,
                              23,24,25,26,27,28,28), name="Day") +
  ylim(0,3) +
  geom_hline(yintercept= mean(lat$CovidNews_daily[lat$SHINEID == 'muri053'],na.rm = T)
, linetype="dashed", 
             color = "blue", size=1) +
  geom_point(aes(y=CovidNews_daily), color="green") +
  ylab("Intensity") +  annotate("text", x = 28, y = 3, label= "example ppt") + 
  theme_bw()
p

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 5 rows containing missing values (geom_point).

1.2.3 Data collection time period

check_day1 =lat%>% 
  group_by(SHINEID) %>%
  filter(order_day == 1) %>%
  arrange(Notification.Time)  %>%
  filter(row_number()==1)  %>%
  select(SHINEID, date,Notification.Time ) %>%
  ungroup() %>%
  count(date)

ggplot() + 
  geom_line(data = check_day1, aes(y=n, x = date),
  size = 1, 
  group = 1) + ylab("# of individuals") +  theme_bw()

Smartphones & the Twitterverse

Mia Jovanova, Jingze Wang, Prashant Garg

2022-06-21

1 Data notebook

1.1 Data cleaning script

1.1.1 Load + prep mobile dataframe

1.1.2 Create geolocation measure (DV)

1.1.3 Create news reading variable (IV)

1.2 Descriptives

1.2.1 Stats

1.2.2 One person news reading over 28 days

1.2.3 Data collection time period