We look at the diurnal patterns emerging from the number of faces in Instagram images:

setwd("~/Dropbox/Broadway_processed_data/")

broadway.df = read.csv("~/Dropbox/Broadway_processed_data/processedData/insta_pix_users_2.csv", header = TRUE, stringsAsFactors = FALSE)
num.pics = table(broadway.df$link_author)
num.pics = num.pics[order(num.pics, decreasing=TRUE)]

num.pics.active.user = num.pics[which(cumsum(num.pics)/sum(num.pics) > .2)[1]]

broadway.df$user.type = "non-active user"
broadway.df$user.type[which(broadway.df$link_author %in% names(num.pics[which(num.pics > num.pics.active.user)]))] = "active user"

qtip.features = read.delim("~/Dropbox/Broadway_paper/data/qtip_meta_v2.no_video.txt", header = TRUE, stringsAsFactors = FALSE)
qtip.features = na.omit(qtip.features)
qtip.features$date = NULL
qtip.features$id = NULL
qtip.features$updated = NULL
qtip.features$instagram_id = NULL
qtip.features$lat = NULL
qtip.features$lon = NULL

broadway.qtip = merge(broadway.df, qtip.features)

num_faces = read.csv("./processedData/num_faces.csv", header = TRUE, stringsAsFactors = FALSE)


get.filename = function(x) {
  splitted = strsplit(x, split = "/")[[1]]
  return(splitted[length(splitted)])
}

num_faces$filename = sapply(num_faces$file_path, get.filename)
num_faces$file_path = NULL
broadway.faces = merge(broadway.qtip, num_faces)
library(plyr)
rm(broadway.df)
rm(broadway.qtip)
get.correct.hour = function(x){
  x = as.POSIXct(strptime(x, format = "%Y-%m-%d %H:%M:%S"), tz = "GMT")
  tz.str = "America/New_York"
  return(hour(format(x, tz = tz.str)))
}
library(lubridate)
## 
## Attaching package: 'lubridate'
## 
## The following object is masked from 'package:plyr':
## 
##     here
broadway.faces$hour = sapply(broadway.faces$updated, get.correct.hour)

normalized.num.faces = function(df){
  return(data.frame(hour = df$hour[1], num.rows = nrow(df), num.faces = sum(df$num_faces), 
                    face.present = length(which(df$num_faces > 0)), solo.face = length(which(df$num_faces == 1)),
                    social.face = length(which(df$num_faces > 1))))
}

faces.hours = ddply(broadway.faces, .(hour), normalized.num.faces)
library(ggplot2)
ggplot(faces.hours, aes(x = as.factor(hour), y = num.faces/num.rows)) + geom_point(size = 4)