Overview

Finding active users

Finding frequent users

Overview

We will be analyzing social media data from Broadway Ave. First we use Instagram images that we collected along Broadway Ave.

broadway.df = read.csv("~/Dropbox/Broadway_processed_data/processedData/insta_pix_users_2.csv", header = TRUE, stringsAsFactors = FALSE)

Here is a quick overview of what the data set looks like:

head(broadway.df)
##                                 filename
## 1 00004eeac48b11e39a2b24be059d8430_8.jpg
## 2 0000cf3eb76911e3aea30e0a45d706d2_8.jpg
## 3 00018ed2c4ee11e39bed0002c9dc919e_8.jpg
## 4 0001bc6ea72e11e3ab690ea951e5ff49_8.jpg
## 5 00021dd2ac8611e3907c1296586a1d24_8.jpg
## 6 000287a4b4a611e387960ee8fbc8973c_8.jpg
##                    instagram_id      lon     lat
## 1 699095580607820497_1007762859 -74.0061 40.7143
## 2    686982674762715590_7956299 -73.9863 40.7565
## 3    699454333821007139_5755140 -73.9778 40.7868
## 4   672012947605800121_22995112 -73.9961 40.7261
## 5 676941581603487803_1003307735 -73.9887 40.7451
## 6   684437040995492191_41252551 -73.9863 40.7565
##     id    user_id
## 1  p42 1007762859
## 2 p212    7956299
## 3 p330    5755140
## 4  p94   22995112
## 5 p169 1003307735
## 6 p212   41252551
##                              link_author
## 1  http://www.instagram.com/diamondlynn7
## 2 http://www.instagram.com/pauloaugustot
## 3      http://www.instagram.com/osclimon
## 4       http://www.instagram.com/abel257
## 5    http://www.instagram.com/ihaveadica
## 6     http://www.instagram.com/sadyweezy
##               updated       date
## 1 2014-04-15 10:45:09 2014-04-15
## 2 2014-03-29 17:38:58 2014-03-29
## 3 2014-04-15 22:37:55 2014-04-15
## 4 2014-03-09 01:56:47 2014-03-09
## 5 2014-03-15 21:09:06 2014-03-15
## 6 2014-03-26 05:21:15 2014-03-26

A simple scatter plot of the latitude and longitude coordinates reveals where the data was collected from.

library(ggplot2)
ggplot(broadway.df, aes(x = lat, y = lon)) + geom_point()

We look at the range of dates that the data was collected from

range(as.Date(broadway.df$date))
## [1] "2014-02-26" "2014-08-03"

Finding active users

We compute the number of unique users in our data.

print(length(unique(broadway.df$link_author)))
## [1] 271310
num.pics = table(broadway.df$link_author)
num.pics = num.pics[order(num.pics, decreasing=TRUE)]
plot(cumsum(num.pics)/sum(num.pics), xlab = "User", ylab = "Cumulative Distribution of Pics")
abline(h = .2, col = "red")

The above figure shows the cumaltive distribution of the number of pics users have posted. The horizontal line indicates the top 20% of pics posted by a single user. We define “active users”" as users who are part of this 20%. To be part of this group, users must have posted at least:

num.pics.active.user = num.pics[which(cumsum(num.pics)/sum(num.pics) > .2)[1]]
print(num.pics.active.user)
## http://www.instagram.com/djrobdinero 
##                                   13

We subdivded the users into two groups: “active users” and “non-active users.”

broadway.df$user.type = "non-active user"
broadway.df$user.type[which(broadway.df$link_author %in% names(num.pics[which(num.pics > num.pics.active.user)]))] = "active user"
table(broadway.df$user.type)
## 
##     active user non-active user 
##          130707          533041

Finding frequent users

Now compute the distribution of maximum day difference between users posts:

library(data.table)

max.day.diff = function(dates) {
  min.date = min(dates)
  max.date= max(dates) 
  return(as.numeric(difftime(max.date, min.date)))
}

dt = as.data.table(broadway.df) 
setkey(dt, link_author)
dt$date = as.Date(dt$date)
users.max.dates = dt[,list(max.days = max.day.diff(date)), by = list(link_author)]

A quick peek summary of the distribution of users posts indicate the stats of the

quantile(users.max.dates$max.days, c(0, 0.05,.2, .25, .5, .75, .8, .95, 1))
##   0%   5%  20%  25%  50%  75%  80%  95% 100% 
##    0    0    0    0    0    4   12   96  158

Note that since many users have only posted one image, the maximum time difference between their posts will be “zero.” We select frequent users based on the 75% quantile.

frequent.users = users.max.dates$link_author[which(users.max.dates$max.days > quantile(users.max.dates$max.days, .75))]
broadway.df$post.frequency = "infrequent"
broadway.df$post.frequency[which(broadway.df$link_author %in% frequent.users)] = "frequent"
table(broadway.df$post.frequency)
## 
##   frequent infrequent 
##     373774     289974