Broadway Social Media Analysis

Overview
User and Image Distributions vs Location
User and Image Distributions vs Hours
User and Image Distributions vs Days
Understanding Image Features

Overview

We will be analyzing social media data from Broadway Ave. First we use Instagram images that we collected along Broadway Ave.

broadway.df = read.csv("~/Dropbox/Broadway_processed_data/processedData/insta_pix_users_2.csv", header = TRUE, stringsAsFactors = FALSE)

Here is a quick overview of what the data set looks like:

head(broadway.df)

##                                 filename
## 1 00004eeac48b11e39a2b24be059d8430_8.jpg
## 2 0000cf3eb76911e3aea30e0a45d706d2_8.jpg
## 3 00018ed2c4ee11e39bed0002c9dc919e_8.jpg
## 4 0001bc6ea72e11e3ab690ea951e5ff49_8.jpg
## 5 00021dd2ac8611e3907c1296586a1d24_8.jpg
## 6 000287a4b4a611e387960ee8fbc8973c_8.jpg
##                    instagram_id      lon     lat
## 1 699095580607820497_1007762859 -74.0061 40.7143
## 2    686982674762715590_7956299 -73.9863 40.7565
## 3    699454333821007139_5755140 -73.9778 40.7868
## 4   672012947605800121_22995112 -73.9961 40.7261
## 5 676941581603487803_1003307735 -73.9887 40.7451
## 6   684437040995492191_41252551 -73.9863 40.7565
##     id    user_id
## 1  p42 1007762859
## 2 p212    7956299
## 3 p330    5755140
## 4  p94   22995112
## 5 p169 1003307735
## 6 p212   41252551
##                              link_author
## 1  http://www.instagram.com/diamondlynn7
## 2 http://www.instagram.com/pauloaugustot
## 3      http://www.instagram.com/osclimon
## 4       http://www.instagram.com/abel257
## 5    http://www.instagram.com/ihaveadica
## 6     http://www.instagram.com/sadyweezy
##               updated       date
## 1 2014-04-15 10:45:09 2014-04-15
## 2 2014-03-29 17:38:58 2014-03-29
## 3 2014-04-15 22:37:55 2014-04-15
## 4 2014-03-09 01:56:47 2014-03-09
## 5 2014-03-15 21:09:06 2014-03-15
## 6 2014-03-26 05:21:15 2014-03-26

A simple scatter plot of the latitude and longitude coordinates reveals where the data was collected from.

library(ggplot2)
ggplot(broadway.df, aes(x = lat, y = lon)) + geom_point()

We look at the range of dates that the data was collected from

range(as.Date(broadway.df$date))

## [1] "2014-02-26" "2014-08-03"

We compute the number of unique users in our data.

print(length(unique(broadway.df$link_author)))

## [1] 271310

num.pics = table(broadway.df$link_author)
num.pics = num.pics[order(num.pics, decreasing=TRUE)]
plot(cumsum(num.pics)/sum(num.pics), xlab = "User", ylab = "Cumulative Distribution of Pics")
abline(h = .2, col = "red")

The above figure shows the cumaltive distribution of the number of pics users have posted. The horizontal line indicates the top 20% of pics posted by a single user. We define “active users”" as users who are part of this 20%. To be part of this group, users must have posted at least:

num.pics.active.user = num.pics[which(cumsum(num.pics)/sum(num.pics) > .2)[1]]
print(num.pics.active.user)

## http://www.instagram.com/djrobdinero 
##                                   13

We subdivded the users into two groups: “active users” and “non-active users.”

broadway.df$user.type = "non-active user"
broadway.df$user.type[which(broadway.df$link_author %in% names(num.pics[which(num.pics > num.pics.active.user)]))] = "active user"

Let’s merge with census (estimate) data from 2013 to get some neighborhood defintions and economic data:

census = read.delim("~/Dropbox/Broadway data analysis/broadway-crosstab.LM.txt", header = TRUE, stringsAsFactors = FALSE)
census = census[,c("Id","Neighborhood", "Neighborhood_num","dropoffs", "pickups", "checkins", "income", "rent","twitter_messages")]
names(census)[1] = "id"
broadway.census = merge(census, broadway.df)

User and Image Distributions vs Location

Let’s take a look a the number of users and number of images for each neighborhood.

library(ggplot2)
bb = by(broadway.census, broadway.census[,c("Neighborhood", "user.type")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.neighborhood.user.type = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))

res.neighborhood.user.type$Neighborhood = factor(res.neighborhood.user.type$Neighborhood, level = unique(census$Neighborhood))

ggplot(res.neighborhood.user.type, aes(x = num.pics, y = Neighborhood, size = num.users)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ user.type, nrow = 1, scale = "free_x") + ggtitle("Number of Pics")

ggplot(res.neighborhood.user.type, aes(x = num.users, y = Neighborhood, size = num.pics)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ user.type, nrow = 1, scale = "free_x") + ggtitle("Number of Users")

ggplot(res.neighborhood.user.type, aes(x = num.pics/num.users, y = Neighborhood)) + geom_point() +  ylab("") + facet_wrap(~ user.type, nrow = 1, scale = "free_x") + ggtitle("Number of Pics Shared Per User")

We lose a lot of detail looking at the number of users and images at the neighborhood level. Digging a bit deeper at the “box” level, we see details emerging that we didn’t see before. Below is the distribution of images and users for just active users.

bb = by(broadway.census, broadway.census[,c("id", "user.type")], FUN = function(x) data.frame(id = x$id[1], user.type = x$user.type[1], num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.box.user.type = do.call(rbind, bb)
res.box.user.type = merge(res.box.user.type, broadway.census[,c("id", "Neighborhood")])

res.box.user.type$box = sapply(res.box.user.type$id, FUN = function(x) strsplit(as.character(x), split = "p")[[1]][2])
res.box.user.type$box = as.factor(res.box.user.type$box)
res.box.user.type$box = factor(res.box.user.type$box, level = unique(as.numeric(res.box.user.type$box))[order(as.numeric(unique(res.box.user.type$box)))])

slice<-function(x,n) {
    N<-length(x);
    lapply(seq(1,N,n),function(i) x[i:min(i+n-1,N)])
}
res.box.user.type$box.section = "boxes 0 - 177"
res.box.user.type$box.section[which(res.box.user.type$box %in% slice(levels(res.box.user.type$box), 178)[[2]])] = "boxes 178 - 355"
res.box.user.type$box.section[which(res.box.user.type$box %in% slice(levels(res.box.user.type$box), 178)[[3]])] = "boxes 356 - 535"
res.box.user.type$box.section[which(res.box.user.type$box %in% slice(levels(res.box.user.type$box), 178)[[4]])] = "boxes 536 - 712"
ggplot(subset(res.box.user.type, user.type == "active user"), aes(x = num.pics, y = box, size = num.users, colour = Neighborhood)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Active Image Distributions")

ggplot(subset(res.box.user.type, user.type == "non-active user"), aes(x = num.pics, y = box, size = num.users, colour = Neighborhood)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Non-Active Image Distributions")

ggplot(subset(res.box.user.type, user.type == "active user"), aes(x = num.users, y = box, size = num.pics, colour = Neighborhood)) + geom_point() + xlab("Number of Users") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Active User Distributions")

ggplot(subset(res.box.user.type, user.type == "non-active user"), aes(x = num.users, y = box, size = num.pics, colour = Neighborhood)) + geom_point() + xlab("Number of Users") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Non-Active User Distributions")

ggplot(subset(res.box.user.type, user.type == "active user"), aes(x = num.pics/num.users, y = box, colour = Neighborhood)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Active Normalized (by User) Image Distributions")

ggplot(subset(res.box.user.type, user.type == "non-active user"), aes(x = num.pics/num.users, y = box, colour = Neighborhood)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Non-Active Normalized (by User) Image Distributions")

Since each point covers the same area, we get a detailed snapshot of the image sharing popularity of the neighborhoods at a finer resolution. Roughly speaking, for active users (compared to non-active users) there are more neighborhood’s that have high image activity dominated by one or two points. For non-active users the overwhelming dominant point is Times Square followed by the Civic Center bordering China Town (point 42), so we see less structure in the overall distribution of images.

User and Image Distributions vs Hours

Below, we look at the distribution of images during time.

broadway.census$date = as.Date(broadway.census$updated)
broadway.census$day = weekdays(broadway.census$date)
## sloppy way to get hour of day (text manipulation)
broadway.census$hour = sapply(broadway.census$updated, FUN = function(x) substr(x, 12,13))
ggplot(broadway.census, aes(x = as.factor(hour), fill = user.type)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + xlab("Hour") + ylab("Volume")

ggplot(subset(broadway.census, user.type == "active user"), aes(x = as.factor(hour))) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + xlab("Hour") + ylab("Volume") + ggtitle("Active Users")

ggplot(subset(broadway.census, user.type != "active user"), aes(x = as.factor(hour))) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + xlab("Hour") + ylab("Volume") + ggtitle("Non-active Users")

For non-active users, we see diurnal patterns that match daily circadian rhythms as reported by others (for example: http://www.sciencemag.org/content/333/6051/1878.short). However, we see that in Times Square, the peak of activity happens much later in the day. For active users, especially for more North East neighborhoods, we see diurnal patterns that diverge than the patterns for non-activity users. For exmaple, we see the Dinancial District follow typical diurnal pattern corresponding to circadian rhythms, but in Hamilton Heights there is a strong deviation into social media activity late into the night.

The following figure shows the hourly activity divied into 4 equal parts based on our equi-sized boxes.

User distributions over the hours:

bb = by(broadway.census, broadway.census$hour, FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.hour = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
names(res.hour)[1] = "hour"

ggplot(res.hour, aes(x = hour, y = num.pics/num.users)) + geom_point() + ylab("Pics Normalized by User")

bb = by(broadway.census, broadway.census[,c("user.type", "hour")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.hour.user.type = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))

ggplot(res.hour.user.type, aes(x = hour, y = num.pics/num.users)) + geom_point()  + ylab("Pics Normalized by User") + facet_wrap(~ user.type, nrow = 1, scale = "free_y")

Users distributions over the hours for each neighborhood:

bb = by(broadway.census, broadway.census[,c("Neighborhood", "hour")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.hour.neighborhood = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
ggplot(res.hour.neighborhood, aes(x = hour, y = num.pics/num.users)) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y")

bb = by(broadway.census, broadway.census[,c("user.type" ,"Neighborhood", "hour")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.user.type.hour.neighborhood = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
ggplot(subset(res.user.type.hour.neighborhood, user.type == "active user"), aes(x = hour, y = num.pics/num.users)) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + ggtitle("Active User Normalized Image Distribution")

ggplot(subset(res.user.type.hour.neighborhood, user.type != "active user"), aes(x = hour, y = num.pics/num.users)) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + ggtitle("Non-active User Normalized Image Distribution")

User and Image Distributions vs Days

Daily image volumes by neighborhood

broadway.census$day = as.factor(broadway.census$day)
broadway.census$day = factor(broadway.census$day, level = levels(broadway.census$day)[c(4,2,6,7,5,1,3)])
ggplot(broadway.census, aes(x = day, fill = user.type)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90))

ggplot(subset(broadway.census, user.type == "active user"), aes(x = day)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Active Users") + theme(axis.text.x = element_text(angle = 90))

ggplot(subset(broadway.census, user.type != "active user"), aes(x = day)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Non-active Users") + theme(axis.text.x = element_text(angle = 90))

The above figures show the image activity broken down by day of the week. With the exception of the Northeast neighborhoods, the active-users follow a working week activity level.

User distributions over the days:

bb = by(broadway.census, broadway.census$day, FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.day = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
names(res.day)[1] = "day"

res.day$day = factor(res.day$day, level = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
ggplot(res.day, aes(x = day, y = num.users)) + geom_point() + theme(axis.text.x = element_text(angle = 90))

bb = by(broadway.census, broadway.census[,c("user.type", "day")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.user.type.day = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
res.user.type.day$day = factor(res.user.type.day$day, level = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
ggplot(res.user.type.day, aes(x = day, y = num.users)) + geom_point() + theme(axis.text.x = element_text(angle = 90)) + facet_wrap(~ user.type, nrow = 1, scale = "free_y")

User distributions over the days for each neighborhood:

bb = by(broadway.census, broadway.census[,c("Neighborhood", "day")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.day.neighborhood = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))

res.day.neighborhood$day = factor(res.day.neighborhood$day, level = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))

ggplot(res.day.neighborhood, aes(x = day, y = num.pics/num.users)) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + theme(axis.text.x = element_text(angle = 90))

bb = by(broadway.census, broadway.census[,c("user.type", "Neighborhood", "day")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.user.type.day.neighborhood = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))

res.user.type.day.neighborhood$day = factor(res.user.type.day.neighborhood$day, level = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
ggplot(subset(res.user.type.day.neighborhood, user.type == "active user"), aes(x = day, y = num.pics/(1+num.users))) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Active Users Daily Normalized Image Dist")

ggplot(subset(res.user.type.day.neighborhood, user.type != "active user"), aes(x = day, y = num.pics/(1+num.users))) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Non-Active Users Daily Normalized Image Dist")

Monthly image volumes by neighborhood:

#broadway.census$month = sapply(broadway.census$date, FUN = function(x) format(as.Date(x), "%b"))
#broadway.census$month = factor(broadway.census$month, level = c("Feb","Mar","Apr", "May", "Jun", "Jul", "Aug"))
#ggplot(broadway.census, aes(x = month, fill = user.type)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y")  + ylab("Volume") + theme(axis.text.x = element_text(angle = 90))

#ggplot(subset(broadway.census, user.type == "active user"), aes(x = month,)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y")  + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Active Users")
#ggplot(subset(broadway.census, user.type != "active user"), aes(x = month,)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y")  + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Non-active Users")

Monthly image volumes by box section:

#ggplot(broadway.census, aes(x = month, fill = user.type)) + geom_bar(position = "dodge") + facet_wrap(~box.section, scale = "free_y")  + ylab("Volume") + theme(axis.text.x = element_text(angle = 90))

#ggplot(subset(broadway.census, user.type == "active user"), aes(x = month)) + geom_bar(position = "dodge") + facet_wrap(~box.section, scale = "free_y")  + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Active Users")

#ggplot(subset(broadway.census, user.type != "active user"), aes(x = month)) + geom_bar(position = "dodge") + facet_wrap(~box.section, scale = "free_y")  + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Non-active Users")

Understanding Image Features

Merge with QTIP:

qtip = read.csv("~/Dropbox/Broadway data analysis/qtip-features/qtip_meta_v2.csv", header = TRUE, stringsAsFactors = FALSE)
broadway.census = merge(broadway.census, qtip[,c("filename", "entropy", "Hue", "Value", "Saturaion", "Chroma", "contrastM", "contrastRMS")])

Some scatter plots with QTIP features

ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = factor(day), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Active users")

ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = factor(hour), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Active users")

#ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = month, y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Active users")

ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = factor(day), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Non-active users")

ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = factor(hour), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Non-active users")

#ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = factor(month), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Non-active users")

Compare with Economic data

#ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = log(income), y = log(num.pics.per.box))) + geom_point() + stat_smooth(method = lm) + ggtitle("Active users")
ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = log(income), y = Value)) + geom_point() + stat_smooth(method = lm) + ggtitle("Active users")

#ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = log(income), y = log(num.pics.per.box))) + geom_point() + stat_smooth(method = lm) + ggtitle("Non-active users")
ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = log(income), y = Value)) + geom_point() + stat_smooth(method = lm) + ggtitle("Non-active users")