User and Image Distributions vs Location
User and Image Distributions vs Hours
User and Image Distributions vs Days
We will be analyzing social media data from Broadway Ave. First we use Instagram images that we collected along Broadway Ave.
broadway.df = read.csv("~/Dropbox/Broadway_processed_data/processedData/insta_pix_users_2.csv", header = TRUE, stringsAsFactors = FALSE)
Here is a quick overview of what the data set looks like:
head(broadway.df)
## filename
## 1 00004eeac48b11e39a2b24be059d8430_8.jpg
## 2 0000cf3eb76911e3aea30e0a45d706d2_8.jpg
## 3 00018ed2c4ee11e39bed0002c9dc919e_8.jpg
## 4 0001bc6ea72e11e3ab690ea951e5ff49_8.jpg
## 5 00021dd2ac8611e3907c1296586a1d24_8.jpg
## 6 000287a4b4a611e387960ee8fbc8973c_8.jpg
## instagram_id lon lat
## 1 699095580607820497_1007762859 -74.0061 40.7143
## 2 686982674762715590_7956299 -73.9863 40.7565
## 3 699454333821007139_5755140 -73.9778 40.7868
## 4 672012947605800121_22995112 -73.9961 40.7261
## 5 676941581603487803_1003307735 -73.9887 40.7451
## 6 684437040995492191_41252551 -73.9863 40.7565
## id user_id
## 1 p42 1007762859
## 2 p212 7956299
## 3 p330 5755140
## 4 p94 22995112
## 5 p169 1003307735
## 6 p212 41252551
## link_author
## 1 http://www.instagram.com/diamondlynn7
## 2 http://www.instagram.com/pauloaugustot
## 3 http://www.instagram.com/osclimon
## 4 http://www.instagram.com/abel257
## 5 http://www.instagram.com/ihaveadica
## 6 http://www.instagram.com/sadyweezy
## updated date
## 1 2014-04-15 10:45:09 2014-04-15
## 2 2014-03-29 17:38:58 2014-03-29
## 3 2014-04-15 22:37:55 2014-04-15
## 4 2014-03-09 01:56:47 2014-03-09
## 5 2014-03-15 21:09:06 2014-03-15
## 6 2014-03-26 05:21:15 2014-03-26
A simple scatter plot of the latitude and longitude coordinates reveals where the data was collected from.
library(ggplot2)
ggplot(broadway.df, aes(x = lat, y = lon)) + geom_point()
We look at the range of dates that the data was collected from
range(as.Date(broadway.df$date))
## [1] "2014-02-26" "2014-08-03"
We compute the number of unique users in our data.
print(length(unique(broadway.df$link_author)))
## [1] 271310
num.pics = table(broadway.df$link_author)
num.pics = num.pics[order(num.pics, decreasing=TRUE)]
plot(cumsum(num.pics)/sum(num.pics), xlab = "User", ylab = "Cumulative Distribution of Pics")
abline(h = .2, col = "red")
The above figure shows the cumaltive distribution of the number of pics users have posted. The horizontal line indicates the top 20% of pics posted by a single user. We define “active users”" as users who are part of this 20%. To be part of this group, users must have posted at least:
num.pics.active.user = num.pics[which(cumsum(num.pics)/sum(num.pics) > .2)[1]]
print(num.pics.active.user)
## http://www.instagram.com/djrobdinero
## 13
We subdivded the users into two groups: “active users” and “non-active users.”
broadway.df$user.type = "non-active user"
broadway.df$user.type[which(broadway.df$link_author %in% names(num.pics[which(num.pics > num.pics.active.user)]))] = "active user"
Let’s merge with census (estimate) data from 2013 to get some neighborhood defintions and economic data:
census = read.delim("~/Dropbox/Broadway data analysis/broadway-crosstab.LM.txt", header = TRUE, stringsAsFactors = FALSE)
census = census[,c("Id","Neighborhood", "Neighborhood_num","dropoffs", "pickups", "checkins", "income", "rent","twitter_messages")]
names(census)[1] = "id"
broadway.census = merge(census, broadway.df)
Let’s take a look a the number of users and number of images for each neighborhood.
library(ggplot2)
bb = by(broadway.census, broadway.census[,c("Neighborhood", "user.type")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.neighborhood.user.type = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
res.neighborhood.user.type$Neighborhood = factor(res.neighborhood.user.type$Neighborhood, level = unique(census$Neighborhood))
ggplot(res.neighborhood.user.type, aes(x = num.pics, y = Neighborhood, size = num.users)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ user.type, nrow = 1, scale = "free_x") + ggtitle("Number of Pics")
ggplot(res.neighborhood.user.type, aes(x = num.users, y = Neighborhood, size = num.pics)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ user.type, nrow = 1, scale = "free_x") + ggtitle("Number of Users")
ggplot(res.neighborhood.user.type, aes(x = num.pics/num.users, y = Neighborhood)) + geom_point() + ylab("") + facet_wrap(~ user.type, nrow = 1, scale = "free_x") + ggtitle("Number of Pics Shared Per User")
We lose a lot of detail looking at the number of users and images at the neighborhood level. Digging a bit deeper at the “box” level, we see details emerging that we didn’t see before. Below is the distribution of images and users for just active users.
bb = by(broadway.census, broadway.census[,c("id", "user.type")], FUN = function(x) data.frame(id = x$id[1], user.type = x$user.type[1], num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.box.user.type = do.call(rbind, bb)
res.box.user.type = merge(res.box.user.type, broadway.census[,c("id", "Neighborhood")])
res.box.user.type$box = sapply(res.box.user.type$id, FUN = function(x) strsplit(as.character(x), split = "p")[[1]][2])
res.box.user.type$box = as.factor(res.box.user.type$box)
res.box.user.type$box = factor(res.box.user.type$box, level = unique(as.numeric(res.box.user.type$box))[order(as.numeric(unique(res.box.user.type$box)))])
slice<-function(x,n) {
N<-length(x);
lapply(seq(1,N,n),function(i) x[i:min(i+n-1,N)])
}
res.box.user.type$box.section = "boxes 0 - 177"
res.box.user.type$box.section[which(res.box.user.type$box %in% slice(levels(res.box.user.type$box), 178)[[2]])] = "boxes 178 - 355"
res.box.user.type$box.section[which(res.box.user.type$box %in% slice(levels(res.box.user.type$box), 178)[[3]])] = "boxes 356 - 535"
res.box.user.type$box.section[which(res.box.user.type$box %in% slice(levels(res.box.user.type$box), 178)[[4]])] = "boxes 536 - 712"
ggplot(subset(res.box.user.type, user.type == "active user"), aes(x = num.pics, y = box, size = num.users, colour = Neighborhood)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Active Image Distributions")
ggplot(subset(res.box.user.type, user.type == "non-active user"), aes(x = num.pics, y = box, size = num.users, colour = Neighborhood)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Non-Active Image Distributions")
ggplot(subset(res.box.user.type, user.type == "active user"), aes(x = num.users, y = box, size = num.pics, colour = Neighborhood)) + geom_point() + xlab("Number of Users") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Active User Distributions")
ggplot(subset(res.box.user.type, user.type == "non-active user"), aes(x = num.users, y = box, size = num.pics, colour = Neighborhood)) + geom_point() + xlab("Number of Users") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Non-Active User Distributions")
ggplot(subset(res.box.user.type, user.type == "active user"), aes(x = num.pics/num.users, y = box, colour = Neighborhood)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Active Normalized (by User) Image Distributions")
ggplot(subset(res.box.user.type, user.type == "non-active user"), aes(x = num.pics/num.users, y = box, colour = Neighborhood)) + geom_point() + xlab("Number of Pics") + ylab("") + facet_wrap(~ box.section, nrow = 1, scale = "free_y") + ggtitle("Non-Active Normalized (by User) Image Distributions")
Since each point covers the same area, we get a detailed snapshot of the image sharing popularity of the neighborhoods at a finer resolution. Roughly speaking, for active users (compared to non-active users) there are more neighborhood’s that have high image activity dominated by one or two points. For non-active users the overwhelming dominant point is Times Square followed by the Civic Center bordering China Town (point 42), so we see less structure in the overall distribution of images.
Below, we look at the distribution of images during time.
broadway.census$date = as.Date(broadway.census$updated)
broadway.census$day = weekdays(broadway.census$date)
## sloppy way to get hour of day (text manipulation)
broadway.census$hour = sapply(broadway.census$updated, FUN = function(x) substr(x, 12,13))
ggplot(broadway.census, aes(x = as.factor(hour), fill = user.type)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + xlab("Hour") + ylab("Volume")
ggplot(subset(broadway.census, user.type == "active user"), aes(x = as.factor(hour))) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + xlab("Hour") + ylab("Volume") + ggtitle("Active Users")
ggplot(subset(broadway.census, user.type != "active user"), aes(x = as.factor(hour))) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + xlab("Hour") + ylab("Volume") + ggtitle("Non-active Users")
For non-active users, we see diurnal patterns that match daily circadian rhythms as reported by others (for example: http://www.sciencemag.org/content/333/6051/1878.short). However, we see that in Times Square, the peak of activity happens much later in the day. For active users, especially for more North East neighborhoods, we see diurnal patterns that diverge than the patterns for non-activity users. For exmaple, we see the Dinancial District follow typical diurnal pattern corresponding to circadian rhythms, but in Hamilton Heights there is a strong deviation into social media activity late into the night.
The following figure shows the hourly activity divied into 4 equal parts based on our equi-sized boxes.
User distributions over the hours:
bb = by(broadway.census, broadway.census$hour, FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.hour = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
names(res.hour)[1] = "hour"
ggplot(res.hour, aes(x = hour, y = num.pics/num.users)) + geom_point() + ylab("Pics Normalized by User")
bb = by(broadway.census, broadway.census[,c("user.type", "hour")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.hour.user.type = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
ggplot(res.hour.user.type, aes(x = hour, y = num.pics/num.users)) + geom_point() + ylab("Pics Normalized by User") + facet_wrap(~ user.type, nrow = 1, scale = "free_y")
Users distributions over the hours for each neighborhood:
bb = by(broadway.census, broadway.census[,c("Neighborhood", "hour")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.hour.neighborhood = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
ggplot(res.hour.neighborhood, aes(x = hour, y = num.pics/num.users)) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y")
bb = by(broadway.census, broadway.census[,c("user.type" ,"Neighborhood", "hour")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.user.type.hour.neighborhood = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
ggplot(subset(res.user.type.hour.neighborhood, user.type == "active user"), aes(x = hour, y = num.pics/num.users)) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + ggtitle("Active User Normalized Image Distribution")
ggplot(subset(res.user.type.hour.neighborhood, user.type != "active user"), aes(x = hour, y = num.pics/num.users)) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + ggtitle("Non-active User Normalized Image Distribution")
Daily image volumes by neighborhood
broadway.census$day = as.factor(broadway.census$day)
broadway.census$day = factor(broadway.census$day, level = levels(broadway.census$day)[c(4,2,6,7,5,1,3)])
ggplot(broadway.census, aes(x = day, fill = user.type)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90))
ggplot(subset(broadway.census, user.type == "active user"), aes(x = day)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Active Users") + theme(axis.text.x = element_text(angle = 90))
ggplot(subset(broadway.census, user.type != "active user"), aes(x = day)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Non-active Users") + theme(axis.text.x = element_text(angle = 90))
The above figures show the image activity broken down by day of the week. With the exception of the Northeast neighborhoods, the active-users follow a working week activity level.
User distributions over the days:
bb = by(broadway.census, broadway.census$day, FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.day = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
names(res.day)[1] = "day"
res.day$day = factor(res.day$day, level = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
ggplot(res.day, aes(x = day, y = num.users)) + geom_point() + theme(axis.text.x = element_text(angle = 90))
bb = by(broadway.census, broadway.census[,c("user.type", "day")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.user.type.day = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
res.user.type.day$day = factor(res.user.type.day$day, level = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
ggplot(res.user.type.day, aes(x = day, y = num.users)) + geom_point() + theme(axis.text.x = element_text(angle = 90)) + facet_wrap(~ user.type, nrow = 1, scale = "free_y")
User distributions over the days for each neighborhood:
bb = by(broadway.census, broadway.census[,c("Neighborhood", "day")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.day.neighborhood = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
res.day.neighborhood$day = factor(res.day.neighborhood$day, level = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
ggplot(res.day.neighborhood, aes(x = day, y = num.pics/num.users)) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + theme(axis.text.x = element_text(angle = 90))
bb = by(broadway.census, broadway.census[,c("user.type", "Neighborhood", "day")], FUN = function(x) data.frame(num.users = length(unique(x$link_author)), num.pics = nrow(x)))
res.user.type.day.neighborhood = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
res.user.type.day.neighborhood$day = factor(res.user.type.day.neighborhood$day, level = c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"))
ggplot(subset(res.user.type.day.neighborhood, user.type == "active user"), aes(x = day, y = num.pics/(1+num.users))) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Active Users Daily Normalized Image Dist")
ggplot(subset(res.user.type.day.neighborhood, user.type != "active user"), aes(x = day, y = num.pics/(1+num.users))) + geom_point() + facet_wrap(~Neighborhood, scale = "free_y") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Non-Active Users Daily Normalized Image Dist")
Monthly image volumes by neighborhood:
#broadway.census$month = sapply(broadway.census$date, FUN = function(x) format(as.Date(x), "%b"))
#broadway.census$month = factor(broadway.census$month, level = c("Feb","Mar","Apr", "May", "Jun", "Jul", "Aug"))
#ggplot(broadway.census, aes(x = month, fill = user.type)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90))
#ggplot(subset(broadway.census, user.type == "active user"), aes(x = month,)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Active Users")
#ggplot(subset(broadway.census, user.type != "active user"), aes(x = month,)) + geom_bar(position = "dodge") + facet_wrap(~Neighborhood, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Non-active Users")
Monthly image volumes by box section:
#ggplot(broadway.census, aes(x = month, fill = user.type)) + geom_bar(position = "dodge") + facet_wrap(~box.section, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90))
#ggplot(subset(broadway.census, user.type == "active user"), aes(x = month)) + geom_bar(position = "dodge") + facet_wrap(~box.section, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Active Users")
#ggplot(subset(broadway.census, user.type != "active user"), aes(x = month)) + geom_bar(position = "dodge") + facet_wrap(~box.section, scale = "free_y") + ylab("Volume") + theme(axis.text.x = element_text(angle = 90)) + ggtitle("Non-active Users")
Merge with QTIP:
qtip = read.csv("~/Dropbox/Broadway data analysis/qtip-features/qtip_meta_v2.csv", header = TRUE, stringsAsFactors = FALSE)
broadway.census = merge(broadway.census, qtip[,c("filename", "entropy", "Hue", "Value", "Saturaion", "Chroma", "contrastM", "contrastRMS")])
Some scatter plots with QTIP features
ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = factor(day), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Active users")
ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = factor(hour), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Active users")
#ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = month, y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Active users")
ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = factor(day), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Non-active users")
ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = factor(hour), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Non-active users")
#ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = factor(month), y = Value)) + geom_boxplot(notch = TRUE) + ggtitle("Non-active users")
Compare with Economic data
#ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = log(income), y = log(num.pics.per.box))) + geom_point() + stat_smooth(method = lm) + ggtitle("Active users")
ggplot(na.omit(subset(broadway.census, user.type == "active user")), aes(x = log(income), y = Value)) + geom_point() + stat_smooth(method = lm) + ggtitle("Active users")
#ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = log(income), y = log(num.pics.per.box))) + geom_point() + stat_smooth(method = lm) + ggtitle("Non-active users")
ggplot(na.omit(subset(broadway.census, user.type != "active user")), aes(x = log(income), y = Value)) + geom_point() + stat_smooth(method = lm) + ggtitle("Non-active users")