Processing Social Media Images using (mostly) R

Data science work flow

Mehrdad Yazdani

Data Science Workflow

Overview

  1. Getting and loading data in R
  2. Aggregating and grouping data
  3. Visualization with ggplot2

Data sniffing in BASH

CD into folder that you want

PWD and ls -lh current directtory

Get a feel for a data set

Count number of rows for a CSV file

Count number of columns for a CSV file

Extracting features from collection of images

Often, images are stored in a collection of directories and we wish to extract some "features" from these images. We can use these features to extract summarizations of the type of images in the given collection.

For example, given a collection of Instagram images from NYC, we extract color histograms from all images and analyze how colors of images change during the day.

R is usually not the ideal image feature extraction tool. Python, MATLAB, and other tools are more often used.

Python Anaconda Distribution

Python scripts I use

HSV-feature-extraction

pyImagePlot

Loading data

options(width = 10)
broadway.df = read.csv(
  "~/Dropbox/Broadway_processed_data/processedData/insta_pix_users_2.csv", 
  header = TRUE, stringsAsFactors = FALSE)

Verify data is what we think it is: what do we have?

Here is a quick overview of what the data set looks like:

names(broadway.df)
## [1] "filename"    
## [2] "instagram_id"
## [3] "lon"         
## [4] "lat"         
## [5] "id"          
## [6] "user_id"     
## [7] "link_author" 
## [8] "updated"     
## [9] "date"
nrow(broadway.df)
## [1] 663748

Verify data is what we think it is: where is this from?

library(ggplot2)
ggplot(broadway.df, aes(x = lat, y = lon)) + geom_point()

plot of chunk unnamed-chunk-4

Verify data is what we think it is: when is it from?

range(as.Date(broadway.df$date))
## [1] "2014-02-26"
## [2] "2014-08-03"

80/20 rule

print(length(unique(broadway.df$link_author)))
## [1] 271310
num.pics = table(broadway.df$link_author)
num.pics = num.pics[order(num.pics, decreasing=TRUE)]
plot(cumsum(num.pics)/sum(num.pics), xlab = "User", 
     ylab = "Cumulative Distribution of Pics")
abline(h = .2, col = "red")

plot of chunk unnamed-chunk-6

Minimum number of pics users must have to be active (using 80/20 rule)

num.pics.active.user = num.pics[which(cumsum(num.pics)/sum(num.pics) > .2)[1]]
print(num.pics.active.user)
## http://www.instagram.com/djrobdinero 
##                                   13

Subdivide users into active and non-active

We subdivded the users into two groups: "active users" and "non-active users."

broadway.df$user.type = "non-active user"
broadway.df$user.type[which(broadway.df$link_author %in% 
                              names(num.pics[which(num.pics>num.pics.active.user)]))] = 
  "active user"

Merging

Let's merge with census (estimate) data from 2013 to get some neighborhood defintions and economic data:

census = read.delim("~/Dropbox/Broadway data analysis/broadway-crosstab.LM.txt", 
                    header = TRUE, stringsAsFactors = FALSE)
census = census[,c("Id","Neighborhood", "Neighborhood_num","dropoffs", "pickups", 
                   "checkins", "income", "rent","twitter_messages")]
names(census)[1] = "id"
broadway.census = merge(census, broadway.df)

Getting dates

head(broadway.census$date, 3)
## [1] "2014-06-17"
## [2] "2014-06-24"
## [3] "2014-05-09"
class(broadway.census$date)
## [1] "character"
broadway.census$date = as.Date(broadway.census$date)
class(broadway.census$date)
## [1] "Date"

Getting weeks

broadway.census$week = cut(broadway.census$date, breaks = "week")
head(broadway.census$week, 3)
## [1] 2014-06-16
## [2] 2014-06-23
## [3] 2014-05-05
## 23 Levels: 2014-02-24 ...

Grouping by neighborhood and week

bb = by(broadway.census, broadway.census[,c("Neighborhood", "week")], 
        FUN = function(x) data.frame(num.users = length(unique(x$link_author)), 
                                     num.pics = nrow(x)))
head(bb)
## [[1]]
##   num.users
## 1      2008
##   num.pics
## 1     2580
## 
## [[2]]
##   num.users
## 1       256
##   num.pics
## 1      327
## 
## [[3]]
##   num.users
## 1       105
##   num.pics
## 1      140
## 
## [[4]]
##   num.users
## 1       584
##   num.pics
## 1      772
## 
## [[5]]
##   num.users
## 1       393
##   num.pics
## 1      590
## 
## [[6]]
##   num.users
## 1       973
##   num.pics
## 1     1317

Grouping by neighborhood and week continued

res.users.week = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
head(res.users.week)
##          Neighborhood
## 1 Chelsea (14th-34th)
## 2           Chinatown
## 3        Civic Center
## 4     Columbus Circle
## 5  Financial District
## 6  Greenwhich Village
##         week
## 1 2014-02-24
## 2 2014-02-24
## 3 2014-02-24
## 4 2014-02-24
## 5 2014-02-24
## 6 2014-02-24
##   num.users
## 1      2008
## 2       256
## 3       105
## 4       584
## 5       393
## 6       973
##   num.pics
## 1     2580
## 2      327
## 3      140
## 4      772
## 5      590
## 6     1317

Visualize pics per users by neighborhood and week

ggplot(res.users.week, aes(x = week, y = num.pics/num.users)) + geom_point() + 
  ylab("Number of Pics Normalized by Users") + facet_wrap(~ Neighborhood) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot of chunk unnamed-chunk-14

Re-do analysis for user type

bb = by(broadway.census, broadway.census[,c("Neighborhood", "week", "user.type")], 
        FUN = function(x) data.frame(num.users = length(unique(x$link_author)), 
                                     num.pics = nrow(x)))
res.users.week = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
names(res.users.week)
## [1] "Neighborhood"
## [2] "week"        
## [3] "user.type"   
## [4] "num.users"   
## [5] "num.pics"

Visualize pics per users by neighborhood and week (re-do)

ggplot(res.users.week, aes(x = week, y = num.pics/num.users, colour = user.type)) + 
  geom_point()+ylab("Number of Pics Normalized by Users")+facet_wrap(~ Neighborhood) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

plot of chunk unnamed-chunk-16

Dealing with time (hours) correctly

head(broadway.df$updated)
## [1] "2014-04-15 10:45:09"
## [2] "2014-03-29 17:38:58"
## [3] "2014-04-15 22:37:55"
## [4] "2014-03-09 01:56:47"
## [5] "2014-03-15 21:09:06"
## [6] "2014-03-26 05:21:15"
class(broadway.df$updated)
## [1] "character"

Dealing with time (hours) correctly continued

time.format = "%Y-%m-%d %H:%M:%S"
x = as.POSIXct(strptime(broadway.df$updated[1], format = time.format), tz = "GMT")
print(x)
## [1] "2014-04-15 10:45:09 GMT"
library(lubridate)
hour(x)
## [1] 10
hour(format(x, tz = "America/New_York"))
## [1] 6

More about dates

class(broadway.df$date)
## [1] "character"
broadway.df$date = as.Date(broadway.df$date)
class(broadway.df$date)
## [1] "Date"

More about dates

head(broadway.df$date, 4)
## [1] "2014-04-15"
## [2] "2014-03-29"
## [3] "2014-04-15"
## [4] "2014-03-09"
difftime(broadway.df$date[2], broadway.df$date[4])
## Time difference of 20 days
as.numeric(difftime(broadway.df$date[2], broadway.df$date[4]))
## [1] 20

Find users that have posted more than one image

non.solo.users = names(num.pics[which(num.pics>1)])
broadway.no.solo = broadway.df[which(broadway.df$link_author %in% non.solo.users), ]
nrow(broadway.df)
## [1] 663748
nrow(broadway.no.solo)
## [1] 504341

Find maximum time (in days) between images posted by a user

max.day.diff = function(dates){
  min.date = min(dates)
  max.date= max(dates)
  return(as.numeric(difftime(max.date, min.date)))
}

length(unique(broadway.no.solo$link_author))
## [1] 111903

Using data.table library for fast querying and grouping

library(data.table, quietly = TRUE)
dt = as.data.table(broadway.no.solo)
setkey(dt, link_author)

users.max.dates = dt[,list(max.days = max.day.diff(date)), by = list(link_author)]
## Error in min(dates): invalid 'type' (closure) of argument

More resources