
Mehrdad Yazdani
Often, images are stored in a collection of directories and we wish to extract some "features" from these images. We can use these features to extract summarizations of the type of images in the given collection.
For example, given a collection of Instagram images from NYC, we extract color histograms from all images and analyze how colors of images change during the day.
R is usually not the ideal image feature extraction tool. Python, MATLAB, and other tools are more often used.
Batch feature extraction scripts: https://github.com/myazdani/batch-feature-extraction
pyImagePlot: https://github.com/myazdani/pyImagePlot
Caution!! These are all in development!!
options(width = 10)
broadway.df = read.csv(
"~/Dropbox/Broadway_processed_data/processedData/insta_pix_users_2.csv",
header = TRUE, stringsAsFactors = FALSE)
Here is a quick overview of what the data set looks like:
names(broadway.df)
## [1] "filename"
## [2] "instagram_id"
## [3] "lon"
## [4] "lat"
## [5] "id"
## [6] "user_id"
## [7] "link_author"
## [8] "updated"
## [9] "date"
nrow(broadway.df)
## [1] 663748
library(ggplot2)
ggplot(broadway.df, aes(x = lat, y = lon)) + geom_point()
range(as.Date(broadway.df$date))
## [1] "2014-02-26"
## [2] "2014-08-03"
print(length(unique(broadway.df$link_author)))
## [1] 271310
num.pics = table(broadway.df$link_author)
num.pics = num.pics[order(num.pics, decreasing=TRUE)]
plot(cumsum(num.pics)/sum(num.pics), xlab = "User",
ylab = "Cumulative Distribution of Pics")
abline(h = .2, col = "red")
num.pics.active.user = num.pics[which(cumsum(num.pics)/sum(num.pics) > .2)[1]]
print(num.pics.active.user)
## http://www.instagram.com/djrobdinero
## 13
We subdivded the users into two groups: "active users" and "non-active users."
broadway.df$user.type = "non-active user"
broadway.df$user.type[which(broadway.df$link_author %in%
names(num.pics[which(num.pics>num.pics.active.user)]))] =
"active user"
Let's merge with census (estimate) data from 2013 to get some neighborhood defintions and economic data:
census = read.delim("~/Dropbox/Broadway data analysis/broadway-crosstab.LM.txt",
header = TRUE, stringsAsFactors = FALSE)
census = census[,c("Id","Neighborhood", "Neighborhood_num","dropoffs", "pickups",
"checkins", "income", "rent","twitter_messages")]
names(census)[1] = "id"
broadway.census = merge(census, broadway.df)
head(broadway.census$date, 3)
## [1] "2014-06-17"
## [2] "2014-06-24"
## [3] "2014-05-09"
class(broadway.census$date)
## [1] "character"
broadway.census$date = as.Date(broadway.census$date)
class(broadway.census$date)
## [1] "Date"
broadway.census$week = cut(broadway.census$date, breaks = "week")
head(broadway.census$week, 3)
## [1] 2014-06-16
## [2] 2014-06-23
## [3] 2014-05-05
## 23 Levels: 2014-02-24 ...
bb = by(broadway.census, broadway.census[,c("Neighborhood", "week")],
FUN = function(x) data.frame(num.users = length(unique(x$link_author)),
num.pics = nrow(x)))
head(bb)
## [[1]]
## num.users
## 1 2008
## num.pics
## 1 2580
##
## [[2]]
## num.users
## 1 256
## num.pics
## 1 327
##
## [[3]]
## num.users
## 1 105
## num.pics
## 1 140
##
## [[4]]
## num.users
## 1 584
## num.pics
## 1 772
##
## [[5]]
## num.users
## 1 393
## num.pics
## 1 590
##
## [[6]]
## num.users
## 1 973
## num.pics
## 1 1317
res.users.week = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
head(res.users.week)
## Neighborhood
## 1 Chelsea (14th-34th)
## 2 Chinatown
## 3 Civic Center
## 4 Columbus Circle
## 5 Financial District
## 6 Greenwhich Village
## week
## 1 2014-02-24
## 2 2014-02-24
## 3 2014-02-24
## 4 2014-02-24
## 5 2014-02-24
## 6 2014-02-24
## num.users
## 1 2008
## 2 256
## 3 105
## 4 584
## 5 393
## 6 973
## num.pics
## 1 2580
## 2 327
## 3 140
## 4 772
## 5 590
## 6 1317
ggplot(res.users.week, aes(x = week, y = num.pics/num.users)) + geom_point() +
ylab("Number of Pics Normalized by Users") + facet_wrap(~ Neighborhood) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
bb = by(broadway.census, broadway.census[,c("Neighborhood", "week", "user.type")],
FUN = function(x) data.frame(num.users = length(unique(x$link_author)),
num.pics = nrow(x)))
res.users.week = cbind(expand.grid(dimnames(bb)), do.call(rbind, bb))
names(res.users.week)
## [1] "Neighborhood"
## [2] "week"
## [3] "user.type"
## [4] "num.users"
## [5] "num.pics"
ggplot(res.users.week, aes(x = week, y = num.pics/num.users, colour = user.type)) +
geom_point()+ylab("Number of Pics Normalized by Users")+facet_wrap(~ Neighborhood) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
head(broadway.df$updated)
## [1] "2014-04-15 10:45:09"
## [2] "2014-03-29 17:38:58"
## [3] "2014-04-15 22:37:55"
## [4] "2014-03-09 01:56:47"
## [5] "2014-03-15 21:09:06"
## [6] "2014-03-26 05:21:15"
class(broadway.df$updated)
## [1] "character"
time.format = "%Y-%m-%d %H:%M:%S"
x = as.POSIXct(strptime(broadway.df$updated[1], format = time.format), tz = "GMT")
print(x)
## [1] "2014-04-15 10:45:09 GMT"
library(lubridate)
hour(x)
## [1] 10
hour(format(x, tz = "America/New_York"))
## [1] 6
class(broadway.df$date)
## [1] "character"
broadway.df$date = as.Date(broadway.df$date)
class(broadway.df$date)
## [1] "Date"
head(broadway.df$date, 4)
## [1] "2014-04-15"
## [2] "2014-03-29"
## [3] "2014-04-15"
## [4] "2014-03-09"
difftime(broadway.df$date[2], broadway.df$date[4])
## Time difference of 20 days
as.numeric(difftime(broadway.df$date[2], broadway.df$date[4]))
## [1] 20
non.solo.users = names(num.pics[which(num.pics>1)])
broadway.no.solo = broadway.df[which(broadway.df$link_author %in% non.solo.users), ]
nrow(broadway.df)
## [1] 663748
nrow(broadway.no.solo)
## [1] 504341
max.day.diff = function(dates){
min.date = min(dates)
max.date= max(dates)
return(as.numeric(difftime(max.date, min.date)))
}
length(unique(broadway.no.solo$link_author))
## [1] 111903
library(data.table, quietly = TRUE)
dt = as.data.table(broadway.no.solo)
setkey(dt, link_author)
users.max.dates = dt[,list(max.days = max.day.diff(date)), by = list(link_author)]
## Error in min(dates): invalid 'type' (closure) of argument