Author: @BulletproofLS *twitter
This post provides a foolproof approach to capture visitors’ data from an account of Okcupid.com. Specifically, this post
To make it into a controlled experiment:
I used three account with same profile but different pictures (picture of myself, Hanato Yui, and Sis Feng. Please google to find out what they look like…)
I used a photo of an unknown pretty for another account at Day 2. This was simply because Hanato’s account got reported as a spam at Day 1. I called her Beauty and let’s assume she has same level of attractiveness as Hanato…
I setted up all the account location in San Fransico, CA. (94123)
getlist.r is a function for getting the visitor’s username and output a list of url. The program scrapes no more than five pages of visitors as shown on the webpage.
##this is a program to get the list of visitors
##input: cookie file (txt),export from firefox add on
##output: a list of visitors username
##output2: a list of url made by username
##load library
library(XML)
library(bitops)
library(RCurl)
##log in the website
## myHttpheader is got from firefox-developers tools/network/get/type(html)/file(global)
myHttpheader<- c(
"User-Agent"="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-US,en;q=0.5",
"Connection"="keep-alive"
)
d2 =debugGatherer()
##make handle by getCurlHandle
cHandle2<- getCurlHandle(
httpheader=myHttpheader,followlocation=1,
debugfunction=d2$update,verbose=TRUE,cookiefile="cookie1.txt")
##scrap the 1st page
temp <- getURL(
"http://okcupid.com/visitors",curl=cHandle2,.encoding="gbk"
)
temp.html <- htmlTreeParse(temp, useInternalNodes=T)
temp.title.pro=xpathSApply (temp.html,'//title',xmlValue)
temp.line=xpathSApply(temp.html,'//a',xmlValue)
temp.line2=xpathSApply(temp.html,"//a[@class='top']", xmlGetAttr,"href")
##now try to scrap 2nd page
temp.2<- getURL(
"http://okcupid.com/visitors?low=26",curl=cHandle2,.encoding="gbk"
)
temp2.html <- htmlTreeParse(temp.2, useInternalNodes=T)
temp2.line2=xpathSApply(temp2.html,"//a[@class='top']", xmlGetAttr,"href")
##now try to scrap next page
temp.3<- getURL(
"http://okcupid.com/visitors?low=51",curl=cHandle2,.encoding="gbk"
)
temp3.html <- htmlTreeParse(temp.3, useInternalNodes=T)
temp3.line2=xpathSApply(temp3.html,"//a[@class='top']", xmlGetAttr,"href")
##now try to scrap next page
temp.4<- getURL(
"http://okcupid.com/visitors?low=76",curl=cHandle2,.encoding="gbk"
)
temp4.html <- htmlTreeParse(temp.4, useInternalNodes=T)
temp4.line2=xpathSApply(temp4.html,"//a[@class='top']", xmlGetAttr,"href")
##now try to scrap next page
temp.5<- getURL(
"http://okcupid.com/visitors?low=101",curl=cHandle2,.encoding="gbk"
)
temp5.html <- htmlTreeParse(temp.5, useInternalNodes=T)
temp5.line2=xpathSApply(temp5.html,"//a[@class='top']", xmlGetAttr,"href")
##now bind all the name togther
visitor.all=c(temp.line2,temp2.line2,temp3.line2,temp4.line2,temp5.line2)
##now make the url
url.list=NULL
for (j in 1:length(visitor.all)) {
cur=visitor.all[j]
url.cur=paste('http://www.okcupid.com',cur,sep="")
url.list=c(url.list,url.cur)
}
##save the file
save(url.list,file="urllist.rda")
grabfile.r takes the output of getlist.r and grab the info from each webpage
##this is a program take teh output of getlist.r and grab the info from each visitor's webpage
##input: urllist.rda
##output: depend on what I want to know
##age [title]
##location [title] ##hard to get analysis
##religion [10]
##income [14]
##education [12]
##sign [11]
##smoke [7]
##drinks [8]
##drugs [9]
##pets [18]
##load the url
attach("urllist.rda")
##get title.pro and detail.pro each time, renew the list
age=NULL
smoke=NULL
drink=NULL
drug=NULL
religion=NULL
horo=NULL
educate=NULL
income=NULL
pets=NULL
for (i in 1:length(url.list)) {
cur.url=url.list[i]
judge=try(html <- htmlTreeParse(cur.url, useInternalNodes=T),silent=T)
##error may occur here
if (class(judge)[1]=='try-error') {
##delete the url from the list
url.list[i]=NaN
next
}
title.pro=xpathSApply (html,'//title',xmlValue)
detail.pro=xpathSApply(html,'//dl',xmlValue)
new.title=strsplit(title.pro,'/')
age.cur=as.numeric(new.title[[1]][2])
smoke.cur=strsplit(detail.pro[7],' ')[[1]][2]
drink.cur=strsplit(detail.pro[8],' ')[[1]][2]
drug.cur=strsplit(detail.pro[9],' ')[[1]][2]
religion.cur=strsplit(detail.pro[10],' ')[[1]][2]
horo.cur=strsplit(detail.pro[11],' ')[[1]][2]
educate.cur=strsplit(detail.pro[12],'Education ')[[1]][2]
income.cur=strsplit(detail.pro[14],' ')[[1]][2]
pets.cur=strsplit(detail.pro[18],'Pets ')[[1]][2]
age=c(age,age.cur)
smoke=c(smoke,smoke.cur)
drink=c(drink,drink.cur)
drug=c(drug,drug.cur)
religion=c(religion,religion.cur)
horo=c(horo,horo.cur)
educate=c(educate,educate.cur)
income=c(income,income.cur)
pets=c(pets,pets.cur)
}
##put all variables into one dataframe
profile=data.frame(age,smoke,drink,drug,religion,horo,educate,income,pets)
About the cookie file: I used an plugin from firefox browser to export the cookie file after logged in the accont. It looks like this:
.acuityplatform.com TRUE / FALSE 3557530171 auid 65669919244
.admaym.com TRUE / FALSE 1415223675 UM1 RgAAAB-LCAAAAAAAAAvjcuHQFXLgUkm1tDQxNTFI0jVPTU3UNbE0MNC1ME g01LVMTjG3TDZIMko1MRLi5thxZ-ve2RN7jARYpUCc-Ru7V_QCOQAVA0sbRgAAAA2
.admaym.com TRUE / FALSE 1415223675 vi 7e6868102fd74acd8bf267d6e0eee22e
.admaym.com TRUE / FALSE 1415223675 fid 39bc8c2d55b1d91900392f350212244b
.bluekai.com TRUE / FALSE 1425616360 bkdc dal
The following code takes in grabfile.r, getlist.r and an cookie file in txt format. It generates the list of visitors’ information.
##This is to do the real scrapping script
##input
##cookie file in txt format
##getlist.r
##grabfile.r
##Output
##put all the following object in the folder, and save by the date it get scrapped
##urllist
##sex
##age, etc
##source the file
source('getlist.r')
source('grabfile.r')
##name
file.name=paste('profile',Sys.time(),sep='')
list.name=paste('userlist',Sys.time(),sep='')
##write
write.table(profile,file=file.name,sep='\t')
write(url.list,file=list.name)
##use read.table(file='',sep='\t') to read
The following code did some clean-up. I did add a Tag variable to indicate which account it sources for.
###add tag column to the data, and combine into one data frame
Tag=rep('Me',nrow(pro.me))
pro.me=cbind(pro.me,Tag)
Tag=rep('Hatano',nrow(pro.hanata))
pro.hanata=cbind(pro.hanata,Tag)
Tag=rep('Feng',nrow(pro.feng))
pro.feng=cbind(pro.feng,Tag)
Tag=rep('Beauty',nrow(pro.beauty))
pro.beauty=cbind(pro.beauty,Tag)
profile=rbind(pro.me,pro.hanata)
profile=rbind(profile,pro.feng)
profile=rbind(profile,pro.beauty)
The final data frame looks like this:
head(profile)
## age smoke drink drug religion
## 1 29 — — Never —
## 2 34 No Socially Never Catholicism, but not too serious about it
## 3 40 — — — —
## 4 25 Trying Socially Never —
## 5 28 No Socially Sometimes —
## 6 42 No Rarely Never Christianity, but not too serious about it
## horo educate
## 1 — <NA>
## 2 Aries, and it’s fun to think about Graduated from two-year college
## 3 — University
## 4 Pisces, and it’s fun to think about Graduated from university
## 5 Aries, and it’s fun to think about Working on university
## 6 Gemini, and it’s fun to think about Graduated from masters program
## income pets Tag
## 1 — — Me
## 2 $50,000–$60,000 Likes dogs Me
## 3 — — Me
## 4 — Likes dogs Me
## 5 $20,000–$30,000 Likes dogs Me
## 6 $150,000–$250,000 — Me
I only did univariate comparsion and dirty quick plots. No sanity of statistical significance is questioned…
## The following object is masked _by_ .GlobalEnv:
##
## Tag
And other general features of okcupid male user…
One thing I've learned is that profile photo did make a big difference on your popularity. Plus, car-selfies, fish-selfies, and low-resolution pictures are highly obnoxious.