Author: @BulletproofLS *twitter

This post provides a foolproof approach to capture visitors’ data from an account of Okcupid.com. Specifically, this post

  1. Shows the example code for automatically log in okcupid account using RCurl;
  2. Captures the profile data and format it into a data frame;
  3. Makes dirty and quick plots from my data;
  4. Discusses the results

‘The bait’

To make it into a controlled experiment:

‘Fish hook’

getlist.r is a function for getting the visitor’s username and output a list of url. The program scrapes no more than five pages of visitors as shown on the webpage.

##this is a program to get the list of visitors 
##input: cookie file (txt),export from firefox add on
##output: a list of visitors username
##output2: a list of url made by username

##load library
library(XML)
library(bitops)
library(RCurl)    
    ##log in the website
    ## myHttpheader is got from firefox-developers tools/network/get/type(html)/file(global)
    myHttpheader<- c(
"User-Agent"="Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:32.0) Gecko/20100101 Firefox/32.0",
"Accept"="text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language"="en-US,en;q=0.5",
"Connection"="keep-alive"
)
    d2 =debugGatherer()
    
    ##make handle by getCurlHandle
    cHandle2<- getCurlHandle(
httpheader=myHttpheader,followlocation=1,
debugfunction=d2$update,verbose=TRUE,cookiefile="cookie1.txt")

##scrap the 1st page
temp <- getURL(
"http://okcupid.com/visitors",curl=cHandle2,.encoding="gbk"
)
temp.html <- htmlTreeParse(temp, useInternalNodes=T)
temp.title.pro=xpathSApply (temp.html,'//title',xmlValue)
temp.line=xpathSApply(temp.html,'//a',xmlValue)
temp.line2=xpathSApply(temp.html,"//a[@class='top']", xmlGetAttr,"href")

##now try to scrap 2nd page
temp.2<- getURL(
"http://okcupid.com/visitors?low=26",curl=cHandle2,.encoding="gbk"
)

temp2.html <- htmlTreeParse(temp.2, useInternalNodes=T)
temp2.line2=xpathSApply(temp2.html,"//a[@class='top']", xmlGetAttr,"href")

##now try to scrap next page
temp.3<- getURL(
"http://okcupid.com/visitors?low=51",curl=cHandle2,.encoding="gbk"
)

temp3.html <- htmlTreeParse(temp.3, useInternalNodes=T)
temp3.line2=xpathSApply(temp3.html,"//a[@class='top']", xmlGetAttr,"href")

##now try to scrap next page
temp.4<- getURL(
"http://okcupid.com/visitors?low=76",curl=cHandle2,.encoding="gbk"
)

temp4.html <- htmlTreeParse(temp.4, useInternalNodes=T)
temp4.line2=xpathSApply(temp4.html,"//a[@class='top']", xmlGetAttr,"href")

##now try to scrap next page
temp.5<- getURL(
"http://okcupid.com/visitors?low=101",curl=cHandle2,.encoding="gbk"
)

temp5.html <- htmlTreeParse(temp.5, useInternalNodes=T)
temp5.line2=xpathSApply(temp5.html,"//a[@class='top']", xmlGetAttr,"href")

            ##now bind all the name togther
            visitor.all=c(temp.line2,temp2.line2,temp3.line2,temp4.line2,temp5.line2)
        
        ##now make the url
        url.list=NULL
        for (j in 1:length(visitor.all)) {
            cur=visitor.all[j]
            url.cur=paste('http://www.okcupid.com',cur,sep="")
            url.list=c(url.list,url.cur)
            }
            
        ##save the file
        save(url.list,file="urllist.rda")

grabfile.r takes the output of getlist.r and grab the info from each webpage

##this is a program take teh output of getlist.r and grab the info from each visitor's webpage

##input: urllist.rda
##output: depend on what I want to know

    ##age [title]
    ##location [title] ##hard to get analysis
    ##religion [10]
    ##income [14]
    ##education [12]
    ##sign [11]
    ##smoke [7]
    ##drinks [8]
    ##drugs [9]
    ##pets [18]
    
    ##load the url
    attach("urllist.rda")
    
        ##get title.pro and detail.pro each time, renew the list
        age=NULL
        smoke=NULL
        drink=NULL
        drug=NULL
        religion=NULL
        horo=NULL
        educate=NULL
        income=NULL
        pets=NULL
        
        for (i in 1:length(url.list)) {
            cur.url=url.list[i]
            judge=try(html <- htmlTreeParse(cur.url, useInternalNodes=T),silent=T)
            ##error may occur here
            if (class(judge)[1]=='try-error') {
                ##delete the url from the list
                url.list[i]=NaN
                next
                }
                            
            title.pro=xpathSApply (html,'//title',xmlValue)
            detail.pro=xpathSApply(html,'//dl',xmlValue)
            
            new.title=strsplit(title.pro,'/')
            age.cur=as.numeric(new.title[[1]][2])
            
            smoke.cur=strsplit(detail.pro[7],' ')[[1]][2]
            drink.cur=strsplit(detail.pro[8],' ')[[1]][2]
            drug.cur=strsplit(detail.pro[9],' ')[[1]][2]
            religion.cur=strsplit(detail.pro[10],'  ')[[1]][2]
            horo.cur=strsplit(detail.pro[11],'  ')[[1]][2]
            educate.cur=strsplit(detail.pro[12],'Education  ')[[1]][2]
            income.cur=strsplit(detail.pro[14],' ')[[1]][2]
            pets.cur=strsplit(detail.pro[18],'Pets  ')[[1]][2]
            
            
            age=c(age,age.cur)
            smoke=c(smoke,smoke.cur)
            drink=c(drink,drink.cur)
            drug=c(drug,drug.cur)
            religion=c(religion,religion.cur)
            horo=c(horo,horo.cur)
            educate=c(educate,educate.cur)
            income=c(income,income.cur)
            pets=c(pets,pets.cur)
            
            }
        
    ##put all variables into one dataframe
    profile=data.frame(age,smoke,drink,drug,religion,horo,educate,income,pets)  

‘The fishing’

About the cookie file: I used an plugin from firefox browser to export the cookie file after logged in the accont. It looks like this:

.acuityplatform.com    TRUE /   FALSE   3557530171  auid    65669919244
    .admaym.com TRUE    /   FALSE   1415223675  UM1 RgAAAB-LCAAAAAAAAAvjcuHQFXLgUkm1tDQxNTFI0jVPTU3UNbE0MNC1ME            g01LVMTjG3TDZIMko1MRLi5thxZ-ve2RN7jARYpUCc-Ru7V_QCOQAVA0sbRgAAAA2
.admaym.com TRUE    /   FALSE   1415223675  vi  7e6868102fd74acd8bf267d6e0eee22e
.admaym.com TRUE    /   FALSE   1415223675  fid 39bc8c2d55b1d91900392f350212244b
.bluekai.com    TRUE    /   FALSE   1425616360  bkdc    dal

The following code takes in grabfile.r, getlist.r and an cookie file in txt format. It generates the list of visitors’ information.

##This is to do the real scrapping script 
##input 
    ##cookie file in txt format
    ##getlist.r
    ##grabfile.r
    
##Output
    ##put all the following object in the folder, and save by the date it get scrapped
        ##urllist
        ##sex
        ##age, etc
        
        
 ##source the file
 source('getlist.r')
 source('grabfile.r')
 
 ##name
 file.name=paste('profile',Sys.time(),sep='')
 list.name=paste('userlist',Sys.time(),sep='')
 
 ##write
 write.table(profile,file=file.name,sep='\t')
 write(url.list,file=list.name)
 
 
 ##use read.table(file='',sep='\t') to read

Clean-up

The following code did some clean-up. I did add a Tag variable to indicate which account it sources for.

###add tag column to the data, and combine into one data frame
Tag=rep('Me',nrow(pro.me))
pro.me=cbind(pro.me,Tag)
Tag=rep('Hatano',nrow(pro.hanata))
pro.hanata=cbind(pro.hanata,Tag)
Tag=rep('Feng',nrow(pro.feng))
pro.feng=cbind(pro.feng,Tag)
Tag=rep('Beauty',nrow(pro.beauty))
pro.beauty=cbind(pro.beauty,Tag)

profile=rbind(pro.me,pro.hanata)
profile=rbind(profile,pro.feng)
profile=rbind(profile,pro.beauty)

The final data frame looks like this:

head(profile)
##   age  smoke    drink      drug                                   religion
## 1  29      —        —     Never                                          —
## 2  34     No Socially     Never  Catholicism, but not too serious about it
## 3  40      —        —         —                                          —
## 4  25 Trying Socially     Never                                          —
## 5  28     No Socially Sometimes                                          —
## 6  42     No   Rarely     Never Christianity, but not too serious about it
##                                  horo                           educate
## 1                                   —                              <NA>
## 2  Aries, and it’s fun to think about Graduated from two-year college  
## 3                                   —                      University  
## 4 Pisces, and it’s fun to think about       Graduated from university  
## 5  Aries, and it’s fun to think about           Working on university  
## 6 Gemini, and it’s fun to think about  Graduated from masters program  
##              income         pets Tag
## 1                 —          —    Me
## 2   $50,000–$60,000 Likes dogs    Me
## 3                 —          —    Me
## 4                 — Likes dogs    Me
## 5   $20,000–$30,000 Likes dogs    Me
## 6 $150,000–$250,000          —    Me

Results

I only did univariate comparsion and dirty quick plots. No sanity of statistical significance is questioned…

  1. The number of visitors are almost proportional to the cleavage scale on the photos?..
## The following object is masked _by_ .GlobalEnv:
## 
##     Tag

plot of chunk unnamed-chunk-8

  1. Age distribution: unknown pretty girl attracts more old guys? plot of chunk unnamed-chunk-9plot of chunk unnamed-chunk-9

And other general features of okcupid male user… plot of chunk unnamed-chunk-10plot of chunk unnamed-chunk-10plot of chunk unnamed-chunk-10plot of chunk unnamed-chunk-10plot of chunk unnamed-chunk-10plot of chunk unnamed-chunk-10plot of chunk unnamed-chunk-10plot of chunk unnamed-chunk-10

Discussion

One thing I've learned is that profile photo did make a big difference on your popularity. Plus, car-selfies, fish-selfies, and low-resolution pictures are highly obnoxious.