Setup

getwd()
setwd("/Users/Taylor/Downloads")
#list.files()
pf<- read.csv("pseudo_facebook.tsv",sep='\t')
names(pf)
library(gridExtra)
library(ggplot2)
library(xkcd)

Date of Birth

Let’s look at every day of the year and see if there are any trends. Note that January 1st has a huge number of birthdays. This is likely due to the way facebook masks private birthdays, maybe end users are selecting the first dropdown in the list, we don’t know for sure.

# takes ~variable, allows you to create same type of plot across multiple plots
# facet_grid takes vertical ~ horizontal)
qplot(x=dob_day, data=pf,color=I("black"), fill=I("#359bed")) +
  theme_xkcd() +
  scale_x_discrete(breaks=1:31) +
  facet_wrap(~dob_month,ncol=3)+
  #facet_grid(~dob_month,labeller=month_labeller,as.table=TRUE)+
  xlab('Date of Birth') + 
  ylab('Number of Users')

Friend Count

Let’s look at friend count. What I expect (and now looking back, found to be true) is that this is going to be very long-tailed data. In other words the vast majority of of data are going to be distributed normally, but there is going to be a very string lean right due to the outliers with magnitudes of friend counts more than the median user.

qplot(x=friend_count, data=pf, color=I("black"), fill=I("#dd3333")) + 
  theme_xkcd()+
  xlab('Number of Friends') + 
  ylab('Number of Users')

qplot(x=friend_count, data=pf, xlim = c(0,1000), color=I("black"), fill=I("#dd3333")) + 
  theme_xkcd()+
  xlab('Number of Friends') + 
  ylab('Number of Users')

qplot(x=friend_count, data=pf, color=I("black"), fill=I("#dd3333"))+ 
  theme_xkcd() +
  scale_x_continuous(limits=c(0,1000))+
  xlab('Number of Friends') + 
  ylab('Number of Users')

qplot(x=friend_count, data=pf, binwidth=25, color=I("black"), fill=I("#dd3333"))+ 
  theme_xkcd() + 
  scale_x_continuous(limits=c(0,1000), breaks=seq(0,1000,50))+
  xlab('Number of Friends') + 
  ylab('Number of Users')

#xlim limits the axis values

Friend Count by Gender

Let’s look at who has more friends, men or women? Hint: It’s women.

#gender
qplot(x=friend_count, data=subset(pf), binwidth=25, color=I("black"), fill=I("#fcae3a"))+ 
  theme_xkcd() +
  scale_x_continuous(limits=c(0,1000), breaks=seq(0,1000,100)) +
  facet_wrap(~gender, ncol=3) +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
  xlab('Number of Friends') + 
  ylab('Number of Users')

#remove genderless

qplot(x=friend_count, data=subset(pf, !is.na(pf$gender)), binwidth=25, color=I("black"), fill=I("#fcae3a"))+
  theme_xkcd() + 
  scale_x_continuous(limits=c(0,1000), breaks=seq(0,1000,100)) +
  facet_wrap(~gender) +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
  xlab('Number of Friends') + 
  ylab('Number of Users')

#what's our breakdown guys to girls?
table(pf$gender)

## 
## female   male 
##  40254  58574

mean(pf$friend_count)

## [1] 196.3508

by(pf$friend_count, pf$gender ,summary)

## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

by(pf$friend_count, pf$gender ,median)

## pf$gender: female
## [1] 96
## -------------------------------------------------------- 
## pf$gender: male
## [1] 74

Tenure

Interesting data here–how long have most facebook users been on the site. Not as long as you may think.

##tenure
# in months
qplot(x=tenure, data=pf,color=I("black"), fill=I("#8224e3"), binwidth=30) +
  theme_xkcd() + 
  ggtitle("Histogram of Users by Tenure") + 
  ylab("Number of Users") +
  xlab("Tenure in Months")

# in years
qplot(x=tenure/365, data=pf,color=I("black"), fill=I("#8224e3"), binwidth=.5) +
  theme_xkcd() + 
  ggtitle("Histogram of Users by Tenure") + 
  ylab("Number of Users") +
  xlab("Tenure in Years") +
  scale_x_continuous(breaks=seq(from=1,to=7,by=1),limits=c(0,7))

## facebook age of users
# in years
qplot(x=age, data=pf, color=I("black"), fill = I("#81d742"), binwidth=5,
      ylab='Number of Users', xlab='Age') +
  theme_xkcd() +
  ggtitle("Histogram of Facebook users by age") +
  scale_x_continuous(breaks=seq(10,115,5), limits=c(10,115))

qplot(x=age, data=pf, color=I("black"), fill = I("#81d742"), binwidth=1,
      ylab='Number of Users', xlab='Age') +
  theme_xkcd() +
  ggtitle("Histogram of Facebook users by age") +
  scale_x_continuous(breaks=seq(12,114,4), limits=c(12,114))

Transforming Data

How do we explore data that are not normally distributed? We can do this by changing the scale at which we receive the data.

Why do we need to do this? We need to do this to get an understanding of the data and we need to do this so that the data are normally distributed if we are going to model off of it.

normalPlot<-qplot(x=friend_count, data=pf, binwidth=25, color=I("black"), fill=I("#dd3333"))+ 
  theme_xkcd() + 
  scale_x_continuous(limits=c(0,1000), breaks=seq(0,1000,50))+
  ggtitle("No Transformation") +
  ylab("Number of Users")+
  xlab("Friend Count")
summary(pf$friend_count)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    31.0    82.0   196.4   206.0  4923.0

summary(log10(pf$friend_count+1)) # add 1 because if there are users with 0 friends log10(0) is undefined

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.505   1.919   1.868   2.316   3.692

summary(sqrt(pf$friend_count))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.568   9.055  11.090  14.350  70.160

#log10
log10Plot1<-qplot(x=log10(friend_count+1), data=pf, color=I("black"), fill=I("#dd3333"))+ 
  theme_xkcd() + 
  ggtitle("Log Base 10 Transformation")+
  ylab("Number of Users")+
  xlab("Log Base 10 of Friend Count")
#sqrt
sqrtPlot<-qplot(x=sqrt(friend_count), data=pf, color=I("black"), fill=I("#dd3333"))+ 
  theme_xkcd() + 
  #scale_x_continuous(limits=c(0,1000), breaks=seq(0,1000,50))+
  ggtitle("Post-transformation--Square Root")+
  ylab("Number of Users")+
  xlab("Square Root of Friend Count")

grid.arrange(normalPlot,log10Plot1,sqrtPlot,ncol=1)

## using scales in the lesson
normalPlot<-ggplot(aes(x=friend_count),data=pf)+
  geom_histogram(color=I('black'), fill=I('#dd3333'))+
  theme_xkcd()+
  ggtitle("Pre-transformation--long tailed") +
  ylab("Number of Users")+
  xlab("Friend Count")
log10Plot<-normalPlot+scale_x_log10()+ ggtitle('Log Base 10 Transformation')
sqrtPlot<-normalPlot+scale_x_sqrt()+ ggtitle('Square Root Transformation')

grid.arrange(normalPlot,log10Plot,sqrtPlot,ncol=1)

#why are the scales different?
grid.arrange(log10Plot1,log10Plot)

Frequency Polygon

This allows us to compare distributions. They are similar to histograms but they draw a curve matching the counts in the histogram, so we can better see peaks. We can also stack them.

Remember this?

qplot(x=friend_count, data=subset(pf, !is.na(pf$gender)), binwidth=25, color=I("black"), fill=I("#fcae3a"))+
  theme_xkcd() + 
  scale_x_continuous(limits=c(0,1000), breaks=seq(0,1000,100)) +
  facet_wrap(~gender) +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
  xlab('Number of Friends') + 
  ylab('Number of Users')

Now let’s use the frequency polygon:

normalFreq<-qplot(x=friend_count, data=subset(pf, !is.na(pf$gender)), 
      binwidth=10, color=gender, fill=I('#fcae3a'), 
      geom= 'freqpoly', y=(..count../sum(..count..))*100) +
  scale_x_continuous(lim=c(0,1000), breaks=seq(0,1000,50)) +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
  ggtitle('Frequency Polygon: Number Of Friends By Gender')+
  xlab('Number of Friends') + 
  ylab('Percent of Users')+
  theme_xkcd()

limitedFreq<-qplot(x=friend_count, data=subset(pf, !is.na(pf$gender)), 
      binwidth=10, color=gender, fill=I('#fcae3a'), 
      geom= 'freqpoly', y=..count../sum(..count..)) +
  scale_x_continuous(lim=c(200,4000), breaks=seq(200,4000,200)) +
  scale_y_continuous(lim=c(0,0.0005)) +
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
  ggtitle('Frequency Polygon: Number Of Friends By Gender')+
  xlab('Number of Friends') + 
  ylab('Proportion of Users')+
  theme_xkcd()

logFreq<-qplot(x=friend_count, data=subset(pf, !is.na(pf$gender)), color=gender, fill=I('#fcae3a'), 
      geom= 'freqpoly', y=(..count../sum(..count..))*100) +
  scale_x_continuous(limits=c(1,1000),breaks=seq(0,5000,500)) +
  scale_x_log10(limits=c(10,5000),breaks=seq(10,5000,500))+
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
  ggtitle('Frequency Polygon: Number Of Friends By Gender - Logarithmic')+
  xlab('Number of Friends') + 
  ylab('Percent of Users')+
  theme_xkcd()

logFreq2<-qplot(x=friend_count, data=subset(pf, !is.na(pf$gender)), color=gender, fill=I('#fcae3a'), 
      geom= 'freqpoly', y=(..count../sum(..count..))*100) +
  scale_x_continuous(limits=c(1,1000),breaks=seq(0,5000,500)) +
  scale_x_log10(limits=c(1000,5000),breaks=seq(1000,5000,500))+
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
  ggtitle('Frequency Polygon: Number Of Friends By Gender - Logarithmic')+
  xlab('Number of Friends') + 
  ylab('Percent of Users')+
  theme_xkcd()

#likes by gender
qplot(x=www_likes, data=subset(pf, !is.na(pf$gender)), color=gender, fill=I('#fcae3a'), 
      geom= 'freqpoly') +
  scale_x_continuous() +
  scale_x_log10()+
  theme(axis.text.x=element_text(angle=90,hjust=1,vjust=0.5))+
  ggtitle('Likes by Gender')+
  xlab('Number of Likes') + 
  ylab('Number of Users with X Number of Likes')+
  theme_xkcd()

Boxplots

Boxplots are a useful visualization to examine the standard deviation of the data. The box represents the middle 50% of data. The top of the box is thus the 75th percentile of the data. In this case the 75th percentile represents the number of friends someone has to have to be in the 75th percentile of users. Breaking the problem down by genders shows us the women have more of a range, but both medians are closer together, meaning the bulk of men and women have similar counts.

# how many likes per gender?
males<-subset(pf,pf$gender=='male')
females<-subset(pf,pf$gender=='female')
sum(males$www_likes)

## [1] 1430175

sum(females$www_likes)

## [1] 3507665

# should use by() for this
by(pf$www_likes,pf$gender,sum)

## pf$gender: female
## [1] 3507665
## -------------------------------------------------------- 
## pf$gender: male
## [1] 1430175

qplot(x=gender, y=friend_count, data=subset(pf, !is.na(pf$gender)), 
      color=gender, fill=I('#fcae3a'), 
      geom= 'boxplot')+
  theme_xkcd()+
  ggtitle('Boxplot: Number Of Friends By Gender With Outliers')+
  ylab("Number of Friends")

#bring it down to more closely look at the output
qplot(x=gender, y=friend_count, data=subset(pf, !is.na(pf$gender)), 
      color=gender, fill=I('#fcae3a'), 
      geom= 'boxplot')+
  theme_xkcd()+
      coord_cartesian(ylim = c(0, 250))+
  ggtitle('Boxplot: Number Of Friends By Gender')+
  ylab("Number of Friends")

#Lets look at the friend count over gender to get a summary
by(pf$friend_count,pf$gender,summary)

## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

Who Initiated More Friendships?

First let’s again check out which columns we have to deal with:

names(pf)

##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"

Looks like it’s the “friendships_initiated” column we want. Let’s use the by command to get a summary of this:

help(by)
by(pf$friendships_initiated,pf$gender,summary)

## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    19.0    49.0   113.9   124.8  3654.0 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    15.0    44.0   103.1   111.0  4144.0

Looks like we got our answer. Let’s plot it for the sake of visual appeal:

qplot(x=gender, y=friendships_initiated, data=subset(pf, !is.na(pf$gender)), 
      color=gender, fill=I('#fcae3a'), 
      geom= 'boxplot')+
  theme_xkcd()+
      coord_cartesian(ylim = c(0, 150))+
  ggtitle('Boxplot: Number Of Friendships Initiated By Gender')+
  ylab("Number of Friendships Initiated")

Creating Binary Values from Integer Variables

Often times it makes sense to take a variable / metric that represents the “number” of times a user has done a certain action, and make it binary, (i.e. does this person use this feature?).

As an example, we can look at mobile likes. This gives us info about how many mobile likes on average a user has performed:

summary(pf$mobile_likes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0     4.0   106.1    46.0 25110.0

This in turn gives us the number of users that have or haven’t ever used mobile liking:

summary(pf$mobile_likes>0)

##    Mode   FALSE    TRUE    NA's 
## logical   35056   63947       0

Now let’s create a new variable / metric / column to denote whether the user has used mobile liking, and then populate it:

library('scales')
pf$mobile_check_in<-NA # set each row to NA
pf$mobile_check_in<-ifelse(pf$mobile_likes>0,1,0)
pf$mobile_check_in<-factor(pf$mobile_check_in)
summary(pf$mobile_check_in)

##     0     1 
## 35056 63947

help(summary)
summary(object = pf$mobile_check_in)

##     0     1 
## 35056 63947

checkedIn<-subset(pf,pf$mobile_check_in==1)
notCheckedIn<-subset(pf,pf$mobile_check_in==0)
percent(nrow(checkedIn)/(nrow(checkedIn)+nrow(notCheckedIn)))

## [1] "64.6%"

Note: We converted this to a factor as it is a binary “category”, yes or no. Let’s just plot the ratio for prosperity:

xMobileAxisLabels <- c("Not Yet Adopted Mobile", "Adopted Mobile")
qplot(x=mobile_check_in, data=pf, fill=I('#dd3333'), color=I("black"))+
  theme_xkcd()+
  ylab("Number of users using feature")+
  xlab("Mobile Check In")+
  ggtitle("User Adoption of Facebook's Mobile Platform")+
  scale_x_discrete(breaks=0:1,labels=xMobileAxisLabels) # this is how to do custom labels

Exploratory Data Analysis: Facebook Data (Lesson 3)

Taylor White

December 6, 2015