Get the file

Available at: https://s3.amazonaws.com/udacity-hosted-downloads/ud651/pseudo_facebook.tsv

pf<-read.delim('pseudo_facebook.tsv')
str(pf)

## 'data.frame':    99003 obs. of  15 variables:
##  $ userid               : int  2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
##  $ age                  : int  14 14 14 14 14 14 13 13 13 13 ...
##  $ dob_day              : int  19 2 16 25 4 1 14 4 1 2 ...
##  $ dob_year             : int  1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
##  $ dob_month            : int  11 11 11 12 12 12 1 1 1 2 ...
##  $ gender               : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
##  $ tenure               : int  266 6 13 93 82 15 12 0 81 171 ...
##  $ friend_count         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ friendships_initiated: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes_received       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes_received: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes_received   : int  0 0 0 0 0 0 0 0 0 0 ...

get variable names

names(pf)

##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"

lets plot something

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.1.2

qplot(x=dob_day, data=pf)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

using ggplot

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 3.1.2

theme_set(theme_minimal(24))
ggplot(aes(x = dob_day), data = pf) + 
  geom_histogram() + 
  scale_x_discrete(breaks = 1:31)

scale of x axis using qplot()

qplot(x=dob_day, data=pf)+
  scale_x_discrete(breaks=1:31)

note the problem of first day

Faceting

see if we can break down for each month

qplot(x=dob_day, data=pf)+
  scale_x_discrete(breaks=1:31)+
  facet_wrap(~dob_month, ncol=3)

friend_count

qplot(x=friend_count, data=pf)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

lots of zero’s..can we eliminate them?

xlim

yes! we can

qplot(x=friend_count, data=pf, xlim=c(0,1000))

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

layer of scale

qplot(x=friend_count, data=pf)+
  scale_x_continuous(limits = c(0,1000))

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

set bin width

qplot(x=friend_count, data=pf, binwidth=25)+
  scale_x_continuous(limits = c(0,1000), breaks=seq(0,1000,50))

lets see who has more friends , men or women

qplot(x=friend_count, data=pf, binwidth=25)+
  scale_x_continuous(limits = c(0,1000), breaks=seq(0,1000,50))+
  facet_wrap(~gender)

lets remove NA from our data

qplot(x=friend_count, data= subset(pf, !is.na(gender)), binwidth=25)+
  scale_x_continuous(limits = c(0,1000), breaks=seq(0,1000,50))+
  facet_wrap(~gender)

WHO HAS more friends

table(pf$gender)

## 
## female   male 
##  40254  58574

now lets use the by command

by(pf$friend_count,pf$gender, summary)

## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

lets color our graphs

qplot(x=tenure, data=pf,binwidth=30, color=I('black'), fill=I('#099DD9'))

label plots

qplot(x=tenure/365,data=pf,
      xlab='Number of years using Facebook',
      ylab='Number of users in sample',
      color=I('black'),fill=I('#F79420'))+
  scale_x_continuous(breaks=seq(1,7,1), lim=c(0,7))

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

histogram of ages

qplot(x=age,data=pf, binwidth=1,
      xlab='age',
      ylab='Number of users in sample',
      color=I('black'),fill=I('#F79420'))+
  scale_x_continuous(breaks=seq(0,113,5))

##lets treat some long tail variables

summary(pf$friend_count)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    31.0    82.0   196.4   206.0  4923.0

summary(log10(pf$friend_count+1))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.505   1.919   1.868   2.316   3.692

summary(sqrt(pf$friend_count))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.568   9.055  11.090  14.350  70.160

lets put the three plots in a grid

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.1.2

## Loading required package: grid

p1<- qplot(x=friend_count,data=pf)
p2<- qplot(x= log10(friend_count+1),data=pf)
p3<- qplot(x= sqrt(friend_count+1),data=pf)
grid.arrange(p1,p2,p3, ncol=1)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

another way using ggplot()

p1<-ggplot(aes(x=friend_count), data=pf)+geom_histogram()
p2<- p1+scale_x_log10()
p3<- p1+scale_x_sqrt()
grid.arrange(p1,p2,p3, ncol=1)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

frequency polygon

qplot(x=friend_count, data= subset(pf, !is.na(gender)), 
      binwidth=10, geom='freqpoly', color= gender )+
  scale_x_continuous(limits = c(0,1000), breaks=seq(0,1000,50))

## Warning: Removed 2 rows containing missing values (geom_path).

## Warning: Removed 2 rows containing missing values (geom_path).

likes on the web

qplot(x=www_likes, data=subset(pf,!is.na(gender)), geom='freqpoly',
      color=gender)

## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

we need now to add some scale factors

qplot(x=www_likes, data=subset(pf,!is.na(gender)), geom='freqpoly',
      color=gender)+
  scale_x_continuous()+
  scale_x_log10()

## Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

how may www_likes by gender

by(pf$www_likes, pf$gender, sum)

## pf$gender: female
## [1] 3507665
## -------------------------------------------------------- 
## pf$gender: male
## [1] 1430175

box plots

qplot(x=gender, y=friend_count, 
      data= subset(pf, !is.na(gender)), geom='boxplot')

lets limit the counts 0,1000

qplot(x=gender, y=friend_count, 
      data= subset(pf, !is.na(gender)), geom='boxplot',
      ylim=c(0,1000))

## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).

see the coord_cartesian layer

qplot(x=gender, y=friend_count, 
      data= subset(pf, !is.na(gender)), geom='boxplot')+
      coord_cartesian(ylim=c(0,1000))

lets zoom in further

qplot(x=gender, y=friend_count, 
      data= subset(pf, !is.na(gender)), geom='boxplot')+
      coord_cartesian(ylim=c(0,250))

by(pf$friend_count, pf$gender, summary)

## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

check logical

mobile<-NA

mobile<-NA
pf$mobile<-ifelse(pf$mobile_likes>0,1,0)
pf$mobile<-factor(pf$mobile)
summary(pf$mobile)

##     0     1 
## 35056 63947

and to know the percentage

sum(pf$mobile==1)/length(pf$mobile)

## [1] 0.6459097

Pseudo Facebook