library(ggplot2)

load a tab seperated pseudo facebook data

pf <- read.csv('pseudo_facebook.tsv',sep='\t')
str(pf)
## 'data.frame':    99003 obs. of  15 variables:
##  $ userid               : int  2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
##  $ age                  : int  14 14 14 14 14 14 13 13 13 13 ...
##  $ dob_day              : int  19 2 16 25 4 1 14 4 1 2 ...
##  $ dob_year             : int  1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
##  $ dob_month            : int  11 11 11 12 12 12 1 1 1 2 ...
##  $ gender               : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
##  $ tenure               : int  266 6 13 93 82 15 12 0 81 171 ...
##  $ friend_count         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ friendships_initiated: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes                : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ likes_received       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ mobile_likes_received: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ www_likes_received   : int  0 0 0 0 0 0 0 0 0 0 ...

Scaling

names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"
qplot(x=dob_day, data=pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(x=dob_day, data=pf)+
  scale_x_continuous(breaks=1:31)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Faceting

qplot(x=dob_day, data=pf)+
  scale_x_continuous(breaks=1:31)+
facet_wrap(~dob_month,ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Limiting the X axis. Here I jut limited the x axis data to 500 from zero(ie: friend count less than zero)

qplot(x=friend_count,data=pf)+
  scale_x_continuous(lim=c(0,500),breaks = seq(0,500,100))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8327 rows containing non-finite values (stat_bin).

Also can written as below

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8327 rows containing non-finite values (stat_bin).

Setting the bin width

qplot(x=friend_count,data=pf,binwidth=30)+
  scale_x_continuous(limits=c(0,500),breaks = seq(0,500,50))
## Warning: Removed 8327 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).

Who has more friends Men or women(Use facter_wrap)

qplot(x=friend_count,data=pf,binwidth=30)+
  scale_x_continuous(limits=c(0,500),breaks = seq(0,500,50))+
                       facet_wrap(~gender)
## Warning: Removed 8327 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing missing values (geom_bar).

Omitting Na observations

## Warning: Removed 8311 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

To omit all NA observations

## Warning: Removed 8311 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

Statistics by gender

table(pf$gender)
## 
## female   male 
##  40254  58574
by(pf$friend_count,pf$gender,summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

Tenure(How many data some one is using facebook)

qplot(x=tenure, data=pf, binwidth=30, 
      color=I('black'),fill=I('#099009'))
## Warning: Removed 2 rows containing non-finite values (stat_bin).

qplot(x=tenure/365,data=pf,binwidth=1,
      color=I('black'),fill=I('#099009'))  
## Warning: Removed 2 rows containing non-finite values (stat_bin).

A yearly view with quarterly bins

qplot(x=tenure/365,data=pf,binwidth=.25,color=I('black'),fill=I('#F79420'))+
scale_x_continuous(breaks=seq(1,7,1),limit=c(0,7)) 
## Warning: Removed 26 rows containing non-finite values (stat_bin).

Labeling plots(By default r tables col names as axis names of the plots)

qplot(x=tenure/365,data=pf,binwidth=.25,color=I('black'),fill=I('#F79420'))+
scale_x_continuous(breaks=seq(1,7,1),limit=c(0,7)) +
  xlab('numberof years using facebook')+
  ylab('number of users in sample')
## Warning: Removed 26 rows containing non-finite values (stat_bin).

User ages facet wrap by gender.

qplot(x=age,data=na.omit(pf),binwidth=1,color=I('black'),fill=I('#5760AB'))+
  facet_wrap(~gender)

Transforming data(Over dispersed data can be visualized in a better way)

#install.packages('gridExtra') 
library(gridExtra) 
summary(log10(pf$friend_count+1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.505   1.919   1.868   2.316   3.692
summary(log10(pf$friend_count+1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.505   1.919   1.868   2.316   3.692
summary(sqrt(pf$friend_count+1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   5.657   9.110  11.180  14.390  70.170
noscale<-qplot(x=(friend_count),data=pf)

logScale<-qplot(x=log10(friend_count),data=pf) 

sqrtplot<-ggplot(aes(x=friend_count),data=pf)+geom_histogram()+scale_x_sqrt() 

grid.arrange(noscale,logScale,sqrtplot,ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Transformation data alternate solution

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Difference between the transformations wrapper an adding layer

log_wraper<-ggplot(aes(x=log10(friend_count)),data=pf)+geom_histogram()
log_layer<-ggplot(aes(x=friend_count),data=pf)+geom_histogram()+scale_x_log10()
grid.arrange(log_wraper,log_layer,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).

To compate distrubutiosn Frequency poligons(just before that a histogram)

qplot(x= friend_count,data=subset(pf,is.na(gender)),
  binwidth=10)+
scale_x_continuous(lim=c(0,1000),breaks = seq(0, 1000,50)) + facet_wrap(~gender)
## Warning: Removed 2 rows containing non-finite values (stat_bin).

ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) + 
  geom_freqpoly(aes(color = gender), binwidth=10) + 
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + 
  xlab('Friend Count') + 
  ylab('Percentage of users with that friend count')
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).

But we really wanted to know who have more friends male or female(For this we have to take proportions .The below addition code y = ..count../sum(..count..) will take the proportions )

ggplot(aes(x = friend_count, y = ..count../sum(..count..)), data = subset(pf, !is.na(gender))) + 
  geom_freqpoly(aes(color = gender), binwidth=10) + 
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) + 
  xlab('Friend Count') + 
  ylab('Percentage of users with that friend count')
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).

Frequency polygon for www_likes(The advantage of log transfermation)

qplot(x=www_likes ,
     data = subset(pf,!is.na(gender)),
     geom='freqpoly',color=gender)+
     scale_x_continuous()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(x=www_likes ,
     data = subset(pf,!is.na(gender)),
     geom='freqpoly',color=gender)+
     scale_x_continuous()+
     scale_x_log10()  
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).

Summary for www-Likes

by(pf$www_likes,pf$gender,summary)
## pf$gender: female
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     0.00    87.14    25.00 14860.00 
## -------------------------------------------------------- 
## pf$gender: male
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00     0.00     0.00    24.42     2.00 12900.00
by(pf$www_likes,pf$gender,sum)
## pf$gender: female
## [1] 3507665
## -------------------------------------------------------- 
## pf$gender: male
## [1] 1430175

Box PLots

qplot(x=gender, y=friend_count,data=subset(pf, !is.na(gender)),geom='boxplot')

Outlier : is just 1.5 times IQR distance from th emedian.set the limits to remove the outliers

qplot(x=gender, y=friend_count,data=subset(pf, !is.na(gender)),geom='boxplot',ylim=c(0,1000)) 
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).

Set the limits to remove the outliers

qplot(x=gender, y=friend_count,data=subset(pf, !is.na(gender)),geom='boxplot')+
scale_y_continuous(limits=c(0,1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).

to summarize the friend-counts by gender

by(pf$friend_count,pf$gender,summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

To match the graph details with the above statistics use the coord_cartesian function. This means , the data point we limit on y-axis is just for visualization not on actual calculation.

qplot(x=gender, y=friend_count,data=subset(pf,!is.na(gender)),geom='boxplot')+
  coord_cartesian(ylim=c(0,250))

On an average who initiated more friend request.(The answer is female)

qplot(x=gender, y=friendships_initiated,data=subset(pf,!is.na(gender)),geom='boxplot')+
  coord_cartesian(ylim=c(0,250))

by(pf$friendships_initiated,pf$gender,summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    19.0    49.0   113.9   124.8  3654.0 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    15.0    44.0   103.1   111.0  4144.0