library(ggplot2)
load a tab seperated pseudo facebook data
pf <- read.csv('pseudo_facebook.tsv',sep='\t')
str(pf)
## 'data.frame': 99003 obs. of 15 variables:
## $ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
## $ age : int 14 14 14 14 14 14 13 13 13 13 ...
## $ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
## $ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
## $ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
## $ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
## $ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
Scaling
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
qplot(x=dob_day, data=pf)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(x=dob_day, data=pf)+
scale_x_continuous(breaks=1:31)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Faceting
qplot(x=dob_day, data=pf)+
scale_x_continuous(breaks=1:31)+
facet_wrap(~dob_month,ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Limiting the X axis. Here I jut limited the x axis data to 500 from zero(ie: friend count less than zero)
qplot(x=friend_count,data=pf)+
scale_x_continuous(lim=c(0,500),breaks = seq(0,500,100))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8327 rows containing non-finite values (stat_bin).
Also can written as below
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8327 rows containing non-finite values (stat_bin).
Setting the bin width
qplot(x=friend_count,data=pf,binwidth=30)+
scale_x_continuous(limits=c(0,500),breaks = seq(0,500,50))
## Warning: Removed 8327 rows containing non-finite values (stat_bin).
## Warning: Removed 1 rows containing missing values (geom_bar).
Who has more friends Men or women(Use facter_wrap)
qplot(x=friend_count,data=pf,binwidth=30)+
scale_x_continuous(limits=c(0,500),breaks = seq(0,500,50))+
facet_wrap(~gender)
## Warning: Removed 8327 rows containing non-finite values (stat_bin).
## Warning: Removed 3 rows containing missing values (geom_bar).
Omitting Na observations
## Warning: Removed 8311 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
To omit all NA observations
## Warning: Removed 8311 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
Statistics by gender
table(pf$gender)
##
## female male
## 40254 58574
by(pf$friend_count,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Tenure(How many data some one is using facebook)
qplot(x=tenure, data=pf, binwidth=30,
color=I('black'),fill=I('#099009'))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
qplot(x=tenure/365,data=pf,binwidth=1,
color=I('black'),fill=I('#099009'))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
A yearly view with quarterly bins
qplot(x=tenure/365,data=pf,binwidth=.25,color=I('black'),fill=I('#F79420'))+
scale_x_continuous(breaks=seq(1,7,1),limit=c(0,7))
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Labeling plots(By default r tables col names as axis names of the plots)
qplot(x=tenure/365,data=pf,binwidth=.25,color=I('black'),fill=I('#F79420'))+
scale_x_continuous(breaks=seq(1,7,1),limit=c(0,7)) +
xlab('numberof years using facebook')+
ylab('number of users in sample')
## Warning: Removed 26 rows containing non-finite values (stat_bin).
User ages facet wrap by gender.
qplot(x=age,data=na.omit(pf),binwidth=1,color=I('black'),fill=I('#5760AB'))+
facet_wrap(~gender)
Transforming data(Over dispersed data can be visualized in a better way)
#install.packages('gridExtra')
library(gridExtra)
summary(log10(pf$friend_count+1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.505 1.919 1.868 2.316 3.692
summary(log10(pf$friend_count+1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.505 1.919 1.868 2.316 3.692
summary(sqrt(pf$friend_count+1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 5.657 9.110 11.180 14.390 70.170
noscale<-qplot(x=(friend_count),data=pf)
logScale<-qplot(x=log10(friend_count),data=pf)
sqrtplot<-ggplot(aes(x=friend_count),data=pf)+geom_histogram()+scale_x_sqrt()
grid.arrange(noscale,logScale,sqrtplot,ncol=3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Transformation data alternate solution
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Difference between the transformations wrapper an adding layer
log_wraper<-ggplot(aes(x=log10(friend_count)),data=pf)+geom_histogram()
log_layer<-ggplot(aes(x=friend_count),data=pf)+geom_histogram()+scale_x_log10()
grid.arrange(log_wraper,log_layer,ncol=2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1962 rows containing non-finite values (stat_bin).
qplot(x= friend_count,data=subset(pf,is.na(gender)),
binwidth=10)+
scale_x_continuous(lim=c(0,1000),breaks = seq(0, 1000,50)) + facet_wrap(~gender)
## Warning: Removed 2 rows containing non-finite values (stat_bin).
ggplot(aes(x = friend_count), data = subset(pf, !is.na(gender))) +
geom_freqpoly(aes(color = gender), binwidth=10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
xlab('Friend Count') +
ylab('Percentage of users with that friend count')
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
But we really wanted to know who have more friends male or female(For this we have to take proportions .The below addition code y = ..count../sum(..count..) will take the proportions )
ggplot(aes(x = friend_count, y = ..count../sum(..count..)), data = subset(pf, !is.na(gender))) +
geom_freqpoly(aes(color = gender), binwidth=10) +
scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
xlab('Friend Count') +
ylab('Percentage of users with that friend count')
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
Frequency polygon for www_likes(The advantage of log transfermation)
qplot(x=www_likes ,
data = subset(pf,!is.na(gender)),
geom='freqpoly',color=gender)+
scale_x_continuous()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(x=www_likes ,
data = subset(pf,!is.na(gender)),
geom='freqpoly',color=gender)+
scale_x_continuous()+
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).
Summary for www-Likes
by(pf$www_likes,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 87.14 25.00 14860.00
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 0.00 24.42 2.00 12900.00
by(pf$www_likes,pf$gender,sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
Box PLots
qplot(x=gender, y=friend_count,data=subset(pf, !is.na(gender)),geom='boxplot')
Outlier : is just 1.5 times IQR distance from th emedian.set the limits to remove the outliers
qplot(x=gender, y=friend_count,data=subset(pf, !is.na(gender)),geom='boxplot',ylim=c(0,1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).
Set the limits to remove the outliers
qplot(x=gender, y=friend_count,data=subset(pf, !is.na(gender)),geom='boxplot')+
scale_y_continuous(limits=c(0,1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).
to summarize the friend-counts by gender
by(pf$friend_count,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
To match the graph details with the above statistics use the coord_cartesian function. This means , the data point we limit on y-axis is just for visualization not on actual calculation.
qplot(x=gender, y=friend_count,data=subset(pf,!is.na(gender)),geom='boxplot')+
coord_cartesian(ylim=c(0,250))
On an average who initiated more friend request.(The answer is female)
qplot(x=gender, y=friendships_initiated,data=subset(pf,!is.na(gender)),geom='boxplot')+
coord_cartesian(ylim=c(0,250))
by(pf$friendships_initiated,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0