abdul Baqi
Sunday, November 02, 2014
Available at: https://s3.amazonaws.com/udacity-hosted-downloads/ud651/pseudo_facebook.tsv
pf<-read.delim('pseudo_facebook.tsv')
str(pf)
## 'data.frame': 99003 obs. of 15 variables:
## $ userid : int 2094382 1192601 2083884 1203168 1733186 1524765 1136133 1680361 1365174 1712567 ...
## $ age : int 14 14 14 14 14 14 13 13 13 13 ...
## $ dob_day : int 19 2 16 25 4 1 14 4 1 2 ...
## $ dob_year : int 1999 1999 1999 1999 1999 1999 2000 2000 2000 2000 ...
## $ dob_month : int 11 11 11 12 12 12 1 1 1 2 ...
## $ gender : Factor w/ 2 levels "female","male": 2 1 2 1 2 2 2 1 2 2 ...
## $ tenure : int 266 6 13 93 82 15 12 0 81 171 ...
## $ friend_count : int 0 0 0 0 0 0 0 0 0 0 ...
## $ friendships_initiated: int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ mobile_likes_received: int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ www_likes_received : int 0 0 0 0 0 0 0 0 0 0 ...
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.1.2
qplot(x=dob_day, data=pf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 3.1.2
theme_set(theme_minimal(24))
ggplot(aes(x = dob_day), data = pf) +
geom_histogram() +
scale_x_discrete(breaks = 1:31)
qplot(x=dob_day, data=pf)+
scale_x_discrete(breaks=1:31)
note the problem of first day
see if we can break down for each month
qplot(x=dob_day, data=pf)+
scale_x_discrete(breaks=1:31)+
facet_wrap(~dob_month, ncol=3)
qplot(x=friend_count, data=pf)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
lots of zero’s..can we eliminate them?
yes! we can
qplot(x=friend_count, data=pf, xlim=c(0,1000))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x=friend_count, data=pf)+
scale_x_continuous(limits = c(0,1000))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x=friend_count, data=pf, binwidth=25)+
scale_x_continuous(limits = c(0,1000), breaks=seq(0,1000,50))
qplot(x=friend_count, data=pf, binwidth=25)+
scale_x_continuous(limits = c(0,1000), breaks=seq(0,1000,50))+
facet_wrap(~gender)
qplot(x=friend_count, data= subset(pf, !is.na(gender)), binwidth=25)+
scale_x_continuous(limits = c(0,1000), breaks=seq(0,1000,50))+
facet_wrap(~gender)
table(pf$gender)
##
## female male
## 40254 58574
now lets use the by command
by(pf$friend_count,pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
qplot(x=tenure, data=pf,binwidth=30, color=I('black'), fill=I('#099DD9'))
qplot(x=tenure/365,data=pf,
xlab='Number of years using Facebook',
ylab='Number of users in sample',
color=I('black'),fill=I('#F79420'))+
scale_x_continuous(breaks=seq(1,7,1), lim=c(0,7))
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x=age,data=pf, binwidth=1,
xlab='age',
ylab='Number of users in sample',
color=I('black'),fill=I('#F79420'))+
scale_x_continuous(breaks=seq(0,113,5))
##lets treat some long tail variables
summary(pf$friend_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 31.0 82.0 196.4 206.0 4923.0
summary(log10(pf$friend_count+1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.505 1.919 1.868 2.316 3.692
summary(sqrt(pf$friend_count))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.568 9.055 11.090 14.350 70.160
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.1.2
## Loading required package: grid
p1<- qplot(x=friend_count,data=pf)
p2<- qplot(x= log10(friend_count+1),data=pf)
p3<- qplot(x= sqrt(friend_count+1),data=pf)
grid.arrange(p1,p2,p3, ncol=1)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
p1<-ggplot(aes(x=friend_count), data=pf)+geom_histogram()
p2<- p1+scale_x_log10()
p3<- p1+scale_x_sqrt()
grid.arrange(p1,p2,p3, ncol=1)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
qplot(x=friend_count, data= subset(pf, !is.na(gender)),
binwidth=10, geom='freqpoly', color= gender )+
scale_x_continuous(limits = c(0,1000), breaks=seq(0,1000,50))
## Warning: Removed 2 rows containing missing values (geom_path).
## Warning: Removed 2 rows containing missing values (geom_path).
qplot(x=www_likes, data=subset(pf,!is.na(gender)), geom='freqpoly',
color=gender)
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
we need now to add some scale factors
qplot(x=www_likes, data=subset(pf,!is.na(gender)), geom='freqpoly',
color=gender)+
scale_x_continuous()+
scale_x_log10()
## Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
by(pf$www_likes, pf$gender, sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
qplot(x=gender, y=friend_count,
data= subset(pf, !is.na(gender)), geom='boxplot')
qplot(x=gender, y=friend_count,
data= subset(pf, !is.na(gender)), geom='boxplot',
ylim=c(0,1000))
## Warning: Removed 2949 rows containing non-finite values (stat_boxplot).
qplot(x=gender, y=friend_count,
data= subset(pf, !is.na(gender)), geom='boxplot')+
coord_cartesian(ylim=c(0,1000))
qplot(x=gender, y=friend_count,
data= subset(pf, !is.na(gender)), geom='boxplot')+
coord_cartesian(ylim=c(0,250))
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
mobile<-NA
mobile<-NA
pf$mobile<-ifelse(pf$mobile_likes>0,1,0)
pf$mobile<-factor(pf$mobile)
summary(pf$mobile)
## 0 1
## 35056 63947
and to know the percentage
sum(pf$mobile==1)/length(pf$mobile)
## [1] 0.6459097