Pseudo-Facebook User Data

Notes:

getwd()
## [1] "C:/Users/lowye/Desktop"
pf <- read.csv('pseudo_facebook.tsv',sep = '\t')
names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"

Histogram of Users’ Birthdays

Notes:

library(ggplot2)
qplot(data = pf,x = dob_day) + 
  scale_x_continuous(breaks = 1:31) +
  facet_wrap(~dob_month,ncol = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(data = pf,x = friend_count,xlim = c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).

# What code would you add to create a facet the histogram by gender?
# Add it to the code below. 
#binwidth是组距
qplot(x = friend_count, data = pf, binwidth = 10) +
  scale_x_continuous(limits = c(0, 1000),
                     breaks = seq(0, 1000, 50))
## Warning: Removed 2951 rows containing non-finite values (stat_bin).


Statistics ‘by’ Gender

Notes:

qplot(x = friend_count, data = pf, binwidth = 10) +
  scale_x_continuous(limits = c(0, 1000),
                     breaks = seq(0, 1000, 50)) +
                     facet_wrap(~gender)
## Warning: Removed 2951 rows containing non-finite values (stat_bin).

#去除空缺值
qplot(x = friend_count, data = subset(pf,!is.na(pf$gender), binwidth = 10)) +
  scale_x_continuous(limits = c(0, 1000),
                     breaks = seq(0, 1000, 50)) +
                     facet_wrap(~gender)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2949 rows containing non-finite values (stat_bin).

#去除所有空缺值
qplot(x = friend_count, data =na.omit(pf), binwidth = 10) +
  scale_x_continuous(limits = c(0, 1000),
                     breaks = seq(0, 1000, 50)) +
                     facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).

Who on average has more friends: men or women?

Response:

table(pf$gender)
## 
## female   male 
##  40254  58574
by(pf$friend_count,pf$gender,summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

Tenure

Notes:

qplot(data = pf,x = tenure,binwidth = 30,
    color = I('black'),fill = I('#099DD9')) 
## Warning: Removed 2 rows containing non-finite values (stat_bin).


How would you create a histogram of tenure by year?

qplot(data = pf,x = tenure/365,
      xlab = 'Number of years using Facebook',
      ylab = 'Number of users in sample',
    color = I('black'),fill = I('#F79420')) +
  scale_x_continuous(breaks = 1:7,limits = c(0,7))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 26 rows containing non-finite values (stat_bin).


User Ages

Notes:

qplot(data = pf,x = age,binwidth = 1,
      xlab = 'Number of user ages',
      ylab = 'Number of user samples',
       color = I('black'),fill = I('#F79420')) + 
  scale_x_continuous(limits = c(10,115),breaks = seq(10,115,5)) + 
  scale_y_continuous(breaks = seq(0,10000,1000)) 


Transforming Data

Notes:

library(ggplot2)
qplot(data = pf,x = friend_count)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(log10(pf$friend_count + 1))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.505   1.919   1.868   2.316   3.692
summary(sqrt(pf$friend_count))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   5.568   9.055  11.090  14.350  70.160
#三个直方图
library(gridExtra)
p1 <- qplot(data = pf,x = friend_count,color = I('black'),fill = I('#FF3333'),
           xlab = "Number of friend",ylab = "")
p2 <- qplot(data = pf,x = log10(friend_count + 1),color = I('black'),fill = I('#FFFF00'),
           xlab = "Number of friend",ylab = "Number of users sample") 
p3 <- qplot(data = pf,x = sqrt(friend_count),color = I('black'),fill = I('#33FF33'),
          xlab = "Number of friend",ylab =  "" ) 
grid.arrange(p1,p2,p3,ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.


#频数多边形图,利用变量做颜色,同时利用比率计算男女占比
qplot(x = friend_count,y = ..count../sum(..count..),
      data = subset(pf,!is.na(gender)), binwidth = 10,
      geom = 'freqpoly',color = gender,
      xlab = 'Number of user friend',ylab = 'frequency of gender') +
      scale_x_continuous(limits = c(0, 1000),breaks = seq(0, 1000, 50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).

Likes on the Web

Notes:

p1 <- qplot(data = subset(pf,!is.na(pf$gender)),x = www_likes,binwidth = 50) + 
      scale_x_continuous(limits = c(0,1500),breaks = seq(0,1500,200)) +
      scale_y_continuous(limits = c(0,4000),breaks = seq(0,4000,500)) +
      facet_wrap(~gender,ncol = 2)
p2 <- qplot(data = subset(pf,!is.na(pf$gender)),x =www_likes,y = ..count../sum(..count..),
                          geom = 'freqpoly',color = gender,
                          xlab = 'Number of user www_likes',ylab = 'fequency of user sample by gender') +
      scale_x_log10()
grid.arrange(p1,p2,ncol = 1)
## Warning: Removed 537 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).

#查看哪个功能男性女性使用频率,这可以提供供应商哪个平台使用率高,可以舍弃利用率不高的块
by(pf$www_likes,pf$gender,sum)
## pf$gender: female
## [1] 3507665
## -------------------------------------------------------- 
## pf$gender: male
## [1] 1430175

Box Plots

箱线图的形成,箱线图通过对Y轴的定义可以去掉异常值

qplot(data = subset(pf,!is.na(pf$gender)),x = gender,y =friend_count,
      geom = 'boxplot') + 
  coord_cartesian(ylim = c(0,250))

qplot(data = subset(pf,!is.na(pf$gender)),x = gender,y =friendships_initiated,
      geom = 'boxplot') + 
  coord_cartesian(ylim = c(0,150))

by(pf$friendships_initiated,pf$gender,summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    19.0    49.0   113.9   124.8  3654.0 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    15.0    44.0   103.1   111.0  4144.0

#逻辑变量的转换,然后通过变量所占百分比确定变量数值
mobile_chick_in <- NA
pf$mobile_chick_in <- ifelse(pf$mobile_likes > 0,1,0)
pf$mobile_chick_in <- factor(pf$mobile_chick_in)
summary(pf$mobile_chick_in)
##     0     1 
## 35056 63947
sum(pf$mobile_chick_in == 1) / length(pf$mobile_chick_in)
## [1] 0.6459097