Notes:
getwd()
## [1] "C:/Users/lowye/Desktop"
pf <- read.csv('pseudo_facebook.tsv',sep = '\t')
names(pf)
## [1] "userid" "age"
## [3] "dob_day" "dob_year"
## [5] "dob_month" "gender"
## [7] "tenure" "friend_count"
## [9] "friendships_initiated" "likes"
## [11] "likes_received" "mobile_likes"
## [13] "mobile_likes_received" "www_likes"
## [15] "www_likes_received"
Notes:
library(ggplot2)
qplot(data = pf,x = dob_day) +
scale_x_continuous(breaks = 1:31) +
facet_wrap(~dob_month,ncol = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(data = pf,x = friend_count,xlim = c(0,1000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
# What code would you add to create a facet the histogram by gender?
# Add it to the code below.
#binwidth是组距
qplot(x = friend_count, data = pf, binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50))
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
Notes:
qplot(x = friend_count, data = pf, binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## Warning: Removed 2951 rows containing non-finite values (stat_bin).
#去除空缺值
qplot(x = friend_count, data = subset(pf,!is.na(pf$gender), binwidth = 10)) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
#去除所有空缺值
qplot(x = friend_count, data =na.omit(pf), binwidth = 10) +
scale_x_continuous(limits = c(0, 1000),
breaks = seq(0, 1000, 50)) +
facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
Response:
table(pf$gender)
##
## female male
## 40254 58574
by(pf$friend_count,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 37 96 242 244 4923
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 27 74 165 182 4917
Notes:
qplot(data = pf,x = tenure,binwidth = 30,
color = I('black'),fill = I('#099DD9'))
## Warning: Removed 2 rows containing non-finite values (stat_bin).
qplot(data = pf,x = tenure/365,
xlab = 'Number of years using Facebook',
ylab = 'Number of users in sample',
color = I('black'),fill = I('#F79420')) +
scale_x_continuous(breaks = 1:7,limits = c(0,7))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 26 rows containing non-finite values (stat_bin).
Notes:
qplot(data = pf,x = age,binwidth = 1,
xlab = 'Number of user ages',
ylab = 'Number of user samples',
color = I('black'),fill = I('#F79420')) +
scale_x_continuous(limits = c(10,115),breaks = seq(10,115,5)) +
scale_y_continuous(breaks = seq(0,10000,1000))
Notes:
library(ggplot2)
qplot(data = pf,x = friend_count)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(log10(pf$friend_count + 1))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.505 1.919 1.868 2.316 3.692
summary(sqrt(pf$friend_count))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 5.568 9.055 11.090 14.350 70.160
#三个直方图
library(gridExtra)
p1 <- qplot(data = pf,x = friend_count,color = I('black'),fill = I('#FF3333'),
xlab = "Number of friend",ylab = "")
p2 <- qplot(data = pf,x = log10(friend_count + 1),color = I('black'),fill = I('#FFFF00'),
xlab = "Number of friend",ylab = "Number of users sample")
p3 <- qplot(data = pf,x = sqrt(friend_count),color = I('black'),fill = I('#33FF33'),
xlab = "Number of friend",ylab = "" )
grid.arrange(p1,p2,p3,ncol = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#频数多边形图,利用变量做颜色,同时利用比率计算男女占比
qplot(x = friend_count,y = ..count../sum(..count..),
data = subset(pf,!is.na(gender)), binwidth = 10,
geom = 'freqpoly',color = gender,
xlab = 'Number of user friend',ylab = 'frequency of gender') +
scale_x_continuous(limits = c(0, 1000),breaks = seq(0, 1000, 50))
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).
Notes:
p1 <- qplot(data = subset(pf,!is.na(pf$gender)),x = www_likes,binwidth = 50) +
scale_x_continuous(limits = c(0,1500),breaks = seq(0,1500,200)) +
scale_y_continuous(limits = c(0,4000),breaks = seq(0,4000,500)) +
facet_wrap(~gender,ncol = 2)
p2 <- qplot(data = subset(pf,!is.na(pf$gender)),x =www_likes,y = ..count../sum(..count..),
geom = 'freqpoly',color = gender,
xlab = 'Number of user www_likes',ylab = 'fequency of user sample by gender') +
scale_x_log10()
grid.arrange(p1,p2,ncol = 1)
## Warning: Removed 537 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 60935 rows containing non-finite values (stat_bin).
#查看哪个功能男性女性使用频率,这可以提供供应商哪个平台使用率高,可以舍弃利用率不高的块
by(pf$www_likes,pf$gender,sum)
## pf$gender: female
## [1] 3507665
## --------------------------------------------------------
## pf$gender: male
## [1] 1430175
箱线图的形成,箱线图通过对Y轴的定义可以去掉异常值
qplot(data = subset(pf,!is.na(pf$gender)),x = gender,y =friend_count,
geom = 'boxplot') +
coord_cartesian(ylim = c(0,250))
qplot(data = subset(pf,!is.na(pf$gender)),x = gender,y =friendships_initiated,
geom = 'boxplot') +
coord_cartesian(ylim = c(0,150))
by(pf$friendships_initiated,pf$gender,summary)
## pf$gender: female
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 19.0 49.0 113.9 124.8 3654.0
## --------------------------------------------------------
## pf$gender: male
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 15.0 44.0 103.1 111.0 4144.0
#逻辑变量的转换,然后通过变量所占百分比确定变量数值
mobile_chick_in <- NA
pf$mobile_chick_in <- ifelse(pf$mobile_likes > 0,1,0)
pf$mobile_chick_in <- factor(pf$mobile_chick_in)
summary(pf$mobile_chick_in)
## 0 1
## 35056 63947
sum(pf$mobile_chick_in == 1) / length(pf$mobile_chick_in)
## [1] 0.6459097