library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv',sep = '\t')
qplot(x = age,y = friend_count,data = pf)
Notes:
ggplot(aes(x = age,y = friend_count),data = pf) +
geom_point() +
xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).
Notes:
ggplot(aes(x = age,y = friend_count),data = pf) +
geom_jitter(alpha = 1/20) +
xlim(13,90)
## Warning: Removed 5181 rows containing missing values (geom_point).
***
ggplot(aes(x = age,y = friend_count),data = pf) +
geom_point(alpha = 1/20) +
xlim(13,90) +
coord_trans(y ='sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).
ggplot(aes(x = age,y = friend_count),data = pf) +
geom_point(alpha = 1/20,position = position_jitter(h = 0)) +
xlim(13,90) +
coord_trans(y ='sqrt')
## Warning: Removed 5211 rows containing missing values (geom_point).
ggplot(aes(age,friendships_initiated),data = pf) +
geom_point(alpha = 1/20,position = position_jitter(h = 0)) +
scale_x_continuous(limits = c(13,113),breaks = seq(13,113,5)) +
coord_trans(y = 'sqrt')
## Warning: Removed 352 rows containing missing values (geom_point).
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#选取变量进行分组
age_groups <- group_by(pf,age)
#创建新的数据框,其中包含的变量
pf.fc_by_age <- summarise(age_groups,
friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n())
#按照年龄进行排序
pf.fc_by_age <- arrange(pf.fc_by_age,age)
head(pf.fc_by_age)
## # A tibble: 6 × 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
pf.fc_by_age <- pf %>%
group_by(age) %>%
summarise( friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age)
head(pf.fc_by_age,20)
## # A tibble: 20 × 4
## age friend_count_mean friend_count_median n
## <int> <dbl> <dbl> <int>
## 1 13 164.7500 74.0 484
## 2 14 251.3901 132.0 1925
## 3 15 347.6921 161.0 2618
## 4 16 351.9371 171.5 3086
## 5 17 350.3006 156.0 3283
## 6 18 331.1663 162.0 5196
## 7 19 333.6921 157.0 4391
## 8 20 283.4991 135.0 3769
## 9 21 235.9412 121.0 3671
## 10 22 211.3948 106.0 3032
## 11 23 202.8426 93.0 4404
## 12 24 185.7121 92.0 2827
## 13 25 131.0211 62.0 3641
## 14 26 144.0082 75.0 2815
## 15 27 134.1473 72.0 2240
## 16 28 125.8354 66.0 2364
## 17 29 120.8182 66.0 1936
## 18 30 115.2080 67.5 1716
## 19 31 118.4599 63.0 1694
## 20 32 114.2800 63.0 1443
Notes:
ggplot(aes(age,friend_count_mean),data = pf.fc_by_age) +
geom_line() +
scale_x_continuous(limits = c(13,113),breaks = seq(13,113,5))
ggplot(aes(x = age,y = friend_count),data = pf) +
geom_point(alpha = 1/20,position = position_jitter(h = 0),color = 'orange') +
geom_line(stat = 'summary',fun.y = mean) +
#在图层中添加10%分位数
geom_line(stat = 'summary',fun.y = quantile,fun.args = list(probs = 0.1),
linetype = 2,color = 'blue') +
geom_line(stat = 'summary',fun.y = quantile,fun.args = list(probs = 0.9),
linetype = 2,color = 'blue') +
geom_line(stat = 'summary',fun.y = quantile,fun.args = list(probs = 0.5),
linetype = 1,color = 'blue') +
coord_cartesian(xlim = c(13,70)) +
coord_trans(y = 'sqrt')
with(pf,cor.test(age,friend_count,method = 'pearson'))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03363072 -0.02118189
## sample estimates:
## cor
## -0.02740737
?subset
## starting httpd help server ...
## done
with(subset(pf,age < 70), cor.test(age, friend_count))
##
## Pearson's product-moment correlation
##
## data: age and friend_count
## t = -52.326, df = 90664, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1775257 -0.1648889
## sample estimates:
## cor
## -0.1712144
Notes:
ggplot(aes(www_likes_received,likes_received),data = pf) +
geom_point() +
#选取95%的数值
xlim(0,quantile(pf$www_likes_received,0.95)) +
ylim(0,quantile(pf$likes_received,0.95)) +
geom_smooth(method = 'lm',color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).
Notes:
cor.test(pf$www_likes_received,pf$likes_received)
##
## Pearson's product-moment correlation
##
## data: pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9473553 0.9486176
## sample estimates:
## cor
## 0.9479902
Notes:
library(alr3)
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
Mitchell
## Month Temp
## 1 0 -5.183330
## 2 1 -1.650000
## 3 2 2.494440
## 4 3 10.400000
## 5 4 14.994400
## 6 5 21.716700
## 7 6 24.744400
## 8 7 24.072200
## 9 8 18.861100
## 10 9 9.166670
## 11 10 1.544440
## 12 11 -4.005560
## 13 12 -7.477780
## 14 13 -4.005560
## 15 14 3.000000
## 16 15 10.961100
## 17 16 18.022200
## 18 17 22.727800
## 19 18 26.261100
## 20 19 21.605600
## 21 20 18.022200
## 22 21 9.783330
## 23 22 3.727780
## 24 23 -2.772220
## 25 24 -6.188890
## 26 25 -5.688890
## 27 26 -1.933330
## 28 27 6.477780
## 29 28 13.538900
## 30 29 20.988900
## 31 30 24.183300
## 32 31 22.727800
## 33 32 19.755600
## 34 33 11.127800
## 35 34 3.838890
## 36 35 -2.883330
## 37 36 -7.422220
## 38 37 -6.305560
## 39 38 -0.250000
## 40 39 7.594440
## 41 40 15.216700
## 42 41 21.383300
## 43 42 22.277800
## 44 43 22.561100
## 45 44 20.766700
## 46 45 12.694400
## 47 46 3.166670
## 48 47 -1.650000
## 49 48 -2.377780
## 50 49 -2.827780
## 51 50 -0.644444
## 52 51 9.500000
## 53 52 17.233300
## 54 53 22.394400
## 55 54 26.200000
## 56 55 24.633300
## 57 56 20.600000
## 58 57 11.461100
## 59 58 4.400000
## 60 59 -1.144440
## 61 60 -2.994440
## 62 61 -2.772220
## 63 62 3.727780
## 64 63 12.694400
## 65 64 15.722200
## 66 65 22.000000
## 67 66 24.238900
## 68 67 22.727800
## 69 68 19.194400
## 70 69 10.961100
## 71 70 5.183330
## 72 71 -0.866667
## 73 72 -4.227780
## 74 73 -4.788890
## 75 74 -0.250000
## 76 75 7.650000
## 77 76 14.938900
## 78 77 19.588900
## 79 78 24.466700
## 80 79 23.850000
## 81 80 18.466700
## 82 81 11.405600
## 83 82 2.944440
## 84 83 -0.644444
## 85 84 -1.594440
## 86 85 -1.261110
## 87 86 2.105560
## 88 87 4.288890
## 89 88 13.816700
## 90 89 20.150000
## 91 90 26.816700
## 92 91 26.877800
## 93 92 20.427800
## 94 93 11.294400
## 95 94 5.072220
## 96 95 -1.594440
## 97 96 -2.494440
## 98 97 -1.372220
## 99 98 -0.194444
## 100 99 5.355560
## 101 100 13.983300
## 102 101 21.272200
## 103 102 26.650000
## 104 103 27.605600
## 105 104 19.644400
## 106 105 10.455600
## 107 106 3.838890
## 108 107 -2.155560
## 109 108 -4.733330
## 110 109 -5.461110
## 111 110 2.416670
## 112 111 8.827780
## 113 112 17.394400
## 114 113 19.755600
## 115 114 24.466700
## 116 115 21.216700
## 117 116 19.138900
## 118 117 9.555560
## 119 118 8.183330
## 120 119 3.522220
## 121 120 -4.005560
## 122 121 -2.661110
## 123 122 1.133330
## 124 123 9.555560
## 125 124 16.900000
## 126 125 22.894400
## 127 126 24.972200
## 128 127 21.605600
## 129 128 18.583300
## 130 129 11.072200
## 131 130 0.955556
## 132 131 -2.716670
## 133 132 -2.550000
## 134 133 -0.866667
## 135 134 3.338890
## 136 135 10.444400
## 137 136 18.477800
## 138 137 23.272200
## 139 138 26.161100
## 140 139 22.577800
## 141 140 18.416700
## 142 141 9.233330
## 143 142 4.838890
## 144 143 -0.877778
## 145 144 -5.850000
## 146 145 -3.477780
## 147 146 2.994440
## 148 147 9.983330
## 149 148 19.227800
## 150 149 26.505600
## 151 150 26.161100
## 152 151 26.505600
## 153 152 19.344400
## 154 153 10.677800
## 155 154 3.166670
## 156 155 -2.033330
## 157 156 -2.611110
## 158 157 -4.172220
## 159 158 1.950000
## 160 159 13.161100
## 161 160 18.188900
## 162 161 22.288900
## 163 162 26.683300
## 164 163 24.716700
## 165 164 17.783300
## 166 165 11.888900
## 167 166 3.105560
## 168 167 -2.950000
## 169 168 -1.916670
## 170 169 -1.227780
## 171 170 4.438890
## 172 171 10.216700
## 173 172 15.066700
## 174 173 23.616700
## 175 174 24.372200
## 176 175 24.427800
## 177 176 20.844400
## 178 177 11.372200
## 179 178 5.072220
## 180 179 -2.900000
## 181 180 -4.227780
## 182 181 -1.338890
## 183 182 3.972220
## 184 183 10.966700
## 185 184 18.244400
## 186 185 23.850000
## 187 186 25.872200
## 188 187 23.733300
## 189 188 19.400000
## 190 189 10.794400
## 191 190 0.972222
## 192 191 -1.050000
## 193 192 -1.050000
## 194 193 1.200000
## 195 194 5.305560
## 196 195 8.827780
## 197 196 16.916700
## 198 197 20.438900
## 199 198 22.694400
## 200 199 21.772200
## 201 200 18.127800
## 202 201 11.255600
## 203 202 2.416670
## 204 203 -1.627780
Create your plot!
ggplot(aes(Month,Temp),data = Mitchell) +
geom_point()
cor.test(Mitchell$Month,Mitchell$Temp)
##
## Pearson's product-moment correlation
##
## data: Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.08053637 0.19331562
## sample estimates:
## cor
## 0.05747063
噪声数值
ggplot(aes(Month,Temp),data = Mitchell) +
geom_point() +
scale_x_continuous(breaks = seq(0,203,12))
Notes:
pf$age_with_months <- pf$age + (12 - pf$dob_month) / 12
pf.fc_by_age_months <- pf %>%
group_by(age_with_months) %>%
summarise( friend_count_mean = mean(friend_count),
friend_count_median = median(friend_count),
n = n()) %>%
arrange(age_with_months)
q1 <- ggplot(aes(age_with_months,friend_count_mean),data = subset(pf.fc_by_age_months,age_with_months < 71)) +
geom_line() +
geom_smooth()
q2 <- ggplot(aes(age,friend_count_mean),data = subset(pf.fc_by_age,age < 71)) +
geom_line() +
geom_smooth()
#将一个点周围2.5范围的点融合为一个点,平滑指数
q3 <- ggplot(aes(round(age / 5)*5,y = friend_count),data = subset(pf,age < 71)) +
geom_line(stat = 'summary',fun.y = mean)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
grid.arrange(q1,q2,q3,ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'
***