library(ggplot2)
pf <- read.csv('pseudo_facebook.tsv',sep = '\t')
qplot(x = age,y = friend_count,data = pf)

ggplot Syntax

Notes:

ggplot(aes(x = age,y = friend_count),data = pf) +
  geom_point() +
  xlim(13,90)
## Warning: Removed 4906 rows containing missing values (geom_point).


过渡渲染

Notes:

ggplot(aes(x = age,y = friend_count),data = pf) +
  geom_jitter(alpha = 1/20) +
  xlim(13,90)
## Warning: Removed 5181 rows containing missing values (geom_point).

***

Coord_trans()

Notes:转换X,Y轴数值

ggplot(aes(x = age,y = friend_count),data = pf) +
  geom_point(alpha = 1/20) +
  xlim(13,90) +
  coord_trans(y ='sqrt')
## Warning: Removed 4906 rows containing missing values (geom_point).

使用噪声,可能会将数值为0的数据转换为负值,此时无法进行Log运算,因此要按位置进行抖动

ggplot(aes(x = age,y = friend_count),data = pf) +
  geom_point(alpha = 1/20,position = position_jitter(h = 0)) +
  xlim(13,90) +
  coord_trans(y ='sqrt')
## Warning: Removed 5211 rows containing missing values (geom_point).


好友邀请与年龄的关系

ggplot(aes(age,friendships_initiated),data = pf) + 
  geom_point(alpha = 1/20,position = position_jitter(h = 0)) +
  scale_x_continuous(limits = c(13,113),breaks = seq(13,113,5)) +
  coord_trans(y = 'sqrt')
## Warning: Removed 352 rows containing missing values (geom_point).

Notes:dplyr可以分割数据框,向有关数据传导函数

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#选取变量进行分组
age_groups <- group_by(pf,age)
#创建新的数据框,其中包含的变量
pf.fc_by_age <- summarise(age_groups,
          friend_count_mean = mean(friend_count),
          friend_count_median = median(friend_count),
          n = n())
#按照年龄进行排序
pf.fc_by_age <- arrange(pf.fc_by_age,age)
head(pf.fc_by_age)
## # A tibble: 6 × 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196
pf.fc_by_age <- pf %>%
     group_by(age) %>%
     summarise( friend_count_mean = mean(friend_count),
                friend_count_median = median(friend_count),
                n = n()) %>%
arrange(age)
head(pf.fc_by_age,20)
## # A tibble: 20 × 4
##      age friend_count_mean friend_count_median     n
##    <int>             <dbl>               <dbl> <int>
## 1     13          164.7500                74.0   484
## 2     14          251.3901               132.0  1925
## 3     15          347.6921               161.0  2618
## 4     16          351.9371               171.5  3086
## 5     17          350.3006               156.0  3283
## 6     18          331.1663               162.0  5196
## 7     19          333.6921               157.0  4391
## 8     20          283.4991               135.0  3769
## 9     21          235.9412               121.0  3671
## 10    22          211.3948               106.0  3032
## 11    23          202.8426                93.0  4404
## 12    24          185.7121                92.0  2827
## 13    25          131.0211                62.0  3641
## 14    26          144.0082                75.0  2815
## 15    27          134.1473                72.0  2240
## 16    28          125.8354                66.0  2364
## 17    29          120.8182                66.0  1936
## 18    30          115.2080                67.5  1716
## 19    31          118.4599                63.0  1694
## 20    32          114.2800                63.0  1443

Overlaying Summaries with Raw Data

Notes:

ggplot(aes(age,friend_count_mean),data = pf.fc_by_age) +
  geom_line() +
  scale_x_continuous(limits = c(13,113),breaks = seq(13,113,5))

在散点图中添加平均线

ggplot(aes(x = age,y = friend_count),data = pf) +
  geom_point(alpha = 1/20,position = position_jitter(h = 0),color = 'orange') +
  geom_line(stat = 'summary',fun.y = mean) +
  #在图层中添加10%分位数
  geom_line(stat = 'summary',fun.y = quantile,fun.args = list(probs = 0.1),
            linetype = 2,color = 'blue') +
  geom_line(stat = 'summary',fun.y = quantile,fun.args = list(probs = 0.9),
            linetype = 2,color = 'blue') +
   geom_line(stat = 'summary',fun.y = quantile,fun.args = list(probs = 0.5),
            linetype = 1,color = 'blue') +
  coord_cartesian(xlim = c(13,70)) +
  coord_trans(y = 'sqrt')


Correlation

Notes:计算R值

with(pf,cor.test(age,friend_count,method = 'pearson'))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737
?subset
## starting httpd help server ...
##  done

Correlation on Subsets

Notes:选择数据计算R值

with(subset(pf,age < 70), cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.326, df = 90664, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1775257 -0.1648889
## sample estimates:
##        cor 
## -0.1712144

Create Scatterplots

Notes:

ggplot(aes(www_likes_received,likes_received),data = pf) +
  geom_point() +
  #选取95%的数值
  xlim(0,quantile(pf$www_likes_received,0.95)) +
  ylim(0,quantile(pf$likes_received,0.95)) +
  geom_smooth(method = 'lm',color = 'red')
## Warning: Removed 6075 rows containing non-finite values (stat_smooth).
## Warning: Removed 6075 rows containing missing values (geom_point).


Strong Correlations

Notes:

cor.test(pf$www_likes_received,pf$likes_received)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$www_likes_received and pf$likes_received
## t = 937.1, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9473553 0.9486176
## sample estimates:
##       cor 
## 0.9479902

More Caution with Correlation

Notes:

library(alr3)
## Loading required package: car
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
Mitchell
##     Month      Temp
## 1       0 -5.183330
## 2       1 -1.650000
## 3       2  2.494440
## 4       3 10.400000
## 5       4 14.994400
## 6       5 21.716700
## 7       6 24.744400
## 8       7 24.072200
## 9       8 18.861100
## 10      9  9.166670
## 11     10  1.544440
## 12     11 -4.005560
## 13     12 -7.477780
## 14     13 -4.005560
## 15     14  3.000000
## 16     15 10.961100
## 17     16 18.022200
## 18     17 22.727800
## 19     18 26.261100
## 20     19 21.605600
## 21     20 18.022200
## 22     21  9.783330
## 23     22  3.727780
## 24     23 -2.772220
## 25     24 -6.188890
## 26     25 -5.688890
## 27     26 -1.933330
## 28     27  6.477780
## 29     28 13.538900
## 30     29 20.988900
## 31     30 24.183300
## 32     31 22.727800
## 33     32 19.755600
## 34     33 11.127800
## 35     34  3.838890
## 36     35 -2.883330
## 37     36 -7.422220
## 38     37 -6.305560
## 39     38 -0.250000
## 40     39  7.594440
## 41     40 15.216700
## 42     41 21.383300
## 43     42 22.277800
## 44     43 22.561100
## 45     44 20.766700
## 46     45 12.694400
## 47     46  3.166670
## 48     47 -1.650000
## 49     48 -2.377780
## 50     49 -2.827780
## 51     50 -0.644444
## 52     51  9.500000
## 53     52 17.233300
## 54     53 22.394400
## 55     54 26.200000
## 56     55 24.633300
## 57     56 20.600000
## 58     57 11.461100
## 59     58  4.400000
## 60     59 -1.144440
## 61     60 -2.994440
## 62     61 -2.772220
## 63     62  3.727780
## 64     63 12.694400
## 65     64 15.722200
## 66     65 22.000000
## 67     66 24.238900
## 68     67 22.727800
## 69     68 19.194400
## 70     69 10.961100
## 71     70  5.183330
## 72     71 -0.866667
## 73     72 -4.227780
## 74     73 -4.788890
## 75     74 -0.250000
## 76     75  7.650000
## 77     76 14.938900
## 78     77 19.588900
## 79     78 24.466700
## 80     79 23.850000
## 81     80 18.466700
## 82     81 11.405600
## 83     82  2.944440
## 84     83 -0.644444
## 85     84 -1.594440
## 86     85 -1.261110
## 87     86  2.105560
## 88     87  4.288890
## 89     88 13.816700
## 90     89 20.150000
## 91     90 26.816700
## 92     91 26.877800
## 93     92 20.427800
## 94     93 11.294400
## 95     94  5.072220
## 96     95 -1.594440
## 97     96 -2.494440
## 98     97 -1.372220
## 99     98 -0.194444
## 100    99  5.355560
## 101   100 13.983300
## 102   101 21.272200
## 103   102 26.650000
## 104   103 27.605600
## 105   104 19.644400
## 106   105 10.455600
## 107   106  3.838890
## 108   107 -2.155560
## 109   108 -4.733330
## 110   109 -5.461110
## 111   110  2.416670
## 112   111  8.827780
## 113   112 17.394400
## 114   113 19.755600
## 115   114 24.466700
## 116   115 21.216700
## 117   116 19.138900
## 118   117  9.555560
## 119   118  8.183330
## 120   119  3.522220
## 121   120 -4.005560
## 122   121 -2.661110
## 123   122  1.133330
## 124   123  9.555560
## 125   124 16.900000
## 126   125 22.894400
## 127   126 24.972200
## 128   127 21.605600
## 129   128 18.583300
## 130   129 11.072200
## 131   130  0.955556
## 132   131 -2.716670
## 133   132 -2.550000
## 134   133 -0.866667
## 135   134  3.338890
## 136   135 10.444400
## 137   136 18.477800
## 138   137 23.272200
## 139   138 26.161100
## 140   139 22.577800
## 141   140 18.416700
## 142   141  9.233330
## 143   142  4.838890
## 144   143 -0.877778
## 145   144 -5.850000
## 146   145 -3.477780
## 147   146  2.994440
## 148   147  9.983330
## 149   148 19.227800
## 150   149 26.505600
## 151   150 26.161100
## 152   151 26.505600
## 153   152 19.344400
## 154   153 10.677800
## 155   154  3.166670
## 156   155 -2.033330
## 157   156 -2.611110
## 158   157 -4.172220
## 159   158  1.950000
## 160   159 13.161100
## 161   160 18.188900
## 162   161 22.288900
## 163   162 26.683300
## 164   163 24.716700
## 165   164 17.783300
## 166   165 11.888900
## 167   166  3.105560
## 168   167 -2.950000
## 169   168 -1.916670
## 170   169 -1.227780
## 171   170  4.438890
## 172   171 10.216700
## 173   172 15.066700
## 174   173 23.616700
## 175   174 24.372200
## 176   175 24.427800
## 177   176 20.844400
## 178   177 11.372200
## 179   178  5.072220
## 180   179 -2.900000
## 181   180 -4.227780
## 182   181 -1.338890
## 183   182  3.972220
## 184   183 10.966700
## 185   184 18.244400
## 186   185 23.850000
## 187   186 25.872200
## 188   187 23.733300
## 189   188 19.400000
## 190   189 10.794400
## 191   190  0.972222
## 192   191 -1.050000
## 193   192 -1.050000
## 194   193  1.200000
## 195   194  5.305560
## 196   195  8.827780
## 197   196 16.916700
## 198   197 20.438900
## 199   198 22.694400
## 200   199 21.772200
## 201   200 18.127800
## 202   201 11.255600
## 203   202  2.416670
## 204   203 -1.627780

Create your plot!

ggplot(aes(Month,Temp),data = Mitchell) +
  geom_point()

cor.test(Mitchell$Month,Mitchell$Temp)
## 
##  Pearson's product-moment correlation
## 
## data:  Mitchell$Month and Mitchell$Temp
## t = 0.81816, df = 202, p-value = 0.4142
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.08053637  0.19331562
## sample estimates:
##        cor 
## 0.05747063

Noisy Scatterplots

噪声数值

ggplot(aes(Month,Temp),data = Mitchell) +
  geom_point() + 
  scale_x_continuous(breaks = seq(0,203,12))


Making Sense of Data

Notes:

pf$age_with_months <- pf$age + (12 - pf$dob_month) / 12
pf.fc_by_age_months <- pf %>%
     group_by(age_with_months) %>%
     summarise( friend_count_mean = mean(friend_count),
                friend_count_median = median(friend_count),
                n = n()) %>%
arrange(age_with_months)

q1 <- ggplot(aes(age_with_months,friend_count_mean),data = subset(pf.fc_by_age_months,age_with_months < 71)) + 
  geom_line() + 
  geom_smooth()

q2 <- ggplot(aes(age,friend_count_mean),data = subset(pf.fc_by_age,age < 71)) + 
 geom_line() +
    geom_smooth()

#将一个点周围2.5范围的点融合为一个点,平滑指数
q3 <- ggplot(aes(round(age / 5)*5,y = friend_count),data = subset(pf,age < 71)) + 
  geom_line(stat = 'summary',fun.y = mean)

library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(q1,q2,q3,ncol = 1)
## `geom_smooth()` using method = 'loess'
## `geom_smooth()` using method = 'loess'

***