第十二课 基本统计分析

lidong

26/05/2021

描述性统计

在描述性统计量的计算方面, R有很多方法,这里仅展示基础包的函数

> summary(mtcars[,1:3])
##       mpg             cyl             disp      
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8  
##  Median :19.20   Median :6.000   Median :196.3  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0

\(~\) #分组计算描述性统计量

\(~\) 使用aggregate()分组获取描述性统计量

> aggregate(mtcars[,1:3], by=list(am=mtcars$am), mean)
##   am      mpg      cyl     disp
## 1  0 17.14737 6.947368 290.3789
## 2  1 24.39231 5.076923 143.5308

\(~\) aggregate()仅允许在每次调用中使用平均数、标准差这样的单返回值函数。它无法一次返回若干个统计量。

by(data, INDICES, FUN, …, simplify =TRUE)函数用于将data中的数据,按照INDICES里面的内容拆分成若干个小的data frame,并且在每一小块data frame上应用FUN函数。

\(~\)

> by(mtcars, mtcars$am,summary)
## mtcars$am: 0
##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   :120.1   Min.   : 62.0  
##  1st Qu.:14.95   1st Qu.:6.000   1st Qu.:196.3   1st Qu.:116.5  
##  Median :17.30   Median :8.000   Median :275.8   Median :175.0  
##  Mean   :17.15   Mean   :6.947   Mean   :290.4   Mean   :160.3  
##  3rd Qu.:19.20   3rd Qu.:8.000   3rd Qu.:360.0   3rd Qu.:192.5  
##  Max.   :24.40   Max.   :8.000   Max.   :472.0   Max.   :245.0  
##       drat             wt             qsec             vs               am   
##  Min.   :2.760   Min.   :2.465   Min.   :15.41   Min.   :0.0000   Min.   :0  
##  1st Qu.:3.070   1st Qu.:3.438   1st Qu.:17.18   1st Qu.:0.0000   1st Qu.:0  
##  Median :3.150   Median :3.520   Median :17.82   Median :0.0000   Median :0  
##  Mean   :3.286   Mean   :3.769   Mean   :18.18   Mean   :0.3684   Mean   :0  
##  3rd Qu.:3.695   3rd Qu.:3.842   3rd Qu.:19.17   3rd Qu.:1.0000   3rd Qu.:0  
##  Max.   :3.920   Max.   :5.424   Max.   :22.90   Max.   :1.0000   Max.   :0  
##       gear            carb      
##  Min.   :3.000   Min.   :1.000  
##  1st Qu.:3.000   1st Qu.:2.000  
##  Median :3.000   Median :3.000  
##  Mean   :3.211   Mean   :2.737  
##  3rd Qu.:3.000   3rd Qu.:4.000  
##  Max.   :4.000   Max.   :4.000  
## ------------------------------------------------------------ 
## mtcars$am: 1
##       mpg             cyl             disp             hp             drat     
##  Min.   :15.00   Min.   :4.000   Min.   : 71.1   Min.   : 52.0   Min.   :3.54  
##  1st Qu.:21.00   1st Qu.:4.000   1st Qu.: 79.0   1st Qu.: 66.0   1st Qu.:3.85  
##  Median :22.80   Median :4.000   Median :120.3   Median :109.0   Median :4.08  
##  Mean   :24.39   Mean   :5.077   Mean   :143.5   Mean   :126.8   Mean   :4.05  
##  3rd Qu.:30.40   3rd Qu.:6.000   3rd Qu.:160.0   3rd Qu.:113.0   3rd Qu.:4.22  
##  Max.   :33.90   Max.   :8.000   Max.   :351.0   Max.   :335.0   Max.   :4.93  
##        wt             qsec             vs               am         gear      
##  Min.   :1.513   Min.   :14.50   Min.   :0.0000   Min.   :1   Min.   :4.000  
##  1st Qu.:1.935   1st Qu.:16.46   1st Qu.:0.0000   1st Qu.:1   1st Qu.:4.000  
##  Median :2.320   Median :17.02   Median :1.0000   Median :1   Median :4.000  
##  Mean   :2.411   Mean   :17.36   Mean   :0.5385   Mean   :1   Mean   :4.385  
##  3rd Qu.:2.780   3rd Qu.:18.61   3rd Qu.:1.0000   3rd Qu.:1   3rd Qu.:5.000  
##  Max.   :3.570   Max.   :19.90   Max.   :1.0000   Max.   :1   Max.   :5.000  
##       carb      
##  Min.   :1.000  
##  1st Qu.:1.000  
##  Median :2.000  
##  Mean   :2.923  
##  3rd Qu.:4.000  
##  Max.   :8.000

\(~\)

使用psych包中的describeBy()分组计算概述统计量

> library(psych)
> describeBy(mtcars[,1:3], list(am=mtcars$am))
## 
##  Descriptive statistics by group 
## am: 0
##      vars  n   mean     sd median trimmed    mad   min   max range  skew
## mpg     1 19  17.15   3.83   17.3   17.12   3.11  10.4  24.4  14.0  0.01
## cyl     2 19   6.95   1.54    8.0    7.06   0.00   4.0   8.0   4.0 -0.95
## disp    3 19 290.38 110.17  275.8  289.71 124.83 120.1 472.0 351.9  0.05
##      kurtosis    se
## mpg     -0.80  0.88
## cyl     -0.74  0.35
## disp    -1.26 25.28
## ------------------------------------------------------------ 
## am: 1
##      vars  n   mean    sd median trimmed   mad  min   max range skew kurtosis
## mpg     1 13  24.39  6.17   22.8   24.38  6.67 15.0  33.9  18.9 0.05    -1.46
## cyl     2 13   5.08  1.55    4.0    4.91  0.00  4.0   8.0   4.0 0.87    -0.90
## disp    3 13 143.53 87.20  120.3  131.25 58.86 71.1 351.0 279.9 1.33     0.40
##         se
## mpg   1.71
## cyl   0.43
## disp 24.19

频数表和列联表

\(~\)

> table(am=mtcars$am)
## am
##  0  1 
## 19 13
> xtabs(~vs+am,data=mtcars)
##    am
## vs   0  1
##   0 12  6
##   1  7  7

\(~\) –独立性检验 chisq.test()

\(~\) – Fisher精确检验 fisher.test()

\(~\)

相关

\(~\)

R可以计算多种相关系数,包括Pearson相关系数、Spearman相关系数、 Kendall相关系数、偏 相关系数

\(~\)

> x <- rnorm(10)
> y <- rnorm(10)
> cor(x,y)
## [1] -0.4982695
> cor(mtcars$mpg,mtcars$disp)
## [1] -0.8475514

\(~\)

t检验

\(~\)
在研究中最常见的行为就是对两个组进行比较。t检验就是对两组数据进行的检验

\(~\)

> x <- rnorm(10)
> y <- rnorm(10)
> t.test(x,y,var.equal=T)#指方差齐性,你能得到和我一样的结果么?为什么
## 
##  Two Sample t-test
## 
## data:  x and y
## t = 2.4758, df = 18, p-value = 0.02346
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  0.1194265 1.4579712
## sample estimates:
##  mean of x  mean of y 
##  0.6183135 -0.1703853
> t.statistic <- replicate(1000,t.test(rnorm(10),rnorm(10),var.equal=T)$statistic)#replicate是重复执行某一命令的函数,1000 代表执行次数
>  opar <- par(no.readonly=TRUE)
> par(fig=c(0, 0.8, 0, 1))#画图区域,前两个参数制定横向范围,后面两个参数制定纵向分布范围
> plot(t.statistic)#每次执行上述命令后t值的散点图
> par(fig=c(0.65, 1, 0, 1), new=TRUE)##在上幅图的右侧画分布图
> plot(density(t.statistic)$y,density(t.statistic)$x,axes=F,frame.plot=F, xlab="", ylab="",main="",type = "l")

> par(opar)