本文档只描述笔者学习R语言过程中的描述统计分析函数的过程。
本次训练关注分析连续型变量的中心趋势、变化性和分布形状的方法。最常见的描述统计量有最小值、最大值、四分位数、均值和中位数,除此之外还有标准差、方差、值域(全距)、平均值的标准误、峰度、偏度、截尾均值等。
R中提供的计算方法非常之多,自带的summary函数,以及扩展包Hmisc包的describe函数、pastecs包的stat.desc函数、psych包的describe函数等。
本次训练采用自带的是mtcars数据集,关注焦点是每加仑汽油行驶的英里数(mpg)、马力(hp)和车重(wt)
vars <- c("mpg","hp","wt")
head(mtcars[vars])
## mpg hp wt
## Mazda RX4 21.0 110 2.620
## Mazda RX4 Wag 21.0 110 2.875
## Datsun 710 22.8 93 2.320
## Hornet 4 Drive 21.4 110 3.215
## Hornet Sportabout 18.7 175 3.440
## Valiant 18.1 105 3.460
summary(mtcars[vars])
## mpg hp wt
## Min. :10.40 Min. : 52.0 Min. :1.513
## 1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581
## Median :19.20 Median :123.0 Median :3.325
## Mean :20.09 Mean :146.7 Mean :3.217
## 3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610
## Max. :33.90 Max. :335.0 Max. :5.424
mystats <- function(x,na.omit=FALSE){
if (na.omit)
x <- x[!is.na(x)]
m <- mean(x)
n <- length(x)
s <- sd(x)
skew <- sum((x-m)^3/s^3)/n
kurt <- sum((x-m)^4/s^4)/n - 3
return(c(n=n,mean=m,stdev=s,skew=skew,kurtosis=kurt))
}
sapply(mtcars[vars],mystats)
## mpg hp wt
## n 32.000000 32.0000000 32.00000000
## mean 20.090625 146.6875000 3.21725000
## stdev 6.026948 68.5628685 0.97845744
## skew 0.610655 0.7260237 0.42314646
## kurtosis -0.372766 -0.1355511 -0.02271075
首次使用需要用install.packages命令安装Hmisc包
library(Hmisc)
describe(mtcars[vars])
## vars n mean sd median trimmed mad min max range skew
## mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61
## hp 2 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73
## wt 3 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42
## kurtosis se
## mpg -0.37 1.07
## hp -0.14 12.12
## wt -0.02 0.17
首次使用需要用install.packages命令安装pastecs包
stat.desc(x, basic=TRUE, desc=TRUE, norm=FALSE, p=0.95)
library(pastecs)
stat.desc(mtcars[vars],norm = TRUE)
## mpg hp wt
## nbr.val 32.0000000 32.00000000 32.00000000
## nbr.null 0.0000000 0.00000000 0.00000000
## nbr.na 0.0000000 0.00000000 0.00000000
## min 10.4000000 52.00000000 1.51300000
## max 33.9000000 335.00000000 5.42400000
## range 23.5000000 283.00000000 3.91100000
## sum 642.9000000 4694.00000000 102.95200000
## median 19.2000000 123.00000000 3.32500000
## mean 20.0906250 146.68750000 3.21725000
## SE.mean 1.0654240 12.12031731 0.17296847
## CI.mean.0.95 2.1729465 24.71955013 0.35277153
## var 36.3241028 4700.86693548 0.95737897
## std.dev 6.0269481 68.56286849 0.97845744
## coef.var 0.2999881 0.46740771 0.30412851
## skewness 0.6106550 0.72602366 0.42314646
## skew.2SE 0.7366922 0.87587259 0.51048252
## kurtosis -0.3727660 -0.13555112 -0.02271075
## kurt.2SE -0.2302812 -0.08373853 -0.01402987
## normtest.W 0.9475647 0.93341934 0.94325772
## normtest.p 0.1228814 0.04880824 0.09265499
首次使用需要用install.packages命令安装psych包,相对于Hmisc包的describe函数,后加载的psych包中的describe函数的优先级高
describe(x, na.rm = TRUE, interp=FALSE,skew = TRUE, ranges = TRUE,trim=.1,type=3,check=TRUE,fast=NULL,quant=NULL,IQR=FALSE)
library(psych)
describe(mtcars[vars])
## vars n mean sd median trimmed mad min max range skew
## mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61
## hp 2 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73
## wt 3 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42
## kurtosis se
## mpg -0.37 1.07
## hp -0.14 12.12
## wt -0.02 0.17
对于描述统计量,我们还可以通过图形来观察数据的分布情况,以便于我们观察和了解单单是统计量无法看到的信息
#箱型图
par(mfrow=c(1,3))
boxplot(mtcars$mpg,xlab="mpg")
boxplot(mtcars$hp,xlab="hp")
boxplot(mtcars$wt,xlab="wt")
#直方图
par(mfrow=c(1,3))
hist(mtcars$mpg,freq = FALSE,breaks = 12,xlab="mpg")
rug(jitter(mtcars$mpg))
lines(density(mtcars$mpg),col="blue",lwd=1)
hist(mtcars$hp,freq = FALSE,breaks = 12,xlab="hp")
rug(jitter(mtcars$hp))
lines(density(mtcars$hp),col="blue",lwd=1)
hist(mtcars$wt,freq = FALSE,breaks = 12,xlab="wt")
rug(jitter(mtcars$wt))
lines(density(mtcars$wt),col="blue",lwd=1)
上述的方法多用于对整体的描述统计量的计算,在实际应用中,我们也需要对各组的数据进行描述统计量的计算和观察,例如性别分组或者在mtcars数据集中用汽车的变速箱类型(am)进行分组统计
官方文档解释 S3 method for class ‘data.frame’ aggregate(x, by, FUN, …, simplify = TRUE, drop = TRUE)
aggregate(mtcars[vars],by=list(am=mtcars$am),mean)
## am mpg hp wt
## 1 0 17.14737 160.2632 3.768895
## 2 1 24.39231 126.8462 2.411000
aggregate(mtcars[vars],by=list(am=mtcars$am),sd)
## am mpg hp wt
## 1 0 3.833966 53.90820 0.7774001
## 2 1 6.166504 84.06232 0.6169816
遗憾的是aggregate函数每次只能调用mean,sd此类的函数,但aggregate允许定义多种分组条件。
summaryBy(formula, data = parent.frame(), id = NULL, FUN = mean, keep.names=FALSE, p2d=FALSE, order=TRUE, full.dimension=FALSE, var.names=NULL, fun.names=NULL, …)
在R中,formula 接受以下格式: var1 + var2 + … + varN ~ groupvar1 + groupvar2 + … + groupvarN
library(doBy)
summaryBy(mpg+hp+wt~am,data = mtcars,FUN = mystats)
## am mpg.n mpg.mean mpg.stdev mpg.skew mpg.kurtosis hp.n hp.mean
## 1 0 19 17.14737 3.833966 0.01395038 -0.8031783 19 160.2632
## 2 1 13 24.39231 6.166504 0.05256118 -1.4553520 13 126.8462
## hp.stdev hp.skew hp.kurtosis wt.n wt.mean wt.stdev wt.skew
## 1 53.90820 -0.01422519 -1.2096973 19 3.768895 0.7774001 0.9759294
## 2 84.06232 1.35988586 0.5634635 13 2.411000 0.6169816 0.2103128
## wt.kurtosis
## 1 0.1415676
## 2 -1.1737358
describeBy(x, group=NULL,mat=FALSE,type=3,digits=15,…)
library(psych)
describeBy(mtcars[vars],list(am=mtcars$am))
##
## Descriptive statistics by group
## am: 0
## vars n mean sd median trimmed mad min max range skew
## mpg 1 19 17.15 3.83 17.30 17.12 3.11 10.40 24.40 14.00 0.01
## hp 2 19 160.26 53.91 175.00 161.06 77.10 62.00 245.00 183.00 -0.01
## wt 3 19 3.77 0.78 3.52 3.75 0.45 2.46 5.42 2.96 0.98
## kurtosis se
## mpg -0.80 0.88
## hp -1.21 12.37
## wt 0.14 0.18
## --------------------------------------------------------
## am: 1
## vars n mean sd median trimmed mad min max range skew
## mpg 1 13 24.39 6.17 22.80 24.38 6.67 15.00 33.90 18.90 0.05
## hp 2 13 126.85 84.06 109.00 114.73 63.75 52.00 335.00 283.00 1.36
## wt 3 13 2.41 0.62 2.32 2.39 0.68 1.51 3.57 2.06 0.21
## kurtosis se
## mpg -1.46 1.71
## hp 0.56 23.31
## wt -1.17 0.17
特点:灵活 用melt函数融合数据框 melt(data, id.vars, measure.vars, variable_name = “variable”, na.rm = !preserve.na, preserve.na = TRUE, …)
measure.vars为需要进行描述的变量,默认是所有变量 id.vars 为标识或者分组变量,一个或多个
用cast函数重铸数据 cast(data, formula = … ~ variable, fun.aggregate=NULL, …, margins=FALSE, subset=TRUE, df=FALSE, fill=NULL, add.missing=FALSE, value = guess_value(data))
library(reshape)
dfm <- melt(mtcars,measure.vars=c("mpg","hp","wt"),id.vars=c("am"))
#mystats 自定义函数,前面已经实现,在这里重复调用
(cdf <- cast(dfm,am+variable ~ .,mystats))
## am variable n mean stdev skew kurtosis
## 1 0 mpg 19 17.147368 3.8339664 0.01395038 -0.8031783
## 2 0 hp 19 160.263158 53.9081957 -0.01422519 -1.2096973
## 3 0 wt 19 3.768895 0.7774001 0.97592938 0.1415676
## 4 1 mpg 13 24.392308 6.1665038 0.05256118 -1.4553520
## 5 1 hp 13 126.846154 84.0623243 1.35988586 0.5634635
## 6 1 wt 13 2.411000 0.6169816 0.21031283 -1.1737358
比较直观的观察数据
par(mfrow=c(1,3))
barplot(cdf$mean[cdf$variable=="mpg"],names.arg=cdf$am[cdf$variable=="mpg"],xlab = "am",ylab = "mpg's mean")
barplot(cdf$stdev[cdf$variable=="hp"],names.arg=cdf$am[cdf$variable=="hp"],xlab = "am",ylab = "hp's stdev")
barplot(cdf$mean[cdf$variable=="wt"],names.arg=cdf$am[cdf$variable=="wt"],xlab = "am",ylab = "wt's mean")
以上就是本次练习的全部内容,以后有新的发现便在后面追加。