第四章探索性数据分析

4.1 数据集

library("MASS")
data(package="MASS") # show data sets in MASS
data(Insurance)
dim(Insurance) # 显示数据集维度 nrow,ncol

## [1] 64  5

head(Insurance,n=10L) # 显示前10行

##    District  Group   Age Holders Claims
## 1         1    <1l   <25     197     38
## 2         1    <1l 25-29     264     35
## 3         1    <1l 30-35     246     20
## 4         1    <1l   >35    1680    156
## 5         1 1-1.5l   <25     284     63
## 6         1 1-1.5l 25-29     536     84
## 7         1 1-1.5l 30-35     696     89
## 8         1 1-1.5l   >35    3582    400
## 9         1 1.5-2l   <25     133     19
## 10        1 1.5-2l 25-29     286     52

4.2 数字化探索

补充资料

第一四分位数 (Q1)，又称“较小四分位数”，等于该样本中所有数值由小到大排列后第25%的数字
第二四分位数 (Q2)，又称“中位数”，等于该样本中所有数值由小到大排列后第50%的数字
第三四分位数 (Q3)，又称“较大四分位数”，等于该样本中所有数值由小到大排列后第75%的数字
第三四分位数与第一四分位数的差距又称四分位距（InterQuartile Range,IQR）
首先确定四分位数的位置：[n表示项数]
Q1的位置= (n+1) × 0.25
Q2的位置= (n+1) × 0.5
Q3的位置= (n+1) × 0.75
对于四分位数的确定，有不同的方法，另外一种方法基于N-1 基础。即
Q1的位置=1+（n-1）x 0.25
Q2的位置=1+（n-1）x 0.5
Q3的位置=1+（n-1）x 0.75

##### 4.2.1 变量概况

attributes( Insurance ) #显示属性、数据集格式、行名

## $names
## [1] "District" "Group"    "Age"      "Holders"  "Claims"  
## 
## $class
## [1] "data.frame"
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64

str(Insurance )

## 'data.frame':    64 obs. of  5 variables:
##  $ District: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Group   : Ord.factor w/ 4 levels "<1l"<"1-1.5l"<..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ Age     : Ord.factor w/ 4 levels "<25"<"25-29"<..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Holders : int  197 264 246 1680 284 536 696 3582 133 286 ...
##  $ Claims  : int  38 35 20 156 63 84 89 400 19 52 ...

summary(Insurance)

##  District    Group       Age        Holders            Claims      
##  1:16     <1l   :16   <25  :16   Min.   :   3.00   Min.   :  0.00  
##  2:16     1-1.5l:16   25-29:16   1st Qu.:  46.75   1st Qu.:  9.50  
##  3:16     1.5-2l:16   30-35:16   Median : 136.00   Median : 22.00  
##  4:16     >2l   :16   >35  :16   Mean   : 364.98   Mean   : 49.23  
##                                  3rd Qu.: 327.50   3rd Qu.: 55.50  
##                                  Max.   :3582.00   Max.   :400.00

4.2.2 变量详情

library("Hmisc")
describe(Insurance) # 取值水平<=10默认为离散变量

## Insurance 
## 
##  5  Variables      64  Observations
## ---------------------------------------------------------------------------
## District 
##       n missing  unique 
##      64       0       4 
## 
## 1 (16, 25%), 2 (16, 25%), 3 (16, 25%), 4 (16, 25%) 
## ---------------------------------------------------------------------------
## Group 
##       n missing  unique 
##      64       0       4 
## 
## <1l (16, 25%), 1-1.5l (16, 25%), 1.5-2l (16, 25%) 
## >2l (16, 25%) 
## ---------------------------------------------------------------------------
## Age 
##       n missing  unique 
##      64       0       4 
## 
## <25 (16, 25%), 25-29 (16, 25%), 30-35 (16, 25%) 
## >35 (16, 25%) 
## ---------------------------------------------------------------------------
## Holders 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      64       0      63       1     365   16.30   24.00   46.75  136.00 
##     .75     .90     .95 
##  327.50  868.90 1639.25 
## 
## lowest :    3    7    9   16   18, highest: 1635 1640 1680 2443 3582 
## ---------------------------------------------------------------------------
## Claims 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      64       0      46       1   49.23    3.15    4.30    9.50   22.00 
##     .75     .90     .95 
##   55.50  101.70  182.35 
## 
## lowest :   0   2   3   4   5, highest: 156 187 233 290 400 
## ---------------------------------------------------------------------------

library("fBasics")
basicStats(Insurance$Holders) # nobs-对象数; SEMean-标准误差均值; LCL/UCL Mean 95%置信水平置信限; Variance 方差; Stdev 标准误; Skewness 偏度; Kurtisis 峰度

##             X..Insurance.Holders
## nobs                6.400000e+01
## NAs                 0.000000e+00
## Minimum             3.000000e+00
## Maximum             3.582000e+03
## 1. Quartile         4.675000e+01
## 3. Quartile         3.275000e+02
## Mean                3.649844e+02
## Median              1.360000e+02
## Sum                 2.335900e+04
## SE Mean             7.784632e+01
## LCL Mean            2.094209e+02
## UCL Mean            5.205478e+02
## Variance            3.878432e+05
## Stdev               6.227706e+02
## Skewness            3.127833e+00
## Kurtosis            1.099961e+01

4.2.3 分布指标（某变量各水平取值情况）

library("timeDate") 
skewness(Insurance$Holder) # 偏度衡量数据偏倚成都，abs(偏度)>1左偏（-）或右偏（+）

## [1] 3.127833
## attr(,"method")
## [1] "moment"

kurtosis( Insurance$Holders ) # 峰度衡量数据分布形态的陡缓程度，峰度>0:尖顶峰度；峰度=0：标准峰度；峰度<0：平顶峰度

## [1] 10.99961
## attr(,"method")
## [1] "excess"

4.2.4 稀疏性

library("Matrix")
i<-sample(1:10,10,replace=TRUE)
j<-sample(1:10,10,replace=TRUE)
A<-sparseMatrix(i,j,x=1) # 维数的变化是由于随机取样造成的
local<-which(A==1,arr.ind=TRUE) # arr.ind:当为数组时，是否返回数组（否的时候返回向量）
plot(local,pch=22)

##### 4.2.5 缺失值

library("mice")

## Loading required package: Rcpp

## mice 2.25 2015-11-09

A<-diag(2,5,5)
A

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    2    0    0    0    0
## [2,]    0    2    0    0    0
## [3,]    0    0    2    0    0
## [4,]    0    0    0    2    0
## [5,]    0    0    0    0    2

A[A==2]<-NA # 创建缺失值
A[4,]<-2
A[5,3:5]<-4
A

##      [,1] [,2] [,3] [,4] [,5]
## [1,]   NA    0    0    0    0
## [2,]    0   NA    0    0    0
## [3,]    0    0   NA    0    0
## [4,]    2    2    2    2    2
## [5,]    0    0    4    4    4

md.pattern(A)

##   [,1] [,2] [,3] [,4] [,5] [,6]
## 2    1    1    1    1    1    0
## 1    1    1    0    1    1    1
## 1    1    1    1    0    1    1
## 1    1    1    1    1    0    1
##      0    0    1    1    1    3

4.2.6 相关性

cor(Insurance$Holder,Insurance$Claims) # 投保人数量~要求索赔数量

## [1] 0.9857701

x<-matrix( sample(1:100) ,ncol=10 ) # 实验数据
cor_x<-cor( x )
library("ellipse")
plotcorr(cor_x,col=rep(c("white","black"),5)) # 生成“相关图”

plotcorr(cor_x,diag=T,type="lower",col=rep(c("white","black"),5)) # diag:对角线 lower：下三角 upper：上三角

#### 4.3 可视化探索      
##### 4.3.1 直方图

library("MASS")
data(Insurance)
hist(Insurance$Claims,main="Histogram of freq of Insurance$Claims") # 要求索赔的人数

hist(Insurance$Claims,main="Histogram of freq of Insurance$Claims whit 20 bars" ,breaks=20,labels=TRUE,col="black",border="white" ) # breaks-分组（0-20,20-40,40-60...）； col填充颜色设置； border-边框颜色设置

hist(Insurance$Claims,freq=FALSE,density=20,main="Histogram of density of Insurance$Claims") # freq-频率直方图(T)/密度直方图(F)，
lines(density(Insurance$Claims)) # 绘制密度曲线

4.3.2 积累分布图

library("Hmisc") # 书上有错误
par(mfrow=c(1,2)) # 子图模式，两个图在同一列（1行2列）
Ecdf(Insurance$Claims,xlab="Claims",main="Cumulative Distribution of Claims") # 可以认为积累分布图各点斜率为概率密度曲线在该点的取值，即积累密度曲线在某一点的斜率越大，概率密度越大
plot(density(Insurance$Claims),main="密度曲线图")

par(mfrow=c(1,1)) # 退出子图模式（这里并没有什么子图模式，只是便于记忆）
data_plot<-with( Insurance,
                 rbind( 
                   data.frame( var1=Claims[Age=="<25"],var2="<25" ),
                   data.frame( var1=Claims[Age=="25-29"],var2="25-29" ),
                   data.frame( var1=Claims[Age=="30-35"],var2="30-35" ),
                   data.frame( var1=Claims[Age==">35"],var2=">35" ) # 小心这里不要多了逗号
                 )
) # 构建数据
data_plot[c(1:4,16+1:4,32+1:4,48+1:4),] # 查看数据

##    var1  var2
## 1    38   <25
## 2    63   <25
## 3    19   <25
## 4     4   <25
## 17   35 25-29
## 18   84 25-29
## 19   52 25-29
## 20   18 25-29
## 33   20 30-35
## 34   89 30-35
## 35   74 30-35
## 36   19 30-35
## 49  156   >35
## 50  400   >35
## 51  233   >35
## 52   77   >35

Ecdf( data_plot$var1,group=data_plot$var2,lty=1:4,label.curves=1:4,col=c("blue","green","red","black"),
      xlab="Claims",main="Cumulative Distribution of Claims by Age"
      ) # lty-线型设置； label.curves-标出分组曲线组名； group-分组变量
Ecdf(Insurance$Claim,add=TRUE,col="red")

结论：一定程度上说明了年龄较大的投保人（>35）要求索赔量较高

4.3.3 箱线图

Claims_bp <- boxplot( Insurance$Claims,main="Distribution of Claims" )
Claims_bp$stats

##      [,1]
## [1,]    0
## [2,]    9
## [3,]   22
## [4,]   58
## [5,]  102
## attr(,"class")
##           
## "integer"

summary(Insurance$Claims) #为什么这里和之前的分位点的值不一样

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.50   22.00   49.23   55.50  400.00

points(x=1,y=mean(Insurance$Claims),pch=8) # 标记平均值
Claims_points<-as.matrix(Insurance$Claims[which(Insurance$Claims>102)]) #异常值
Claims_text<-rbind(Claims_bp$stats,mean(Insurance$Claims),Claims_points)
Claims_text

##            [,1]
##  [1,]   0.00000
##  [2,]   9.00000
##  [3,]  22.00000
##  [4,]  58.00000
##  [5,] 102.00000
##  [6,]  49.23438
##  [7,] 156.00000
##  [8,] 400.00000
##  [9,] 233.00000
## [10,] 290.00000
## [11,] 143.00000
## [12,] 187.00000

for(i in 1:length(Claims_text)){
  print(i)
  text(x=1.1,y=Claims_text[i,],labels=Claims_text[i,]) # 标记各点
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12

boxplot(var1~var2,data=data_plot,horizontal=TRUE,
        main="Distribution of Claims by Age",xlab="Claims",ylab="Age")

结论：
1. Claims分布右偏趋势
2. 前三组（<35）的Claims量小于最后一组（>35）,“或许”可以认年龄大于35的更易发生事故

with( Insurance,{
  boxplot(Holders~Age,boxwex=0.25,at=1:4+0.2,subset = Age == ">35") #绘制第一个箱子（>35）：
  boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "30-35")
  boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "25-29")
  boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "<25")
} ) # 索赔人数~年龄

boxplot(var1~var2,data=data_plot,add=TRUE,boxwex=0.25,at=1:4-0.2,
        col="lightgrey",
        xlab="Age",ylab="Holder")  # 索赔人数~年龄
legend(x="topleft",c( "Claims","Holders" ),fill=c("lightgrey","white"))

#-分开绘图看细节-#
par(mfrow=c( 1,2 ))
boxplot(var1~var2,data=data_plot,boxwex=0.25,at=1:4-0.2,
        col="lightgrey",main="Claims~Age",
        xlab="Age",ylab="Holder")  # 索赔人数~年龄

with( Insurance,{
  boxplot(Holders~Age,boxwex=0.25,at=1:4+0.2,subset = Age == ">35",main="Holders~Age") #绘制第一个箱子（>35）：
  boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "30-35")
  boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "25-29")
  boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "<25")
} ) # 索赔人数~年龄

par(mfrow=c(1,1))

结论：
1. 可以看出>35投保人数也多，所以很可能是由于投保人数量众多才造成了要求索赔人数增多 2. Holders和Claims的差异还可以得出<=35的要求索赔人数更多一些，出事故的可能更大一些，因为差值最大

library("Hmisc")
attach(data_plot);
data_plot<-list(
  var1[ which(var2=="<25") ],
  var1[ which(var2=="25-29") ],
  var1[ which(var2=="30-35") ],
  var1[ which(var2==">35") ]
  ) # 构建数据
detach(data_plot)
par(mfrow=c(1,1))
data_plot

## [[1]]
##  [1] 38 63 19  4 22 25 14  4  5 10  8  3  2  7  5  0
## 
## [[2]]
##  [1] 35 84 52 18 19 51 46 15 11 24 19  2  5 10  7  6
## 
## [[3]]
##  [1] 20 89 74 19 22 49 39 12 10 37 24  8  4 22 16  8
## 
## [[4]]
##  [1] 156 400 233  77  87 290 143  53  67 187 101  37  36 102  63  33

bpplot( data_plot,name=c("<25","25-29","30-35",">35"),xlab="Age",ylab="Claims" )#用箱子的宽窄表示所有观测值中高于/低于相应位置的值得比例

##### 4.3.4 条形图

Claims_Age<-with(Insurance,
    c( sum( Claims[which( Age=="<25" ) ] ),
       sum( Claims[which( Age=="25-29" ) ] ),
       sum( Claims[which( Age=="30-35" ) ] ),
       sum( Claims[which( Age==">35" ) ]  ) )
)
x<-barplot( Claims_Age,names.arg=c("<25","25-29","30-35",">35"),density=20,main="Ditribution of Age by Claims",xlab="Age",ylab="Claims"); # 返回条形图中间值
text(t(x),Claims_Age,labels=Claims_Age,cex=1,pos=1) # 标注数字，cex-标识大小； pos-位置（下、左、上、右）【由于没有labels标签，使用其他方式】

Holders_Age<-with(Insurance,
    c( sum( Holders[which( Age=="<25" ) ] ),
       sum( Holders[which( Age=="25-29" ) ] ),
       sum( Holders[which( Age=="30-35" ) ] ),
       sum( Holders[which( Age==">35" ) ]  ) )
)
data_bar <- rbind(Claims_Age,Holders_Age)
x<-barplot( data_bar,names.arg=c("<25","25-29","30-35",">35"),col=c("grey","black"),
            beside=TRUE,main="Ditribution of Age by Claims and Holders",
            xlab="Age",ylab="Claims&Holder",
            legend.text=rownames(data_bar),args.legend=list(x="topleft") # 设置图例
            );# beside=TRUE构建分组条形图

x<-barplot( data_bar,names.arg=c("<25","25-29","30-35",">35"),col=c("grey","black"),
            beside=FALSE,main="Ditribution of Age by Claims and Holders",
            xlab="Age",ylab="Claims&Holder",
            legend.text=rownames(data_bar),args.legend=list(x="topleft") #设置图例
            ) # beside=FALSE构建堆叠条形图

##### 4.3.5 点阵图

dotchart( data_bar,xlab="Claims&Holders",main="点阵图", pch=1:2 )
legend( x=14000,y=15,"<25",bty="n" )
legend( x=14000,y=11,"25-29",bty="n" )
legend( x=14000,y=7,"30-25",bty="n" )
legend( x=14000,y=3,">35",bty="n" ) #bty:the type of box to be drawn around the legend. The allowed values are "o" (the default) and "n"

##### 4.3.6 饼图

pie(Claims_Age,labels=c("<25","25-29","30-35",">35"),
    main="pie cahrt of Claims by Age",
    col = c("deepskyblue", "yellow", "yellow3","skyblue")
    )

percent<-round(Claims_Age/sum(Claims_Age)*100,2)
labels<-paste(c("<25","25-29","30-35",">35"),":",percent,"%",sep="")
pie(Claims_Age,labels=labels,
    main="pie cahrt of Claims by Age",
    col = c("deepskyblue", "yellow", "yellow3","skyblue")
    )# 加入百分数

library("plotrix")
pie3D(Claims_Age,labels=labels,
    main="pie cahrt of Claims by Age",
    col = c("deepskyblue", "yellow", "yellow3","skyblue"),
    explode=0.05,labelcex=1# explode-缝隙大小；labelcex-标签大小（书上错误）
    )

第四章 探索性数据分析