library("MASS")
data(package="MASS") # show data sets in MASS
data(Insurance)
dim(Insurance) # 显示数据集维度 nrow,ncol
## [1] 64 5
head(Insurance,n=10L) # 显示前10行
## District Group Age Holders Claims
## 1 1 <1l <25 197 38
## 2 1 <1l 25-29 264 35
## 3 1 <1l 30-35 246 20
## 4 1 <1l >35 1680 156
## 5 1 1-1.5l <25 284 63
## 6 1 1-1.5l 25-29 536 84
## 7 1 1-1.5l 30-35 696 89
## 8 1 1-1.5l >35 3582 400
## 9 1 1.5-2l <25 133 19
## 10 1 1.5-2l 25-29 286 52
第一四分位数 (Q1),又称“较小四分位数”,等于该样本中所有数值由小到大排列后第25%的数字
第二四分位数 (Q2),又称“中位数”,等于该样本中所有数值由小到大排列后第50%的数字
第三四分位数 (Q3),又称“较大四分位数”,等于该样本中所有数值由小到大排列后第75%的数字
第三四分位数与第一四分位数的差距又称四分位距(InterQuartile Range,IQR)
首先确定四分位数的位置:[n表示项数]
Q1的位置= (n+1) × 0.25
Q2的位置= (n+1) × 0.5
Q3的位置= (n+1) × 0.75
对于四分位数的确定,有不同的方法,另外一种方法基于N-1 基础。即
Q1的位置=1+(n-1)x 0.25
Q2的位置=1+(n-1)x 0.5
Q3的位置=1+(n-1)x 0.75
##### 4.2.1 变量概况
attributes( Insurance ) #显示属性、数据集格式、行名
## $names
## [1] "District" "Group" "Age" "Holders" "Claims"
##
## $class
## [1] "data.frame"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
str(Insurance )
## 'data.frame': 64 obs. of 5 variables:
## $ District: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ Group : Ord.factor w/ 4 levels "<1l"<"1-1.5l"<..: 1 1 1 1 2 2 2 2 3 3 ...
## $ Age : Ord.factor w/ 4 levels "<25"<"25-29"<..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Holders : int 197 264 246 1680 284 536 696 3582 133 286 ...
## $ Claims : int 38 35 20 156 63 84 89 400 19 52 ...
summary(Insurance)
## District Group Age Holders Claims
## 1:16 <1l :16 <25 :16 Min. : 3.00 Min. : 0.00
## 2:16 1-1.5l:16 25-29:16 1st Qu.: 46.75 1st Qu.: 9.50
## 3:16 1.5-2l:16 30-35:16 Median : 136.00 Median : 22.00
## 4:16 >2l :16 >35 :16 Mean : 364.98 Mean : 49.23
## 3rd Qu.: 327.50 3rd Qu.: 55.50
## Max. :3582.00 Max. :400.00
library("Hmisc")
describe(Insurance) # 取值水平<=10默认为离散变量
## Insurance
##
## 5 Variables 64 Observations
## ---------------------------------------------------------------------------
## District
## n missing unique
## 64 0 4
##
## 1 (16, 25%), 2 (16, 25%), 3 (16, 25%), 4 (16, 25%)
## ---------------------------------------------------------------------------
## Group
## n missing unique
## 64 0 4
##
## <1l (16, 25%), 1-1.5l (16, 25%), 1.5-2l (16, 25%)
## >2l (16, 25%)
## ---------------------------------------------------------------------------
## Age
## n missing unique
## 64 0 4
##
## <25 (16, 25%), 25-29 (16, 25%), 30-35 (16, 25%)
## >35 (16, 25%)
## ---------------------------------------------------------------------------
## Holders
## n missing unique Info Mean .05 .10 .25 .50
## 64 0 63 1 365 16.30 24.00 46.75 136.00
## .75 .90 .95
## 327.50 868.90 1639.25
##
## lowest : 3 7 9 16 18, highest: 1635 1640 1680 2443 3582
## ---------------------------------------------------------------------------
## Claims
## n missing unique Info Mean .05 .10 .25 .50
## 64 0 46 1 49.23 3.15 4.30 9.50 22.00
## .75 .90 .95
## 55.50 101.70 182.35
##
## lowest : 0 2 3 4 5, highest: 156 187 233 290 400
## ---------------------------------------------------------------------------
library("fBasics")
basicStats(Insurance$Holders) # nobs-对象数; SEMean-标准误差均值; LCL/UCL Mean 95%置信水平置信限; Variance 方差; Stdev 标准误; Skewness 偏度; Kurtisis 峰度
## X..Insurance.Holders
## nobs 6.400000e+01
## NAs 0.000000e+00
## Minimum 3.000000e+00
## Maximum 3.582000e+03
## 1. Quartile 4.675000e+01
## 3. Quartile 3.275000e+02
## Mean 3.649844e+02
## Median 1.360000e+02
## Sum 2.335900e+04
## SE Mean 7.784632e+01
## LCL Mean 2.094209e+02
## UCL Mean 5.205478e+02
## Variance 3.878432e+05
## Stdev 6.227706e+02
## Skewness 3.127833e+00
## Kurtosis 1.099961e+01
library("timeDate")
skewness(Insurance$Holder) # 偏度衡量数据偏倚成都,abs(偏度)>1左偏(-)或右偏(+)
## [1] 3.127833
## attr(,"method")
## [1] "moment"
kurtosis( Insurance$Holders ) # 峰度衡量数据分布形态的陡缓程度,峰度>0:尖顶峰度;峰度=0:标准峰度;峰度<0:平顶峰度
## [1] 10.99961
## attr(,"method")
## [1] "excess"
library("Matrix")
i<-sample(1:10,10,replace=TRUE)
j<-sample(1:10,10,replace=TRUE)
A<-sparseMatrix(i,j,x=1) # 维数的变化是由于随机取样造成的
local<-which(A==1,arr.ind=TRUE) # arr.ind:当为数组时,是否返回数组(否的时候返回向量)
plot(local,pch=22)
##### 4.2.5 缺失值
library("mice")
## Loading required package: Rcpp
## mice 2.25 2015-11-09
A<-diag(2,5,5)
A
## [,1] [,2] [,3] [,4] [,5]
## [1,] 2 0 0 0 0
## [2,] 0 2 0 0 0
## [3,] 0 0 2 0 0
## [4,] 0 0 0 2 0
## [5,] 0 0 0 0 2
A[A==2]<-NA # 创建缺失值
A[4,]<-2
A[5,3:5]<-4
A
## [,1] [,2] [,3] [,4] [,5]
## [1,] NA 0 0 0 0
## [2,] 0 NA 0 0 0
## [3,] 0 0 NA 0 0
## [4,] 2 2 2 2 2
## [5,] 0 0 4 4 4
md.pattern(A)
## [,1] [,2] [,3] [,4] [,5] [,6]
## 2 1 1 1 1 1 0
## 1 1 1 0 1 1 1
## 1 1 1 1 0 1 1
## 1 1 1 1 1 0 1
## 0 0 1 1 1 3
cor(Insurance$Holder,Insurance$Claims) # 投保人数量~要求索赔数量
## [1] 0.9857701
x<-matrix( sample(1:100) ,ncol=10 ) # 实验数据
cor_x<-cor( x )
library("ellipse")
plotcorr(cor_x,col=rep(c("white","black"),5)) # 生成“相关图”
plotcorr(cor_x,diag=T,type="lower",col=rep(c("white","black"),5)) # diag:对角线 lower:下三角 upper:上三角
#### 4.3 可视化探索
##### 4.3.1 直方图
library("MASS")
data(Insurance)
hist(Insurance$Claims,main="Histogram of freq of Insurance$Claims") # 要求索赔的人数
hist(Insurance$Claims,main="Histogram of freq of Insurance$Claims whit 20 bars" ,breaks=20,labels=TRUE,col="black",border="white" ) # breaks-分组(0-20,20-40,40-60...); col填充颜色设置; border-边框颜色设置
hist(Insurance$Claims,freq=FALSE,density=20,main="Histogram of density of Insurance$Claims") # freq-频率直方图(T)/密度直方图(F),
lines(density(Insurance$Claims)) # 绘制密度曲线
library("Hmisc") # 书上有错误
par(mfrow=c(1,2)) # 子图模式,两个图在同一列(1行2列)
Ecdf(Insurance$Claims,xlab="Claims",main="Cumulative Distribution of Claims") # 可以认为积累分布图各点斜率为概率密度曲线在该点的取值,即积累密度曲线在某一点的斜率越大,概率密度越大
plot(density(Insurance$Claims),main="密度曲线图")
par(mfrow=c(1,1)) # 退出子图模式(这里并没有什么子图模式,只是便于记忆)
data_plot<-with( Insurance,
rbind(
data.frame( var1=Claims[Age=="<25"],var2="<25" ),
data.frame( var1=Claims[Age=="25-29"],var2="25-29" ),
data.frame( var1=Claims[Age=="30-35"],var2="30-35" ),
data.frame( var1=Claims[Age==">35"],var2=">35" ) # 小心这里不要多了逗号
)
) # 构建数据
data_plot[c(1:4,16+1:4,32+1:4,48+1:4),] # 查看数据
## var1 var2
## 1 38 <25
## 2 63 <25
## 3 19 <25
## 4 4 <25
## 17 35 25-29
## 18 84 25-29
## 19 52 25-29
## 20 18 25-29
## 33 20 30-35
## 34 89 30-35
## 35 74 30-35
## 36 19 30-35
## 49 156 >35
## 50 400 >35
## 51 233 >35
## 52 77 >35
Ecdf( data_plot$var1,group=data_plot$var2,lty=1:4,label.curves=1:4,col=c("blue","green","red","black"),
xlab="Claims",main="Cumulative Distribution of Claims by Age"
) # lty-线型设置; label.curves-标出分组曲线组名; group-分组变量
Ecdf(Insurance$Claim,add=TRUE,col="red")
结论:一定程度上说明了年龄较大的投保人(>35)要求索赔量较高
Claims_bp <- boxplot( Insurance$Claims,main="Distribution of Claims" )
Claims_bp$stats
## [,1]
## [1,] 0
## [2,] 9
## [3,] 22
## [4,] 58
## [5,] 102
## attr(,"class")
##
## "integer"
summary(Insurance$Claims) #为什么这里和之前的分位点的值不一样
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.50 22.00 49.23 55.50 400.00
points(x=1,y=mean(Insurance$Claims),pch=8) # 标记平均值
Claims_points<-as.matrix(Insurance$Claims[which(Insurance$Claims>102)]) #异常值
Claims_text<-rbind(Claims_bp$stats,mean(Insurance$Claims),Claims_points)
Claims_text
## [,1]
## [1,] 0.00000
## [2,] 9.00000
## [3,] 22.00000
## [4,] 58.00000
## [5,] 102.00000
## [6,] 49.23438
## [7,] 156.00000
## [8,] 400.00000
## [9,] 233.00000
## [10,] 290.00000
## [11,] 143.00000
## [12,] 187.00000
for(i in 1:length(Claims_text)){
print(i)
text(x=1.1,y=Claims_text[i,],labels=Claims_text[i,]) # 标记各点
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10
## [1] 11
## [1] 12
boxplot(var1~var2,data=data_plot,horizontal=TRUE,
main="Distribution of Claims by Age",xlab="Claims",ylab="Age")
结论:
1. Claims分布右偏趋势
2. 前三组(<35)的Claims量小于最后一组(>35),“或许”可以认年龄大于35的更易发生事故
with( Insurance,{
boxplot(Holders~Age,boxwex=0.25,at=1:4+0.2,subset = Age == ">35") #绘制第一个箱子(>35):
boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "30-35")
boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "25-29")
boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "<25")
} ) # 索赔人数~年龄
boxplot(var1~var2,data=data_plot,add=TRUE,boxwex=0.25,at=1:4-0.2,
col="lightgrey",
xlab="Age",ylab="Holder") # 索赔人数~年龄
legend(x="topleft",c( "Claims","Holders" ),fill=c("lightgrey","white"))
#-分开绘图看细节-#
par(mfrow=c( 1,2 ))
boxplot(var1~var2,data=data_plot,boxwex=0.25,at=1:4-0.2,
col="lightgrey",main="Claims~Age",
xlab="Age",ylab="Holder") # 索赔人数~年龄
with( Insurance,{
boxplot(Holders~Age,boxwex=0.25,at=1:4+0.2,subset = Age == ">35",main="Holders~Age") #绘制第一个箱子(>35):
boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "30-35")
boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "25-29")
boxplot(Holders~Age,add=TRUE,boxwex=0.25,at=1:4+0.2,subset = Age == "<25")
} ) # 索赔人数~年龄
par(mfrow=c(1,1))
结论:
1. 可以看出>35投保人数也多,所以很可能是由于投保人数量众多才造成了要求索赔人数增多 2. Holders和Claims的差异还可以得出<=35的要求索赔人数更多一些,出事故的可能更大一些,因为差值最大
library("Hmisc")
attach(data_plot);
data_plot<-list(
var1[ which(var2=="<25") ],
var1[ which(var2=="25-29") ],
var1[ which(var2=="30-35") ],
var1[ which(var2==">35") ]
) # 构建数据
detach(data_plot)
par(mfrow=c(1,1))
data_plot
## [[1]]
## [1] 38 63 19 4 22 25 14 4 5 10 8 3 2 7 5 0
##
## [[2]]
## [1] 35 84 52 18 19 51 46 15 11 24 19 2 5 10 7 6
##
## [[3]]
## [1] 20 89 74 19 22 49 39 12 10 37 24 8 4 22 16 8
##
## [[4]]
## [1] 156 400 233 77 87 290 143 53 67 187 101 37 36 102 63 33
bpplot( data_plot,name=c("<25","25-29","30-35",">35"),xlab="Age",ylab="Claims" )#用箱子的宽窄表示所有观测值中高于/低于相应位置的值得比例
##### 4.3.4 条形图
Claims_Age<-with(Insurance,
c( sum( Claims[which( Age=="<25" ) ] ),
sum( Claims[which( Age=="25-29" ) ] ),
sum( Claims[which( Age=="30-35" ) ] ),
sum( Claims[which( Age==">35" ) ] ) )
)
x<-barplot( Claims_Age,names.arg=c("<25","25-29","30-35",">35"),density=20,main="Ditribution of Age by Claims",xlab="Age",ylab="Claims"); # 返回条形图中间值
text(t(x),Claims_Age,labels=Claims_Age,cex=1,pos=1) # 标注数字,cex-标识大小; pos-位置(下、左、上、右)【由于没有labels标签,使用其他方式】
Holders_Age<-with(Insurance,
c( sum( Holders[which( Age=="<25" ) ] ),
sum( Holders[which( Age=="25-29" ) ] ),
sum( Holders[which( Age=="30-35" ) ] ),
sum( Holders[which( Age==">35" ) ] ) )
)
data_bar <- rbind(Claims_Age,Holders_Age)
x<-barplot( data_bar,names.arg=c("<25","25-29","30-35",">35"),col=c("grey","black"),
beside=TRUE,main="Ditribution of Age by Claims and Holders",
xlab="Age",ylab="Claims&Holder",
legend.text=rownames(data_bar),args.legend=list(x="topleft") # 设置图例
);# beside=TRUE构建分组条形图
x<-barplot( data_bar,names.arg=c("<25","25-29","30-35",">35"),col=c("grey","black"),
beside=FALSE,main="Ditribution of Age by Claims and Holders",
xlab="Age",ylab="Claims&Holder",
legend.text=rownames(data_bar),args.legend=list(x="topleft") #设置图例
) # beside=FALSE构建堆叠条形图
##### 4.3.5 点阵图
dotchart( data_bar,xlab="Claims&Holders",main="点阵图", pch=1:2 )
legend( x=14000,y=15,"<25",bty="n" )
legend( x=14000,y=11,"25-29",bty="n" )
legend( x=14000,y=7,"30-25",bty="n" )
legend( x=14000,y=3,">35",bty="n" ) #bty:the type of box to be drawn around the legend. The allowed values are "o" (the default) and "n"
##### 4.3.6 饼图
pie(Claims_Age,labels=c("<25","25-29","30-35",">35"),
main="pie cahrt of Claims by Age",
col = c("deepskyblue", "yellow", "yellow3","skyblue")
)
percent<-round(Claims_Age/sum(Claims_Age)*100,2)
labels<-paste(c("<25","25-29","30-35",">35"),":",percent,"%",sep="")
pie(Claims_Age,labels=labels,
main="pie cahrt of Claims by Age",
col = c("deepskyblue", "yellow", "yellow3","skyblue")
)# 加入百分数
library("plotrix")
pie3D(Claims_Age,labels=labels,
main="pie cahrt of Claims by Age",
col = c("deepskyblue", "yellow", "yellow3","skyblue"),
explode=0.05,labelcex=1# explode-缝隙大小;labelcex-标签大小(书上错误)
)