第二章 数据概览

2.2 数据的分类

2.2.2 R的数据分类
x<-1:4;x
## [1] 1 2 3 4
class(x) # 整数型
## [1] "integer"
x<-seq(1,2,0.2);x
## [1] 1.0 1.2 1.4 1.6 1.8 2.0
class(x) # 数值型
## [1] "numeric"
y<-x>1.5;  y
## [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE
class(y) # 逻辑型
## [1] "logical"
y<-paste("x",1:4,sep="_");y #字符型
## [1] "x_1" "x_2" "x_3" "x_4"
class(y)
## [1] "character"
x<-factor( c(1,1,0,0,1),labels=c("man","woman"),levels=c("1","0") );x #因子型
## [1] man   man   woman woman man  
## Levels: man woman
class(x)
## [1] "factor"
x<-factor( c(1,1,0,0,1),labels=c("man","woman"),levels=c("0","1") );x #注意结果
## [1] woman woman man   man   woman
## Levels: man woman
x<-factor(c("a","b","c","a"))
as.numeric(x)
## [1] 1 2 3 1
x
## [1] a b c a
## Levels: a b c
x<-factor( c("abc","bcd","adc")) # 但省略labels或levels时,水平按照字母表顺序从1开始取值
as.numeric(x)
## [1] 1 3 2
2.2.3 简单的数据处理
library("MASS")
attach(Insurance)
dim(Insurance)
## [1] 64  5
class(Insurance)
## [1] "data.frame"
names(Insurance)
## [1] "District" "Group"    "Age"      "Holders"  "Claims"
levels( Age ) # 查看各水平
## [1] "<25"   "25-29" "30-35" ">35"
is.numeric( Age )
## [1] FALSE
is.character( Age ) # is.logical(), is.integer()
## [1] FALSE
class(Age)
## [1] "ordered" "factor"

2.3 数据抽样及R实现

2.3.1 简单随机抽样

sample(x,size,replace=F/T,prob=NULL)
x-待抽取对象;
size-抽取个数;
replace-是(T)否(F)有放回抽样;
prob-各样本抽样概率,默认NULL,等概率抽样

local<-sample( nrow(Insurance),5,replace=T,prob=c( rep(0,nrow(Insurance)-1),1 ) ) #设置每个样本被抽到的概率
Insurance[local,]
##      District Group Age Holders Claims
## 64          4   >2l >35     114     33
## 64.1        4   >2l >35     114     33
## 64.2        4   >2l >35     114     33
## 64.3        4   >2l >35     114     33
## 64.4        4   >2l >35     114     33
2.3.2 分层抽样

strata( data,stratanames=NULL,size,method=c(“srswor”,“srswr”,“poisson”,“systematic”),pik,description=FALSE )
data-待抽样数据;
stratanames-进行分层依赖的变量名;
size-各层中要抽取的的样本数() method-无放回(默认)、有放回、泊松、系统;
pik-各层中各样本概率;
description-是否输出含有各层基本信息的结果

library("sampling")
local2<-strata( Insurance,stratanames="District",size=rep(2,4),method="srswor" )
local2
##    District ID_unit  Prob Stratum
## 10        1      10 0.125       1
## 14        1      14 0.125       1
## 29        2      29 0.125       2
## 32        2      32 0.125       2
## 42        3      42 0.125       3
## 47        3      47 0.125       3
## 53        4      53 0.125       4
## 60        4      60 0.125       4
getdata(Insurance,local2)
##     Group   Age Holders Claims District ID_unit  Prob Stratum
## 10 1.5-2l 25-29     286     52        1      10 0.125       1
## 14    >2l 25-29      71     18        1      14 0.125       1
## 29    >2l   <25       9      4        2      29 0.125       2
## 32    >2l   >35     322     53        2      32 0.125       2
## 42 1.5-2l 25-29      78     19        3      42 0.125       3
## 47    >2l 30-35      43      8        3      47 0.125       3
## 53 1-1.5l   <25      31      7        4      53 0.125       4
## 60 1.5-2l   >35     344     63        4      60 0.125       4
strata( Insurance,stratanames="District",size=rep(2,4),method="srswor",description=TRUE ) # 列出抽样信息
## Stratum 1 
## 
## Population total and number of selected units: 16 2 
## Stratum 2 
## 
## Population total and number of selected units: 16 2 
## Stratum 3 
## 
## Population total and number of selected units: 16 2 
## Stratum 4 
## 
## Population total and number of selected units: 16 2 
## Number of strata  4 
## Total number of selected units 8
##    District ID_unit  Prob Stratum
## 11        1      11 0.125       1
## 13        1      13 0.125       1
## 23        2      23 0.125       2
## 27        2      27 0.125       2
## 39        3      39 0.125       3
## 40        3      40 0.125       3
## 52        4      52 0.125       4
## 57        4      57 0.125       4
local3<-strata( Insurance,stratanames="District",size=c(1,2,3,4),method="systematic",pik=Claims)
## Warning in inclusionprobabilities(pik[y], size[i]): there are zero values in the initial vector a
getdata(Insurance,local3)
##     Group   Age Holders Claims District ID_unit       Prob Stratum
## 9  1.5-2l   <25     133     19        1       9 0.01375815       1
## 24 1-1.5l   >35    2443    290        2      24 0.65095398       2
## 28 1.5-2l   >35    1110    143        2      28 0.32098765       2
## 36    <1l   >35     648     67        3      36 0.36612022       3
## 40 1-1.5l   >35    1635    187        3      40 1.00000000       3
## 44 1.5-2l   >35     692    101        3      44 0.55191257       3
## 52    <1l   >35     316     36        4      52 0.48214286       4
## 56 1-1.5l   >35     724    102        4      56 1.00000000       4
## 58 1.5-2l 25-29      39      7        4      58 0.09375000       4
## 60 1.5-2l   >35     344     63        4      60 0.84375000       4
2.3.3 整群抽样

cluster(data,clustername,size,method=c(“srswor”,“srswr”,“poisson”,“systematic”),pik,descreption=FALSE )
cluster-划分群的变量
size-要抽取的群数

local4<-cluster(Insurance,clustername="District",size=2,method="srswor",description=T  )
## Number of selected clusters: 2 
## Population total and number of selected units 64 32
head(local4);tail(local4)
##   District ID_unit Prob
## 1        3      33  0.5
## 2        3      34  0.5
## 3        3      35  0.5
## 4        3      36  0.5
## 5        3      37  0.5
## 6        3      38  0.5
##    District ID_unit Prob
## 27        4      59  0.5
## 28        4      60  0.5
## 29        4      61  0.5
## 30        4      62  0.5
## 31        4      63  0.5
## 32        4      64  0.5
getdata(Insurance,local4  )
##     Group   Age Holders Claims District ID_unit Prob
## 33    <1l   <25      35      5        3      33  0.5
## 34    <1l 25-29      73     11        3      34  0.5
## 35    <1l 30-35      89     10        3      35  0.5
## 36    <1l   >35     648     67        3      36  0.5
## 37 1-1.5l   <25      53     10        3      37  0.5
## 38 1-1.5l 25-29     155     24        3      38  0.5
## 39 1-1.5l 30-35     240     37        3      39  0.5
## 40 1-1.5l   >35    1635    187        3      40  0.5
## 41 1.5-2l   <25      24      8        3      41  0.5
## 42 1.5-2l 25-29      78     19        3      42  0.5
## 43 1.5-2l 30-35     121     24        3      43  0.5
## 44 1.5-2l   >35     692    101        3      44  0.5
## 45    >2l   <25       7      3        3      45  0.5
## 46    >2l 25-29      29      2        3      46  0.5
## 47    >2l 30-35      43      8        3      47  0.5
## 48    >2l   >35     245     37        3      48  0.5
## 49    <1l   <25      20      2        4      49  0.5
## 50    <1l 25-29      33      5        4      50  0.5
## 51    <1l 30-35      40      4        4      51  0.5
## 52    <1l   >35     316     36        4      52  0.5
## 53 1-1.5l   <25      31      7        4      53  0.5
## 54 1-1.5l 25-29      81     10        4      54  0.5
## 55 1-1.5l 30-35     122     22        4      55  0.5
## 56 1-1.5l   >35     724    102        4      56  0.5
## 57 1.5-2l   <25      18      5        4      57  0.5
## 58 1.5-2l 25-29      39      7        4      58  0.5
## 59 1.5-2l 30-35      68     16        4      59  0.5
## 60 1.5-2l   >35     344     63        4      60  0.5
## 61    >2l   <25       3      0        4      61  0.5
## 62    >2l 25-29      16      6        4      62  0.5
## 63    >2l 30-35      25      8        4      63  0.5
## 64    >2l   >35     114     33        4      64  0.5

整群抽样要求:各群对数据总体有较好的代表性,即群内差异大,群间差异小。群间差距大时会造成样本分布面不广,样本对总体的代表性相对较差

2.4 训练集和测试集

train_sub<-sample(nrow(Insurance),3/4*nrow(Insurance)) #3/4的训练集的行号
train_data<-Insurance[train_sub,]
test_data<-Insurance[-train_sub,] # 使用“-”筛选数据
dim(train_data);dim(test_data)
## [1] 48  5
## [1] 16  5