第二章数据概览

2.2 数据的分类

2.2.2 R的数据分类

x<-1:4;x

## [1] 1 2 3 4

class(x) # 整数型

## [1] "integer"

x<-seq(1,2,0.2);x

## [1] 1.0 1.2 1.4 1.6 1.8 2.0

class(x) # 数值型

## [1] "numeric"

y<-x>1.5;  y

## [1] FALSE FALSE FALSE  TRUE  TRUE  TRUE

class(y) # 逻辑型

## [1] "logical"

y<-paste("x",1:4,sep="_");y #字符型

## [1] "x_1" "x_2" "x_3" "x_4"

class(y)

## [1] "character"

x<-factor( c(1,1,0,0,1),labels=c("man","woman"),levels=c("1","0") );x #因子型

## [1] man   man   woman woman man  
## Levels: man woman

class(x)

## [1] "factor"

x<-factor( c(1,1,0,0,1),labels=c("man","woman"),levels=c("0","1") );x #注意结果

## [1] woman woman man   man   woman
## Levels: man woman

x<-factor(c("a","b","c","a"))
as.numeric(x)

## [1] 1 2 3 1

## [1] a b c a
## Levels: a b c

x<-factor( c("abc","bcd","adc")) # 但省略labels或levels时，水平按照字母表顺序从1开始取值
as.numeric(x)

## [1] 1 3 2

2.2.3 简单的数据处理

library("MASS")
attach(Insurance)
dim(Insurance)

## [1] 64  5

class(Insurance)

## [1] "data.frame"

names(Insurance)

## [1] "District" "Group"    "Age"      "Holders"  "Claims"

levels( Age ) # 查看各水平

## [1] "<25"   "25-29" "30-35" ">35"

is.numeric( Age )

## [1] FALSE

is.character( Age ) # is.logical(), is.integer()

## [1] FALSE

class(Age)

## [1] "ordered" "factor"

2.3 数据抽样及R实现

2.3.1 简单随机抽样

sample(x,size,replace=F/T,prob=NULL)
x-待抽取对象；
size-抽取个数；
replace-是（T）否（F）有放回抽样；
prob-各样本抽样概率，默认NULL，等概率抽样

local<-sample( nrow(Insurance),5,replace=T,prob=c( rep(0,nrow(Insurance)-1),1 ) ) #设置每个样本被抽到的概率
Insurance[local,]

##      District Group Age Holders Claims
## 64          4   >2l >35     114     33
## 64.1        4   >2l >35     114     33
## 64.2        4   >2l >35     114     33
## 64.3        4   >2l >35     114     33
## 64.4        4   >2l >35     114     33

2.3.2 分层抽样

strata( data,stratanames=NULL,size,method=c(“srswor”,“srswr”,“poisson”,“systematic”),pik,description=FALSE )
data-待抽样数据；
stratanames-进行分层依赖的变量名；
size-各层中要抽取的的样本数() method-无放回（默认）、有放回、泊松、系统；
pik-各层中各样本概率；
description-是否输出含有各层基本信息的结果

library("sampling")
local2<-strata( Insurance,stratanames="District",size=rep(2,4),method="srswor" )
local2

##    District ID_unit  Prob Stratum
## 10        1      10 0.125       1
## 14        1      14 0.125       1
## 29        2      29 0.125       2
## 32        2      32 0.125       2
## 42        3      42 0.125       3
## 47        3      47 0.125       3
## 53        4      53 0.125       4
## 60        4      60 0.125       4

getdata(Insurance,local2)

##     Group   Age Holders Claims District ID_unit  Prob Stratum
## 10 1.5-2l 25-29     286     52        1      10 0.125       1
## 14    >2l 25-29      71     18        1      14 0.125       1
## 29    >2l   <25       9      4        2      29 0.125       2
## 32    >2l   >35     322     53        2      32 0.125       2
## 42 1.5-2l 25-29      78     19        3      42 0.125       3
## 47    >2l 30-35      43      8        3      47 0.125       3
## 53 1-1.5l   <25      31      7        4      53 0.125       4
## 60 1.5-2l   >35     344     63        4      60 0.125       4

strata( Insurance,stratanames="District",size=rep(2,4),method="srswor",description=TRUE ) # 列出抽样信息

## Stratum 1 
## 
## Population total and number of selected units: 16 2 
## Stratum 2 
## 
## Population total and number of selected units: 16 2 
## Stratum 3 
## 
## Population total and number of selected units: 16 2 
## Stratum 4 
## 
## Population total and number of selected units: 16 2 
## Number of strata  4 
## Total number of selected units 8

##    District ID_unit  Prob Stratum
## 11        1      11 0.125       1
## 13        1      13 0.125       1
## 23        2      23 0.125       2
## 27        2      27 0.125       2
## 39        3      39 0.125       3
## 40        3      40 0.125       3
## 52        4      52 0.125       4
## 57        4      57 0.125       4

local3<-strata( Insurance,stratanames="District",size=c(1,2,3,4),method="systematic",pik=Claims)

## Warning in inclusionprobabilities(pik[y], size[i]): there are zero values in the initial vector a

getdata(Insurance,local3)

##     Group   Age Holders Claims District ID_unit       Prob Stratum
## 9  1.5-2l   <25     133     19        1       9 0.01375815       1
## 24 1-1.5l   >35    2443    290        2      24 0.65095398       2
## 28 1.5-2l   >35    1110    143        2      28 0.32098765       2
## 36    <1l   >35     648     67        3      36 0.36612022       3
## 40 1-1.5l   >35    1635    187        3      40 1.00000000       3
## 44 1.5-2l   >35     692    101        3      44 0.55191257       3
## 52    <1l   >35     316     36        4      52 0.48214286       4
## 56 1-1.5l   >35     724    102        4      56 1.00000000       4
## 58 1.5-2l 25-29      39      7        4      58 0.09375000       4
## 60 1.5-2l   >35     344     63        4      60 0.84375000       4

2.3.3 整群抽样

cluster(data,clustername,size,method=c(“srswor”,“srswr”,“poisson”,“systematic”),pik,descreption=FALSE )
cluster-划分群的变量
size-要抽取的群数

local4<-cluster(Insurance,clustername="District",size=2,method="srswor",description=T  )

## Number of selected clusters: 2 
## Population total and number of selected units 64 32

head(local4);tail(local4)

##   District ID_unit Prob
## 1        3      33  0.5
## 2        3      34  0.5
## 3        3      35  0.5
## 4        3      36  0.5
## 5        3      37  0.5
## 6        3      38  0.5

##    District ID_unit Prob
## 27        4      59  0.5
## 28        4      60  0.5
## 29        4      61  0.5
## 30        4      62  0.5
## 31        4      63  0.5
## 32        4      64  0.5

getdata(Insurance,local4  )

##     Group   Age Holders Claims District ID_unit Prob
## 33    <1l   <25      35      5        3      33  0.5
## 34    <1l 25-29      73     11        3      34  0.5
## 35    <1l 30-35      89     10        3      35  0.5
## 36    <1l   >35     648     67        3      36  0.5
## 37 1-1.5l   <25      53     10        3      37  0.5
## 38 1-1.5l 25-29     155     24        3      38  0.5
## 39 1-1.5l 30-35     240     37        3      39  0.5
## 40 1-1.5l   >35    1635    187        3      40  0.5
## 41 1.5-2l   <25      24      8        3      41  0.5
## 42 1.5-2l 25-29      78     19        3      42  0.5
## 43 1.5-2l 30-35     121     24        3      43  0.5
## 44 1.5-2l   >35     692    101        3      44  0.5
## 45    >2l   <25       7      3        3      45  0.5
## 46    >2l 25-29      29      2        3      46  0.5
## 47    >2l 30-35      43      8        3      47  0.5
## 48    >2l   >35     245     37        3      48  0.5
## 49    <1l   <25      20      2        4      49  0.5
## 50    <1l 25-29      33      5        4      50  0.5
## 51    <1l 30-35      40      4        4      51  0.5
## 52    <1l   >35     316     36        4      52  0.5
## 53 1-1.5l   <25      31      7        4      53  0.5
## 54 1-1.5l 25-29      81     10        4      54  0.5
## 55 1-1.5l 30-35     122     22        4      55  0.5
## 56 1-1.5l   >35     724    102        4      56  0.5
## 57 1.5-2l   <25      18      5        4      57  0.5
## 58 1.5-2l 25-29      39      7        4      58  0.5
## 59 1.5-2l 30-35      68     16        4      59  0.5
## 60 1.5-2l   >35     344     63        4      60  0.5
## 61    >2l   <25       3      0        4      61  0.5
## 62    >2l 25-29      16      6        4      62  0.5
## 63    >2l 30-35      25      8        4      63  0.5
## 64    >2l   >35     114     33        4      64  0.5

整群抽样要求：各群对数据总体有较好的代表性，即群内差异大，群间差异小。群间差距大时会造成样本分布面不广，样本对总体的代表性相对较差

2.4 训练集和测试集

train_sub<-sample(nrow(Insurance),3/4*nrow(Insurance)) #3/4的训练集的行号
train_data<-Insurance[train_sub,]
test_data<-Insurance[-train_sub,] # 使用“-”筛选数据
dim(train_data);dim(test_data)

## [1] 48  5

## [1] 16  5

第二章 数据概览