x<-1:4;x
## [1] 1 2 3 4
class(x) # 整数型
## [1] "integer"
x<-seq(1,2,0.2);x
## [1] 1.0 1.2 1.4 1.6 1.8 2.0
class(x) # 数值型
## [1] "numeric"
y<-x>1.5; y
## [1] FALSE FALSE FALSE TRUE TRUE TRUE
class(y) # 逻辑型
## [1] "logical"
y<-paste("x",1:4,sep="_");y #字符型
## [1] "x_1" "x_2" "x_3" "x_4"
class(y)
## [1] "character"
x<-factor( c(1,1,0,0,1),labels=c("man","woman"),levels=c("1","0") );x #因子型
## [1] man man woman woman man
## Levels: man woman
class(x)
## [1] "factor"
x<-factor( c(1,1,0,0,1),labels=c("man","woman"),levels=c("0","1") );x #注意结果
## [1] woman woman man man woman
## Levels: man woman
x<-factor(c("a","b","c","a"))
as.numeric(x)
## [1] 1 2 3 1
x
## [1] a b c a
## Levels: a b c
x<-factor( c("abc","bcd","adc")) # 但省略labels或levels时,水平按照字母表顺序从1开始取值
as.numeric(x)
## [1] 1 3 2
library("MASS")
attach(Insurance)
dim(Insurance)
## [1] 64 5
class(Insurance)
## [1] "data.frame"
names(Insurance)
## [1] "District" "Group" "Age" "Holders" "Claims"
levels( Age ) # 查看各水平
## [1] "<25" "25-29" "30-35" ">35"
is.numeric( Age )
## [1] FALSE
is.character( Age ) # is.logical(), is.integer()
## [1] FALSE
class(Age)
## [1] "ordered" "factor"
sample(x,size,replace=F/T,prob=NULL)
x-待抽取对象;
size-抽取个数;
replace-是(T)否(F)有放回抽样;
prob-各样本抽样概率,默认NULL,等概率抽样
local<-sample( nrow(Insurance),5,replace=T,prob=c( rep(0,nrow(Insurance)-1),1 ) ) #设置每个样本被抽到的概率
Insurance[local,]
## District Group Age Holders Claims
## 64 4 >2l >35 114 33
## 64.1 4 >2l >35 114 33
## 64.2 4 >2l >35 114 33
## 64.3 4 >2l >35 114 33
## 64.4 4 >2l >35 114 33
strata( data,stratanames=NULL,size,method=c(“srswor”,“srswr”,“poisson”,“systematic”),pik,description=FALSE )
data-待抽样数据;
stratanames-进行分层依赖的变量名;
size-各层中要抽取的的样本数() method-无放回(默认)、有放回、泊松、系统;
pik-各层中各样本概率;
description-是否输出含有各层基本信息的结果
library("sampling")
local2<-strata( Insurance,stratanames="District",size=rep(2,4),method="srswor" )
local2
## District ID_unit Prob Stratum
## 10 1 10 0.125 1
## 14 1 14 0.125 1
## 29 2 29 0.125 2
## 32 2 32 0.125 2
## 42 3 42 0.125 3
## 47 3 47 0.125 3
## 53 4 53 0.125 4
## 60 4 60 0.125 4
getdata(Insurance,local2)
## Group Age Holders Claims District ID_unit Prob Stratum
## 10 1.5-2l 25-29 286 52 1 10 0.125 1
## 14 >2l 25-29 71 18 1 14 0.125 1
## 29 >2l <25 9 4 2 29 0.125 2
## 32 >2l >35 322 53 2 32 0.125 2
## 42 1.5-2l 25-29 78 19 3 42 0.125 3
## 47 >2l 30-35 43 8 3 47 0.125 3
## 53 1-1.5l <25 31 7 4 53 0.125 4
## 60 1.5-2l >35 344 63 4 60 0.125 4
strata( Insurance,stratanames="District",size=rep(2,4),method="srswor",description=TRUE ) # 列出抽样信息
## Stratum 1
##
## Population total and number of selected units: 16 2
## Stratum 2
##
## Population total and number of selected units: 16 2
## Stratum 3
##
## Population total and number of selected units: 16 2
## Stratum 4
##
## Population total and number of selected units: 16 2
## Number of strata 4
## Total number of selected units 8
## District ID_unit Prob Stratum
## 11 1 11 0.125 1
## 13 1 13 0.125 1
## 23 2 23 0.125 2
## 27 2 27 0.125 2
## 39 3 39 0.125 3
## 40 3 40 0.125 3
## 52 4 52 0.125 4
## 57 4 57 0.125 4
local3<-strata( Insurance,stratanames="District",size=c(1,2,3,4),method="systematic",pik=Claims)
## Warning in inclusionprobabilities(pik[y], size[i]): there are zero values in the initial vector a
getdata(Insurance,local3)
## Group Age Holders Claims District ID_unit Prob Stratum
## 9 1.5-2l <25 133 19 1 9 0.01375815 1
## 24 1-1.5l >35 2443 290 2 24 0.65095398 2
## 28 1.5-2l >35 1110 143 2 28 0.32098765 2
## 36 <1l >35 648 67 3 36 0.36612022 3
## 40 1-1.5l >35 1635 187 3 40 1.00000000 3
## 44 1.5-2l >35 692 101 3 44 0.55191257 3
## 52 <1l >35 316 36 4 52 0.48214286 4
## 56 1-1.5l >35 724 102 4 56 1.00000000 4
## 58 1.5-2l 25-29 39 7 4 58 0.09375000 4
## 60 1.5-2l >35 344 63 4 60 0.84375000 4
cluster(data,clustername,size,method=c(“srswor”,“srswr”,“poisson”,“systematic”),pik,descreption=FALSE )
cluster-划分群的变量
size-要抽取的群数
local4<-cluster(Insurance,clustername="District",size=2,method="srswor",description=T )
## Number of selected clusters: 2
## Population total and number of selected units 64 32
head(local4);tail(local4)
## District ID_unit Prob
## 1 3 33 0.5
## 2 3 34 0.5
## 3 3 35 0.5
## 4 3 36 0.5
## 5 3 37 0.5
## 6 3 38 0.5
## District ID_unit Prob
## 27 4 59 0.5
## 28 4 60 0.5
## 29 4 61 0.5
## 30 4 62 0.5
## 31 4 63 0.5
## 32 4 64 0.5
getdata(Insurance,local4 )
## Group Age Holders Claims District ID_unit Prob
## 33 <1l <25 35 5 3 33 0.5
## 34 <1l 25-29 73 11 3 34 0.5
## 35 <1l 30-35 89 10 3 35 0.5
## 36 <1l >35 648 67 3 36 0.5
## 37 1-1.5l <25 53 10 3 37 0.5
## 38 1-1.5l 25-29 155 24 3 38 0.5
## 39 1-1.5l 30-35 240 37 3 39 0.5
## 40 1-1.5l >35 1635 187 3 40 0.5
## 41 1.5-2l <25 24 8 3 41 0.5
## 42 1.5-2l 25-29 78 19 3 42 0.5
## 43 1.5-2l 30-35 121 24 3 43 0.5
## 44 1.5-2l >35 692 101 3 44 0.5
## 45 >2l <25 7 3 3 45 0.5
## 46 >2l 25-29 29 2 3 46 0.5
## 47 >2l 30-35 43 8 3 47 0.5
## 48 >2l >35 245 37 3 48 0.5
## 49 <1l <25 20 2 4 49 0.5
## 50 <1l 25-29 33 5 4 50 0.5
## 51 <1l 30-35 40 4 4 51 0.5
## 52 <1l >35 316 36 4 52 0.5
## 53 1-1.5l <25 31 7 4 53 0.5
## 54 1-1.5l 25-29 81 10 4 54 0.5
## 55 1-1.5l 30-35 122 22 4 55 0.5
## 56 1-1.5l >35 724 102 4 56 0.5
## 57 1.5-2l <25 18 5 4 57 0.5
## 58 1.5-2l 25-29 39 7 4 58 0.5
## 59 1.5-2l 30-35 68 16 4 59 0.5
## 60 1.5-2l >35 344 63 4 60 0.5
## 61 >2l <25 3 0 4 61 0.5
## 62 >2l 25-29 16 6 4 62 0.5
## 63 >2l 30-35 25 8 4 63 0.5
## 64 >2l >35 114 33 4 64 0.5
整群抽样要求:各群对数据总体有较好的代表性,即群内差异大,群间差异小。群间差距大时会造成样本分布面不广,样本对总体的代表性相对较差
train_sub<-sample(nrow(Insurance),3/4*nrow(Insurance)) #3/4的训练集的行号
train_data<-Insurance[train_sub,]
test_data<-Insurance[-train_sub,] # 使用“-”筛选数据
dim(train_data);dim(test_data)
## [1] 48 5
## [1] 16 5