工作路径获取和设置

getwd() #获取当前工作路径

## [1] "/Users/huanghuilin/Desktop/360安全云盘同步版/2022/2022年数据挖掘课程/上机/第一次上机：R数据类型和抽样技术/演示"

data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

dim(iris)

## [1] 150   5

R数据分类

1、数值型向量和整数型向量创建

x <- c(1,2,3,4) #构造元素依次为1,2,3,4的向量x                    
x

## [1] 1 2 3 4

class ( x )       # 显示向量x的数据类型

## [1] "numeric"

x1 = as.integer ( x )
x1

## [1] 1 2 3 4

class ( x1 )

## [1] "integer"

x= c (0,1,2,3,4)  
x

## [1] 0 1 2 3 4

class(x)

## [1] "numeric"

x[4]

## [1] 3

x==2    # 判断向量x中等于2的元素

## [1] FALSE FALSE  TRUE FALSE FALSE

! ( x<2 )     #判断向量x中大于等于2的元素

## [1] FALSE FALSE  TRUE  TRUE  TRUE

which ( x<2 ) # 选择向量x中小于2的元素的位置

## [1] 1 2

x[which ( x<2 )]

## [1] 0 1

is.logical ( x )  # 判断向量x是否为逻辑性数据

## [1] FALSE

2、字符串型数据

y = c ( "I", "love", "R" )            
# 构造元素依次为字符串“I”，“love”，“R”的向量y

y

## [1] "I"    "love" "R"

class ( y )

## [1] "character"

length ( y )  # 显示向量y的维度，即元素个数

## [1] 3

nchar ( y )  # 显示向量y中每个元素的字符个数

## [1] 1 4 1

y=="R"      # 判断向量y中为“R”的元素

## [1] FALSE FALSE  TRUE

which ( y=="love" )

## [1] 2

y[which ( y=="love" )]

## [1] "love"

y[3]

## [1] "R"

y[1]

## [1] "I"

y[2]

## [1] "love"

3、因子型数据

# 设置因子型数据sex
sex = factor ( c(1,1,0,0,1), levels=c(0,1),
               labels=c("male","female") )  

sex       # 输出sex的值

## [1] female female male   male   female
## Levels: male female

class ( sex )     # 显示sex的数据类型

## [1] "factor"

# 调换标签（labels）的取值，得到因子型数据sex1
sex1 = factor ( c(1,1,0,0,1), levels=c(0,1), labels=c("female","male") )      

sex1  # 输出sex1的值

## [1] male   male   female female male  
## Levels: female male

 # 调换水平（levels）的取值，得到因子型数据sex2
sex2 = factor (c(1,1,0,0,1), levels=c(1,0),
               labels=c("male","female") )  
sex2      # 输出sex2的值

## [1] male   male   female female male  
## Levels: male female

# 设置因子型变量num
num = factor ( c("a","b","c","d") )                             
num

## [1] a b c d
## Levels: a b c d

class(num)

## [1] "factor"

#将因子型数据num转换为数值型数据
as.numeric(num)

## [1] 1 2 3 4

# 调换num中元素顺序，构造因子型变量num1
num1 = factor ( c("b","a","d","c") )        

as.numeric ( num1 )

## [1] 2 1 4 3

# 将因子型数据num1转换为数值型数据
num + 1        # 因子型数据不可进行数值运算

## Warning in Ops.factor(num, 1): '+' not meaningful for factors

## [1] NA NA NA NA

# 转换为数值型数据后可参与运算
as.numeric ( num ) + 1

## [1] 2 3 4 5

获取数据集Insurance及各变量信息

library ( MASS )

## Warning: package 'MASS' was built under R version 4.1.2

data (Insurance)   # 获取数据集Insurance                              
summary(Insurance)

##  District    Group       Age        Holders            Claims      
##  1:16     <1l   :16   <25  :16   Min.   :   3.00   Min.   :  0.00  
##  2:16     1-1.5l:16   25-29:16   1st Qu.:  46.75   1st Qu.:  9.50  
##  3:16     1.5-2l:16   30-35:16   Median : 136.00   Median : 22.00  
##  4:16     >2l   :16   >35  :16   Mean   : 364.98   Mean   : 49.23  
##                                  3rd Qu.: 327.50   3rd Qu.: 55.50  
##                                  Max.   :3582.00   Max.   :400.00

write.table(Insurance,"Insurance.txt")
write.csv(Insurance,"Insurance.csv")

Insurance[1:10, ]

##    District  Group   Age Holders Claims
## 1         1    <1l   <25     197     38
## 2         1    <1l 25-29     264     35
## 3         1    <1l 30-35     246     20
## 4         1    <1l   >35    1680    156
## 5         1 1-1.5l   <25     284     63
## 6         1 1-1.5l 25-29     536     84
## 7         1 1-1.5l 30-35     696     89
## 8         1 1-1.5l   >35    3582    400
## 9         1 1.5-2l   <25     133     19
## 10        1 1.5-2l 25-29     286     52

head(Insurance)

##   District  Group   Age Holders Claims
## 1        1    <1l   <25     197     38
## 2        1    <1l 25-29     264     35
## 3        1    <1l 30-35     246     20
## 4        1    <1l   >35    1680    156
## 5        1 1-1.5l   <25     284     63
## 6        1 1-1.5l 25-29     536     84

tail(Insurance)

##    District  Group   Age Holders Claims
## 59        4 1.5-2l 30-35      68     16
## 60        4 1.5-2l   >35     344     63
## 61        4    >2l   <25       3      0
## 62        4    >2l 25-29      16      6
## 63        4    >2l 30-35      25      8
## 64        4    >2l   >35     114     33

dim (Insurance)

## [1] 64  5

dim (Insurance[1:10, ]) #获取数据集前10条数据的维度

## [1] 10  5

# 获取数据集仅含第2、3、4个变量部分的维度
dim (Insurance[ ,2:4])

## [1] 64  3

# 获取数据集维度向量的第一个元素，即行数
dim ( Insurance ) [1]

## [1] 64

# 获取数据集维度向量的第二个元素，即列数
dim ( Insurance ) [2]

## [1] 5

# 构造含有“District”和“Age”两个元素的字符向量vars
vars = c ( "District", "Age" )     

Insurance [ 20:25, vars ] #筛选出District及Age变量的第20-25行数据

##    District   Age
## 20        2   >35
## 21        2   <25
## 22        2 25-29
## 23        2 30-35
## 24        2   >35
## 25        2   <25

names(Insurance)   # 输出Insurance数据集变量名

## [1] "District" "Group"    "Age"      "Holders"  "Claims"

head(names(Insurance), n=2)  # 仅输出前2个变量名

## [1] "District" "Group"

tail(names(Insurance), n=2)  # 仅输出后2个变量名

## [1] "Holders" "Claims"

head(Insurance$Age)

## [1] <25   25-29 30-35 >35   <25   25-29
## Levels: <25 < 25-29 < 30-35 < >35

Insurance$Age

##  [1] <25   25-29 30-35 >35   <25   25-29 30-35 >35   <25   25-29 30-35 >35  
## [13] <25   25-29 30-35 >35   <25   25-29 30-35 >35   <25   25-29 30-35 >35  
## [25] <25   25-29 30-35 >35   <25   25-29 30-35 >35   <25   25-29 30-35 >35  
## [37] <25   25-29 30-35 >35   <25   25-29 30-35 >35   <25   25-29 30-35 >35  
## [49] <25   25-29 30-35 >35   <25   25-29 30-35 >35   <25   25-29 30-35 >35  
## [61] <25   25-29 30-35 >35  
## Levels: <25 < 25-29 < 30-35 < >35

str(Insurance)

## 'data.frame':    64 obs. of  5 variables:
##  $ District: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Group   : Ord.factor w/ 4 levels "<1l"<"1-1.5l"<..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ Age     : Ord.factor w/ 4 levels "<25"<"25-29"<..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Holders : int  197 264 246 1680 284 536 696 3582 133 286 ...
##  $ Claims  : int  38 35 20 156 63 84 89 400 19 52 ...

class(Insurance$District) #显示District的变量类型

## [1] "factor"

class ( Insurance[,1])

## [1] "factor"

Insurance[,c(1,5)]

##    District Claims
## 1         1     38
## 2         1     35
## 3         1     20
## 4         1    156
## 5         1     63
## 6         1     84
## 7         1     89
## 8         1    400
## 9         1     19
## 10        1     52
## 11        1     74
## 12        1    233
## 13        1      4
## 14        1     18
## 15        1     19
## 16        1     77
## 17        2     22
## 18        2     19
## 19        2     22
## 20        2     87
## 21        2     25
## 22        2     51
## 23        2     49
## 24        2    290
## 25        2     14
## 26        2     46
## 27        2     39
## 28        2    143
## 29        2      4
## 30        2     15
## 31        2     12
## 32        2     53
## 33        3      5
## 34        3     11
## 35        3     10
## 36        3     67
## 37        3     10
## 38        3     24
## 39        3     37
## 40        3    187
## 41        3      8
## 42        3     19
## 43        3     24
## 44        3    101
## 45        3      3
## 46        3      2
## 47        3      8
## 48        3     37
## 49        4      2
## 50        4      5
## 51        4      4
## 52        4     36
## 53        4      7
## 54        4     10
## 55        4     22
## 56        4    102
## 57        4      5
## 58        4      7
## 59        4     16
## 60        4     63
## 61        4      0
## 62        4      6
## 63        4      8
## 64        4     33

data.frame(Insurance$District,Insurance$Claims)

##    Insurance.District Insurance.Claims
## 1                   1               38
## 2                   1               35
## 3                   1               20
## 4                   1              156
## 5                   1               63
## 6                   1               84
## 7                   1               89
## 8                   1              400
## 9                   1               19
## 10                  1               52
## 11                  1               74
## 12                  1              233
## 13                  1                4
## 14                  1               18
## 15                  1               19
## 16                  1               77
## 17                  2               22
## 18                  2               19
## 19                  2               22
## 20                  2               87
## 21                  2               25
## 22                  2               51
## 23                  2               49
## 24                  2              290
## 25                  2               14
## 26                  2               46
## 27                  2               39
## 28                  2              143
## 29                  2                4
## 30                  2               15
## 31                  2               12
## 32                  2               53
## 33                  3                5
## 34                  3               11
## 35                  3               10
## 36                  3               67
## 37                  3               10
## 38                  3               24
## 39                  3               37
## 40                  3              187
## 41                  3                8
## 42                  3               19
## 43                  3               24
## 44                  3              101
## 45                  3                3
## 46                  3                2
## 47                  3                8
## 48                  3               37
## 49                  4                2
## 50                  4                5
## 51                  4                4
## 52                  4               36
## 53                  4                7
## 54                  4               10
## 55                  4               22
## 56                  4              102
## 57                  4                5
## 58                  4                7
## 59                  4               16
## 60                  4               63
## 61                  4                0
## 62                  4                6
## 63                  4                8
## 64                  4               33

class ( Insurance$Age )   # 显示Age的变量类型

## [1] "ordered" "factor"

class ( Insurance$Holders )  #显示Holders的变量类型

## [1] "integer"

levels ( Insurance$Age )  # 显示Age变量的4个水平值

## [1] "<25"   "25-29" "30-35" ">35"

levels(Insurance$Age)[1]  #显示Age变量的第1个水平值

## [1] "<25"

# 将Age变量的第1个水平值修改为“young”
levels ( Insurance$Age ) [1] = "young"        

head ( Insurance$Age )  #回看修改后Age变量前若干个取值

## [1] young 25-29 30-35 >35   young 25-29
## Levels: young < 25-29 < 30-35 < >35

is.character ( Insurance$Age )                           #判断Age是否为字符型变量

## [1] FALSE

class ( Insurance$Claims )  #显示Claims的变量类型

## [1] "integer"

class ( as.integer (Insurance$Claims) )

## [1] "integer"

#将Claims的数据类型强制转换为数值型
dim(Insurance)

## [1] 64  5

抽样技术

简单抽样：有放回简单抽样

set.seed(2)

#从Insurance数据集中有放回的随机抽取10个观测样本
sub1=sample(nrow(Insurance),10,replace=T)   
sub1    #显示所抽取的行序号

##  [1] 21 15  6  6 32  8 17 29 17 12

Insurance[sub1,]  #显示所抽取的10条观测样本

##      District  Group   Age Holders Claims
## 21          2 1-1.5l young     149     25
## 15          1    >2l 30-35      99     19
## 6           1 1-1.5l 25-29     536     84
## 6.1         1 1-1.5l 25-29     536     84
## 32          2    >2l   >35     322     53
## 8           1 1-1.5l   >35    3582    400
## 17          2    <1l young      85     22
## 29          2    >2l young       9      4
## 17.1        2    <1l young      85     22
## 12          1 1.5-2l   >35    1640    233

sub2=sample(nrow(Insurance),10,replace=T,
            prob=c(rep(0,nrow(Insurance)-1),1)) 
#设置除了最后一条样本的抽样概率为1外，其他样本被抽到的概率为都为0
Insurance[sub2,]  #显示所抽取的10条观测样本

##      District Group Age Holders Claims
## 64          4   >2l >35     114     33
## 64.1        4   >2l >35     114     33
## 64.2        4   >2l >35     114     33
## 64.3        4   >2l >35     114     33
## 64.4        4   >2l >35     114     33
## 64.5        4   >2l >35     114     33
## 64.6        4   >2l >35     114     33
## 64.7        4   >2l >35     114     33
## 64.8        4   >2l >35     114     33
## 64.9        4   >2l >35     114     33

无放回地简单随机抽样

#无放回随机抽取10个观测样本
sub3=sample(nrow(Insurance),10)     
sub3 #显示所抽取的行序号

##  [1] 51 55  8 39 33 54 41 48 36 43

Insurance[sub3,]             #显示抽样结果

##    District  Group   Age Holders Claims
## 51        4    <1l 30-35      40      4
## 55        4 1-1.5l 30-35     122     22
## 8         1 1-1.5l   >35    3582    400
## 39        3 1-1.5l 30-35     240     37
## 33        3    <1l young      35      5
## 54        4 1-1.5l 25-29      81     10
## 41        3 1.5-2l young      24      8
## 48        3    >2l   >35     245     37
## 36        3    <1l   >35     648     67
## 43        3 1.5-2l 30-35     121     24

分层抽样

加载软件包sampling，该软件包提供strata(）和cluster(）函数

library (sampling)  

#按街区District进行分层，且1-4街区中分别无放回抽取4个样本.
sub4=strata(Insurance,stratanames="District",size=c(4,4,4,4),method="srswor")
                   
sub4             #显示分层抽样结果

##    District ID_unit Prob Stratum
## 6         1       6 0.25       1
## 8         1       8 0.25       1
## 9         1       9 0.25       1
## 14        1      14 0.25       1
## 22        2      22 0.25       2
## 29        2      29 0.25       2
## 30        2      30 0.25       2
## 32        2      32 0.25       2
## 35        3      35 0.25       3
## 39        3      39 0.25       3
## 40        3      40 0.25       3
## 41        3      41 0.25       3
## 50        4      50 0.25       4
## 54        4      54 0.25       4
## 61        4      61 0.25       4
## 64        4      64 0.25       4

getdata(Insurance,sub4)    #获取分层抽样所得的数据集

##     Group   Age Holders Claims District ID_unit Prob Stratum
## 6  1-1.5l 25-29     536     84        1       6 0.25       1
## 8  1-1.5l   >35    3582    400        1       8 0.25       1
## 9  1.5-2l young     133     19        1       9 0.25       1
## 14    >2l 25-29      71     18        1      14 0.25       1
## 22 1-1.5l 25-29     313     51        2      22 0.25       2
## 29    >2l young       9      4        2      29 0.25       2
## 30    >2l 25-29      48     15        2      30 0.25       2
## 32    >2l   >35     322     53        2      32 0.25       2
## 35    <1l 30-35      89     10        3      35 0.25       3
## 39 1-1.5l 30-35     240     37        3      39 0.25       3
## 40 1-1.5l   >35    1635    187        3      40 0.25       3
## 41 1.5-2l young      24      8        3      41 0.25       3
## 50    <1l 25-29      33      5        4      50 0.25       4
## 54 1-1.5l 25-29      81     10        4      54 0.25       4
## 61    >2l young       3      0        4      61 0.25       4
## 64    >2l   >35     114     33        4      64 0.25       4

#按街区District进行分层，且1-4街区中分别无放回抽取1-4个样本，
#并输出由description控制的各层基本信息
sub5=strata(Insurance,stratanames="District",size=c(1,2,3,4),description=TRUE)

## Warning in strata(Insurance, stratanames = "District", size = c(1, 2, 3, : the
## method is not specified; by default, the method is srswor

## Stratum 1 
## 
## Population total and number of selected units: 16 1 
## Stratum 2 
## 
## Population total and number of selected units: 16 2 
## Stratum 3 
## 
## Population total and number of selected units: 16 3 
## Stratum 4 
## 
## Population total and number of selected units: 16 4 
## Number of strata  4 
## Total number of selected units 10

sub5                       #显示分层抽样结果

##    District ID_unit   Prob Stratum
## 7         1       7 0.0625       1
## 18        2      18 0.1250       2
## 19        2      19 0.1250       2
## 33        3      33 0.1875       3
## 35        3      35 0.1875       3
## 36        3      36 0.1875       3
## 49        4      49 0.2500       4
## 55        4      55 0.2500       4
## 57        4      57 0.2500       4
## 61        4      61 0.2500       4

getdata(Insurance,sub5)    #获取分层抽样所得的数据集

##     Group   Age Holders Claims District ID_unit   Prob Stratum
## 7  1-1.5l 30-35     696     89        1       7 0.0625       1
## 18    <1l 25-29     139     19        2      18 0.1250       2
## 19    <1l 30-35     151     22        2      19 0.1250       2
## 33    <1l young      35      5        3      33 0.1875       3
## 35    <1l 30-35      89     10        3      35 0.1875       3
## 36    <1l   >35     648     67        3      36 0.1875       3
## 49    <1l young      20      2        4      49 0.2500       4
## 55 1-1.5l 30-35     122     22        4      55 0.2500       4
## 57 1.5-2l young      18      5        4      57 0.2500       4
## 61    >2l young       3      0        4      61 0.2500       4

#选择系统抽样方法systematic，并以Insurance中Claims变量控制各层内的抽样概率
sub6=strata(Insurance,stratanames="District",
            size=c(1,2,3,4),method="systematic",
            pik=Insurance$Claims)

## Warning in inclusionprobabilities(pik[y], size[i]): there are zero values in the initial vector a

sub6        #显示分层抽样结果

##    District ID_unit      Prob Stratum
## 12        1      12 0.1687183       1
## 24        2      24 0.6509540       2
## 28        2      28 0.3209877       2
## 36        3      36 0.3661202       3
## 40        3      40 1.0000000       3
## 44        3      44 0.5519126       3
## 55        4      55 0.2946429       4
## 56        4      56 1.0000000       4
## 60        4      60 0.8437500       4
## 64        4      64 0.4419643       4

getdata(Insurance,sub6)   #获取分层抽样所得的数据集

##     Group   Age Holders Claims District ID_unit      Prob Stratum
## 12 1.5-2l   >35    1640    233        1      12 0.1687183       1
## 24 1-1.5l   >35    2443    290        2      24 0.6509540       2
## 28 1.5-2l   >35    1110    143        2      28 0.3209877       2
## 36    <1l   >35     648     67        3      36 0.3661202       3
## 40 1-1.5l   >35    1635    187        3      40 1.0000000       3
## 44 1.5-2l   >35     692    101        3      44 0.5519126       3
## 55 1-1.5l 30-35     122     22        4      55 0.2946429       4
## 56 1-1.5l   >35     724    102        4      56 1.0000000       4
## 60 1.5-2l   >35     344     63        4      60 0.8437500       4
## 64    >2l   >35     114     33        4      64 0.4419643       4

#整群抽样

sub7=cluster(Insurance,clustername="District",
             size=2,method="srswor",
             description=TRUE)

## Number of selected clusters: 2 
## Number of units in the population and number of selected units: 64 32

#按照District变量的不同取值划分群，并无放回地抽取其中两个群中的所有样本
sub7

##    District ID_unit Prob
## 1         2      19  0.5
## 2         2      17  0.5
## 3         2      18  0.5
## 4         2      23  0.5
## 5         2      24  0.5
## 6         2      25  0.5
## 7         2      26  0.5
## 8         2      27  0.5
## 9         2      28  0.5
## 10        2      29  0.5
## 11        2      30  0.5
## 12        2      31  0.5
## 13        2      32  0.5
## 14        2      20  0.5
## 15        2      21  0.5
## 16        2      22  0.5
## 17        4      49  0.5
## 18        4      50  0.5
## 19        4      51  0.5
## 20        4      52  0.5
## 21        4      53  0.5
## 22        4      54  0.5
## 23        4      55  0.5
## 24        4      56  0.5
## 25        4      57  0.5
## 26        4      58  0.5
## 27        4      59  0.5
## 28        4      60  0.5
## 29        4      61  0.5
## 30        4      62  0.5
## 31        4      63  0.5
## 32        4      64  0.5

getdata(Insurance,sub7)

##     Group   Age Holders Claims District ID_unit Prob
## 19    <1l 30-35     151     22        2      19  0.5
## 17    <1l young      85     22        2      17  0.5
## 18    <1l 25-29     139     19        2      18  0.5
## 23 1-1.5l 30-35     419     49        2      23  0.5
## 24 1-1.5l   >35    2443    290        2      24  0.5
## 25 1.5-2l young      66     14        2      25  0.5
## 26 1.5-2l 25-29     175     46        2      26  0.5
## 27 1.5-2l 30-35     221     39        2      27  0.5
## 28 1.5-2l   >35    1110    143        2      28  0.5
## 29    >2l young       9      4        2      29  0.5
## 30    >2l 25-29      48     15        2      30  0.5
## 31    >2l 30-35      72     12        2      31  0.5
## 32    >2l   >35     322     53        2      32  0.5
## 20    <1l   >35     931     87        2      20  0.5
## 21 1-1.5l young     149     25        2      21  0.5
## 22 1-1.5l 25-29     313     51        2      22  0.5
## 49    <1l young      20      2        4      49  0.5
## 50    <1l 25-29      33      5        4      50  0.5
## 51    <1l 30-35      40      4        4      51  0.5
## 52    <1l   >35     316     36        4      52  0.5
## 53 1-1.5l young      31      7        4      53  0.5
## 54 1-1.5l 25-29      81     10        4      54  0.5
## 55 1-1.5l 30-35     122     22        4      55  0.5
## 56 1-1.5l   >35     724    102        4      56  0.5
## 57 1.5-2l young      18      5        4      57  0.5
## 58 1.5-2l 25-29      39      7        4      58  0.5
## 59 1.5-2l 30-35      68     16        4      59  0.5
## 60 1.5-2l   >35     344     63        4      60  0.5
## 61    >2l young       3      0        4      61  0.5
## 62    >2l 25-29      16      6        4      62  0.5
## 63    >2l 30-35      25      8        4      63  0.5
## 64    >2l   >35     114     33        4      64  0.5

划分训练集和测试集(比例3：1)

#抽取训练集和测试集#
train_sub=sample(nrow(Insurance),3/4*nrow(Insurance))   #随机无放回抽取3/4样本
train_data=Insurance[train_sub,]                        #将相应3/4样本集作为训练集
test_data=Insurance[-train_sub,]                        #将另外的1/4样本集作为测试集
dim(train_data);dim(test_data)                          #显示训练集与测试集的维度

## [1] 48  5

## [1] 16  5

train_data

##    District  Group   Age Holders Claims
## 37        3 1-1.5l young      53     10
## 60        4 1.5-2l   >35     344     63
## 34        3    <1l 25-29      73     11
## 21        2 1-1.5l young     149     25
## 54        4 1-1.5l 25-29      81     10
## 13        1    >2l young      24      4
## 7         1 1-1.5l 30-35     696     89
## 50        4    <1l 25-29      33      5
## 38        3 1-1.5l 25-29     155     24
## 44        3 1.5-2l   >35     692    101
## 20        2    <1l   >35     931     87
## 28        2 1.5-2l   >35    1110    143
## 4         1    <1l   >35    1680    156
## 9         1 1.5-2l young     133     19
## 42        3 1.5-2l 25-29      78     19
## 47        3    >2l 30-35      43      8
## 6         1 1-1.5l 25-29     536     84
## 32        2    >2l   >35     322     53
## 15        1    >2l 30-35      99     19
## 59        4 1.5-2l 30-35      68     16
## 35        3    <1l 30-35      89     10
## 8         1 1-1.5l   >35    3582    400
## 49        4    <1l young      20      2
## 45        3    >2l young       7      3
## 17        2    <1l young      85     22
## 5         1 1-1.5l young     284     63
## 56        4 1-1.5l   >35     724    102
## 51        4    <1l 30-35      40      4
## 46        3    >2l 25-29      29      2
## 40        3 1-1.5l   >35    1635    187
## 16        1    >2l   >35     452     77
## 61        4    >2l young       3      0
## 11        1 1.5-2l 30-35     355     74
## 64        4    >2l   >35     114     33
## 36        3    <1l   >35     648     67
## 10        1 1.5-2l 25-29     286     52
## 14        1    >2l 25-29      71     18
## 39        3 1-1.5l 30-35     240     37
## 57        4 1.5-2l young      18      5
## 55        4 1-1.5l 30-35     122     22
## 12        1 1.5-2l   >35    1640    233
## 18        2    <1l 25-29     139     19
## 63        4    >2l 30-35      25      8
## 1         1    <1l young     197     38
## 52        4    <1l   >35     316     36
## 48        3    >2l   >35     245     37
## 53        4 1-1.5l young      31      7
## 58        4 1.5-2l 25-29      39      7

write.csv(train_data,"train.csv")
test_data

##    District  Group   Age Holders Claims
## 2         1    <1l 25-29     264     35
## 3         1    <1l 30-35     246     20
## 19        2    <1l 30-35     151     22
## 22        2 1-1.5l 25-29     313     51
## 23        2 1-1.5l 30-35     419     49
## 24        2 1-1.5l   >35    2443    290
## 25        2 1.5-2l young      66     14
## 26        2 1.5-2l 25-29     175     46
## 27        2 1.5-2l 30-35     221     39
## 29        2    >2l young       9      4
## 30        2    >2l 25-29      48     15
## 31        2    >2l 30-35      72     12
## 33        3    <1l young      35      5
## 41        3 1.5-2l young      24      8
## 43        3 1.5-2l 30-35     121     24
## 62        4    >2l 25-29      16      6

write.csv(test_data,"test.csv")

R数据类型和抽样技术

Huang Huilin

3/31/2022

工作路径获取和设置

R数据分类

获取数据集Insurance及各变量信息

抽样技术

简单抽样：有放回简单抽样

无放回地简单随机抽样

分层抽样

划分训练集和测试集(比例3：1)