第五章

5.1 数据集的加载

library( "mice" )
data( nhanes2 ) #书上应该用的nhanes
dim( nhanes2 )

## [1] 25  4

summary( nhanes2 )

##     age          bmi          hyp          chl       
##  20-39:12   Min.   :20.40   no  :13   Min.   :113.0  
##  40-59: 7   1st Qu.:22.65   yes : 4   1st Qu.:185.0  
##  60-99: 6   Median :26.75   NA's: 8   Median :187.0  
##             Mean   :26.56             Mean   :191.4  
##             3rd Qu.:28.93             3rd Qu.:212.0  
##             Max.   :35.30             Max.   :284.0  
##             NA's   :9                 NA's   :10

head( nhanes2 )

##     age  bmi  hyp chl
## 1 20-39   NA <NA>  NA
## 2 40-59 22.7   no 187
## 3 20-39   NA   no 187
## 4 60-99   NA <NA>  NA
## 5 20-39 20.4   no 113
## 6 60-99   NA <NA> 184

library( "Hmisc" )
describe( nhanes2 )

## nhanes2 
## 
##  4  Variables      25  Observations
## ---------------------------------------------------------------------------
## age 
##       n missing  unique 
##      25       0       3 
## 
## 20-39 (12, 48%), 40-59 (7, 28%), 60-99 (6, 24%) 
## ---------------------------------------------------------------------------
## bmi 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      16       9      16       1   26.56   21.38   21.85   22.65   26.75 
##     .75     .90     .95 
##   28.92   31.65   33.73 
## 
##           20.4 21.7 22 22.5 22.7 24.9 25.5 26.3 27.2 27.4 27.5 28.7 29.6
## Frequency    1    1  1    1    1    1    1    1    1    1    1    1    1
## %            6    6  6    6    6    6    6    6    6    6    6    6    6
##           30.1 33.2 35.3
## Frequency    1    1    1
## %            6    6    6
## ---------------------------------------------------------------------------
## hyp 
##       n missing  unique 
##      17       8       2 
## 
## no (13, 76%), yes (4, 24%) 
## ---------------------------------------------------------------------------
## chl 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      15      10      13    0.99   191.4   116.5   123.2   185.0   187.0 
##     .75     .90     .95 
##   212.0   234.4   251.8 
## 
##           113 118 131 184 186 187 199 204 206 218 229 238 284
## Frequency   1   1   1   1   1   3   1   1   1   1   1   1   1
## %           7   7   7   7   7  20   7   7   7   7   7   7   7
## ---------------------------------------------------------------------------

5.2 数据清除

data_plot<-c( 11,19,14,22,14,28,13,81,12,43,11,16,31,16,23,42,22,26,17,22,13,27,180,16,43,82,14,11,51,76,28,66,29,14,14,65,37,16,37,35,39,27,14,17,13,38,28,40,85,32,
25,26,16,12,54,40,18,27,16,14,33,29,77,50,19,34)
par( mfrow=c(2,2) )
hist( data_plot,labels=TRUE )
dotchart( data_plot )
boxplot( data_plot,horizontal=TRUE )
qqnorm( data_plot );qqline( data_plot )

data_plot[data_plot>150] # 异常值

## [1] 180

par(mfrow=c(1,1)  )

5.2.1 缺失值的处理

sum( complete.cases(nhanes2) ) # 完整样本个数

## [1] 13

sum( is.na(nhanes2) ) #缺失样本个数

## [1] 27

md.pattern( nhanes2 )

##    age hyp bmi chl   
## 13   1   1   1   1  0
##  1   1   1   0   1  1
##  3   1   1   1   0  1
##  1   1   0   0   1  2
##  7   1   0   0   0  3
##      0   8   9  10 27

imp<-mice( nhanes2,m=4) # m为插补重数

## 
##  iter imp variable
##   1   1  bmi  hyp  chl
##   1   2  bmi  hyp  chl
##   1   3  bmi  hyp  chl
##   1   4  bmi  hyp  chl
##   2   1  bmi  hyp  chl
##   2   2  bmi  hyp  chl
##   2   3  bmi  hyp  chl
##   2   4  bmi  hyp  chl
##   3   1  bmi  hyp  chl
##   3   2  bmi  hyp  chl
##   3   3  bmi  hyp  chl
##   3   4  bmi  hyp  chl
##   4   1  bmi  hyp  chl
##   4   2  bmi  hyp  chl
##   4   3  bmi  hyp  chl
##   4   4  bmi  hyp  chl
##   5   1  bmi  hyp  chl
##   5   2  bmi  hyp  chl
##   5   3  bmi  hyp  chl
##   5   4  bmi  hyp  chl

imp

## Multiply imputed data set
## Call:
## mice(data = nhanes2, m = 4)
## Number of multiple imputations:  4
## Missing cells per column:
## age bmi hyp chl 
##   0   9   8  10 
## Imputation methods:
##      age      bmi      hyp      chl 
##       ""    "pmm" "logreg"    "pmm" 
## VisitSequence:
## bmi hyp chl 
##   2   3   4 
## PredictorMatrix:
##     age bmi hyp chl
## age   0   0   0   0
## bmi   1   0   1   1
## hyp   1   1   0   1
## chl   1   1   1   0
## Random generator seed value:  NA

fit<-with( imp,lm(chl~age+hyp+bmi) ) # 建立线型回归模型
fit

## call :
## with.mids(data = imp, expr = lm(chl ~ age + hyp + bmi))
## 
## call1 :
## mice(data = nhanes2, m = 4)
## 
## nmis :
## age bmi hyp chl 
##   0   9   8  10 
## 
## analyses :
## [[1]]
## 
## Call:
## lm(formula = chl ~ age + hyp + bmi)
## 
## Coefficients:
## (Intercept)         age2         age3         hyp2          bmi  
##      35.727       37.639       50.020        4.435        4.784  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = chl ~ age + hyp + bmi)
## 
## Coefficients:
## (Intercept)         age2         age3         hyp2          bmi  
##      36.102       44.292       66.383       17.547        4.619  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = chl ~ age + hyp + bmi)
## 
## Coefficients:
## (Intercept)         age2         age3         hyp2          bmi  
##      -29.04        74.61        89.61       -25.00         6.59  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = chl ~ age + hyp + bmi)
## 
## Coefficients:
## (Intercept)         age2         age3         hyp2          bmi  
##     -25.471       54.215       90.379      -16.124        7.075

pooled<-pool( fit ) # 建立四个模型的汇总
summary( pooled )

##                   est        se           t       df   Pr(>|t|)
## (Intercept)  4.330517 69.526090  0.06228621 8.123479 0.95184022
## age2        52.690004 25.912710  2.03336526 5.428066 0.09322676
## age3        74.098791 30.467751  2.43204004 4.981794 0.05941356
## hyp2        -4.786532 28.747125 -0.16650472 4.301760 0.87529642
## bmi          5.767032  2.432781  2.37055138 8.494646 0.04345269
##                    lo 95     hi 95 nmis       fmi    lambda
## (Intercept) -155.5737259 164.23476   NA 0.4623535 0.3444935
## age2         -12.3713314 117.75134   NA 0.6064691 0.4840277
## age3          -4.3072262 152.50481   NA 0.6357599 0.5139770
## hyp2         -82.4410258  72.86796   NA 0.6843294 0.5652480
## bmi            0.2133856  11.32068    9 0.4458190 0.3290836

1. 删除法（P92）

删除样本
删除变量
使用完整原始数据
改变权重：删除缺失数据，通过对完整数据按照不同的权重进行加权，可以降低缺失数据带来的偏差“)
##### 2.插补法

sub<-which( is.na(nhanes2[,4])==TRUE)
data_NO<-nhanes2[-sub,]; # 第四列非NA的样本
data_YES<-nhanes2[sub,]; # 第四列NA的样本
####从非缺失之中随机抽样####
data_YES[,4]<-sample( data_NO[,4],length(data_YES[,4]),replace=TRUE )
data_YES

##      age  bmi  hyp chl
## 1  20-39   NA <NA> 218
## 4  60-99   NA <NA> 187
## 10 40-59   NA <NA> 184
## 11 20-39   NA <NA> 186
## 12 40-59   NA <NA> 186
## 15 20-39 29.6   no 206
## 16 20-39   NA <NA> 218
## 20 60-99 25.5  yes 284
## 21 20-39   NA <NA> 131
## 24 60-99 24.9   no 113

length(data_YES)

## [1] 4

length(data_YES[,4])

## [1] 10

####中位数插补、四分位数插补、平均数插补####
data_YES[,4]<-mean( data_NO[,4] )
data_YES

##      age  bmi  hyp   chl
## 1  20-39   NA <NA> 191.4
## 4  60-99   NA <NA> 191.4
## 10 40-59   NA <NA> 191.4
## 11 20-39   NA <NA> 191.4
## 12 40-59   NA <NA> 191.4
## 15 20-39 29.6   no 191.4
## 16 20-39   NA <NA> 191.4
## 20 60-99 25.5  yes 191.4
## 21 20-39   NA <NA> 191.4
## 24 60-99 24.9   no 191.4

####回归插补####
lm<-lm( chl~age,data=data_NO ) #age自变量，chl因变量简历回归模型
nhanes2[ sub,4 ]=round( predict( lm,data_YES ),2 )
nhanes2

##      age  bmi  hyp    chl
## 1  20-39   NA <NA> 169.00
## 2  40-59 22.7   no 187.00
## 3  20-39   NA   no 187.00
## 4  60-99   NA <NA> 224.67
## 5  20-39 20.4   no 113.00
## 6  60-99   NA <NA> 184.00
## 7  20-39 22.5   no 118.00
## 8  20-39 30.1   no 187.00
## 9  40-59 22.0   no 238.00
## 10 40-59   NA <NA> 202.80
## 11 20-39   NA <NA> 169.00
## 12 40-59   NA <NA> 202.80
## 13 60-99 21.7   no 206.00
## 14 40-59 28.7  yes 204.00
## 15 20-39 29.6   no 169.00
## 16 20-39   NA <NA> 169.00
## 17 60-99 27.2  yes 284.00
## 18 40-59 26.3  yes 199.00
## 19 20-39 35.3   no 218.00
## 20 60-99 25.5  yes 224.67
## 21 20-39   NA <NA> 169.00
## 22 20-39 33.2   no 229.00
## 23 20-39 27.5   no 131.00
## 24 60-99 24.9   no 224.67
## 25 40-59 27.4   no 186.00

####热平台插补：在非缺失值数据集中找到一个与缺失值相似的样本####
accept<-nhanes2[which( apply( is.na(nhanes2),1,sum ) != 0 ),] # 有缺失样本
donate<-nhanes2[which( apply( is.na(nhanes2),1,sum ) == 0 ),] # 无缺失样本
accept

##      age bmi  hyp    chl
## 1  20-39  NA <NA> 169.00
## 3  20-39  NA   no 187.00
## 4  60-99  NA <NA> 224.67
## 6  60-99  NA <NA> 184.00
## 10 40-59  NA <NA> 202.80
## 11 20-39  NA <NA> 169.00
## 12 40-59  NA <NA> 202.80
## 16 20-39  NA <NA> 169.00
## 21 20-39  NA <NA> 169.00

donate

##      age  bmi hyp    chl
## 2  40-59 22.7  no 187.00
## 5  20-39 20.4  no 113.00
## 7  20-39 22.5  no 118.00
## 8  20-39 30.1  no 187.00
## 9  40-59 22.0  no 238.00
## 13 60-99 21.7  no 206.00
## 14 40-59 28.7 yes 204.00
## 15 20-39 29.6  no 169.00
## 17 60-99 27.2 yes 284.00
## 18 40-59 26.3 yes 199.00
## 19 20-39 35.3  no 218.00
## 20 60-99 25.5 yes 224.67
## 22 20-39 33.2  no 229.00
## 23 20-39 27.5  no 131.00
## 24 60-99 24.9  no 224.67
## 25 40-59 27.4  no 186.00

accept[2,]

##     age bmi hyp chl
## 3 20-39  NA  no 187

find<-donate[ which(donate[,1]==accept[2,1] & donate[,3]==accept[2,3] & donate[,4]==accept[2,4] ), ]
find

##     age  bmi hyp chl
## 8 20-39 30.1  no 187

accept[2,2]<-find[1,2]
accept[2,]

##     age  bmi hyp chl
## 3 20-39 30.1  no 187

####冷平台插补：按照某些变量将数据分层，在层中进行均值插补####
rm(nhanes2) # 移除变量，因为之前已经改变
data(nhanes2) # 重新加载
level1<-nhanes2[which(nhanes2[,3]=="yes"),]
level1

##      age  bmi hyp chl
## 14 40-59 28.7 yes 204
## 17 60-99 27.2 yes 284
## 18 40-59 26.3 yes 199
## 20 60-99 25.5 yes  NA

level1[4,4]<-mean( level1[,4],na.rm=TRUE )
level1

##      age  bmi hyp chl
## 14 40-59 28.7 yes 204
## 17 60-99 27.2 yes 284
## 18 40-59 26.3 yes 199
## 20 60-99 25.5 yes 229

5.2.2 噪声数据处理

library( "outliers" )
set.seed(1)
s1=.Random.seed # 设置随机种子
y<-rnorm(100) # 生成100个标准正态分布随机数
length(y)

## [1] 100

look1<-outlier(y);look1

## [1] -2.2147

look2<-outlier(y,opposite=TRUE);look2

## [1] 2.401618

dotchart( y )
text(look1+0.3,10,paste("离群值",round(look1,2),sep="\n"),cex=0.7)
text(look2,50,paste("离群值","相反值",round(look2,2),sep="\n"),cex=0.7)

dim(y)<-c(20,5) # 变换y为20X5矩阵
outlier(y)

## [1] -2.214700 -1.989352  1.980400  2.401618 -1.523567

outlier(y,opposite=TRUE)

## [1]  1.595281  1.358680 -1.129363 -1.804959  1.586833

set.seed=1;s1=.Random.seed;
y<-rnorm(10)
outlier(y,logical=TRUE)

##  [1] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE

plot(y)
text( 6.6,outlier(y),"离群点" )

####数据光滑去除噪声####
####等宽箱均值光滑
set.seed(1);s1=.Random.seed;
x<-rnorm(12)
x<-sort(x)
dim(x)<-c(3,4) # 分箱，每行一箱
for( i in 1:3 ){
  x[i,]<-apply(x,1,mean)[i] # 均值分箱
}
x

##              [,1]         [,2]         [,3]         [,4]
## [1,] -0.003212265 -0.003212265 -0.003212265 -0.003212265
## [2,]  0.340596290  0.340596290  0.340596290  0.340596290
## [3,]  0.468529029  0.468529029  0.468529029  0.468529029

5.2.3 数据不一致处理

x<-list(a=1:10,beta=exp(-3:3),logic=c(T,F,F,T))
x

## $a
##  [1]  1  2  3  4  5  6  7  8  9 10
## 
## $beta
## [1]  0.04978707  0.13533528  0.36787944  1.00000000  2.71828183  7.38905610
## [7] 20.08553692
## 
## $logic
## [1]  TRUE FALSE FALSE  TRUE

probs<-1:3/4
rt.value<-rep(0,3)
vapply(x,quantile,FUN.VALUE=rt.value,probs=probs)

##        a      beta logic
## 25% 3.25 0.2516074   0.0
## 50% 5.50 1.0000000   0.5
## 75% 7.75 5.0536690   1.0

##probs<-1:4/4 # 设置返回分位点格式，返回值只有3个，所以会出现错误
##vapply(x,quantile,FUN.VALUE=rt.value,probs=probs)
##错误提示：
##Error in vapply(x, quantile, FUN.VALUE = rt.value, probs = probs) : 
##  values must be length 3,
## but FUN(X[[1]]) result is length 4

probs<-1:4/4;
rt.value=rep(0,4) # 修改分位点个数
vapply(x,quantile,FUN.VALUE=rt.value,probs=probs)

##          a       beta logic
## 25%   3.25  0.2516074   0.0
## 50%   5.50  1.0000000   0.5
## 75%   7.75  5.0536690   1.0
## 100% 10.00 20.0855369   1.0

##rt.value=c(0,0,0,"") # 设置返回值类型：3个数字+1个字符串，与返回格式不符，出现错误
##vapply(x,quantile,FUN.VALUE=rt.value,probs=probs)
##错误提示：
##Error in vapply(x, quantile, FUN.VALUE = rt.value, probs = probs) : 
##  values must be type 'character',
## but FUN(X[[1]]) result is type 'double'

5.3 数据集成(将多个数据源中的数据合并)

消除冗余：1.定性数据：卡方检验（p-value<0.05，在0.05显著水平下拒绝原假设，即变量不相关）
2.定量数据：相关系数和协方差：相关系数为0，不相关，否则正负相关。协方差为正，表示两个属性趋于一起改变；协方差为负，表示两个属性区域相反方向改变

set.seed(1);s1=.Random.seed;
x<-matrix( sample(1:100,50),ncol=2 )
x

##       [,1] [,2]
##  [1,]   27   29
##  [2,]   37    1
##  [3,]   57   28
##  [4,]   89   81
##  [5,]   20   25
##  [6,]   86   87
##  [7,]   97   42
##  [8,]   62   70
##  [9,]   58   13
## [10,]    6   55
## [11,]   19   44
## [12,]   16   78
## [13,]   61    7
## [14,]   34   45
## [15,]   67   26
## [16,]   43   50
## [17,]   88   39
## [18,]   83   46
## [19,]   32   82
## [20,]   63   30
## [21,]   75   65
## [22,]   17    2
## [23,]   51   84
## [24,]   10   59
## [25,]   21   36

chisq.test(x)

## 
##  Pearson's Chi-squared test
## 
## data:  x
## X-squared = 368.25, df = 24, p-value < 2.2e-16

结论:p_value<0.05,在0.05显著水平下拒绝原假设，变量不相关，不可删除

cor(x) # 求相关系数

##           [,1]      [,2]
## [1,] 1.0000000 0.1448407
## [2,] 0.1448407 1.0000000

cov(x) # 求协方差

##          [,1]    [,2]
## [1,] 783.0233 105.615
## [2,] 105.6150 679.040

结论：相关系数较小，不足以认为两列为冗余数据，不可删除，且两列变量趋于相反改变
数据重复检测

x<-cbind( sample(1:10,10,replace=T),rnorm(10),rnorm(10) );
x

##       [,1]        [,2]       [,3]
##  [1,]    5  1.35867955 -0.1645236
##  [2,]    9 -0.10278773 -0.2533617
##  [3,]    5  0.38767161  0.6969634
##  [4,]    3 -0.05380504  0.5566632
##  [5,]    1 -1.37705956 -0.6887557
##  [6,]    1 -0.41499456 -0.7074952
##  [7,]    4 -0.39428995  0.3645820
##  [8,]    6 -0.05931340  0.7685329
##  [9,]    7  1.10002537 -0.1123462
## [10,]    5  0.76317575  0.8811077

y<-unique( x[,1] ) #去除重复样本,返回非重复样本值
y

## [1] 5 9 3 1 4 6 7

sub=rep(0,length(y)) #初始化
for( i in 1:length(sub) ){
  sub[i]<-which( x[,1]==y[i] )[1]  # 找出来不止一个所以取首元素
}
sub

## [1] 1 2 4 5 7 8 9

x_unique<-x[sub,]
x_unique

##      [,1]        [,2]       [,3]
## [1,]    5  1.35867955 -0.1645236
## [2,]    9 -0.10278773 -0.2533617
## [3,]    3 -0.05380504  0.5566632
## [4,]    1 -1.37705956 -0.6887557
## [5,]    4 -0.39428995  0.3645820
## [6,]    6 -0.05931340  0.7685329
## [7,]    7  1.10002537 -0.1123462

数据变换

光滑
属性构造：由给定属性构造出新属性，添加到数据集中
聚集：对数据汇总
规范化

set.seed(1);s1=.Random.seed;
a<-rnorm(5)
a

## [1] -0.6264538  0.1836433 -0.8356286  1.5952808  0.3295078

b<-scale(a) # 标准化
b # attr(,"scaled:center"):原数据均值； attr(,"scaled:scale"):原数据标准差

##             [,1]
## [1,] -0.78636077
## [2,]  0.05657773
## [3,] -1.00401553
## [4,]  1.52544305
## [5,]  0.20835553
## attr(,"scaled:center")
## [1] 0.1292699
## attr(,"scaled:scale")
## [1] 0.9610394

pra<-50:60;
scale(pra)  #

##             [,1]
##  [1,] -1.5075567
##  [2,] -1.2060454
##  [3,] -0.9045340
##  [4,] -0.6030227
##  [5,] -0.3015113
##  [6,]  0.0000000
##  [7,]  0.3015113
##  [8,]  0.6030227
##  [9,]  0.9045340
## [10,]  1.2060454
## [11,]  1.5075567
## attr(,"scaled:center")
## [1] 55
## attr(,"scaled:scale")
## [1] 3.316625

离散化:数值属性的数据用区间标签或概念标签表示，实现定量数据向定性数据转换，将连续型数据离散化

set.seed(1);s1=.Random.seed;
a<-sample( 1:100/100,10 )
a

##  [1] 0.27 0.37 0.57 0.89 0.20 0.86 0.97 0.62 0.58 0.06

a_rt<-rep(0,length(a))
a_rt[which(a>0.5)]<-1
a_rt

##  [1] 0 0 1 1 0 1 1 1 1 0

数据泛化（数据合并）

set.seed(1);s1=.Random.seed;
city<-sample( 1:10,10 )
city

##  [1]  3  4  5  7  2  8  9  6 10  1

city_rt<-rep(0,length(city))
city_rt[which(city>5)]<-1
city_rt

##  [1] 0 0 0 1 0 1 1 1 1 0

5.5 数据规约

library("glmnet") 
x<-matrix(rnorm(100*20),100,20 ) 
y<-rnorm(100) # 生成一行正态随机数作为因变量
fit1<-glmnet(x,y) # 广义线性回归
b<-coef(fit1,s=0.01) # 随着s减小，约束放宽，筛选的变量增多
b

## 21 x 1 sparse Matrix of class "dgCMatrix"
##                        1
## (Intercept) -0.101266710
## V1           0.108237428
## V2           0.231123217
## V3           0.190051493
## V4          -0.052974626
## V5           0.035532599
## V6           0.038137025
## V7          -0.006237134
## V8          -0.121992039
## V9          -0.026866678
## V10          0.087353265
## V11          0.131952133
## V12         -0.003795339
## V13         -0.151361166
## V14         -0.068539027
## V15          0.016609815
## V16          0.011469276
## V17          0.008940900
## V18          0.062310907
## V19          0.106434048
## V20          0.051664503

predict(fit1,newx=x[1:10,],s=c(0.01,0.005))  # 计算s为0.01,0.005情况下预测值

##                1          2
##  [1,]  0.4116414  0.4287138
##  [2,]  0.4768544  0.5079425
##  [3,]  0.3133248  0.3340180
##  [4,] -0.4255843 -0.4467467
##  [5,] -0.1691660 -0.1881846
##  [6,] -0.1703948 -0.2182575
##  [7,] -0.7424685 -0.8172969
##  [8,] -0.2821776 -0.3357268
##  [9,] -0.5316970 -0.5207680
## [10,]  0.1402178  0.1598952