library( "mice" )
data( nhanes2 ) #书上应该用的nhanes
dim( nhanes2 )
## [1] 25 4
summary( nhanes2 )
## age bmi hyp chl
## 20-39:12 Min. :20.40 no :13 Min. :113.0
## 40-59: 7 1st Qu.:22.65 yes : 4 1st Qu.:185.0
## 60-99: 6 Median :26.75 NA's: 8 Median :187.0
## Mean :26.56 Mean :191.4
## 3rd Qu.:28.93 3rd Qu.:212.0
## Max. :35.30 Max. :284.0
## NA's :9 NA's :10
head( nhanes2 )
## age bmi hyp chl
## 1 20-39 NA <NA> NA
## 2 40-59 22.7 no 187
## 3 20-39 NA no 187
## 4 60-99 NA <NA> NA
## 5 20-39 20.4 no 113
## 6 60-99 NA <NA> 184
library( "Hmisc" )
describe( nhanes2 )
## nhanes2
##
## 4 Variables 25 Observations
## ---------------------------------------------------------------------------
## age
## n missing unique
## 25 0 3
##
## 20-39 (12, 48%), 40-59 (7, 28%), 60-99 (6, 24%)
## ---------------------------------------------------------------------------
## bmi
## n missing unique Info Mean .05 .10 .25 .50
## 16 9 16 1 26.56 21.38 21.85 22.65 26.75
## .75 .90 .95
## 28.92 31.65 33.73
##
## 20.4 21.7 22 22.5 22.7 24.9 25.5 26.3 27.2 27.4 27.5 28.7 29.6
## Frequency 1 1 1 1 1 1 1 1 1 1 1 1 1
## % 6 6 6 6 6 6 6 6 6 6 6 6 6
## 30.1 33.2 35.3
## Frequency 1 1 1
## % 6 6 6
## ---------------------------------------------------------------------------
## hyp
## n missing unique
## 17 8 2
##
## no (13, 76%), yes (4, 24%)
## ---------------------------------------------------------------------------
## chl
## n missing unique Info Mean .05 .10 .25 .50
## 15 10 13 0.99 191.4 116.5 123.2 185.0 187.0
## .75 .90 .95
## 212.0 234.4 251.8
##
## 113 118 131 184 186 187 199 204 206 218 229 238 284
## Frequency 1 1 1 1 1 3 1 1 1 1 1 1 1
## % 7 7 7 7 7 20 7 7 7 7 7 7 7
## ---------------------------------------------------------------------------
data_plot<-c( 11,19,14,22,14,28,13,81,12,43,11,16,31,16,23,42,22,26,17,22,13,27,180,16,43,82,14,11,51,76,28,66,29,14,14,65,37,16,37,35,39,27,14,17,13,38,28,40,85,32,
25,26,16,12,54,40,18,27,16,14,33,29,77,50,19,34)
par( mfrow=c(2,2) )
hist( data_plot,labels=TRUE )
dotchart( data_plot )
boxplot( data_plot,horizontal=TRUE )
qqnorm( data_plot );qqline( data_plot )
data_plot[data_plot>150] # 异常值
## [1] 180
par(mfrow=c(1,1) )
sum( complete.cases(nhanes2) ) # 完整样本个数
## [1] 13
sum( is.na(nhanes2) ) #缺失样本个数
## [1] 27
md.pattern( nhanes2 )
## age hyp bmi chl
## 13 1 1 1 1 0
## 1 1 1 0 1 1
## 3 1 1 1 0 1
## 1 1 0 0 1 2
## 7 1 0 0 0 3
## 0 8 9 10 27
imp<-mice( nhanes2,m=4) # m为插补重数
##
## iter imp variable
## 1 1 bmi hyp chl
## 1 2 bmi hyp chl
## 1 3 bmi hyp chl
## 1 4 bmi hyp chl
## 2 1 bmi hyp chl
## 2 2 bmi hyp chl
## 2 3 bmi hyp chl
## 2 4 bmi hyp chl
## 3 1 bmi hyp chl
## 3 2 bmi hyp chl
## 3 3 bmi hyp chl
## 3 4 bmi hyp chl
## 4 1 bmi hyp chl
## 4 2 bmi hyp chl
## 4 3 bmi hyp chl
## 4 4 bmi hyp chl
## 5 1 bmi hyp chl
## 5 2 bmi hyp chl
## 5 3 bmi hyp chl
## 5 4 bmi hyp chl
imp
## Multiply imputed data set
## Call:
## mice(data = nhanes2, m = 4)
## Number of multiple imputations: 4
## Missing cells per column:
## age bmi hyp chl
## 0 9 8 10
## Imputation methods:
## age bmi hyp chl
## "" "pmm" "logreg" "pmm"
## VisitSequence:
## bmi hyp chl
## 2 3 4
## PredictorMatrix:
## age bmi hyp chl
## age 0 0 0 0
## bmi 1 0 1 1
## hyp 1 1 0 1
## chl 1 1 1 0
## Random generator seed value: NA
fit<-with( imp,lm(chl~age+hyp+bmi) ) # 建立线型回归模型
fit
## call :
## with.mids(data = imp, expr = lm(chl ~ age + hyp + bmi))
##
## call1 :
## mice(data = nhanes2, m = 4)
##
## nmis :
## age bmi hyp chl
## 0 9 8 10
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = chl ~ age + hyp + bmi)
##
## Coefficients:
## (Intercept) age2 age3 hyp2 bmi
## 35.727 37.639 50.020 4.435 4.784
##
##
## [[2]]
##
## Call:
## lm(formula = chl ~ age + hyp + bmi)
##
## Coefficients:
## (Intercept) age2 age3 hyp2 bmi
## 36.102 44.292 66.383 17.547 4.619
##
##
## [[3]]
##
## Call:
## lm(formula = chl ~ age + hyp + bmi)
##
## Coefficients:
## (Intercept) age2 age3 hyp2 bmi
## -29.04 74.61 89.61 -25.00 6.59
##
##
## [[4]]
##
## Call:
## lm(formula = chl ~ age + hyp + bmi)
##
## Coefficients:
## (Intercept) age2 age3 hyp2 bmi
## -25.471 54.215 90.379 -16.124 7.075
pooled<-pool( fit ) # 建立四个模型的汇总
summary( pooled )
## est se t df Pr(>|t|)
## (Intercept) 4.330517 69.526090 0.06228621 8.123479 0.95184022
## age2 52.690004 25.912710 2.03336526 5.428066 0.09322676
## age3 74.098791 30.467751 2.43204004 4.981794 0.05941356
## hyp2 -4.786532 28.747125 -0.16650472 4.301760 0.87529642
## bmi 5.767032 2.432781 2.37055138 8.494646 0.04345269
## lo 95 hi 95 nmis fmi lambda
## (Intercept) -155.5737259 164.23476 NA 0.4623535 0.3444935
## age2 -12.3713314 117.75134 NA 0.6064691 0.4840277
## age3 -4.3072262 152.50481 NA 0.6357599 0.5139770
## hyp2 -82.4410258 72.86796 NA 0.6843294 0.5652480
## bmi 0.2133856 11.32068 9 0.4458190 0.3290836
sub<-which( is.na(nhanes2[,4])==TRUE)
data_NO<-nhanes2[-sub,]; # 第四列非NA的样本
data_YES<-nhanes2[sub,]; # 第四列NA的样本
####从非缺失之中随机抽样####
data_YES[,4]<-sample( data_NO[,4],length(data_YES[,4]),replace=TRUE )
data_YES
## age bmi hyp chl
## 1 20-39 NA <NA> 218
## 4 60-99 NA <NA> 187
## 10 40-59 NA <NA> 184
## 11 20-39 NA <NA> 186
## 12 40-59 NA <NA> 186
## 15 20-39 29.6 no 206
## 16 20-39 NA <NA> 218
## 20 60-99 25.5 yes 284
## 21 20-39 NA <NA> 131
## 24 60-99 24.9 no 113
length(data_YES)
## [1] 4
length(data_YES[,4])
## [1] 10
####中位数插补、四分位数插补、平均数插补####
data_YES[,4]<-mean( data_NO[,4] )
data_YES
## age bmi hyp chl
## 1 20-39 NA <NA> 191.4
## 4 60-99 NA <NA> 191.4
## 10 40-59 NA <NA> 191.4
## 11 20-39 NA <NA> 191.4
## 12 40-59 NA <NA> 191.4
## 15 20-39 29.6 no 191.4
## 16 20-39 NA <NA> 191.4
## 20 60-99 25.5 yes 191.4
## 21 20-39 NA <NA> 191.4
## 24 60-99 24.9 no 191.4
####回归插补####
lm<-lm( chl~age,data=data_NO ) #age自变量,chl因变量简历回归模型
nhanes2[ sub,4 ]=round( predict( lm,data_YES ),2 )
nhanes2
## age bmi hyp chl
## 1 20-39 NA <NA> 169.00
## 2 40-59 22.7 no 187.00
## 3 20-39 NA no 187.00
## 4 60-99 NA <NA> 224.67
## 5 20-39 20.4 no 113.00
## 6 60-99 NA <NA> 184.00
## 7 20-39 22.5 no 118.00
## 8 20-39 30.1 no 187.00
## 9 40-59 22.0 no 238.00
## 10 40-59 NA <NA> 202.80
## 11 20-39 NA <NA> 169.00
## 12 40-59 NA <NA> 202.80
## 13 60-99 21.7 no 206.00
## 14 40-59 28.7 yes 204.00
## 15 20-39 29.6 no 169.00
## 16 20-39 NA <NA> 169.00
## 17 60-99 27.2 yes 284.00
## 18 40-59 26.3 yes 199.00
## 19 20-39 35.3 no 218.00
## 20 60-99 25.5 yes 224.67
## 21 20-39 NA <NA> 169.00
## 22 20-39 33.2 no 229.00
## 23 20-39 27.5 no 131.00
## 24 60-99 24.9 no 224.67
## 25 40-59 27.4 no 186.00
####热平台插补:在非缺失值数据集中找到一个与缺失值相似的样本####
accept<-nhanes2[which( apply( is.na(nhanes2),1,sum ) != 0 ),] # 有缺失样本
donate<-nhanes2[which( apply( is.na(nhanes2),1,sum ) == 0 ),] # 无缺失样本
accept
## age bmi hyp chl
## 1 20-39 NA <NA> 169.00
## 3 20-39 NA no 187.00
## 4 60-99 NA <NA> 224.67
## 6 60-99 NA <NA> 184.00
## 10 40-59 NA <NA> 202.80
## 11 20-39 NA <NA> 169.00
## 12 40-59 NA <NA> 202.80
## 16 20-39 NA <NA> 169.00
## 21 20-39 NA <NA> 169.00
donate
## age bmi hyp chl
## 2 40-59 22.7 no 187.00
## 5 20-39 20.4 no 113.00
## 7 20-39 22.5 no 118.00
## 8 20-39 30.1 no 187.00
## 9 40-59 22.0 no 238.00
## 13 60-99 21.7 no 206.00
## 14 40-59 28.7 yes 204.00
## 15 20-39 29.6 no 169.00
## 17 60-99 27.2 yes 284.00
## 18 40-59 26.3 yes 199.00
## 19 20-39 35.3 no 218.00
## 20 60-99 25.5 yes 224.67
## 22 20-39 33.2 no 229.00
## 23 20-39 27.5 no 131.00
## 24 60-99 24.9 no 224.67
## 25 40-59 27.4 no 186.00
accept[2,]
## age bmi hyp chl
## 3 20-39 NA no 187
find<-donate[ which(donate[,1]==accept[2,1] & donate[,3]==accept[2,3] & donate[,4]==accept[2,4] ), ]
find
## age bmi hyp chl
## 8 20-39 30.1 no 187
accept[2,2]<-find[1,2]
accept[2,]
## age bmi hyp chl
## 3 20-39 30.1 no 187
####冷平台插补:按照某些变量将数据分层,在层中进行均值插补####
rm(nhanes2) # 移除变量,因为之前已经改变
data(nhanes2) # 重新加载
level1<-nhanes2[which(nhanes2[,3]=="yes"),]
level1
## age bmi hyp chl
## 14 40-59 28.7 yes 204
## 17 60-99 27.2 yes 284
## 18 40-59 26.3 yes 199
## 20 60-99 25.5 yes NA
level1[4,4]<-mean( level1[,4],na.rm=TRUE )
level1
## age bmi hyp chl
## 14 40-59 28.7 yes 204
## 17 60-99 27.2 yes 284
## 18 40-59 26.3 yes 199
## 20 60-99 25.5 yes 229
library( "outliers" )
set.seed(1)
s1=.Random.seed # 设置随机种子
y<-rnorm(100) # 生成100个标准正态分布随机数
length(y)
## [1] 100
look1<-outlier(y);look1
## [1] -2.2147
look2<-outlier(y,opposite=TRUE);look2
## [1] 2.401618
dotchart( y )
text(look1+0.3,10,paste("离群值",round(look1,2),sep="\n"),cex=0.7)
text(look2,50,paste("离群值","相反值",round(look2,2),sep="\n"),cex=0.7)
dim(y)<-c(20,5) # 变换y为20X5矩阵
outlier(y)
## [1] -2.214700 -1.989352 1.980400 2.401618 -1.523567
outlier(y,opposite=TRUE)
## [1] 1.595281 1.358680 -1.129363 -1.804959 1.586833
set.seed=1;s1=.Random.seed;
y<-rnorm(10)
outlier(y,logical=TRUE)
## [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
plot(y)
text( 6.6,outlier(y),"离群点" )
####数据光滑去除噪声####
####等宽箱均值光滑
set.seed(1);s1=.Random.seed;
x<-rnorm(12)
x<-sort(x)
dim(x)<-c(3,4) # 分箱,每行一箱
for( i in 1:3 ){
x[i,]<-apply(x,1,mean)[i] # 均值分箱
}
x
## [,1] [,2] [,3] [,4]
## [1,] -0.003212265 -0.003212265 -0.003212265 -0.003212265
## [2,] 0.340596290 0.340596290 0.340596290 0.340596290
## [3,] 0.468529029 0.468529029 0.468529029 0.468529029
x<-list(a=1:10,beta=exp(-3:3),logic=c(T,F,F,T))
x
## $a
## [1] 1 2 3 4 5 6 7 8 9 10
##
## $beta
## [1] 0.04978707 0.13533528 0.36787944 1.00000000 2.71828183 7.38905610
## [7] 20.08553692
##
## $logic
## [1] TRUE FALSE FALSE TRUE
probs<-1:3/4
rt.value<-rep(0,3)
vapply(x,quantile,FUN.VALUE=rt.value,probs=probs)
## a beta logic
## 25% 3.25 0.2516074 0.0
## 50% 5.50 1.0000000 0.5
## 75% 7.75 5.0536690 1.0
##probs<-1:4/4 # 设置返回分位点格式,返回值只有3个,所以会出现错误
##vapply(x,quantile,FUN.VALUE=rt.value,probs=probs)
##错误提示:
##Error in vapply(x, quantile, FUN.VALUE = rt.value, probs = probs) :
## values must be length 3,
## but FUN(X[[1]]) result is length 4
probs<-1:4/4;
rt.value=rep(0,4) # 修改分位点个数
vapply(x,quantile,FUN.VALUE=rt.value,probs=probs)
## a beta logic
## 25% 3.25 0.2516074 0.0
## 50% 5.50 1.0000000 0.5
## 75% 7.75 5.0536690 1.0
## 100% 10.00 20.0855369 1.0
##rt.value=c(0,0,0,"") # 设置返回值类型:3个数字+1个字符串,与返回格式不符,出现错误
##vapply(x,quantile,FUN.VALUE=rt.value,probs=probs)
##错误提示:
##Error in vapply(x, quantile, FUN.VALUE = rt.value, probs = probs) :
## values must be type 'character',
## but FUN(X[[1]]) result is type 'double'
消除冗余:1.定性数据:卡方检验(p-value<0.05,在0.05显著水平下拒绝原假设,即变量不相关)
2.定量数据:相关系数和协方差:相关系数为0,不相关,否则正负相关。协方差为正,表示两个属性趋于一起改变;协方差为负,表示两个属性区域相反方向改变
set.seed(1);s1=.Random.seed;
x<-matrix( sample(1:100,50),ncol=2 )
x
## [,1] [,2]
## [1,] 27 29
## [2,] 37 1
## [3,] 57 28
## [4,] 89 81
## [5,] 20 25
## [6,] 86 87
## [7,] 97 42
## [8,] 62 70
## [9,] 58 13
## [10,] 6 55
## [11,] 19 44
## [12,] 16 78
## [13,] 61 7
## [14,] 34 45
## [15,] 67 26
## [16,] 43 50
## [17,] 88 39
## [18,] 83 46
## [19,] 32 82
## [20,] 63 30
## [21,] 75 65
## [22,] 17 2
## [23,] 51 84
## [24,] 10 59
## [25,] 21 36
chisq.test(x)
##
## Pearson's Chi-squared test
##
## data: x
## X-squared = 368.25, df = 24, p-value < 2.2e-16
结论:p_value<0.05,在0.05显著水平下拒绝原假设,变量不相关,不可删除
cor(x) # 求相关系数
## [,1] [,2]
## [1,] 1.0000000 0.1448407
## [2,] 0.1448407 1.0000000
cov(x) # 求协方差
## [,1] [,2]
## [1,] 783.0233 105.615
## [2,] 105.6150 679.040
结论:相关系数较小,不足以认为两列为冗余数据,不可删除,且两列变量趋于相反改变
数据重复检测
x<-cbind( sample(1:10,10,replace=T),rnorm(10),rnorm(10) );
x
## [,1] [,2] [,3]
## [1,] 5 1.35867955 -0.1645236
## [2,] 9 -0.10278773 -0.2533617
## [3,] 5 0.38767161 0.6969634
## [4,] 3 -0.05380504 0.5566632
## [5,] 1 -1.37705956 -0.6887557
## [6,] 1 -0.41499456 -0.7074952
## [7,] 4 -0.39428995 0.3645820
## [8,] 6 -0.05931340 0.7685329
## [9,] 7 1.10002537 -0.1123462
## [10,] 5 0.76317575 0.8811077
y<-unique( x[,1] ) #去除重复样本,返回非重复样本值
y
## [1] 5 9 3 1 4 6 7
sub=rep(0,length(y)) #初始化
for( i in 1:length(sub) ){
sub[i]<-which( x[,1]==y[i] )[1] # 找出来不止一个所以取首元素
}
sub
## [1] 1 2 4 5 7 8 9
x_unique<-x[sub,]
x_unique
## [,1] [,2] [,3]
## [1,] 5 1.35867955 -0.1645236
## [2,] 9 -0.10278773 -0.2533617
## [3,] 3 -0.05380504 0.5566632
## [4,] 1 -1.37705956 -0.6887557
## [5,] 4 -0.39428995 0.3645820
## [6,] 6 -0.05931340 0.7685329
## [7,] 7 1.10002537 -0.1123462
set.seed(1);s1=.Random.seed;
a<-rnorm(5)
a
## [1] -0.6264538 0.1836433 -0.8356286 1.5952808 0.3295078
b<-scale(a) # 标准化
b # attr(,"scaled:center"):原数据均值; attr(,"scaled:scale"):原数据标准差
## [,1]
## [1,] -0.78636077
## [2,] 0.05657773
## [3,] -1.00401553
## [4,] 1.52544305
## [5,] 0.20835553
## attr(,"scaled:center")
## [1] 0.1292699
## attr(,"scaled:scale")
## [1] 0.9610394
pra<-50:60;
scale(pra) #
## [,1]
## [1,] -1.5075567
## [2,] -1.2060454
## [3,] -0.9045340
## [4,] -0.6030227
## [5,] -0.3015113
## [6,] 0.0000000
## [7,] 0.3015113
## [8,] 0.6030227
## [9,] 0.9045340
## [10,] 1.2060454
## [11,] 1.5075567
## attr(,"scaled:center")
## [1] 55
## attr(,"scaled:scale")
## [1] 3.316625
set.seed(1);s1=.Random.seed;
a<-sample( 1:100/100,10 )
a
## [1] 0.27 0.37 0.57 0.89 0.20 0.86 0.97 0.62 0.58 0.06
a_rt<-rep(0,length(a))
a_rt[which(a>0.5)]<-1
a_rt
## [1] 0 0 1 1 0 1 1 1 1 0
set.seed(1);s1=.Random.seed;
city<-sample( 1:10,10 )
city
## [1] 3 4 5 7 2 8 9 6 10 1
city_rt<-rep(0,length(city))
city_rt[which(city>5)]<-1
city_rt
## [1] 0 0 0 1 0 1 1 1 1 0
library("glmnet")
x<-matrix(rnorm(100*20),100,20 )
y<-rnorm(100) # 生成一行正态随机数作为因变量
fit1<-glmnet(x,y) # 广义线性回归
b<-coef(fit1,s=0.01) # 随着s减小,约束放宽,筛选的变量增多
b
## 21 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.101266710
## V1 0.108237428
## V2 0.231123217
## V3 0.190051493
## V4 -0.052974626
## V5 0.035532599
## V6 0.038137025
## V7 -0.006237134
## V8 -0.121992039
## V9 -0.026866678
## V10 0.087353265
## V11 0.131952133
## V12 -0.003795339
## V13 -0.151361166
## V14 -0.068539027
## V15 0.016609815
## V16 0.011469276
## V17 0.008940900
## V18 0.062310907
## V19 0.106434048
## V20 0.051664503
predict(fit1,newx=x[1:10,],s=c(0.01,0.005)) # 计算s为0.01,0.005情况下预测值
## 1 2
## [1,] 0.4116414 0.4287138
## [2,] 0.4768544 0.5079425
## [3,] 0.3133248 0.3340180
## [4,] -0.4255843 -0.4467467
## [5,] -0.1691660 -0.1881846
## [6,] -0.1703948 -0.2182575
## [7,] -0.7424685 -0.8172969
## [8,] -0.2821776 -0.3357268
## [9,] -0.5316970 -0.5207680
## [10,] 0.1402178 0.1598952