setwd("F:/for zuoye")
data <- read.csv("cancer-pred.csv", stringsAsFactors = F)
data[which(data==".",arr.ind = T)] <- NA
data <- apply(data,2,as.numeric)
data <- as.data.frame(data)
which(data < 0, arr.ind = T)
## row col
## [1,] 74 7
## [2,] 77 7
## [3,] 160 7
## [4,] 183 7
data[which(data < 0, arr.ind = T)] <- NA
library("mice")
## Loading required package: Rcpp
## mice 2.25 2015-11-09
mv.pattern <- md.pattern(data)
mv.pattern
## Cancer.development size treatment.type Dose.I stage age Dose.II
## 162 1 1 1 1 1 1 1 0
## 1 0 1 1 1 1 1 1 1
## 4 1 0 1 1 1 1 1 1
## 6 1 1 1 1 0 1 1 1
## 7 1 1 1 1 1 0 1 1
## 5 1 1 0 1 1 1 1 1
## 5 1 1 1 0 1 1 1 1
## 9 1 1 1 1 1 1 0 1
## 1 1 1 1 1 0 1 0 2
## 1 4 5 5 7 7 10 39
library("VIM")
## Warning: package 'VIM' was built under R version 3.3.1
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(data, prop = F, numbers = T, cex.lab = 1, cex.axis = 1)
###可以看到,所有变量的缺失值的个数都在10一下,而Dose.II的缺失值个数最多,达到了10个。
matrixplot(data, cex.axis = 1)
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
###红色代表缺失值,这幅图可以更为直接的观察缺失值在数据中的分布。 ###下面再看一下每个观测值中缺失值的个数
apply(data, 1, function(x) {sum(is.na(x))})
## [1] 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1
## [36] 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0
## [71] 0 1 0 2 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1
## [106] 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
## [141] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## [176] 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1
table(apply(data, 1, function(x) {sum(is.na(x))}))
##
## 0 1 2
## 162 37 1
imp <- mice(data, 5)
##
## iter imp variable
## 1 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 1 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 1 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 1 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 1 5 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 5 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 5 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 5 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 5 Cancer.development size stage age treatment.type Dose.I Dose.II
fit <- with(imp, glm(Cancer.development ~ size + stage + age + treatment.type + Dose.I + Dose.II,
family=binomial(link='logit'), data = data))
pooled <- pool(fit)
summary(pooled)
## est se t df Pr(>|t|)
## (Intercept) -2.29883186 1.92904663 -1.1916933 153.0226 0.23522650
## size -0.27087556 0.30587640 -0.8855720 153.0226 0.37723771
## stage 0.59606727 0.33178623 1.7965401 153.0226 0.07438120
## age 0.10139222 0.04764513 2.1280711 153.0226 0.03493307
## treatment.type 0.11630204 0.36248475 0.3208467 153.0226 0.74876433
## Dose.I -0.34394335 0.29396640 -1.1700091 153.0226 0.24381637
## Dose.II 0.01581585 0.09172577 0.1724253 153.0226 0.86333099
## lo 95 hi 95 nmis fmi lambda
## (Intercept) -6.109833211 1.5121695 NA 0.01281865 0
## size -0.875161304 0.3334102 4 0.01281865 0
## stage -0.059405610 1.2515402 7 0.01281865 0
## age 0.007265078 0.1955194 7 0.01281865 0
## treatment.type -0.599818485 0.8324226 5 0.01281865 0
## Dose.I -0.924699836 0.2368131 5 0.01281865 0
## Dose.II -0.165396491 0.1970282 10 0.01281865 0
data1 <- complete(imp, action = 1)
colnames(data1)
## [1] "Cancer.development" "size" "stage"
## [4] "age" "treatment.type" "Dose.I"
## [7] "Dose.II"
table(data1[,1])
##
## 0 1
## 117 83
hist(data1[,2], xlab = "size", main = "Histogram of size")
shapiro.test(data1[,2])
##
## Shapiro-Wilk normality test
##
## data: data1[, 2]
## W = 0.96148, p-value = 2.869e-05
table(data1[,3])
##
## 0 1
## 106 94
hist(data1[,4],xlab = "age", main = "Histogram of age")
shapiro.test(data1[,4])
##
## Shapiro-Wilk normality test
##
## data: data1[, 4]
## W = 0.99521, p-value = 0.7806
table(data1[,5])
##
## 1 2
## 139 61
hist(data1[,6],xlab = "Dose.I", main = "Histogram of Dose.I")
shapiro.test(data1[,6])
##
## Shapiro-Wilk normality test
##
## data: data1[, 6]
## W = 0.96045, p-value = 2.213e-05
hist(data1[,7],xlab = "Dose.II", main = "Histogram of Dose.II")
shapiro.test(data1[,7])
##
## Shapiro-Wilk normality test
##
## data: data1[, 7]
## W = 0.98853, p-value = 0.1084
log.glm <- glm(Cancer.development~size+stage+age+treatment.type+Dose.I+Dose.II,family = binomial,
data = data1)
summary(log.glm)
##
## Call:
## glm(formula = Cancer.development ~ size + stage + age + treatment.type +
## Dose.I + Dose.II, family = binomial, data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5034 -1.0029 -0.7532 1.1938 1.9642
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.83021 1.66107 -1.102 0.2705
## size -0.28280 0.27349 -1.034 0.3011
## stage 0.56788 0.30031 1.891 0.0586 .
## age 0.10628 0.04150 2.561 0.0104 *
## treatment.type 0.09423 0.32647 0.289 0.7729
## Dose.I -0.47175 0.26954 -1.750 0.0801 .
## Dose.II -0.03680 0.08345 -0.441 0.6592
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 271.45 on 199 degrees of freedom
## Residual deviance: 256.63 on 193 degrees of freedom
## AIC: 270.63
##
## Number of Fisher Scoring iterations: 4
log.step <- step(log.glm)
## Start: AIC=270.63
## Cancer.development ~ size + stage + age + treatment.type + Dose.I +
## Dose.II
##
## Df Deviance AIC
## - treatment.type 1 256.71 268.71
## - Dose.II 1 256.83 268.83
## - size 1 257.71 269.71
## <none> 256.63 270.63
## - Dose.I 1 259.75 271.75
## - stage 1 260.24 272.24
## - age 1 263.53 275.53
##
## Step: AIC=268.71
## Cancer.development ~ size + stage + age + Dose.I + Dose.II
##
## Df Deviance AIC
## - Dose.II 1 256.91 266.91
## - size 1 257.76 267.76
## <none> 256.71 268.71
## - Dose.I 1 259.77 269.77
## - stage 1 260.34 270.34
## - age 1 263.53 273.53
##
## Step: AIC=266.91
## Cancer.development ~ size + stage + age + Dose.I
##
## Df Deviance AIC
## - size 1 257.89 265.89
## <none> 256.91 266.91
## - Dose.I 1 259.97 267.97
## - stage 1 260.52 268.52
## - age 1 263.78 271.78
##
## Step: AIC=265.89
## Cancer.development ~ stage + age + Dose.I
##
## Df Deviance AIC
## <none> 257.89 265.89
## - Dose.I 1 260.94 266.94
## - stage 1 261.15 267.15
## - age 1 264.93 270.93
summary(log.step)
##
## Call:
## glm(formula = Cancer.development ~ stage + age + Dose.I, family = binomial,
## data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5911 -1.0072 -0.7733 1.1936 1.8720
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.44272 1.43382 -1.704 0.08845 .
## stage 0.53445 0.29718 1.798 0.07211 .
## age 0.10645 0.04114 2.588 0.00966 **
## Dose.I -0.46385 0.26791 -1.731 0.08339 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 271.45 on 199 degrees of freedom
## Residual deviance: 257.89 on 196 degrees of freedom
## AIC: 265.89
##
## Number of Fisher Scoring iterations: 4
log.step
##
## Call: glm(formula = Cancer.development ~ stage + age + Dose.I, family = binomial,
## data = data1)
##
## Coefficients:
## (Intercept) stage age Dose.I
## -2.4427 0.5345 0.1064 -0.4638
##
## Degrees of Freedom: 199 Total (i.e. Null); 196 Residual
## Null Deviance: 271.5
## Residual Deviance: 257.9 AIC: 265.9
log.pre <- predict(log.step, data.frame(stage=0, age = 30.2525, Dose.I = 3.013505))
p1 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=1, age = 30.2525, Dose.I = 3.013505))
p2 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=0, age = 30.2525, Dose.I = 3.013505))
p3 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=0, age = 31.2525, Dose.I = 3.013505))
p4 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=0, age = 30.2525, Dose.I = 3.013505))
p5 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=0, age = 30.2525, Dose.I = 4.013505))
p6 <- exp(log.pre)/(1+exp(log.pre))