setwd("F:/for zuoye")
data <- read.csv("cancer-pred.csv", stringsAsFactors = F)
data[which(data==".",arr.ind = T)] <- NA
data <- apply(data,2,as.numeric)
data <- as.data.frame(data)
which(data < 0, arr.ind = T)
## row col
## [1,] 74 7
## [2,] 77 7
## [3,] 160 7
## [4,] 183 7
data[which(data < 0, arr.ind = T)] <- NA
library("mice")
## Loading required package: Rcpp
## mice 2.25 2015-11-09
mv.pattern <- md.pattern(data)
mv.pattern
## Cancer.development size treatment.type Dose.I stage age Dose.II
## 162 1 1 1 1 1 1 1 0
## 1 0 1 1 1 1 1 1 1
## 4 1 0 1 1 1 1 1 1
## 6 1 1 1 1 0 1 1 1
## 7 1 1 1 1 1 0 1 1
## 5 1 1 0 1 1 1 1 1
## 5 1 1 1 0 1 1 1 1
## 9 1 1 1 1 1 1 0 1
## 1 1 1 1 1 0 1 0 2
## 1 4 5 5 7 7 10 39
library("VIM")
## Warning: package 'VIM' was built under R version 3.3.1
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
aggr(data, prop = F, numbers = T, cex.lab = 1, cex.axis = 1)
###可以看到,所有变量的缺失值的个数都在10一下,而Dose.II的缺失值个数最多,达到了10个。
matrixplot(data, cex.axis = 1)
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
## Warning in hex(RGB(r, g, b), gamma = gamma, fixup = fixup, ...): 'gamma' is
## deprecated and has no effect
###红色代表缺失值,这幅图可以更为直接的观察缺失值在数据中的分布。 ###下面再看一下每个观测值中缺失值的个数
apply(data, 1, function(x) {sum(is.na(x))})
## [1] 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 1 0 0 0 1 0 0 0 0 1
## [36] 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0
## [71] 0 1 0 2 0 0 1 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 1
## [106] 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0
## [141] 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## [176] 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1
table(apply(data, 1, function(x) {sum(is.na(x))}))
##
## 0 1 2
## 162 37 1
imp <- mice(data, 5)
##
## iter imp variable
## 1 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 1 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 1 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 1 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 1 5 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 2 5 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 3 5 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 4 5 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 1 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 2 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 3 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 4 Cancer.development size stage age treatment.type Dose.I Dose.II
## 5 5 Cancer.development size stage age treatment.type Dose.I Dose.II
fit <- with(imp, glm(Cancer.development ~ size + stage + age + treatment.type + Dose.I + Dose.II,
family=binomial(link='logit'), data = data))
pooled <- pool(fit)
summary(pooled)
## est se t df Pr(>|t|)
## (Intercept) -2.29883186 1.92904663 -1.1916933 153.0226 0.23522650
## size -0.27087556 0.30587640 -0.8855720 153.0226 0.37723771
## stage 0.59606727 0.33178623 1.7965401 153.0226 0.07438120
## age 0.10139222 0.04764513 2.1280711 153.0226 0.03493307
## treatment.type 0.11630204 0.36248475 0.3208467 153.0226 0.74876433
## Dose.I -0.34394335 0.29396640 -1.1700091 153.0226 0.24381637
## Dose.II 0.01581585 0.09172577 0.1724253 153.0226 0.86333099
## lo 95 hi 95 nmis fmi lambda
## (Intercept) -6.109833211 1.5121695 NA 0.01281865 0
## size -0.875161304 0.3334102 4 0.01281865 0
## stage -0.059405610 1.2515402 7 0.01281865 0
## age 0.007265078 0.1955194 7 0.01281865 0
## treatment.type -0.599818485 0.8324226 5 0.01281865 0
## Dose.I -0.924699836 0.2368131 5 0.01281865 0
## Dose.II -0.165396491 0.1970282 10 0.01281865 0
data1 <- complete(imp, action = 1)
colnames(data1)
## [1] "Cancer.development" "size" "stage"
## [4] "age" "treatment.type" "Dose.I"
## [7] "Dose.II"
table(data1[,1])
##
## 0 1
## 116 84
hist(data1[,2], xlab = "size", main = "Histogram of size")
shapiro.test(data1[,2])
##
## Shapiro-Wilk normality test
##
## data: data1[, 2]
## W = 0.96344, p-value = 4.744e-05
table(data1[,3])
##
## 0 1
## 104 96
hist(data1[,4],xlab = "age", main = "Histogram of age")
shapiro.test(data1[,4])
##
## Shapiro-Wilk normality test
##
## data: data1[, 4]
## W = 0.99545, p-value = 0.8137
table(data1[,5])
##
## 1 2
## 142 58
hist(data1[,6],xlab = "Dose.I", main = "Histogram of Dose.I")
shapiro.test(data1[,6])
##
## Shapiro-Wilk normality test
##
## data: data1[, 6]
## W = 0.96232, p-value = 3.559e-05
hist(data1[,7],xlab = "Dose.II", main = "Histogram of Dose.II")
shapiro.test(data1[,7])
##
## Shapiro-Wilk normality test
##
## data: data1[, 7]
## W = 0.98699, p-value = 0.06363
log.glm <- glm(Cancer.development~size+stage+age+treatment.type+Dose.I+Dose.II,family = binomial,
data = data1)
summary(log.glm)
##
## Call:
## glm(formula = Cancer.development ~ size + stage + age + treatment.type +
## Dose.I + Dose.II, family = binomial, data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5283 -0.9993 -0.7593 1.1717 1.9605
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.71515 1.66206 -1.032 0.3021
## size -0.30795 0.27723 -1.111 0.2666
## stage 0.66590 0.29971 2.222 0.0263 *
## age 0.09458 0.04096 2.309 0.0210 *
## treatment.type 0.14603 0.33160 0.440 0.6597
## Dose.I -0.43674 0.27046 -1.615 0.1064
## Dose.II -0.01471 0.08237 -0.179 0.8582
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 272.12 on 199 degrees of freedom
## Residual deviance: 256.70 on 193 degrees of freedom
## AIC: 270.7
##
## Number of Fisher Scoring iterations: 4
log.step <- step(log.glm)
## Start: AIC=270.7
## Cancer.development ~ size + stage + age + treatment.type + Dose.I +
## Dose.II
##
## Df Deviance AIC
## - Dose.II 1 256.74 268.74
## - treatment.type 1 256.90 268.90
## - size 1 257.94 269.94
## <none> 256.70 270.70
## - Dose.I 1 259.35 271.35
## - stage 1 261.69 273.69
## - age 1 262.24 274.24
##
## Step: AIC=268.74
## Cancer.development ~ size + stage + age + treatment.type + Dose.I
##
## Df Deviance AIC
## - treatment.type 1 256.92 266.92
## - size 1 257.95 267.95
## <none> 256.74 268.74
## - Dose.I 1 259.38 269.38
## - stage 1 261.72 271.72
## - age 1 262.28 272.28
##
## Step: AIC=266.92
## Cancer.development ~ size + stage + age + Dose.I
##
## Df Deviance AIC
## - size 1 258.05 266.05
## <none> 256.92 266.92
## - Dose.I 1 259.50 267.50
## - stage 1 262.02 270.02
## - age 1 262.36 270.36
##
## Step: AIC=266.05
## Cancer.development ~ stage + age + Dose.I
##
## Df Deviance AIC
## <none> 258.05 266.05
## - Dose.I 1 260.54 266.54
## - stage 1 262.89 268.89
## - age 1 263.72 269.72
summary(log.step)
##
## Call:
## glm(formula = Cancer.development ~ stage + age + Dose.I, family = binomial,
## data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5803 -0.9935 -0.7983 1.1928 1.8456
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.26187 1.43647 -1.575 0.1153
## stage 0.65102 0.29749 2.188 0.0286 *
## age 0.09493 0.04065 2.335 0.0195 *
## Dose.I -0.42047 0.26863 -1.565 0.1175
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 272.12 on 199 degrees of freedom
## Residual deviance: 258.05 on 196 degrees of freedom
## AIC: 266.05
##
## Number of Fisher Scoring iterations: 4
log.step
##
## Call: glm(formula = Cancer.development ~ stage + age + Dose.I, family = binomial,
## data = data1)
##
## Coefficients:
## (Intercept) stage age Dose.I
## -2.26187 0.65102 0.09493 -0.42047
##
## Degrees of Freedom: 199 Total (i.e. Null); 196 Residual
## Null Deviance: 272.1
## Residual Deviance: 258.1 AIC: 266.1
log.pre <- predict(log.step, data.frame(stage=0, age = 30.2525, Dose.I = 3.013505))
p1 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=1, age = 30.2525, Dose.I = 3.013505))
p2 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=0, age = 30.2525, Dose.I = 3.013505))
p3 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=0, age = 31.2525, Dose.I = 3.013505))
p4 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=0, age = 30.2525, Dose.I = 3.013505))
p5 <- exp(log.pre)/(1+exp(log.pre))
log.pre <- predict(log.step, data.frame(stage=0, age = 30.2525, Dose.I = 4.013505))
p6 <- exp(log.pre)/(1+exp(log.pre))