water <- read.csv('https://umich.instructure.com/files/399172/download?download_frd=1', header=T)
water[1:3, ]
## Year..string. WHO.region..string. Country..string.
## 1 1990 Africa Algeria
## 2 1990 Africa Angola
## 3 1990 Africa Benin
## Residence.Area.Type..string.
## 1 Rural
## 2 Rural
## 3 Rural
## Population.using.improved.drinking.water.sources......numeric.
## 1 88
## 2 42
## 3 49
## Population.using.improved.sanitation.facilities......numeric.
## 1 77
## 2 7
## 3 0
colnames(water)<-c("year", "region", "country", "residence_area", "improved_water", "sanitation_facilities")
water[1:3, ]
## year region country residence_area improved_water sanitation_facilities
## 1 1990 Africa Algeria Rural 88 77
## 2 1990 Africa Angola Rural 42 7
## 3 1990 Africa Benin Rural 49 0
which.max(water$year);
## [1] 913
# rowMeans(water[,5:6])
mean(water[,6], trim=0.08, na.rm=T)
## [1] 71.63629
Simulation <- read.csv("https://umich.instructure.com/files/354289/download?download_frd=1", header = FALSE)
Simulation[1:3, ]
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 ID i2 age treat homeless pcs mcs cesd indtot pss_fr drugrisk sexrisk
## 2 1 0 25 0 0 49 7 46 37 0 1 6
## 3 2 18 31 0 0 48 34 17 48 0 0 11
## V13 V14 V15 V16
## 1 satreat female substance racegrp
## 2 0 0 cocaine black
## 3 0 0 alcohol white
str(water)
## 'data.frame': 3331 obs. of 6 variables:
## $ year : int 1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
## $ region : Factor w/ 6 levels "Africa","Americas",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ country : Factor w/ 192 levels "Afghanistan",..: 3 5 19 23 26 27 30 32 33 37 ...
## $ residence_area : Factor w/ 3 levels "Rural","Total",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ improved_water : num 88 42 49 86 39 67 34 46 37 83 ...
## $ sanitation_facilities: num 77 7 0 22 2 42 27 12 4 11 ...
summary(water$year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1990 1995 2005 2002 2010 2012
summary(water[c("improved_water", "sanitation_facilities")])
## improved_water sanitation_facilities
## Min. : 3.0 Min. : 0.00
## 1st Qu.: 77.0 1st Qu.: 42.00
## Median : 93.0 Median : 81.00
## Mean : 84.9 Mean : 68.87
## 3rd Qu.: 99.0 3rd Qu.: 97.00
## Max. :100.0 Max. :100.00
## NA's :32 NA's :135
plot(density(water$improved_water,na.rm = T))
vec1<-c(40, 56, 99)
mean(vec1)
## [1] 65
mean(c(40, 56, 99))
## [1] 65
median(vec1)
## [1] 56
median(c(40, 56, 99))
## [1] 56
library(psych)
geometric.mean(vec1, na.rm=TRUE)
## [1] 60.52866
range(water$year)
## [1] 1990 2012
diff(range(water$year))
## [1] 22
IQR(water$year)
## [1] 15
summary(water$improved_water)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 3.0 77.0 93.0 84.9 99.0 100.0 32
IQR(water$improved_water, na.rm = T)
## [1] 22
quantile(water$improved_water, na.rm = T)
## 0% 25% 50% 75% 100%
## 3 77 93 99 100
quantile(water$improved_water, probs = c(0.2, 0.6), na.rm = T)
## 20% 60%
## 71 97
quantile(water$improved_water, seq(from=0, to=1, by=0.2), na.rm = T)
## 0% 20% 40% 60% 80% 100%
## 3 71 89 97 100 100
boxplot(water$improved_water, main="Boxplot for Percent improved_water", ylab="Percentage")
hist(water$improved_water, main = "Histogram of Percent improved_water", xlab="Percentage")
hist(water$sanitation_facilities, main = "Histogram of Percent sanitation_facilities", xlab = "Percentage")
x <- rnorm(100, 0, 1)
hist(x, probability=T, col='lightblue', xlab=' ', ylab=' ', axes=F, main='Normal Distribution')
lines(density(x, bw=0.4), col='red', lwd=3)
baseball<-read.table("https://umich.instructure.com/files/330381/download?download_frd=1", header=T)
hist(baseball$Weight, main = "Histogram for Baseball Player's Weight", xlab="weight")
hist(baseball$Height, main = "Histogram for Baseball Player's Height", xlab ="height")
mean(baseball$Weight)
## [1] 201.7166
mean(baseball$Height)
## [1] 73.69729
var(baseball$Weight)
## [1] 440.9913
sd(baseball$Weight)
## [1] 20.99979
var(baseball$Height)
## [1] 5.316798
sd(baseball$Height)
## [1] 2.305818
water <- read.csv('https://umich.instructure.com/files/399172/download?download_frd=1', header=T)
table(water$Year)
##
## 1990 1995 2000 2005 2010 2012
## 520 561 570 570 556 554
table(water$WHO.region)
##
## Africa Americas Eastern Mediterranean
## 797 613 373
## Europe South-East Asia Western Pacific
## 910 191 447
table(water$Residence.Area)
##
## Rural Total Urban
## 1095 1109 1127
year_table<-table(water$Year..string.)
prop.table(year_table)
##
## 1990 1995 2000 2005 2010 2012
## 0.1561093 0.1684179 0.1711198 0.1711198 0.1669168 0.1663164
year_pct<-prop.table(year_table)*100
round(year_pct, digits=1)
##
## 1990 1995 2000 2005 2010 2012
## 15.6 16.8 17.1 17.1 16.7 16.6
library(gmodels)
water$africa<-water$WHO.region=="Africa"
table(water$africa)
##
## FALSE TRUE
## 2534 797
CrossTable(x=water$Residence.Area, y=water$africa)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 3331
##
##
## | water$africa
## water$Residence.Area | FALSE | TRUE | Row Total |
## ---------------------|-----------|-----------|-----------|
## Rural | 828 | 267 | 1095 |
## | 0.030 | 0.096 | |
## | 0.756 | 0.244 | 0.329 |
## | 0.327 | 0.335 | |
## | 0.249 | 0.080 | |
## ---------------------|-----------|-----------|-----------|
## Total | 845 | 264 | 1109 |
## | 0.002 | 0.007 | |
## | 0.762 | 0.238 | 0.333 |
## | 0.333 | 0.331 | |
## | 0.254 | 0.079 | |
## ---------------------|-----------|-----------|-----------|
## Urban | 861 | 266 | 1127 |
## | 0.016 | 0.050 | |
## | 0.764 | 0.236 | 0.338 |
## | 0.340 | 0.334 | |
## | 0.258 | 0.080 | |
## ---------------------|-----------|-----------|-----------|
## Column Total | 2534 | 797 | 3331 |
## | 0.761 | 0.239 | |
## ---------------------|-----------|-----------|-----------|
##
##
m1<-mean(water$Population.using.improved.drinking, na.rm = T)
m2<-mean(water$Population.using.improved.sanitation, na.rm = T)
water_imp<-water
for(i in 1:3331){
if(is.na(water_imp$Population.using.improved.drinking[i])){
water_imp$Population.using.improved.drinking[i]=m1
}
if(is.na(water_imp$Population.using.improved.sanitation[i])){
water_imp$Population.using.improved.sanitation=m2
}
}
summary(water_imp)
## Year..string. WHO.region..string. Country..string.
## Min. :1990 Africa :797 Albania : 18
## 1st Qu.:1995 Americas :613 Algeria : 18
## Median :2005 Eastern Mediterranean:373 Andorra : 18
## Mean :2002 Europe :910 Angola : 18
## 3rd Qu.:2010 South-East Asia :191 Antigua and Barbuda: 18
## Max. :2012 Western Pacific :447 Argentina : 18
## (Other) :3223
## Residence.Area.Type..string.
## Rural:1095
## Total:1109
## Urban:1127
##
##
##
##
## Population.using.improved.drinking.water.sources......numeric.
## Min. : 3.0
## 1st Qu.: 77.0
## Median : 93.0
## Mean : 84.9
## 3rd Qu.: 99.0
## Max. :100.0
## NA's :32
## Population.using.improved.sanitation.facilities......numeric.
## Min. : 0.00
## 1st Qu.: 42.00
## Median : 81.00
## Mean : 68.87
## 3rd Qu.: 97.00
## Max. :100.00
## NA's :135
## africa Population.using.improved.sanitation
## Mode :logical Min. :68.87
## FALSE:2534 1st Qu.:68.87
## TRUE :797 Median :68.87
## Mean :68.87
## 3rd Qu.:68.87
## Max. :68.87
##
## Population.using.improved.drinking
## Min. : 3.0
## 1st Qu.: 77.0
## Median : 93.0
## Mean : 84.9
## 3rd Qu.: 99.0
## Max. :100.0
##
library(mi)
## Loading required package: Matrix
## Loading required package: stats4
## mi (Version 1.0, packaged: 2015-04-16 14:03:10 UTC; goodrich)
## mi Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
## This program comes with ABSOLUTELY NO WARRANTY.
## This is free software, and you are welcome to redistribute it
## under the General Public License version 2 or later.
## Execute RShowDoc('COPYING') for details.
mdf<-missing_data.frame(water)
head(mdf)
## Year..string. WHO.region..string. Country..string.
## 1 1990 Africa Algeria
## 2 1990 Africa Angola
## 3 1990 Africa Benin
## 4 1990 Africa Botswana
## 5 1990 Africa Burkina Faso
## 6 1990 Africa Burundi
## Residence.Area.Type..string.
## 1 Rural
## 2 Rural
## 3 Rural
## 4 Rural
## 5 Rural
## 6 Rural
## Population.using.improved.drinking.water.sources......numeric.
## 1 88
## 2 42
## 3 49
## 4 86
## 5 39
## 6 67
## Population.using.improved.sanitation.facilities......numeric. africa
## 1 77 TRUE
## 2 7 TRUE
## 3 0 TRUE
## 4 22 TRUE
## 5 2 TRUE
## 6 42 TRUE
## missing_Population.using.improved.drinking.water.sources......numeric.
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
## missing_Population.using.improved.sanitation.facilities......numeric.
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
show(mdf)
## Object of class missing_data.frame with 3331 observations on 7 variables
##
## There are 3 missing data patterns
##
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
##
## type
## Year..string. continuous
## WHO.region..string. unordered-categorical
## Country..string. unordered-categorical
## Residence.Area.Type..string. unordered-categorical
## Population.using.improved.drinking.water.sources......numeric. continuous
## Population.using.improved.sanitation.facilities......numeric. continuous
## africa binary
## missing
## Year..string. 0
## WHO.region..string. 0
## Country..string. 0
## Residence.Area.Type..string. 0
## Population.using.improved.drinking.water.sources......numeric. 32
## Population.using.improved.sanitation.facilities......numeric. 135
## africa 0
## method
## Year..string. <NA>
## WHO.region..string. <NA>
## Country..string. <NA>
## Residence.Area.Type..string. <NA>
## Population.using.improved.drinking.water.sources......numeric. ppd
## Population.using.improved.sanitation.facilities......numeric. ppd
## africa <NA>
## model
## Year..string. <NA>
## WHO.region..string. <NA>
## Country..string. <NA>
## Residence.Area.Type..string. <NA>
## Population.using.improved.drinking.water.sources......numeric. linear
## Population.using.improved.sanitation.facilities......numeric. linear
## africa <NA>
##
## family
## Year..string. <NA>
## WHO.region..string. <NA>
## Country..string. <NA>
## Residence.Area.Type..string. <NA>
## Population.using.improved.drinking.water.sources......numeric. gaussian
## Population.using.improved.sanitation.facilities......numeric. gaussian
## africa <NA>
## link
## Year..string. <NA>
## WHO.region..string. <NA>
## Country..string. <NA>
## Residence.Area.Type..string. <NA>
## Population.using.improved.drinking.water.sources......numeric. identity
## Population.using.improved.sanitation.facilities......numeric. identity
## africa <NA>
## transformation
## Year..string. standardize
## WHO.region..string. <NA>
## Country..string. <NA>
## Residence.Area.Type..string. <NA>
## Population.using.improved.drinking.water.sources......numeric. standardize
## Population.using.improved.sanitation.facilities......numeric. standardize
## africa <NA>
mdf<-change(mdf, y="Population.using.improved.drinking", what = "imputation_method", to="pmm")
mdf<-change(mdf, y="Population.using.improved.sanitation", what = "imputation_method", to="pmm")
imputations<-mi(mdf, n.iter=10, n.chains=3, verbose=T)
data.frames <- complete(imputations, 3)
summary(water)
## Year..string. WHO.region..string. Country..string.
## Min. :1990 Africa :797 Albania : 18
## 1st Qu.:1995 Americas :613 Algeria : 18
## Median :2005 Eastern Mediterranean:373 Andorra : 18
## Mean :2002 Europe :910 Angola : 18
## 3rd Qu.:2010 South-East Asia :191 Antigua and Barbuda: 18
## Max. :2012 Western Pacific :447 Argentina : 18
## (Other) :3223
## Residence.Area.Type..string.
## Rural:1095
## Total:1109
## Urban:1127
##
##
##
##
## Population.using.improved.drinking.water.sources......numeric.
## Min. : 3.0
## 1st Qu.: 77.0
## Median : 93.0
## Mean : 84.9
## 3rd Qu.: 99.0
## Max. :100.0
## NA's :32
## Population.using.improved.sanitation.facilities......numeric.
## Min. : 0.00
## 1st Qu.: 42.00
## Median : 81.00
## Mean : 68.87
## 3rd Qu.: 97.00
## Max. :100.00
## NA's :135
## africa
## Mode :logical
## FALSE:2534
## TRUE :797
##
##
##
##
set.seed(123)
# create MCAR missing-data generator
create.missing <- function (data, pct.mis = 10)
{
n <- nrow(data)
J <- ncol(data)
if (length(pct.mis) == 1) {
if(pct.mis>= 0 & pct.mis <=100) {
n.mis <- rep((n * (pct.mis/100)), J)
}
else {
warning("Percent missing values should be an integer between
0 and 100! Exiting"); break
}
}
else {
if (length(pct.mis) < J)
stop("The length of the missing-vector is not equal to the numbe
r of columns in the data! Exiting!")
n.mis <- n * (pct.mis/100)
}
for (i in 1:ncol(data)) {
if (n.mis[i] == 0) { # if column has no missing do nothing.
data[, i] <- data[, i]
}
else {
data[sample(1:n, n.mis[i], replace = FALSE), i] <- NA
# For each given column (i), sample the row indices (1:n),
# a number of indices to replace as "missing", n.mis[i], "NA",
# without replacement
}
}
return(as.data.frame(data))
}
n <- 1000; u1 <- rbinom(n, 1, .5); v1<-log(rnorm(n, 5, 1)); x1 <- u1*exp(v1)
u2 <- rbinom(n, 1, .5); v2 <- log(rnorm(n, 5, 1)); x2 <- u2*exp(v2)
x3 <- rbinom(n,1,prob=0.45); x4<-ordered(rep(seq(1, 5), n)[sample(1:n, n)]);
x5 <- rep(letters[1:10], n)[sample(1:n, n)]; x6 <- trunc(runif(n, 1, 10));
x7 <- rnorm(n); x8 <- factor(rep(seq(1, 10), n)[sample(1:n, n)]);
x9 <- runif(n,0.1,0.99); x10 <- rpois(n, 4); y<-x1 + x2 + x7 + x9 + rnorm(n)
# package the simulated data as a data frame object
sim_data <- cbind.data.frame(y, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10)
# randomly create missing values
sim_data_30pct_missing <- create.missing(sim_data, pct.mis=30);
head(sim_data_30pct_missing); summary(sim_data_30pct_missing)
## y x1 x2 x3 x4 x5 x6 x7 x8 x9
## 1 NA 0.000000 0.000000 0 <NA> d 6 1.2954432 4 0.5055421
## 2 10.4290124 4.006301 NA NA 2 <NA> 8 -0.6485997 <NA> NA
## 3 0.6498472 0.000000 0.000000 0 2 c NA -1.0443625 4 NA
## 4 11.4870266 5.751061 NA 0 4 <NA> 2 0.1675763 <NA> 0.1308716
## 5 NA 3.490833 NA 0 <NA> f NA 2.2337774 1 0.4483454
## 6 5.7667229 NA 4.384732 1 2 b 4 -0.6722819 5 0.8840504
## x10
## 1 6
## 2 NA
## 3 5
## 4 3
## 5 5
## 6 3
## y x1 x2 x3
## Min. :-3.886 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.: 2.340 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0000
## Median : 5.438 Median :0.000 Median :2.582 Median :0.0000
## Mean : 5.425 Mean :2.493 Mean :2.499 Mean :0.4214
## 3rd Qu.: 8.136 3rd Qu.:4.993 3rd Qu.:4.960 3rd Qu.:1.0000
## Max. :16.526 Max. :8.390 Max. :8.421 Max. :1.0000
## NA's :300 NA's :300 NA's :300 NA's :300
## x4 x5 x6 x7 x8
## 1 :136 d : 77 Min. :1.000 Min. :-3.26215 5 : 76
## 2 :143 f : 73 1st Qu.:3.000 1st Qu.:-0.68042 2 : 74
## 3 :146 h : 73 Median :5.000 Median : 0.02927 6 : 73
## 4 :137 i : 72 Mean :5.079 Mean : 0.02138 1 : 71
## 5 :138 j : 72 3rd Qu.:7.000 3rd Qu.: 0.79094 4 : 70
## NA's:300 (Other):333 Max. :9.000 Max. : 3.71572 (Other):336
## NA's :300 NA's :300 NA's :300 NA's :300
## x9 x10
## Min. :0.1010 Min. : 0.000
## 1st Qu.:0.3076 1st Qu.: 2.000
## Median :0.5100 Median : 4.000
## Mean :0.5286 Mean : 4.029
## 3rd Qu.:0.7681 3rd Qu.: 5.000
## Max. :0.9881 Max. :11.000
## NA's :300 NA's :300
library("betareg"); library("mi")
# get show the missing information matrix
mdf <- missing_data.frame(sim_data_30pct_missing)
show(mdf)
## Object of class missing_data.frame with 1000 observations on 11 variables
##
## There are 538 missing data patterns
##
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
##
## type missing method model
## y continuous 300 ppd linear
## x1 continuous 300 ppd linear
## x2 continuous 300 ppd linear
## x3 binary 300 ppd logit
## x4 ordered-categorical 300 ppd ologit
## x5 unordered-categorical 300 ppd mlogit
## x6 continuous 300 ppd linear
## x7 continuous 300 ppd linear
## x8 unordered-categorical 300 ppd mlogit
## x9 proportion 300 ppd betareg
## x10 continuous 300 ppd linear
##
## family link transformation
## y gaussian identity standardize
## x1 gaussian identity standardize
## x2 gaussian identity standardize
## x3 binomial logit <NA>
## x4 multinomial logit <NA>
## x5 multinomial logit <NA>
## x6 gaussian identity standardize
## x7 gaussian identity standardize
## x8 multinomial logit <NA>
## x9 binomial logit identity
## x10 gaussian identity standardize
# mdf@patterns # to get the textual missing pattern
image(mdf) # remember the visual pattern of this MCAR
# Next, try to impute the missing values.
# Get the Graph Parameters (plotting canvas/margins)
# set to plot the histograms for the 3 imputation chains
# mfcol=c(nr, nc). Subsequent histograms are drawn as nr-by-nc arrays on
# the graphics device by columns (mfcol), or rows (mfrow)
# oma: oma=c(bottom, left, top, right) giving the size of the outer
# margins in lines of text
# mar=c(bottom, left, top, right) gives the number of lines of margin
# to be specified on the four sides of the plot.
# tcl=length of tick marks as a fraction of the height of a line of
# text (default=0.5)
par(mfcol=c(5, 5), oma=c(1, 1, 0, 0), mar=c(1, 1, 1, 0), tcl=-0.1,
mgp=c(0, 0, 0))
imputations <- mi(sim_data_30pct_missing, n.iter=5, n.chains=3,verbose=TRUE)
hist(imputations)
# Extracts several multiply imputed data.frames from "imputations" object
data.frames <- complete(imputations, 3)
# Compare the summaries for the original data (prior to introducing missing
# values) with missing data and the re-completed data following imputation
summary(sim_data)
## y x1 x2 x3 x4
## Min. :-3.886 Min. :0.000 Min. :0.000 Min. :0.000 1:200
## 1st Qu.: 2.579 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.000 2:200
## Median : 5.540 Median :0.000 Median :2.687 Median :0.000 3:200
## Mean : 5.558 Mean :2.472 Mean :2.516 Mean :0.431 4:200
## 3rd Qu.: 8.278 3rd Qu.:4.996 3rd Qu.:5.007 3rd Qu.:1.000 5:200
## Max. :16.526 Max. :8.390 Max. :8.421 Max. :1.000
##
## x5 x6 x7 x8
## a :100 Min. :1.000 Min. :-3.289376 1 :100
## b :100 1st Qu.:3.000 1st Qu.:-0.705687 2 :100
## c :100 Median :5.000 Median : 0.020011 3 :100
## d :100 Mean :5.021 Mean : 0.002811 4 :100
## e :100 3rd Qu.:7.000 3rd Qu.: 0.761157 5 :100
## f :100 Max. :9.000 Max. : 3.715721 6 :100
## (Other):400 (Other):400
## x9 x10
## Min. :0.1010 Min. : 0.000
## 1st Qu.:0.3106 1st Qu.: 3.000
## Median :0.5229 Median : 4.000
## Mean :0.5350 Mean : 3.951
## 3rd Qu.:0.7705 3rd Qu.: 5.000
## Max. :0.9881 Max. :11.000
##
summary(sim_data_30pct_missing)
## y x1 x2 x3
## Min. :-3.886 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.: 2.340 1st Qu.:0.000 1st Qu.:0.000 1st Qu.:0.0000
## Median : 5.438 Median :0.000 Median :2.582 Median :0.0000
## Mean : 5.425 Mean :2.493 Mean :2.499 Mean :0.4214
## 3rd Qu.: 8.136 3rd Qu.:4.993 3rd Qu.:4.960 3rd Qu.:1.0000
## Max. :16.526 Max. :8.390 Max. :8.421 Max. :1.0000
## NA's :300 NA's :300 NA's :300 NA's :300
## x4 x5 x6 x7 x8
## 1 :136 d : 77 Min. :1.000 Min. :-3.26215 5 : 76
## 2 :143 f : 73 1st Qu.:3.000 1st Qu.:-0.68042 2 : 74
## 3 :146 h : 73 Median :5.000 Median : 0.02927 6 : 73
## 4 :137 i : 72 Mean :5.079 Mean : 0.02138 1 : 71
## 5 :138 j : 72 3rd Qu.:7.000 3rd Qu.: 0.79094 4 : 70
## NA's:300 (Other):333 Max. :9.000 Max. : 3.71572 (Other):336
## NA's :300 NA's :300 NA's :300 NA's :300
## x9 x10
## Min. :0.1010 Min. : 0.000
## 1st Qu.:0.3076 1st Qu.: 2.000
## Median :0.5100 Median : 4.000
## Mean :0.5286 Mean : 4.029
## 3rd Qu.:0.7681 3rd Qu.: 5.000
## Max. :0.9881 Max. :11.000
## NA's :300 NA's :300
summary(data.frames[[1]])
## y x1 x2 x3 x4
## Min. :-3.886 Min. :-7.238 Min. :-4.927 0:567 1:193
## 1st Qu.: 2.340 1st Qu.: 0.000 1st Qu.: 0.000 1:433 2:208
## Median : 5.561 Median : 2.591 Median : 2.129 3:214
## Mean : 5.538 Mean : 2.526 Mean : 2.403 4:199
## 3rd Qu.: 8.301 3rd Qu.: 4.905 3rd Qu.: 4.777 5:186
## Max. :18.305 Max. : 9.462 Max. :10.276
##
## x5 x6 x7 x8
## b :111 Min. :-4.152 Min. :-3.42597 1 :116
## j :107 1st Qu.: 3.000 1st Qu.:-0.70742 8 :109
## i :105 Median : 5.000 Median : 0.02580 2 :108
## h :102 Mean : 5.074 Mean : 0.01439 5 :103
## c :101 3rd Qu.: 7.000 3rd Qu.: 0.80369 4 :102
## g :101 Max. :13.181 Max. : 3.71572 10 :102
## (Other):373 (Other):360
## x9 x10 missing_y missing_x1
## Min. :0.0009401 Min. :-3.157 Mode :logical Mode :logical
## 1st Qu.:0.3191679 1st Qu.: 2.527 FALSE:700 FALSE:700
## Median :0.5173536 Median : 4.000 TRUE :300 TRUE :300
## Mean :0.5315164 Mean : 4.057
## 3rd Qu.:0.7626689 3rd Qu.: 5.060
## Max. :0.9995029 Max. :11.075
##
## missing_x2 missing_x3 missing_x4 missing_x5
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x6 missing_x7 missing_x8 missing_x9
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x10
## Mode :logical
## FALSE:700
## TRUE :300
##
##
##
##
lapply(data.frames, summary)
## $`chain:1`
## y x1 x2 x3 x4
## Min. :-3.886 Min. :-7.238 Min. :-4.927 0:567 1:193
## 1st Qu.: 2.340 1st Qu.: 0.000 1st Qu.: 0.000 1:433 2:208
## Median : 5.561 Median : 2.591 Median : 2.129 3:214
## Mean : 5.538 Mean : 2.526 Mean : 2.403 4:199
## 3rd Qu.: 8.301 3rd Qu.: 4.905 3rd Qu.: 4.777 5:186
## Max. :18.305 Max. : 9.462 Max. :10.276
##
## x5 x6 x7 x8
## b :111 Min. :-4.152 Min. :-3.42597 1 :116
## j :107 1st Qu.: 3.000 1st Qu.:-0.70742 8 :109
## i :105 Median : 5.000 Median : 0.02580 2 :108
## h :102 Mean : 5.074 Mean : 0.01439 5 :103
## c :101 3rd Qu.: 7.000 3rd Qu.: 0.80369 4 :102
## g :101 Max. :13.181 Max. : 3.71572 10 :102
## (Other):373 (Other):360
## x9 x10 missing_y missing_x1
## Min. :0.0009401 Min. :-3.157 Mode :logical Mode :logical
## 1st Qu.:0.3191679 1st Qu.: 2.527 FALSE:700 FALSE:700
## Median :0.5173536 Median : 4.000 TRUE :300 TRUE :300
## Mean :0.5315164 Mean : 4.057
## 3rd Qu.:0.7626689 3rd Qu.: 5.060
## Max. :0.9995029 Max. :11.075
##
## missing_x2 missing_x3 missing_x4 missing_x5
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x6 missing_x7 missing_x8 missing_x9
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x10
## Mode :logical
## FALSE:700
## TRUE :300
##
##
##
##
##
## $`chain:2`
## y x1 x2 x3 x4
## Min. :-7.378 Min. :-4.901 Min. :-6.681 0:583 1:201
## 1st Qu.: 2.539 1st Qu.: 0.000 1st Qu.: 0.000 1:417 2:204
## Median : 5.573 Median : 2.707 Median : 1.883 3:202
## Mean : 5.521 Mean : 2.550 Mean : 2.418 4:179
## 3rd Qu.: 8.278 3rd Qu.: 4.885 3rd Qu.: 4.808 5:214
## Max. :16.526 Max. : 9.053 Max. : 9.752
##
## x5 x6 x7 x8
## d :116 Min. :-4.332 Min. :-3.262149 7 :109
## i :113 1st Qu.: 3.000 1st Qu.:-0.671050 5 :108
## b :111 Median : 5.000 Median : 0.009235 2 :105
## f :101 Mean : 5.051 Mean : 0.025900 6 :104
## h : 99 3rd Qu.: 7.000 3rd Qu.: 0.764619 10 :103
## g : 98 Max. :11.495 Max. : 3.932833 4 :102
## (Other):362 (Other):369
## x9 x10 missing_y missing_x1
## Min. :0.01094 Min. :-1.287 Mode :logical Mode :logical
## 1st Qu.:0.31848 1st Qu.: 2.537 FALSE:700 FALSE:700
## Median :0.52444 Median : 4.000 TRUE :300 TRUE :300
## Mean :0.53242 Mean : 4.007
## 3rd Qu.:0.76260 3rd Qu.: 5.000
## Max. :0.98809 Max. :11.703
##
## missing_x2 missing_x3 missing_x4 missing_x5
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x6 missing_x7 missing_x8 missing_x9
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x10
## Mode :logical
## FALSE:700
## TRUE :300
##
##
##
##
##
## $`chain:3`
## y x1 x2 x3 x4
## Min. :-3.886 Min. :-4.260 Min. :-5.877 0:577 1:203
## 1st Qu.: 2.463 1st Qu.: 0.000 1st Qu.: 0.000 1:423 2:215
## Median : 5.604 Median : 2.513 Median : 2.452 3:197
## Mean : 5.571 Mean : 2.483 Mean : 2.474 4:192
## 3rd Qu.: 8.266 3rd Qu.: 4.881 3rd Qu.: 4.817 5:193
## Max. :16.526 Max. : 8.835 Max. :12.073
##
## x5 x6 x7 x8
## d :114 Min. :-2.000 Min. :-3.26215 10 :110
## h :106 1st Qu.: 3.000 1st Qu.:-0.70352 5 :108
## i :105 Median : 5.000 Median : 0.02426 2 :107
## j :105 Mean : 5.064 Mean : 0.01301 7 :107
## b :101 3rd Qu.: 7.000 3rd Qu.: 0.79544 4 :106
## a :100 Max. :11.754 Max. : 3.71572 9 :101
## (Other):369 (Other):361
## x9 x10 missing_y missing_x1
## Min. :0.006127 Min. :-1.865 Mode :logical Mode :logical
## 1st Qu.:0.324624 1st Qu.: 2.634 FALSE:700 FALSE:700
## Median :0.540627 Median : 4.000 TRUE :300 TRUE :300
## Mean :0.538723 Mean : 4.020
## 3rd Qu.:0.768122 3rd Qu.: 5.000
## Max. :0.996987 Max. :11.299
##
## missing_x2 missing_x3 missing_x4 missing_x5
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x6 missing_x7 missing_x8 missing_x9
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x10
## Mode :logical
## FALSE:700
## TRUE :300
##
##
##
##
round(mipply(imputations, mean, to.matrix = TRUE), 3)
## chain:1 chain:2 chain:3
## y 0.015 0.012 0.019
## x1 0.006 0.011 -0.002
## x2 -0.019 -0.016 -0.005
## x3 1.433 1.417 1.423
## x4 2.977 3.001 2.957
## x5 5.587 5.546 5.553
## x6 -0.001 -0.005 -0.003
## x7 -0.003 0.002 -0.004
## x8 5.431 5.520 5.564
## x9 0.532 0.532 0.539
## x10 0.007 -0.005 -0.002
## missing_y 0.300 0.300 0.300
## missing_x1 0.300 0.300 0.300
## missing_x2 0.300 0.300 0.300
## missing_x3 0.300 0.300 0.300
## missing_x4 0.300 0.300 0.300
## missing_x5 0.300 0.300 0.300
## missing_x6 0.300 0.300 0.300
## missing_x7 0.300 0.300 0.300
## missing_x8 0.300 0.300 0.300
## missing_x9 0.300 0.300 0.300
## missing_x10 0.300 0.300 0.300
Rhats(imputations, statistic = "moments")
## mean_y mean_x1 mean_x2 mean_x3 mean_x4 mean_x5 mean_x6
## 1.2046635 1.1826790 1.6873019 1.1366042 0.9100276 1.0741083 1.1754183
## mean_x7 mean_x8 mean_x9 mean_x10 sd_y sd_x1 sd_x2
## 1.1584357 1.1582553 0.9866131 1.1507648 1.1433457 1.1623447 1.0019965
## sd_x3 sd_x4 sd_x5 sd_x6 sd_x7 sd_x8 sd_x9
## 1.1338756 0.9760816 1.1198121 0.9810286 1.3298025 0.9287844 1.5285230
## sd_x10
## 1.1078134
# assess the convergence of MI algorithm
plot(imputations);hist(imputations);image(imputations);summary(imputations)
## $y
## $y$is_missing
## missing
## FALSE TRUE
## 700 300
##
## $y$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.64194 -0.36281 0.06273 0.05068 0.41826 1.65190
##
## $y$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.194077 -0.395670 0.001664 0.000000 0.347725 1.423707
##
##
## $x1
## $x1$is_missing
## missing
## FALSE TRUE
## 700 300
##
## $x1$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.87556 -0.31922 0.02436 0.01698 0.35155 1.34305
##
## $x1$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.4805 -0.4805 -0.4805 0.0000 0.4817 1.1365
##
##
## $x2
## $x2$is_missing
## missing
## FALSE TRUE
## 700 300
##
## $x2$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.77847 -0.39314 -0.06708 -0.04383 0.32694 1.85462
##
## $x2$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.48420 -0.48420 0.01595 0.00000 0.47667 1.14717
##
##
## $x3
## $x3$crosstab
##
## observed imputed
## 0 1215 512
## 1 885 388
##
##
## $x4
## $x4$crosstab
##
## observed imputed
## 1 408 189
## 2 429 198
## 3 438 175
## 4 411 159
## 5 414 179
##
##
## $x5
## $x5$crosstab
##
## observed imputed
## a 186 80
## b 210 113
## c 195 89
## d 231 99
## e 198 81
## f 219 68
## g 210 86
## h 219 88
## i 216 107
## j 216 89
##
##
## $x6
## $x6$is_missing
## missing
## FALSE TRUE
## 700 300
##
## $x6$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.87057 -0.38528 -0.00860 -0.01031 0.33648 1.61064
##
## $x6$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.81073 -0.41317 -0.01562 0.00000 0.38194 0.77949
##
##
## $x7
## $x7$is_missing
## missing
## FALSE TRUE
## 700 300
##
## $x7$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.654793 -0.354776 -0.014823 -0.005786 0.362309 1.877568
##
## $x7$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.576156 -0.336879 0.003784 0.000000 0.369404 1.773350
##
##
## $x8
## $x8$crosstab
##
## observed imputed
## 1 213 91
## 2 222 98
## 3 186 64
## 4 210 100
## 5 228 91
## 6 219 80
## 7 210 97
## 8 207 84
## 9 198 87
## 10 207 108
##
##
## $x9
## $x9$is_missing
## missing
## FALSE TRUE
## 700 300
##
## $x9$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0009401 0.3599191 0.5675957 0.5472855 0.7452422 0.9995029
##
## $x9$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.1010 0.3077 0.5100 0.5286 0.7681 0.9881
##
##
## $x10
## $x10$is_missing
## missing
## FALSE TRUE
## 700 300
##
## $x10$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.7503436 -0.3498453 0.0038875 -0.0002989 0.3462585 1.8692424
##
## $x10$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.981263 -0.494111 -0.006959 0.000000 0.236617 1.698072
model_results<-pool(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10,data=imputations, m=3)
display (model_results); summary (model_results)
## bayesglm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 +
## x9 + x10, data = imputations, m = 3)
## coef.est coef.se
## (Intercept) -0.35 0.43
## x1 0.95 0.03
## x2 0.96 0.02
## x31 0.05 0.27
## x4.L 0.07 0.32
## x4.Q 0.02 0.27
## x4.C -0.12 0.18
## x4^4 0.02 0.32
## x5b -0.11 0.53
## x5c 0.21 0.23
## x5d 0.19 0.35
## x5e 0.51 0.58
## x5f 0.66 0.60
## x5g 0.27 0.49
## x5h 0.22 0.82
## x5i 0.39 0.70
## x5j 0.04 0.33
## x6 0.03 0.03
## x7 0.82 0.12
## x82 -0.07 0.55
## x83 0.05 0.24
## x84 0.47 0.78
## x85 -0.13 0.25
## x86 -0.44 0.48
## x87 -0.43 0.68
## x88 0.17 0.35
## x89 -0.30 0.36
## x810 -0.38 0.33
## x9 1.17 0.23
## x10 0.07 0.06
## n = 970, k = 30
## residual deviance = 2093.7, null deviance = 15450.0 (difference = 13356.3)
## overdispersion parameter = 2.2
## residual sd is sqrt(overdispersion) = 1.47
##
## Call:
## pool(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 +
## x10, data = imputations, m = 3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5892 -0.7474 0.0207 0.7162 3.2188
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.34936 0.43477 -0.804 0.44813
## x1 0.94668 0.03208 29.512 3.15e-06 ***
## x2 0.95623 0.01889 50.610 < 2e-16 ***
## x31 0.05135 0.26727 0.192 0.86162
## x4.L 0.06670 0.32472 0.205 0.85301
## x4.Q 0.02008 0.26772 0.075 0.94526
## x4.C -0.11598 0.18421 -0.630 0.56013
## x4^4 0.02178 0.32147 0.068 0.95110
## x5b -0.10707 0.52572 -0.204 0.85226
## x5c 0.20986 0.22503 0.933 0.35126
## x5d 0.19091 0.34655 0.551 0.60385
## x5e 0.51065 0.57785 0.884 0.44819
## x5f 0.65886 0.59638 1.105 0.35924
## x5g 0.26945 0.48518 0.555 0.61587
## x5h 0.22178 0.81526 0.272 0.80859
## x5i 0.39250 0.69651 0.564 0.62147
## x5j 0.04330 0.32543 0.133 0.89806
## x6 0.02586 0.02853 0.907 0.39886
## x7 0.82225 0.11628 7.071 0.00735 **
## x82 -0.07311 0.55473 -0.132 0.90448
## x83 0.04611 0.24080 0.191 0.84848
## x84 0.46605 0.77985 0.598 0.60470
## x85 -0.12774 0.24709 -0.517 0.60974
## x86 -0.43783 0.48245 -0.908 0.42977
## x87 -0.42548 0.68379 -0.622 0.58818
## x88 0.16671 0.35027 0.476 0.65365
## x89 -0.30466 0.36069 -0.845 0.43845
## x810 -0.37982 0.32686 -1.162 0.29179
## x9 1.16823 0.22810 5.122 5.54e-05 ***
## x10 0.06603 0.05853 1.128 0.34750
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2.158413)
##
## Null deviance: 15450.0 on 999 degrees of freedom
## Residual deviance: 2093.7 on 970 degrees of freedom
## AIC: 3638.2
##
## Number of Fisher Scoring iterations: 7
# Report the summaries of the imputations
data.frames <- complete(imputations, 3) # extract the first 3 chains
lapply(data.frames, summary)
## $`chain:1`
## y x1 x2 x3 x4
## Min. :-3.886 Min. :-7.238 Min. :-4.927 0:567 1:193
## 1st Qu.: 2.340 1st Qu.: 0.000 1st Qu.: 0.000 1:433 2:208
## Median : 5.561 Median : 2.591 Median : 2.129 3:214
## Mean : 5.538 Mean : 2.526 Mean : 2.403 4:199
## 3rd Qu.: 8.301 3rd Qu.: 4.905 3rd Qu.: 4.777 5:186
## Max. :18.305 Max. : 9.462 Max. :10.276
##
## x5 x6 x7 x8
## b :111 Min. :-4.152 Min. :-3.42597 1 :116
## j :107 1st Qu.: 3.000 1st Qu.:-0.70742 8 :109
## i :105 Median : 5.000 Median : 0.02580 2 :108
## h :102 Mean : 5.074 Mean : 0.01439 5 :103
## c :101 3rd Qu.: 7.000 3rd Qu.: 0.80369 4 :102
## g :101 Max. :13.181 Max. : 3.71572 10 :102
## (Other):373 (Other):360
## x9 x10 missing_y missing_x1
## Min. :0.0009401 Min. :-3.157 Mode :logical Mode :logical
## 1st Qu.:0.3191679 1st Qu.: 2.527 FALSE:700 FALSE:700
## Median :0.5173536 Median : 4.000 TRUE :300 TRUE :300
## Mean :0.5315164 Mean : 4.057
## 3rd Qu.:0.7626689 3rd Qu.: 5.060
## Max. :0.9995029 Max. :11.075
##
## missing_x2 missing_x3 missing_x4 missing_x5
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x6 missing_x7 missing_x8 missing_x9
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x10
## Mode :logical
## FALSE:700
## TRUE :300
##
##
##
##
##
## $`chain:2`
## y x1 x2 x3 x4
## Min. :-7.378 Min. :-4.901 Min. :-6.681 0:583 1:201
## 1st Qu.: 2.539 1st Qu.: 0.000 1st Qu.: 0.000 1:417 2:204
## Median : 5.573 Median : 2.707 Median : 1.883 3:202
## Mean : 5.521 Mean : 2.550 Mean : 2.418 4:179
## 3rd Qu.: 8.278 3rd Qu.: 4.885 3rd Qu.: 4.808 5:214
## Max. :16.526 Max. : 9.053 Max. : 9.752
##
## x5 x6 x7 x8
## d :116 Min. :-4.332 Min. :-3.262149 7 :109
## i :113 1st Qu.: 3.000 1st Qu.:-0.671050 5 :108
## b :111 Median : 5.000 Median : 0.009235 2 :105
## f :101 Mean : 5.051 Mean : 0.025900 6 :104
## h : 99 3rd Qu.: 7.000 3rd Qu.: 0.764619 10 :103
## g : 98 Max. :11.495 Max. : 3.932833 4 :102
## (Other):362 (Other):369
## x9 x10 missing_y missing_x1
## Min. :0.01094 Min. :-1.287 Mode :logical Mode :logical
## 1st Qu.:0.31848 1st Qu.: 2.537 FALSE:700 FALSE:700
## Median :0.52444 Median : 4.000 TRUE :300 TRUE :300
## Mean :0.53242 Mean : 4.007
## 3rd Qu.:0.76260 3rd Qu.: 5.000
## Max. :0.98809 Max. :11.703
##
## missing_x2 missing_x3 missing_x4 missing_x5
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x6 missing_x7 missing_x8 missing_x9
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x10
## Mode :logical
## FALSE:700
## TRUE :300
##
##
##
##
##
## $`chain:3`
## y x1 x2 x3 x4
## Min. :-3.886 Min. :-4.260 Min. :-5.877 0:577 1:203
## 1st Qu.: 2.463 1st Qu.: 0.000 1st Qu.: 0.000 1:423 2:215
## Median : 5.604 Median : 2.513 Median : 2.452 3:197
## Mean : 5.571 Mean : 2.483 Mean : 2.474 4:192
## 3rd Qu.: 8.266 3rd Qu.: 4.881 3rd Qu.: 4.817 5:193
## Max. :16.526 Max. : 8.835 Max. :12.073
##
## x5 x6 x7 x8
## d :114 Min. :-2.000 Min. :-3.26215 10 :110
## h :106 1st Qu.: 3.000 1st Qu.:-0.70352 5 :108
## i :105 Median : 5.000 Median : 0.02426 2 :107
## j :105 Mean : 5.064 Mean : 0.01301 7 :107
## b :101 3rd Qu.: 7.000 3rd Qu.: 0.79544 4 :106
## a :100 Max. :11.754 Max. : 3.71572 9 :101
## (Other):369 (Other):361
## x9 x10 missing_y missing_x1
## Min. :0.006127 Min. :-1.865 Mode :logical Mode :logical
## 1st Qu.:0.324624 1st Qu.: 2.634 FALSE:700 FALSE:700
## Median :0.540627 Median : 4.000 TRUE :300 TRUE :300
## Mean :0.538723 Mean : 4.020
## 3rd Qu.:0.768122 3rd Qu.: 5.000
## Max. :0.996987 Max. :11.299
##
## missing_x2 missing_x3 missing_x4 missing_x5
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x6 missing_x7 missing_x8 missing_x9
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:700 FALSE:700 FALSE:700 FALSE:700
## TRUE :300 TRUE :300 TRUE :300 TRUE :300
##
##
##
##
## missing_x10
## Mode :logical
## FALSE:700
## TRUE :300
##
##
##
##
coef(summary(model_results))[, 1:2] # get the model coef's and their SE's
## Estimate Std. Error
## (Intercept) -0.34935528 0.43476723
## x1 0.94667669 0.03207716
## x2 0.95623011 0.01889411
## x31 0.05134643 0.26726832
## x4.L 0.06669926 0.32471930
## x4.Q 0.02008243 0.26771613
## x4.C -0.11597568 0.18421013
## x4^4 0.02178467 0.32147253
## x5b -0.10706720 0.52571850
## x5c 0.20986479 0.22503305
## x5d 0.19091395 0.34655472
## x5e 0.51065296 0.57784542
## x5f 0.65886317 0.59638279
## x5g 0.26944791 0.48517637
## x5h 0.22178212 0.81526130
## x5i 0.39250366 0.69651000
## x5j 0.04329837 0.32542720
## x6 0.02586243 0.02852896
## x7 0.82225007 0.11627861
## x82 -0.07310774 0.55473221
## x83 0.04611270 0.24080026
## x84 0.46605214 0.77984682
## x85 -0.12774269 0.24708639
## x86 -0.43783114 0.48245267
## x87 -0.42547522 0.68379391
## x88 0.16671493 0.35026743
## x89 -0.30466469 0.36068715
## x810 -0.37981975 0.32686287
## x9 1.16823126 0.22809937
## x10 0.06603382 0.05852858
library("lattice")
densityplot(y ~ x1 + x2, data=imputations)
TBI Data Example
# Load the (raw) data from the table into a plain text file "08_EpiBioSData_Incomplete.csv"
TBI_Data <- read.csv("https://umich.instructure.com/files/720782/download?download_frd=1",
na.strings=c("", ".", "NA"))
summary(TBI_Data)
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. : 3 Min. : 3.000 Min. : 0.000 Min. : 0.0
## 1st Qu.: 3 1st Qu.: 4.000 1st Qu.: 3.000 1st Qu.: 3.0
## Median : 7 Median : 7.500 Median : 6.000 Median : 3.0
## Mean : 8 Mean : 8.182 Mean : 6.378 Mean : 5.4
## 3rd Qu.:12 3rd Qu.:12.250 3rd Qu.: 8.000 3rd Qu.: 7.0
## Max. :15 Max. :15.000 Max. :14.000 Max. :14.0
## NA's :2 NA's :2 NA's :1 NA's :1
## X6m.gose X2013.gose skull.fx temp.injury
## Min. :2.000 Min. :2.000 Min. :0.0000 Min. :0.000
## 1st Qu.:3.000 1st Qu.:5.000 1st Qu.:0.0000 1st Qu.:0.000
## Median :5.000 Median :7.000 Median :1.0000 Median :1.000
## Mean :4.805 Mean :5.804 Mean :0.6087 Mean :0.587
## 3rd Qu.:6.000 3rd Qu.:7.000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :8.000 Max. :8.000 Max. :1.0000 Max. :1.000
## NA's :5
## surgery spikes.hr min.hr max.hr
## Min. :0.0000 Min. : 1.280 Min. : 0.000 Min. : 12.00
## 1st Qu.:0.0000 1st Qu.: 5.357 1st Qu.: 0.000 1st Qu.: 35.25
## Median :1.0000 Median : 18.170 Median : 0.000 Median : 97.50
## Mean :0.6304 Mean : 52.872 Mean : 3.571 Mean : 241.89
## 3rd Qu.:1.0000 3rd Qu.: 57.227 3rd Qu.: 0.000 3rd Qu.: 312.75
## Max. :1.0000 Max. :294.000 Max. :42.000 Max. :1199.00
## NA's :18 NA's :18 NA's :18
## acute.sz late.sz ever.sz
## Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :1.0000 Median :1.000
## Mean :0.1739 Mean :0.5652 Mean :0.587
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.0000 Max. :1.000
##
# Get information matrix of the data
library("betareg"); library("mi")
mdf <- missing_data.frame(TBI_Data) # warnings about missingness patterns
## NOTE: The following pairs of variables appear to have the same missingness pattern.
## Please verify whether they are in fact logically distinct variables.
## [,1] [,2]
## [1,] "icu.gcs" "worst.gcs"
show(mdf); mdf@patterns; image(mdf)
## Object of class missing_data.frame with 46 observations on 19 variables
##
## There are 7 missing data patterns
##
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
##
## type missing method model
## id irrelevant 0 <NA> <NA>
## age continuous 0 <NA> <NA>
## sex binary 0 <NA> <NA>
## mechanism unordered-categorical 0 <NA> <NA>
## field.gcs continuous 2 ppd linear
## er.gcs continuous 2 ppd linear
## icu.gcs continuous 1 ppd linear
## worst.gcs continuous 1 ppd linear
## X6m.gose continuous 5 ppd linear
## X2013.gose continuous 0 <NA> <NA>
## skull.fx binary 0 <NA> <NA>
## temp.injury binary 0 <NA> <NA>
## surgery binary 0 <NA> <NA>
## spikes.hr continuous 18 ppd linear
## min.hr continuous 18 ppd linear
## max.hr continuous 18 ppd linear
## acute.sz binary 0 <NA> <NA>
## late.sz binary 0 <NA> <NA>
## ever.sz binary 0 <NA> <NA>
##
## family link transformation
## id <NA> <NA> <NA>
## age <NA> <NA> standardize
## sex <NA> <NA> <NA>
## mechanism <NA> <NA> <NA>
## field.gcs gaussian identity standardize
## er.gcs gaussian identity standardize
## icu.gcs gaussian identity standardize
## worst.gcs gaussian identity standardize
## X6m.gose gaussian identity standardize
## X2013.gose <NA> <NA> standardize
## skull.fx <NA> <NA> <NA>
## temp.injury <NA> <NA> <NA>
## surgery <NA> <NA> <NA>
## spikes.hr gaussian identity standardize
## min.hr gaussian identity standardize
## max.hr gaussian identity standardize
## acute.sz <NA> <NA> <NA>
## late.sz <NA> <NA> <NA>
## ever.sz <NA> <NA> <NA>
## [1] spikes.hr, min.hr, max.hr
## [2] field.gcs
## [3] nothing
## [4] nothing
## [5] nothing
## [6] nothing
## [7] spikes.hr, min.hr, max.hr
## [8] nothing
## [9] nothing
## [10] nothing
## [11] nothing
## [12] nothing
## [13] spikes.hr, min.hr, max.hr
## [14] nothing
## [15] spikes.hr, min.hr, max.hr
## [16] spikes.hr, min.hr, max.hr
## [17] nothing
## [18] spikes.hr, min.hr, max.hr
## [19] spikes.hr, min.hr, max.hr
## [20] spikes.hr, min.hr, max.hr
## [21] X6m.gose, spikes.hr, min.hr, max.hr
## [22] nothing
## [23] spikes.hr, min.hr, max.hr
## [24] spikes.hr, min.hr, max.hr
## [25] spikes.hr, min.hr, max.hr
## [26] spikes.hr, min.hr, max.hr
## [27] spikes.hr, min.hr, max.hr
## [28] X6m.gose
## [29] spikes.hr, min.hr, max.hr
## [30] nothing
## [31] X6m.gose, spikes.hr, min.hr, max.hr
## [32] spikes.hr, min.hr, max.hr
## [33] nothing
## [34] nothing
## [35] nothing
## [36] nothing
## [37] field.gcs, er.gcs, icu.gcs, worst.gcs, X6m.gose
## [38] er.gcs
## [39] nothing
## [40] nothing
## [41] nothing
## [42] spikes.hr, min.hr, max.hr
## [43] nothing
## [44] nothing
## [45] nothing
## [46] X6m.gose
## 7 Levels: nothing field.gcs X6m.gose er.gcs ... field.gcs, er.gcs, icu.gcs, worst.gcs, X6m.gose
# mi::change() method changes the family imputation method,
# size, type, and so forth of a missing variable. It's called
# before calling mi to affect how the conditional expectation of each
# missing variable is modeled.
mdf <- change(mdf, y = "spikes.hr", what = "transformation", to="identity")
summary(mdf); hist(mdf);
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. : 3 Min. : 3.000 Min. : 0.000 Min. : 0.0
## 1st Qu.: 3 1st Qu.: 4.000 1st Qu.: 3.000 1st Qu.: 3.0
## Median : 7 Median : 7.500 Median : 6.000 Median : 3.0
## Mean : 8 Mean : 8.182 Mean : 6.378 Mean : 5.4
## 3rd Qu.:12 3rd Qu.:12.250 3rd Qu.: 8.000 3rd Qu.: 7.0
## Max. :15 Max. :15.000 Max. :14.000 Max. :14.0
## NA's :2 NA's :2 NA's :1 NA's :1
## X6m.gose X2013.gose skull.fx temp.injury
## Min. :2.000 Min. :2.000 Min. :0.0000 Min. :0.000
## 1st Qu.:3.000 1st Qu.:5.000 1st Qu.:0.0000 1st Qu.:0.000
## Median :5.000 Median :7.000 Median :1.0000 Median :1.000
## Mean :4.805 Mean :5.804 Mean :0.6087 Mean :0.587
## 3rd Qu.:6.000 3rd Qu.:7.000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :8.000 Max. :8.000 Max. :1.0000 Max. :1.000
## NA's :5
## surgery spikes.hr min.hr max.hr
## Min. :0.0000 Min. : 1.280 Min. : 0.000 Min. : 12.00
## 1st Qu.:0.0000 1st Qu.: 5.357 1st Qu.: 0.000 1st Qu.: 35.25
## Median :1.0000 Median : 18.170 Median : 0.000 Median : 97.50
## Mean :0.6304 Mean : 52.872 Mean : 3.571 Mean : 241.89
## 3rd Qu.:1.0000 3rd Qu.: 57.227 3rd Qu.: 0.000 3rd Qu.: 312.75
## Max. :1.0000 Max. :294.000 Max. :42.000 Max. :1199.00
## NA's :18 NA's :18 NA's :18
## acute.sz late.sz ever.sz
## Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :1.0000 Median :1.000
## Mean :0.1739 Mean :0.5652 Mean :0.587
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.000
## Max. :1.0000 Max. :1.0000 Max. :1.000
##
image(mdf)
imputations <- mi(mdf, n.iter=10, n.chains=5, verbose=TRUE)
hist(imputations)
data.frames <- complete(imputations, 5)
lapply(data.frames, summary)
## $`chain:1`
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. : 3.000 Min. : 3.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 3.250 1st Qu.: 4.250 1st Qu.: 3.000 1st Qu.: 3.00
## Median : 7.000 Median : 8.000 Median : 6.000 Median : 3.00
## Mean : 8.368 Mean : 8.403 Mean : 6.403 Mean : 5.49
## 3rd Qu.:12.750 3rd Qu.:12.750 3rd Qu.: 7.886 3rd Qu.: 7.75
## Max. :19.259 Max. :16.382 Max. :14.000 Max. :14.00
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :-0.477 Min. :2.000 0:18 0:19 0:17
## 1st Qu.: 3.000 1st Qu.:5.000 1:28 1:27 1:29
## Median : 5.000 Median :7.000
## Mean : 4.682 Mean :5.804
## 3rd Qu.: 6.000 3rd Qu.:7.000
## Max. : 8.000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-137.889 Min. :-36.411 Min. :-528.98 0:38 0:20
## 1st Qu.: 3.805 1st Qu.: 0.000 1st Qu.: 16.25 1: 8 1:26
## Median : 28.125 Median : 0.000 Median : 84.00
## Mean : 38.266 Mean : 1.741 Mean : 188.99
## 3rd Qu.: 62.502 3rd Qu.: 3.750 3rd Qu.: 363.00
## Max. : 294.000 Max. : 42.000 Max. :1199.00
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
##
## $`chain:2`
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. :-0.8877 Min. : 3.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.0000 1st Qu.: 4.25 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 6.5000 Median : 8.00 Median : 6.000 Median : 3.000
## Mean : 7.6670 Mean : 8.56 Mean : 6.303 Mean : 5.452
## 3rd Qu.:12.0000 3rd Qu.:13.00 3rd Qu.: 7.750 3rd Qu.: 7.607
## Max. :15.0000 Max. :18.75 Max. :14.000 Max. :14.000
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :2.000 Min. :2.000 0:18 0:19 0:17
## 1st Qu.:3.000 1st Qu.:5.000 1:28 1:27 1:29
## Median :5.000 Median :7.000
## Mean :4.904 Mean :5.804
## 3rd Qu.:6.253 3rd Qu.:7.000
## Max. :8.000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-219.254 Min. :-19.972 Min. :-924.66 0:38 0:20
## 1st Qu.: 3.953 1st Qu.: 0.000 1st Qu.: 28.75 1: 8 1:26
## Median : 33.490 Median : 0.000 Median : 150.00
## Mean : 73.034 Mean : 3.770 Mean : 255.84
## 3rd Qu.: 147.970 3rd Qu.: 6.278 3rd Qu.: 468.00
## Max. : 421.748 Max. : 42.000 Max. :1318.33
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
##
## $`chain:3`
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. : 3.000 Min. : 3.000 Min. : 0.00 Min. :-3.824
## 1st Qu.: 3.250 1st Qu.: 4.250 1st Qu.: 3.00 1st Qu.: 3.000
## Median : 7.000 Median : 8.000 Median : 6.00 Median : 3.000
## Mean : 8.122 Mean : 8.525 Mean : 6.30 Mean : 5.199
## 3rd Qu.:12.000 3rd Qu.:13.000 3rd Qu.: 7.75 3rd Qu.: 7.000
## Max. :16.690 Max. :16.607 Max. :14.00 Max. :14.000
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :-0.02415 Min. :2.000 0:18 0:19 0:17
## 1st Qu.: 3.00000 1st Qu.:5.000 1:28 1:27 1:29
## Median : 5.00000 Median :7.000
## Mean : 4.53409 Mean :5.804
## 3rd Qu.: 6.00000 3rd Qu.:7.000
## Max. : 8.00000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-281.747 Min. :-33.445 Min. :-1099.01 0:38 0:20
## 1st Qu.: 4.577 1st Qu.: 0.000 1st Qu.: 33.75 1: 8 1:26
## Median : 29.403 Median : 0.000 Median : 163.03
## Mean : 51.345 Mean : 3.239 Mean : 252.11
## 3rd Qu.: 108.989 3rd Qu.: 6.773 3rd Qu.: 518.85
## Max. : 294.000 Max. : 42.000 Max. : 1199.00
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
##
## $`chain:4`
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. : 0.8178 Min. : 2.728 Min. : 0.00 Min. : 0.00
## 1st Qu.: 3.0000 1st Qu.: 4.000 1st Qu.: 3.00 1st Qu.: 3.00
## Median : 7.0000 Median : 7.000 Median : 6.00 Median : 3.00
## Mean : 8.0500 Mean : 8.016 Mean : 6.43 Mean : 5.48
## 3rd Qu.:12.0000 3rd Qu.:12.000 3rd Qu.: 8.00 3rd Qu.: 7.75
## Max. :17.4818 Max. :15.000 Max. :14.00 Max. :14.00
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :0.493 Min. :2.000 0:18 0:19 0:17
## 1st Qu.:3.000 1st Qu.:5.000 1:28 1:27 1:29
## Median :5.000 Median :7.000
## Mean :4.604 Mean :5.804
## 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :8.000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-124.622 Min. :-37.474 Min. :-417.67 0:38 0:20
## 1st Qu.: 3.805 1st Qu.: 0.000 1st Qu.: 28.75 1: 8 1:26
## Median : 28.125 Median : 0.000 Median : 116.00
## Mean : 54.517 Mean : 6.677 Mean : 230.91
## 3rd Qu.: 95.099 3rd Qu.: 11.369 3rd Qu.: 363.00
## Max. : 294.000 Max. : 64.397 Max. :1199.00
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
##
## $`chain:5`
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. : 3.000 Min. : 0.7236 Min. :-4.872 Min. : 0.000
## 1st Qu.: 3.250 1st Qu.: 3.2500 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 7.000 Median : 7.0000 Median : 6.000 Median : 3.000
## Mean : 7.937 Mean : 7.8938 Mean : 6.133 Mean : 5.398
## 3rd Qu.:12.000 3rd Qu.:12.0000 3rd Qu.: 7.750 3rd Qu.: 7.000
## Max. :15.000 Max. :15.0000 Max. :14.000 Max. :14.000
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :2.000 Min. :2.000 0:18 0:19 0:17
## 1st Qu.:3.000 1st Qu.:5.000 1:28 1:27 1:29
## Median :5.000 Median :7.000
## Mean :4.653 Mean :5.804
## 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :8.000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-135.298 Min. :-38.620 Min. :-867.26 0:38 0:20
## 1st Qu.: 3.953 1st Qu.: 0.000 1st Qu.: 28.75 1: 8 1:26
## Median : 12.012 Median : 0.000 Median : 117.08
## Mean : 36.725 Mean : 3.011 Mean : 187.49
## 3rd Qu.: 62.502 3rd Qu.: 7.103 3rd Qu.: 338.25
## Max. : 294.000 Max. : 42.000 Max. :1199.00
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
# (not necessary, but may be useful)
indx <- sapply(data.frames[[5]], is.numeric) # get the indices of numeric columns
data.frames[[5]][indx] <- lapply(data.frames[[5]][indx], function(x)
as.numeric(as.integer(x)))
# cast each value as integer data.frames[[5]]$spikes.hr
library("mi")
# lm.mi(); glm.mi(); polr.mi(); bayesglm.mi(); bayespolr.mi(); lmer.mi(); glmer.mi()
# Also see Step above
##linear regression for each imputed data set - 5 regression models are fit
fit_lm1 <- glm(ever.sz ~ surgery + worst.gcs + factor(sex) + age, data.frames$`chain:1`, family = "binomial"); summary(fit_lm1); display(fit_lm1)
##
## Call:
## glm(formula = ever.sz ~ surgery + worst.gcs + factor(sex) + age,
## family = "binomial", data = data.frames$`chain:1`)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7010 -1.2148 0.8212 1.0009 1.3884
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.255915 1.357554 0.189 0.850
## surgery1 0.950585 0.685826 1.386 0.166
## worst.gcs -0.069793 0.098192 -0.711 0.477
## factor(sex)Male -0.330161 0.842880 -0.392 0.695
## age 0.004406 0.019433 0.227 0.821
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 62.371 on 45 degrees of freedom
## Residual deviance: 60.033 on 41 degrees of freedom
## AIC: 70.033
##
## Number of Fisher Scoring iterations: 4
## glm(formula = ever.sz ~ surgery + worst.gcs + factor(sex) + age,
## family = "binomial", data = data.frames$`chain:1`)
## coef.est coef.se
## (Intercept) 0.26 1.36
## surgery1 0.95 0.69
## worst.gcs -0.07 0.10
## factor(sex)Male -0.33 0.84
## age 0.00 0.02
## ---
## n = 46, k = 5
## residual deviance = 60.0, null deviance = 62.4 (difference = 2.3)
# (estimates over MI chains)
model_results <- pool(ever.sz ~ surgery + worst.gcs + factor(sex) + age,
family = "binomial", data=imputations, m=5)
display (model_results); summary (model_results)
## bayesglm(formula = ever.sz ~ surgery + worst.gcs + factor(sex) +
## age, data = imputations, m = 5, family = "binomial")
## coef.est coef.se
## (Intercept) 0.37 1.27
## surgery1 0.89 0.65
## worst.gcs -0.08 0.10
## factor(sex)Male -0.31 0.76
## age 0.00 0.02
## n = 41, k = 5
## residual deviance = 59.7, null deviance = 62.4 (difference = 2.7)
##
## Call:
## pool(formula = ever.sz ~ surgery + worst.gcs + factor(sex) +
## age, data = imputations, m = 5, family = "binomial")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6668 -1.2037 0.8545 1.0150 1.3635
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.365188 1.272203 0.287 0.774
## surgery1 0.887991 0.651889 1.362 0.173
## worst.gcs -0.075537 0.096347 -0.784 0.433
## factor(sex)Male -0.311468 0.761916 -0.409 0.683
## age 0.002654 0.018246 0.145 0.884
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 62.371 on 45 degrees of freedom
## Residual deviance: 59.683 on 41 degrees of freedom
## AIC: 69.683
##
## Number of Fisher Scoring iterations: 6.2
data.frames <- complete(imputations, 3) # extract the first 3 chains
lapply(data.frames, summary)
## $`chain:1`
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. : 3.000 Min. : 3.000 Min. : 0.000 Min. : 0.00
## 1st Qu.: 3.250 1st Qu.: 4.250 1st Qu.: 3.000 1st Qu.: 3.00
## Median : 7.000 Median : 8.000 Median : 6.000 Median : 3.00
## Mean : 8.368 Mean : 8.403 Mean : 6.403 Mean : 5.49
## 3rd Qu.:12.750 3rd Qu.:12.750 3rd Qu.: 7.886 3rd Qu.: 7.75
## Max. :19.259 Max. :16.382 Max. :14.000 Max. :14.00
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :-0.477 Min. :2.000 0:18 0:19 0:17
## 1st Qu.: 3.000 1st Qu.:5.000 1:28 1:27 1:29
## Median : 5.000 Median :7.000
## Mean : 4.682 Mean :5.804
## 3rd Qu.: 6.000 3rd Qu.:7.000
## Max. : 8.000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-137.889 Min. :-36.411 Min. :-528.98 0:38 0:20
## 1st Qu.: 3.805 1st Qu.: 0.000 1st Qu.: 16.25 1: 8 1:26
## Median : 28.125 Median : 0.000 Median : 84.00
## Mean : 38.266 Mean : 1.741 Mean : 188.99
## 3rd Qu.: 62.502 3rd Qu.: 3.750 3rd Qu.: 363.00
## Max. : 294.000 Max. : 42.000 Max. :1199.00
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
##
## $`chain:2`
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. :-0.8877 Min. : 3.00 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.0000 1st Qu.: 4.25 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 6.5000 Median : 8.00 Median : 6.000 Median : 3.000
## Mean : 7.6670 Mean : 8.56 Mean : 6.303 Mean : 5.452
## 3rd Qu.:12.0000 3rd Qu.:13.00 3rd Qu.: 7.750 3rd Qu.: 7.607
## Max. :15.0000 Max. :18.75 Max. :14.000 Max. :14.000
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :2.000 Min. :2.000 0:18 0:19 0:17
## 1st Qu.:3.000 1st Qu.:5.000 1:28 1:27 1:29
## Median :5.000 Median :7.000
## Mean :4.904 Mean :5.804
## 3rd Qu.:6.253 3rd Qu.:7.000
## Max. :8.000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-219.254 Min. :-19.972 Min. :-924.66 0:38 0:20
## 1st Qu.: 3.953 1st Qu.: 0.000 1st Qu.: 28.75 1: 8 1:26
## Median : 33.490 Median : 0.000 Median : 150.00
## Mean : 73.034 Mean : 3.770 Mean : 255.84
## 3rd Qu.: 147.970 3rd Qu.: 6.278 3rd Qu.: 468.00
## Max. : 421.748 Max. : 42.000 Max. :1318.33
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
##
## $`chain:3`
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. : 3.000 Min. : 3.000 Min. : 0.00 Min. :-3.824
## 1st Qu.: 3.250 1st Qu.: 4.250 1st Qu.: 3.00 1st Qu.: 3.000
## Median : 7.000 Median : 8.000 Median : 6.00 Median : 3.000
## Mean : 8.122 Mean : 8.525 Mean : 6.30 Mean : 5.199
## 3rd Qu.:12.000 3rd Qu.:13.000 3rd Qu.: 7.75 3rd Qu.: 7.000
## Max. :16.690 Max. :16.607 Max. :14.00 Max. :14.000
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :-0.02415 Min. :2.000 0:18 0:19 0:17
## 1st Qu.: 3.00000 1st Qu.:5.000 1:28 1:27 1:29
## Median : 5.00000 Median :7.000
## Mean : 4.53409 Mean :5.804
## 3rd Qu.: 6.00000 3rd Qu.:7.000
## Max. : 8.00000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-281.747 Min. :-33.445 Min. :-1099.01 0:38 0:20
## 1st Qu.: 4.577 1st Qu.: 0.000 1st Qu.: 33.75 1: 8 1:26
## Median : 29.403 Median : 0.000 Median : 163.03
## Mean : 51.345 Mean : 3.239 Mean : 252.11
## 3rd Qu.: 108.989 3rd Qu.: 6.773 3rd Qu.: 518.85
## Max. : 294.000 Max. : 42.000 Max. : 1199.00
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
# should be similar for each of the k chains (in this case k=5).
# mipply is wrapper for sapply invoked on mi-class objects to compute the col means
round(mipply(imputations, mean, to.matrix = TRUE), 3)
## chain:1 chain:2 chain:3 chain:4 chain:5
## id 23.500 23.500 23.500 23.500 23.500
## age 0.000 0.000 0.000 0.000 0.000
## sex 1.804 1.804 1.804 1.804 1.804
## mechanism 4.261 4.261 4.261 4.261 4.261
## field.gcs 0.041 -0.037 0.014 0.006 -0.007
## er.gcs 0.026 0.045 0.041 -0.020 -0.035
## icu.gcs 0.004 -0.012 -0.012 0.008 -0.038
## worst.gcs 0.013 0.008 -0.030 0.012 0.000
## X6m.gose -0.035 0.028 -0.077 -0.057 -0.044
## X2013.gose 0.000 0.000 0.000 0.000 0.000
## skull.fx 1.609 1.609 1.609 1.609 1.609
## temp.injury 1.587 1.587 1.587 1.587 1.587
## surgery 1.630 1.630 1.630 1.630 1.630
## spikes.hr 38.266 73.034 51.345 54.517 36.725
## min.hr -0.094 0.010 -0.017 0.159 -0.029
## max.hr -0.087 0.023 0.017 -0.018 -0.090
## acute.sz 1.174 1.174 1.174 1.174 1.174
## late.sz 1.565 1.565 1.565 1.565 1.565
## ever.sz 1.587 1.587 1.587 1.587 1.587
## missing_field.gcs 0.043 0.043 0.043 0.043 0.043
## missing_er.gcs 0.043 0.043 0.043 0.043 0.043
## missing_icu.gcs 0.022 0.022 0.022 0.022 0.022
## missing_worst.gcs 0.022 0.022 0.022 0.022 0.022
## missing_X6m.gose 0.109 0.109 0.109 0.109 0.109
## missing_spikes.hr 0.391 0.391 0.391 0.391 0.391
## missing_min.hr 0.391 0.391 0.391 0.391 0.391
## missing_max.hr 0.391 0.391 0.391 0.391 0.391
# Rhat convergence statistics compares the variance between chains to the variance across chains.
# Rhat Values ~ 1.0 indicate likely convergence,
# Rhat Values > 1.1 indicate that the chains should be run longer (use large number of iterations)
Rhats(imputations, statistic = "moments") # assess the convergence of MI algorithm
## mean_field.gcs mean_er.gcs mean_icu.gcs mean_worst.gcs mean_X6m.gose
## 1.4787521 3.1877013 2.8303357 2.2686572 0.9932531
## mean_spikes.hr mean_min.hr mean_max.hr sd_field.gcs sd_er.gcs
## 2.5564563 1.7275399 2.1428859 1.2807005 1.2659777
## sd_icu.gcs sd_worst.gcs sd_X6m.gose sd_spikes.hr sd_min.hr
## 1.3829297 1.1412632 1.0487617 2.2336051 1.9834381
## sd_max.hr
## 2.2399720
# When convergence is unstable, we can continue the iterations for all chains, e.g.
imputations <- mi(imputations, n.iter=20) # add additional 20 iterations
# To plot the produced mi results, for all missing_variables we can generate
# a histogram of the observed, imputed, and completed data.
# We can compare of the completed data to the fitted values implied by the model for the completed #data, by plotting binned residuals.
# hist function works similarly as plot.
# image function gives a sense of the missingness patterns in the data
plot(imputations);hist(imputations)
image(imputations); summary(imputations)
## $id
## $id$is_missing
## [1] "all values observed"
##
## $id$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 12.25 23.50 23.50 34.75 46.00
##
##
## $age
## $age$is_missing
## [1] "all values observed"
##
## $age$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.6045 -0.4019 -0.1126 0.0000 0.2997 1.3341
##
##
## $sex
## $sex$is_missing
## [1] "all values observed"
##
## $sex$observed
##
## 1 2
## 9 37
##
##
## $mechanism
## $mechanism$is_missing
## [1] "all values observed"
##
## $mechanism$observed
##
## 1 2 3 4 5 6 7
## 4 4 13 2 7 10 6
##
##
## $field.gcs
## $field.gcs$is_missing
## missing
## FALSE TRUE
## 44 2
##
## $field.gcs$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.430462 -0.531767 0.001835 0.009681 0.459283 2.024220
##
## $field.gcs$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.5545 -0.5545 -0.1109 0.0000 0.4436 0.7763
##
##
## $er.gcs
## $er.gcs$is_missing
## missing
## FALSE TRUE
## 44 2
##
## $er.gcs$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.0493 -1.5811 -1.1128 -1.0759 -0.2557 0.3408
##
## $er.gcs$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.62098 -0.50114 -0.08171 0.00000 0.48752 0.81708
##
##
## $icu.gcs
## $icu.gcs$is_missing
## missing
## FALSE TRUE
## 45 1
##
## $icu.gcs$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1686 -1.7804 -1.6045 -1.5440 -0.9245 -0.2418
##
## $icu.gcs$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.98350 -0.52088 -0.05826 0.00000 0.25016 1.17540
##
##
## $worst.gcs
## $worst.gcs$is_missing
## missing
## FALSE TRUE
## 45 1
##
## $worst.gcs$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.7031 -0.3589 -0.3131 -0.1614 -0.2112 0.7795
##
## $worst.gcs$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.8068 -0.3586 -0.3586 0.0000 0.2390 1.2849
##
##
## $X6m.gose
## $X6m.gose$is_missing
## missing
## FALSE TRUE
## 41 5
##
## $X6m.gose$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.27850 -0.43691 -0.06858 -0.03898 0.33339 1.32618
##
## $X6m.gose$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.80159 -0.51581 0.05576 0.00000 0.34155 0.91312
##
##
## $X2013.gose
## $X2013.gose$is_missing
## [1] "all values observed"
##
## $X2013.gose$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.9987 -0.2112 0.3139 0.0000 0.3139 0.5764
##
##
## $skull.fx
## $skull.fx$is_missing
## [1] "all values observed"
##
## $skull.fx$observed
##
## 1 2
## 18 28
##
##
## $temp.injury
## $temp.injury$is_missing
## [1] "all values observed"
##
## $temp.injury$observed
##
## 1 2
## 19 27
##
##
## $surgery
## $surgery$is_missing
## [1] "all values observed"
##
## $surgery$observed
##
## 1 2
## 17 29
##
##
## $spikes.hr
## $spikes.hr$is_missing
## missing
## FALSE TRUE
## 28 18
##
## $spikes.hr$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -331.250 -79.870 -6.743 10.109 105.559 324.983
##
## $spikes.hr$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.280 5.357 18.170 52.872 57.227 294.000
##
##
## $min.hr
## $min.hr$is_missing
## missing
## FALSE TRUE
## 28 18
##
## $min.hr$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.5731 -1.0627 -0.1471 -0.1819 0.8233 2.9098
##
## $min.hr$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.1828 -0.1828 -0.1828 0.0000 -0.1828 1.9668
##
##
## $max.hr
## $max.hr$is_missing
## missing
## FALSE TRUE
## 28 18
##
## $max.hr$imputed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.49886 -0.53336 -0.01627 -0.05129 0.47479 1.56717
##
## $max.hr$observed
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.3783 -0.3400 -0.2376 0.0000 0.1166 1.5750
##
##
## $acute.sz
## $acute.sz$is_missing
## [1] "all values observed"
##
## $acute.sz$observed
##
## 1 2
## 38 8
##
##
## $late.sz
## $late.sz$is_missing
## [1] "all values observed"
##
## $late.sz$observed
##
## 1 2
## 20 26
##
##
## $ever.sz
## $ever.sz$is_missing
## [1] "all values observed"
##
## $ever.sz$observed
##
## 1 2
## 19 27
# regression model and impact of various predictors
model_results <- pool(ever.sz ~ surgery + worst.gcs + factor(sex) + age,
data = imputations, m=5); display (model_results); summary (model_results)
## bayesglm(formula = ever.sz ~ surgery + worst.gcs + factor(sex) +
## age, data = imputations, m = 5)
## coef.est coef.se
## (Intercept) 0.42 1.27
## surgery1 0.91 0.65
## worst.gcs -0.08 0.10
## factor(sex)Male -0.32 0.76
## age 0.00 0.02
## n = 41, k = 5
## residual deviance = 59.5, null deviance = 62.4 (difference = 2.8)
##
## Call:
## pool(formula = ever.sz ~ surgery + worst.gcs + factor(sex) +
## age, data = imputations, m = 5)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.676 -1.188 0.844 1.016 1.375
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.417855 1.274720 0.328 0.743
## surgery1 0.914428 0.654134 1.398 0.162
## worst.gcs -0.084801 0.097286 -0.872 0.383
## factor(sex)Male -0.317796 0.761500 -0.417 0.676
## age 0.002258 0.018153 0.124 0.901
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 62.371 on 45 degrees of freedom
## Residual deviance: 59.545 on 41 degrees of freedom
## AIC: 69.545
##
## Number of Fisher Scoring iterations: 6.6
coef(summary(model_results))[, 1:2] # get the model coef's and their SE's
## Estimate Std. Error
## (Intercept) 0.417854649 1.27471972
## surgery1 0.914428111 0.65413413
## worst.gcs -0.084801326 0.09728582
## factor(sex)Male -0.317796027 0.76150004
## age 0.002257943 0.01815253
# Report the summaries of the imputations
data.frames <- complete(imputations, 3) # extract the first 3 chains
lapply(data.frames, summary) # report summaries
## [[1]]
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. :-1.742 Min. :-17.264 Min. : 0.000 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 3.250 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 6.500 Median : 7.000 Median : 6.000 Median : 3.000
## Mean : 7.704 Mean : 7.343 Mean : 6.344 Mean : 5.348
## 3rd Qu.:12.000 3rd Qu.: 12.000 3rd Qu.: 7.750 3rd Qu.: 7.000
## Max. :15.000 Max. : 15.000 Max. :14.000 Max. :14.000
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :-3.168 Min. :2.000 0:18 0:19 0:17
## 1st Qu.: 3.000 1st Qu.:5.000 1:28 1:27 1:29
## Median : 5.000 Median :7.000
## Mean : 4.909 Mean :5.804
## 3rd Qu.: 7.000 3rd Qu.:7.000
## Max. : 9.445 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-331.250 Min. :-22.2183 Min. :-709.8 0:38 0:20
## 1st Qu.: 1.375 1st Qu.: -3.8339 1st Qu.: 31.5 1: 8 1:26
## Median : 7.450 Median : 0.0000 Median : 177.5
## Mean : 23.379 Mean : 0.7072 Mean : 270.4
## 3rd Qu.: 58.328 3rd Qu.: 0.0000 3rd Qu.: 414.2
## Max. : 294.000 Max. : 42.0000 Max. :1199.0
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
##
## [[2]]
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. :-13.915 Min. : 3.000 Min. :-14.170 Min. : 0.000
## 1st Qu.: 3.000 1st Qu.: 4.250 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 7.000 Median : 7.700 Median : 6.000 Median : 3.000
## Mean : 7.558 Mean : 8.227 Mean : 5.931 Mean : 5.369
## 3rd Qu.: 12.000 3rd Qu.:12.000 3rd Qu.: 7.750 3rd Qu.: 7.000
## Max. : 15.000 Max. :15.000 Max. : 14.000 Max. :14.000
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :2.000 Min. :2.000 0:18 0:19 0:17
## 1st Qu.:3.000 1st Qu.:5.000 1:28 1:27 1:29
## Median :5.000 Median :7.000
## Mean :4.875 Mean :5.804
## 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :8.000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-265.67 Min. :-36.561 Min. :-1276.7 0:38 0:20
## 1st Qu.: 1.67 1st Qu.: 0.000 1st Qu.: 31.5 1: 8 1:26
## Median : 8.73 Median : 0.000 Median : 105.7
## Mean : 27.90 Mean : 1.811 Mean : 197.5
## 3rd Qu.: 55.26 3rd Qu.: 5.339 3rd Qu.: 358.2
## Max. : 294.00 Max. : 43.005 Max. : 1199.0
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
##
## [[3]]
## id age sex mechanism
## Min. : 1.00 Min. :16.00 Female: 9 Bike_vs_Auto: 4
## 1st Qu.:12.25 1st Qu.:23.00 Male :37 Blunt : 4
## Median :23.50 Median :33.00 Fall :13
## Mean :23.50 Mean :36.89 GSW : 2
## 3rd Qu.:34.75 3rd Qu.:47.25 MCA : 7
## Max. :46.00 Max. :83.00 MVA :10
## Peds_vs_Auto: 6
## field.gcs er.gcs icu.gcs worst.gcs
## Min. : 3.000 Min. :-5.035 Min. :-5.168 Min. : 0.000
## 1st Qu.: 3.250 1st Qu.: 3.250 1st Qu.: 3.000 1st Qu.: 3.000
## Median : 7.000 Median : 7.000 Median : 6.000 Median : 3.000
## Mean : 8.363 Mean : 7.760 Mean : 6.127 Mean : 5.513
## 3rd Qu.:12.000 3rd Qu.:12.000 3rd Qu.: 7.750 3rd Qu.: 7.750
## Max. :26.252 Max. :15.000 Max. :14.000 Max. :14.000
##
## X6m.gose X2013.gose skull.fx temp.injury surgery
## Min. :2.000 Min. :2.000 0:18 0:19 0:17
## 1st Qu.:3.000 1st Qu.:5.000 1:28 1:27 1:29
## Median :5.000 Median :7.000
## Mean :4.606 Mean :5.804
## 3rd Qu.:6.000 3rd Qu.:7.000
## Max. :8.000 Max. :8.000
##
## spikes.hr min.hr max.hr acute.sz late.sz
## Min. :-193.857 Min. :-36.587 Min. :-835.00 0:38 0:20
## 1st Qu.: 1.375 1st Qu.: 0.000 1st Qu.: 24.25 1: 8 1:26
## Median : 7.450 Median : 0.000 Median : 105.46
## Mean : 33.133 Mean : 3.202 Mean : 203.60
## 3rd Qu.: 64.057 3rd Qu.: 7.027 3rd Qu.: 393.73
## Max. : 294.000 Max. : 42.000 Max. :1199.00
##
## ever.sz missing_field.gcs missing_er.gcs missing_icu.gcs
## 0:19 Mode :logical Mode :logical Mode :logical
## 1:27 FALSE:44 FALSE:44 FALSE:45
## TRUE :2 TRUE :2 TRUE :1
##
##
##
##
## missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr
## Mode :logical Mode :logical Mode :logical Mode :logical
## FALSE:45 FALSE:41 FALSE:28 FALSE:28
## TRUE :1 TRUE :5 TRUE :18 TRUE :18
##
##
##
##
## missing_max.hr
## Mode :logical
## FALSE:28
## TRUE :18
##
##
##
##
A Simple Manual Implementation of EM-Based Imputation
set.seed(202227)
mu <- as.matrix(rep(2,20) )
sig <- diag(c(1:20) )
# Add a noise item. The noise is
# $ \epsilon ~ MVN(as.matrix(rep(0,20)), diag(rep(1,20)))$
library(MASS)
sim_data <- mvrnorm(n = 200, mu, sig) + mvrnorm(n=200, as.matrix(rep(0,20)), diag( rep(1,20) ))
# save these in the "original" object
sim_data.orig <- sim_data
# introduce 500 random missing indices (in the total of 4000=200*20)
# discrete distribution where the probability of the elements of values is proportional to probs, #which are normalized to add up to 1.
library(e1071)
rand.miss <- e1071::rdiscrete(500, probs = rep(1,length(sim_data)), values = seq(1, length(sim_data)))
sim_data[rand.miss] <- NA
sum(is.na(sim_data)) # check now many missing (NA) are there < 500
## [1] 466
# cast the data into a data.frame object and report 15*10 elements
sim_data.df <- data.frame(sim_data)
knitr::kable( sim_data.df[1:15, 1:10], caption = "The first 15 rows and first 10 columns of the simulation data")
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 |
|---|---|---|---|---|---|---|---|---|---|
| NA | 4.8500284 | 1.7282025 | 4.6387605 | 4.5511409 | 2.1699719 | 4.7118571 | 2.6972905 | 2.7895031 | -0.9250101 |
| 1.7721898 | -1.6955535 | 0.4234295 | 6.2619764 | -3.1339307 | -1.8043889 | 1.2932913 | 0.9906410 | 1.0790366 | 1.1638161 |
| 2.2859237 | 4.2713494 | 3.8900670 | 3.5104578 | 3.0025607 | 3.9058619 | 3.0548309 | 3.5258699 | NA | 1.3601231 |
| NA | 6.9253703 | NA | NA | NA | -2.0861790 | -0.8847125 | 5.6485105 | 1.3348689 | 0.6512754 |
| -0.0305062 | 2.3069337 | 0.7258556 | 1.2474398 | -0.2091862 | 1.1726298 | 1.4622491 | 0.6492390 | 0.1458781 | 7.2195026 |
| 1.9411674 | 3.6606019 | 6.3004943 | 0.3543817 | 1.0875460 | 1.3077307 | 0.0269935 | -0.0112352 | NA | 2.4616466 |
| 0.5209321 | 4.2225983 | NA | 4.3656730 | 5.7103015 | 0.4736086 | 4.5803136 | 0.2535158 | -0.9758147 | -2.6156309 |
| 3.3111727 | 0.1245427 | 0.9768425 | 4.7764851 | 4.9810860 | -0.0438375 | -0.3213523 | 4.3612903 | 2.7483497 | 3.7744105 |
| 2.1097813 | NA | NA | 3.6791032 | 3.7784795 | 5.5533794 | 1.8635898 | 0.1565858 | 4.6743012 | NA |
| 1.6203382 | 3.2422234 | 1.3625543 | 1.4006816 | -1.3467049 | NA | 3.1703070 | -3.4308300 | 0.2195447 | -2.2921107 |
| 2.4063177 | 0.5601779 | 3.9764884 | 2.1652769 | 0.1060700 | NA | -3.7629196 | -0.9718406 | 1.2196606 | 0.7038253 |
| NA | 1.6714614 | 0.2000255 | 2.3308968 | 0.4593543 | 0.1716613 | -0.5795159 | -1.3327330 | 6.4115262 | 1.2607465 |
| 2.5416914 | 1.0388186 | 4.7421120 | 2.3110187 | -0.2892515 | -1.5291668 | NA | 9.3103565 | 2.5364297 | 1.6862923 |
| 0.9393925 | 1.8668513 | 3.4702117 | NA | 0.7797323 | 2.5548284 | -3.5498813 | NA | 3.2124694 | 5.3120133 |
| -0.8334973 | 0.6466994 | 1.1949414 | -0.6573305 | NA | -1.7743495 | 1.7168340 | 2.1065878 | 2.9478528 | -0.5562349 |
EM_algorithm <- function(x, tol = 0.001) {
# identify the missing data entries (Boolean indices)
missvals <- is.na(x)
# instantialize the EM-iteration
new.impute <- x
old.impute <- x
count.iter <- 1
reach.tol <- 0
# compute \Sigma on complete data
sigma <- as.matrix(var(na.exclude(x)))
# compute the vector of feature (column) means
mean.vec <- as.matrix(apply(na.exclude(x), 2, mean))
while (reach.tol != 1) {
for (i in 1:nrow(x)) {
pick.miss <- (c(missvals[i, ]))
if (sum(pick.miss) != 0) {
# compute invese-Sigma_completeData, variance-covariance matrix
inv.S <- solve(sigma[!pick.miss, !pick.miss], tol = 1e-40)
# Expectation Step
# $$E(Y|X)=\mu_{mis}+\Sigma_{mo}\Sigma_{oo}^{-1}(X-\mu_{obs})$$
new.impute[i, pick.miss] <- mean.vec[pick.miss] +
sigma[pick.miss,!pick.miss] %*% inv.S %*%
(t(new.impute[i, !pick.miss]) - t(t(mean.vec[!pick.miss])))
}
}
# Maximization Step
# Compute the complete \Sigma complete vector of feature (column) means
# $$\Sigma^{(t+1)} = \frac{1}{n}\sum_{i=1}^nE(ZZ^T|X) -
# \mu^{(t+1)}{\mu^{(t+1)}}^T$$
sigma <- var((new.impute))
#$$\mu^{(t+1)} = \frac{1}{n}\sum_{i=1}^nE(Z|X)$$
mean.vec <- as.matrix(apply(new.impute, 2, mean))
# Inspect for convergence tolerance, start with the 2nd iteration
if (count.iter > 1) {
for (l in 1:nrow(new.impute)) {
for (m in 1:ncol(new.impute)) {
if (abs((old.impute[l, m] - new.impute[l, m])) > tol) {
reach.tol < -0
} else {
reach.tol <- 1
}
}
}
}
count.iter <- count.iter + 1
old.impute <- new.impute
}
sim_data.imputed <- EM_algorithm(sim_data.df, tol=0.0001)
# return the imputation output of the current iteration that passed the tolerance level
return(new.impute)
}
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
dim(sim_data.df)
## [1] 200 20
amelia.out <- amelia(sim_data.df, m = 5)
## -- Imputation 1 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 21
##
## -- Imputation 2 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
##
## -- Imputation 3 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
##
## -- Imputation 4 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
##
## -- Imputation 5 --
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 21 22 23
library("XML")
library("xml2")
library("rvest")
##
## Attaching package: 'rvest'
## The following object is masked from 'package:XML':
##
## xml
wiki_url <- read_html("http://wiki.socr.umich.edu/index.php/SOCR_Data_Dinov_021708_Earthquakes")
html_nodes(wiki_url, "#content")
## {xml_nodeset (1)}
## [1] <div id="content" class="mw-body-primary" role="main">\n\t<a id="top ...
## {xml_nodeset (1)}
## [1] <div id="content" class="mw-body-primary" role="main">\n\t<a id="top...
earthquake<- html_table(html_nodes(wiki_url, "table")[[2]])
library(ggplot2)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
plotweb <- ggplot(earthquake, aes(Longitude, Latitude, group = Magt, color = Magt))+
geom_point(data = earthquake, size = 4, mapping = aes(x = Longitude, y = Latitude, shape = Magt))
plotweb
plot1<-ggplot(earthquake, aes(Latitude, size=1))+
geom_density(aes(color=Magt))
plot1
## Warning: Groups with fewer than two data points have been dropped.
kernal_density <- with(earthquake, MASS::kde2d(Longitude, Latitude, n = 50))
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
with(kernal_density, plot_ly(x=x, y=y, z=z, type="surface"))
plot_ly(x = ~ earthquake$Longitude)
## No trace type specified:
## Based on info supplied, a 'histogram' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#histogram
plot_ly(x = ~ earthquake$Longitude, y = ~earthquake$Latitude)
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
plot_ly(x=~earthquake$Longitude, y=~earthquake$Latitude, z=~earthquake$Mag)
## No trace type specified:
## Based on info supplied, a 'scatter3d' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
df3D <- data.frame(x=earthquake$Longitude, y=earthquake$Latitude, z=earthquake$Mag)
# Convert the Long (X, Y, Z) Earthquake format data into a Matrix Format
library("Matrix")
matrix_EarthQuakes <- with(df3D, sparseMatrix(i = as.numeric(180-x),j=as.numeric(y), x=z, use.last.ij=T, dimnames=list(levels(x), levels(y))))
dim(matrix_EarthQuakes)
## [1] 307 44
View(as.matrix(matrix_EarthQuakes))
# view matrix is 2D heatmap :
library("ggplot2"); library("gplots")
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
heatmap.2( as.matrix(matrix_EarthQuakes[280:307, 30:44]), Rowv=FALSE,
Colv=FALSE, dendrogram='none', cellnote=as.matrix(matrix_EarthQuakes[280:307, 30:44]), notecol="black", trace='none', key=FALSE, lwid =
c(.01, .99), lhei = c(.01, .99), margins = c(5, 15 ))
# Long -180<x<-170, Lat: 30<y<45, Z: 5<Mag<8
matrix_EarthQuakes <- with(df3D, sparseMatrix(i = as.numeric(180+x),
j=as.numeric(y), x=z,use.last.ij=TRUE,dimnames=list(levels(x), levels(y))))
mat1 <- as.matrix(matrix_EarthQuakes)
plot_ly(z = ~mat1, type = "surface")
# To plot the Aggregate (Summed) Magnitudes at all Long/Lat:
matrix_EarthQuakes <- with(df3D, sparseMatrix(i = as.numeric(180+x),
j=as.numeric(y), x=z, dimnames=list(levels(x), levels(y))))
mat1 <- as.matrix(matrix_EarthQuakes)
plot_ly(z = ~mat1, type = "surface")
# plot_ly(z = ~mat1[30:60, 20:40], type = "surface")
Cohort rebalancing
# load the data: 06_PPMI_ClassificationValidationData_Short.csv
ppmi_data <-read.csv("https://umich.instructure.com/files/330400/download?download_frd=1", header=TRUE)
table(ppmi_data$ResearchGroup)
##
## Control PD SWEDD
## 166 434 61
# binarize the Dx classes
ppmi_data$ResearchGroup <- ifelse(ppmi_data$ResearchGroup == "Control", "Control", "Patient")
attach(ppmi_data)
head(ppmi_data)
## X FID_IID L_insular_cortex_AvgMeanCurvature L_insular_cortex_ComputeArea
## 1 2 3001 0.11044503 2917.938
## 2 3 3002 0.10091019 2505.515
## 3 4 3003 0.09726433 3284.796
## 4 5 3004 0.10955031 2802.992
## 5 6 3006 0.09733524 3188.515
## 6 7 3008 0.10340988 2548.982
## L_insular_cortex_Volume L_insular_cortex_ShapeIndex
## 1 8754.107 0.3207401
## 2 8089.185 0.3191578
## 3 11446.049 0.3150762
## 4 9000.509 0.3272781
## 5 11004.131 0.3040458
## 6 7613.921 0.3006974
## L_insular_cortex_Curvedness R_insular_cortex_AvgMeanCurvature
## 1 0.1033839 0.1160668
## 2 0.1099705 0.1077885
## 3 0.0991298 0.1014219
## 4 0.1144485 0.1126108
## 5 0.1091740 0.1086814
## 6 0.1154384 0.1232604
## R_insular_cortex_ComputeArea R_insular_cortex_Volume
## 1 2342.961 5732.367
## 2 2073.981 5321.370
## 3 2320.993 6395.721
## 4 2225.154 5567.333
## 5 2135.745 5583.177
## 6 1839.798 3953.644
## R_insular_cortex_ShapeIndex R_insular_cortex_Curvedness
## 1 0.3149570 0.1215921
## 2 0.2709259 0.1345651
## 3 0.3003472 0.1126387
## 4 0.3058957 0.1315195
## 5 0.3187932 0.1205962
## 6 0.2863870 0.1324961
## L_cingulate_gyrus_AvgMeanCurvature L_cingulate_gyrus_ComputeArea
## 1 0.1246343 4381.930
## 2 0.1355602 3221.540
## 3 0.1236226 4658.949
## 4 0.1274116 4016.731
## 5 0.1298253 4188.552
## 6 0.1406572 3366.857
## L_cingulate_gyrus_Volume L_cingulate_gyrus_ShapeIndex
## 1 11205.127 0.4349292
## 2 7439.645 0.4396813
## 3 12443.624 0.4556403
## 4 9922.555 0.4316057
## 5 10502.193 0.4510032
## 6 7630.507 0.4539803
## L_cingulate_gyrus_Curvedness R_cingulate_gyrus_AvgMeanCurvature
## 1 0.06714417 0.1249111
## 2 0.08652417 0.1458248
## 3 0.06749613 0.1361298
## 4 0.07150800 0.1288771
## 5 0.07495887 0.1277486
## 6 0.07871191 0.1431415
## R_cingulate_gyrus_ComputeArea R_cingulate_gyrus_Volume
## 1 4610.447 12246.549
## 2 3194.348 7264.683
## 3 4028.330 9681.935
## 4 3918.263 10227.022
## 5 4352.117 11528.466
## 6 3606.831 8659.683
## R_cingulate_gyrus_ShapeIndex R_cingulate_gyrus_Curvedness
## 1 0.4350192 0.07081687
## 2 0.4268223 0.09480283
## 3 0.4498362 0.08071711
## 4 0.4462917 0.07295120
## 5 0.4498547 0.07407571
## 6 0.4394006 0.08183549
## L_caudate_AvgMeanCurvature L_caudate_ComputeArea L_caudate_Volume
## 1 0.2392929 621.5344 821.8991
## 2 0.1833535 876.9414 1364.8601
## 3 0.1983211 1019.6664 1570.3055
## 4 0.1958644 1086.9388 1772.9241
## 5 0.2115839 863.4775 1341.3073
## 6 0.3143357 424.2490 473.3143
## L_caudate_ShapeIndex L_caudate_Curvedness R_caudate_AvgMeanCurvature
## 1 0.2473931 0.3429220 0.1558452
## 2 0.2580652 0.2308683 0.1763202
## 3 0.3164872 0.1760060 0.1703131
## 4 0.2960296 0.1854576 0.1739900
## 5 0.2780000 0.2513109 0.1441599
## 6 0.2509409 0.4493166 0.2014278
## R_caudate_ComputeArea R_caudate_Volume R_caudate_ShapeIndex
## 1 1302.1458 2526.248 0.2892424
## 2 1056.2200 1965.206 0.2970232
## 3 1208.7987 2268.701 0.2912487
## 4 1267.7040 2262.694 0.3027961
## 5 1247.1482 2678.534 0.3008528
## 6 811.4173 1302.117 0.2616859
## R_caudate_Curvedness L_putamen_AvgMeanCurvature L_putamen_ComputeArea
## 1 0.1496714 0.2022735 1029.175
## 2 0.1854200 0.1418179 1275.905
## 3 0.1568764 0.1409578 1482.684
## 4 0.1599818 0.1814492 1050.227
## 5 0.1360011 0.1787384 1229.513
## 6 0.2701702 0.1603679 1180.385
## L_putamen_Volume L_putamen_ShapeIndex L_putamen_Curvedness
## 1 1543.017 0.2853486 0.2320214
## 2 2696.695 0.2876249 0.1566615
## 3 3046.274 0.3041529 0.1523438
## 4 2028.140 0.2769987 0.1868594
## 5 2482.513 0.2923973 0.2064697
## 6 2105.129 0.2469977 0.1955575
## R_putamen_AvgMeanCurvature R_putamen_ComputeArea R_putamen_Volume
## 1 0.1111208 1680.197 3792.201
## 2 0.1446244 1375.725 2966.682
## 3 0.1402802 1310.875 2755.439
## 4 0.1214390 1511.374 3458.706
## 5 0.1628602 1298.760 2595.811
## 6 0.1191328 1468.178 3456.212
## R_putamen_ShapeIndex R_putamen_Curvedness L_hippocampus_AvgMeanCurvature
## 1 0.2670927 0.1332011 0.1226265
## 2 0.2936447 0.1546469 0.1304512
## 3 0.2913030 0.1606766 0.1191435
## 4 0.2978874 0.1362302 0.1242598
## 5 0.3136044 0.1666391 0.1116450
## 6 0.2860881 0.1425279 0.1266569
## L_hippocampus_ComputeArea L_hippocampus_Volume L_hippocampus_ShapeIndex
## 1 1769.672 4737.038 0.3201427
## 2 1529.759 3736.040 0.3189372
## 3 1679.536 4458.264 0.3512617
## 4 1552.218 3718.433 0.2814200
## 5 1878.617 5078.684 0.3338252
## 6 1580.434 3833.965 0.3145288
## L_hippocampus_Curvedness R_hippocampus_AvgMeanCurvature
## 1 0.1179703 0.1312485
## 2 0.1239492 0.1179286
## 3 0.1092478 0.1174198
## 4 0.1435228 0.1158403
## 5 0.1089977 0.1140703
## 6 0.1460567 0.1206285
## R_hippocampus_ComputeArea R_hippocampus_Volume R_hippocampus_ShapeIndex
## 1 1578.946 3817.621 0.2894075
## 2 1799.438 4665.167 0.3102413
## 3 1850.187 5177.618 0.3050725
## 4 1762.969 4529.123 0.3331539
## 5 1976.793 5508.145 0.3469782
## 6 1596.947 3811.093 0.2952811
## R_hippocampus_Curvedness L_fusiform_gyrus_AvgMeanCurvature
## 1 0.14824928 0.10385320
## 2 0.12306355 0.10405280
## 3 0.12038557 0.11326566
## 4 0.12558195 0.11218079
## 5 0.09903943 0.09365524
## 6 0.13272062 0.10741455
## L_fusiform_gyrus_ComputeArea L_fusiform_gyrus_Volume
## 1 4534.707 15830.32
## 2 4013.385 12677.99
## 3 3902.592 11525.38
## 4 4120.621 13775.00
## 5 4864.410 17467.13
## 6 3978.040 12779.50
## L_fusiform_gyrus_ShapeIndex L_fusiform_gyrus_Curvedness
## 1 0.3102792 0.08341972
## 2 0.2857870 0.10808808
## 3 0.2983204 0.09976534
## 4 0.3584661 0.09042557
## 5 0.2929722 0.09020279
## 6 0.3071964 0.10343642
## R_fusiform_gyrus_AvgMeanCurvature R_fusiform_gyrus_ComputeArea
## 1 0.1063175 3945.037
## 2 0.1267130 3551.876
## 3 0.1076625 4260.940
## 4 0.1018607 3919.842
## 5 0.1199351 5169.535
## 6 0.1170813 3331.151
## R_fusiform_gyrus_Volume R_fusiform_gyrus_ShapeIndex
## 1 14471.84 0.3592094
## 2 11263.23 0.3331986
## 3 14213.62 0.3515383
## 4 14405.84 0.3487578
## 5 17455.32 0.3855645
## 6 10131.39 0.3454159
## R_fusiform_gyrus_Curvedness Sex Weight ResearchGroup VisitID Age
## 1 0.07762280 1 74.2 Patient 1 65.1808
## 2 0.10994359 2 70.6 Patient 1 67.6247
## 3 0.08235058 2 82.0 Patient 1 56.7562
## 4 0.09018205 1 81.6 Control 1 59.4548
## 5 0.07418299 2 69.1 Patient 1 57.5781
## 6 0.09854799 2 69.1 Control 1 81.9452
## chr12_rs34637584_GT chr17_rs11868035_GT chr17_rs11012_GT
## 1 0 1 1
## 2 0 1 0
## 3 0 1 1
## 4 0 1 1
## 5 0 0 1
## 6 0 0 0
## chr17_rs393152_GT chr17_rs12185268_GT chr17_rs199533_GT
## 1 1 1 0
## 2 0 0 0
## 3 1 1 1
## 4 1 1 1
## 5 1 1 1
## 6 0 0 0
## chr17_rs199533_DP chr17_rs199533_GQ UPDRS_Part_I_Summary_Score_Baseline
## 1 20 60 0
## 2 26 66 3
## 3 65 99 1
## 4 55 99 0
## 5 33 99 1
## 6 20 60 1
## UPDRS_Part_I_Summary_Score_Month_03 UPDRS_Part_I_Summary_Score_Month_06
## 1 0.000000 0.000000
## 2 2.000000 3.407134
## 3 1.000000 0.000000
## 4 3.295365 1.498698
## 5 2.000000 0.000000
## 6 2.951738 1.309742
## UPDRS_Part_I_Summary_Score_Month_09 UPDRS_Part_I_Summary_Score_Month_12
## 1 1.000000 4
## 2 1.000000 3
## 3 0.000000 0
## 4 -0.259238 0
## 5 1.000000 2
## 6 4.120260 4
## UPDRS_Part_I_Summary_Score_Month_18 UPDRS_Part_I_Summary_Score_Month_24
## 1 1.0000000 0.00000
## 2 4.0000000 8.00000
## 3 0.6328668 0.00000
## 4 1.4534149 1.00000
## 5 2.7703873 1.70897
## 6 2.3499861 1.00000
## UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Baseline
## 1 2
## 2 15
## 3 6
## 4 0
## 5 4
## 6 2
## UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_03
## 1 2.000000
## 2 10.000000
## 3 5.000000
## 4 6.576687
## 5 10.000000
## 6 7.256589
## UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_06
## 1 3.000000
## 2 11.969026
## 3 3.000000
## 4 3.132565
## 5 10.000000
## 6 7.275325
## UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_09
## 1 3.000000
## 2 16.000000
## 3 8.000000
## 4 3.061787
## 5 8.000000
## 6 9.568565
## UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_12
## 1 3.00e+00
## 2 1.50e+01
## 3 5.00e+00
## 4 8.88e-16
## 5 1.30e+01
## 6 1.00e+00
## UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_18
## 1 4.000000
## 2 16.000000
## 3 13.035019
## 4 5.633763
## 5 16.601842
## 6 6.425239
## UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_24
## 1 2.000000e+00
## 2 1.400000e+01
## 3 7.000000e+00
## 4 8.880000e-16
## 5 1.465211e+01
## 6 2.000000e+00
## UPDRS_Part_III_Summary_Score_Baseline
## 1 12
## 2 17
## 3 29
## 4 2
## 5 22
## 6 10
## UPDRS_Part_III_Summary_Score_Month_03
## 1 18.00000
## 2 22.00000
## 3 37.00000
## 4 20.59587
## 5 30.00000
## 6 30.44123
## UPDRS_Part_III_Summary_Score_Month_06
## 1 23.00000
## 2 24.61497
## 3 37.00000
## 4 17.21627
## 5 31.00000
## 6 27.84771
## UPDRS_Part_III_Summary_Score_Month_09
## 1 19.00000
## 2 20.00000
## 3 33.00000
## 4 19.68433
## 5 32.00000
## 6 28.91509
## UPDRS_Part_III_Summary_Score_Month_12
## 1 20
## 2 27
## 3 44
## 4 3
## 5 33
## 6 12
## UPDRS_Part_III_Summary_Score_Month_18
## 1 29.00000
## 2 22.00000
## 3 59.19947
## 4 23.40963
## 5 47.79686
## 6 27.78102
## UPDRS_Part_III_Summary_Score_Month_24
## 1 39.00000
## 2 22.00000
## 3 43.00000
## 4 7.00000
## 5 42.49653
## 6 13.00000
## X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Baseline
## 1 6
## 2 14
## 3 8
## 4 6
## 5 0
## 6 4
## X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_06
## 1 7.000000
## 2 11.280820
## 3 8.000000
## 4 7.033210
## 5 0.000000
## 6 6.890178
## X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_12
## 1 3
## 2 14
## 3 6
## 4 6
## 5 6
## 6 6
## X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_24
## 1 7.00000
## 2 11.00000
## 3 7.00000
## 4 10.00000
## 5 6.12631
## 6 8.00000
## X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Baseline
## 1 6
## 2 6
## 3 4
## 4 5
## 5 5
## 6 5
## X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_06
## 1 4.000000
## 2 5.177094
## 3 4.000000
## 4 3.409930
## 5 5.000000
## 6 4.857484
## X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_12
## 1 5
## 2 6
## 3 4
## 4 4
## 5 5
## 6 5
## X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_24
## 1 5.000000
## 2 9.000000
## 3 4.000000
## 4 6.000000
## 5 5.435596
## 6 6.000000
# Model-free analysis, classification
library("crossval")
require(crossval)
require(ada)
## Loading required package: ada
## Loading required package: rpart
#set up adaboosting prediction function
# Define a new classification result-reporting function
my.ada <- function (train.x, train.y, test.x, test.y, negative, formula)
{
ada.fit <- ada(train.x, train.y)
predict.y <- predict(ada.fit, test.x)
#count TP, FP, TN, FN, Accuracy, etc.
out <- confusionMatrix(test.y, predict.y, negative = negative)
# negative is the label of a negative "null" sample (default: "control").
return (out)
}
# balance cases
# SMOTE: Synthetic Minority Oversampling Technique to handle class misbalance in binary #classification.
set.seed(1000)
require(unbalanced)
## Loading required package: unbalanced
## Loading required package: mlr
## Loading required package: ParamHelpers
##
## Attaching package: 'mlr'
## The following object is masked from 'package:crossval':
##
## crossval
## The following object is masked from 'package:e1071':
##
## impute
## Loading required package: foreach
## Loading required package: doParallel
## Loading required package: iterators
## Loading required package: parallel
ppmi_data$PD <- ifelse(ppmi_data$ResearchGroup=="Control", 1, 0)
uniqueID <- unique(ppmi_data$FID_IID)
ppmi_data <- ppmi_data[ppmi_data$VisitID==1, ]
ppmi_data$PD <- factor(ppmi_data$PD)
colnames(ppmi_data)
## [1] "X"
## [2] "FID_IID"
## [3] "L_insular_cortex_AvgMeanCurvature"
## [4] "L_insular_cortex_ComputeArea"
## [5] "L_insular_cortex_Volume"
## [6] "L_insular_cortex_ShapeIndex"
## [7] "L_insular_cortex_Curvedness"
## [8] "R_insular_cortex_AvgMeanCurvature"
## [9] "R_insular_cortex_ComputeArea"
## [10] "R_insular_cortex_Volume"
## [11] "R_insular_cortex_ShapeIndex"
## [12] "R_insular_cortex_Curvedness"
## [13] "L_cingulate_gyrus_AvgMeanCurvature"
## [14] "L_cingulate_gyrus_ComputeArea"
## [15] "L_cingulate_gyrus_Volume"
## [16] "L_cingulate_gyrus_ShapeIndex"
## [17] "L_cingulate_gyrus_Curvedness"
## [18] "R_cingulate_gyrus_AvgMeanCurvature"
## [19] "R_cingulate_gyrus_ComputeArea"
## [20] "R_cingulate_gyrus_Volume"
## [21] "R_cingulate_gyrus_ShapeIndex"
## [22] "R_cingulate_gyrus_Curvedness"
## [23] "L_caudate_AvgMeanCurvature"
## [24] "L_caudate_ComputeArea"
## [25] "L_caudate_Volume"
## [26] "L_caudate_ShapeIndex"
## [27] "L_caudate_Curvedness"
## [28] "R_caudate_AvgMeanCurvature"
## [29] "R_caudate_ComputeArea"
## [30] "R_caudate_Volume"
## [31] "R_caudate_ShapeIndex"
## [32] "R_caudate_Curvedness"
## [33] "L_putamen_AvgMeanCurvature"
## [34] "L_putamen_ComputeArea"
## [35] "L_putamen_Volume"
## [36] "L_putamen_ShapeIndex"
## [37] "L_putamen_Curvedness"
## [38] "R_putamen_AvgMeanCurvature"
## [39] "R_putamen_ComputeArea"
## [40] "R_putamen_Volume"
## [41] "R_putamen_ShapeIndex"
## [42] "R_putamen_Curvedness"
## [43] "L_hippocampus_AvgMeanCurvature"
## [44] "L_hippocampus_ComputeArea"
## [45] "L_hippocampus_Volume"
## [46] "L_hippocampus_ShapeIndex"
## [47] "L_hippocampus_Curvedness"
## [48] "R_hippocampus_AvgMeanCurvature"
## [49] "R_hippocampus_ComputeArea"
## [50] "R_hippocampus_Volume"
## [51] "R_hippocampus_ShapeIndex"
## [52] "R_hippocampus_Curvedness"
## [53] "L_fusiform_gyrus_AvgMeanCurvature"
## [54] "L_fusiform_gyrus_ComputeArea"
## [55] "L_fusiform_gyrus_Volume"
## [56] "L_fusiform_gyrus_ShapeIndex"
## [57] "L_fusiform_gyrus_Curvedness"
## [58] "R_fusiform_gyrus_AvgMeanCurvature"
## [59] "R_fusiform_gyrus_ComputeArea"
## [60] "R_fusiform_gyrus_Volume"
## [61] "R_fusiform_gyrus_ShapeIndex"
## [62] "R_fusiform_gyrus_Curvedness"
## [63] "Sex"
## [64] "Weight"
## [65] "ResearchGroup"
## [66] "VisitID"
## [67] "Age"
## [68] "chr12_rs34637584_GT"
## [69] "chr17_rs11868035_GT"
## [70] "chr17_rs11012_GT"
## [71] "chr17_rs393152_GT"
## [72] "chr17_rs12185268_GT"
## [73] "chr17_rs199533_GT"
## [74] "chr17_rs199533_DP"
## [75] "chr17_rs199533_GQ"
## [76] "UPDRS_Part_I_Summary_Score_Baseline"
## [77] "UPDRS_Part_I_Summary_Score_Month_03"
## [78] "UPDRS_Part_I_Summary_Score_Month_06"
## [79] "UPDRS_Part_I_Summary_Score_Month_09"
## [80] "UPDRS_Part_I_Summary_Score_Month_12"
## [81] "UPDRS_Part_I_Summary_Score_Month_18"
## [82] "UPDRS_Part_I_Summary_Score_Month_24"
## [83] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Baseline"
## [84] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_03"
## [85] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_06"
## [86] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_09"
## [87] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_12"
## [88] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_18"
## [89] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_24"
## [90] "UPDRS_Part_III_Summary_Score_Baseline"
## [91] "UPDRS_Part_III_Summary_Score_Month_03"
## [92] "UPDRS_Part_III_Summary_Score_Month_06"
## [93] "UPDRS_Part_III_Summary_Score_Month_09"
## [94] "UPDRS_Part_III_Summary_Score_Month_12"
## [95] "UPDRS_Part_III_Summary_Score_Month_18"
## [96] "UPDRS_Part_III_Summary_Score_Month_24"
## [97] "X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Baseline"
## [98] "X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_06"
## [99] "X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_12"
## [100] "X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_24"
## [101] "X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Baseline"
## [102] "X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_06"
## [103] "X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_12"
## [104] "X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_24"
## [105] "PD"
# ppmi_data.1<-ppmi_data[, c(3:281, 284, 287, 336:340, 341)]
n <- ncol(ppmi_data)
output.1 <- ppmi_data$PD
# remove Default Real Clinical subject classifications!
ppmi_data$PD <- ifelse(ppmi_data$ResearchGroup=="Control", 1, 0)
input <- ppmi_data[ , -which(names(ppmi_data) %in% c("ResearchGroup",
"PD", "X", "FID_IID"))]
# output <- as.matrix(ppmi_data[ , which(names(ppmi_data) %in% {"PD"})])
output <- as.factor(ppmi_data$PD)
c(dim(input), dim(output))
## [1] 422 101
#balance the dataset
data.1<-ubBalance(X= input, Y=output, type="ubSMOTE", percOver=300, percUnder=150, verbose=TRUE)
## Proportion of positives after ubSMOTE : 47.06 % of 1037 observations
# percOver = A number that drives the decision of how many extra cases from the minority class are #generated (known as over-sampling).
# k = A number indicating the number of nearest neighbors that are used to generate the new examples #of the minority class.
# percUnder = A number that drives the decision of how many extra cases from the majority classes are #selected for each case generated from the minority class (known as under-sampling)
balancedData<-cbind(data.1$X, data.1$Y)
table(data.1$Y)
##
## 0 1
## 549 488
nrow(data.1$X); ncol(data.1$X)
## [1] 1037
## [1] 101
nrow(balancedData); ncol(balancedData)
## [1] 1037
## [1] 102
nrow(input); ncol(input)
## [1] 422
## [1] 101
colnames(balancedData) <- c(colnames(input), "PD")
# check visually for differences between the distributions of the raw(input) and rebalanced data (for #only one variable, in this case)
qqplot(input[, 5], balancedData [, 5])
alpha.0.05 <- 0.05
test.results.bin <- NULL # binarized/dichotomized p-values
test.results.raw <- NULL # raw p-values
for (i in 1:(ncol(balancedData)-1))
{
test.results.raw[i]<-wilcox.test(input[, i], balancedData [, i])$p.value
test.results.bin [i] <- ifelse(test.results.raw [i] > alpha.0.05, 1, 0)
print(c("i=", i, "Wilcoxon-test=", test.results.raw [i]))
}
## [1] "i=" "1" "Wilcoxon-test="
## [4] "0.121916148645072"
## [1] "i=" "2" "Wilcoxon-test="
## [4] "0.0115555790092226"
## [1] "i=" "3" "Wilcoxon-test="
## [4] "0.00263706847152905"
## [1] "i=" "4" "Wilcoxon-test="
## [4] "0.0193825870829418"
## [1] "i=" "5" "Wilcoxon-test="
## [4] "0.465234458246381"
## [1] "i=" "6" "Wilcoxon-test="
## [4] "0.46498323084997"
## [1] "i=" "7" "Wilcoxon-test="
## [4] "0.0559803555615915"
## [1] "i=" "8" "Wilcoxon-test="
## [4] "0.0469706919902162"
## [1] "i=" "9" "Wilcoxon-test="
## [4] "0.0796428263747722"
## [1] "i=" "10" "Wilcoxon-test="
## [4] "0.867915750891381"
## [1] "i=" "11" "Wilcoxon-test="
## [4] "0.791763744004629"
## [1] "i=" "12" "Wilcoxon-test="
## [4] "0.252737493660177"
## [1] "i=" "13" "Wilcoxon-test="
## [4] "0.185613271988163"
## [1] "i=" "14" "Wilcoxon-test="
## [4] "0.921995769938857"
## [1] "i=" "15" "Wilcoxon-test="
## [4] "0.368163390166519"
## [1] "i=" "16" "Wilcoxon-test="
## [4] "0.871583919642137"
## [1] "i=" "17" "Wilcoxon-test="
## [4] "0.647092249431079"
## [1] "i=" "18" "Wilcoxon-test="
## [4] "0.409631033847553"
## [1] "i=" "19" "Wilcoxon-test="
## [4] "0.417301066910891"
## [1] "i=" "20" "Wilcoxon-test="
## [4] "0.749951942238898"
## [1] "i=" "21" "Wilcoxon-test="
## [4] "0.656276365210151"
## [1] "i=" "22" "Wilcoxon-test="
## [4] "0.15600349399911"
## [1] "i=" "23" "Wilcoxon-test="
## [4] "0.0486209830782161"
## [1] "i=" "24" "Wilcoxon-test="
## [4] "0.864465951024094"
## [1] "i=" "25" "Wilcoxon-test="
## [4] "0.657465243276207"
## [1] "i=" "26" "Wilcoxon-test="
## [4] "0.648471417036021"
## [1] "i=" "27" "Wilcoxon-test="
## [4] "0.19531148154071"
## [1] "i=" "28" "Wilcoxon-test="
## [4] "0.0594836689192933"
## [1] "i=" "29" "Wilcoxon-test="
## [4] "0.269303857585846"
## [1] "i=" "30" "Wilcoxon-test="
## [4] "0.422672530338941"
## [1] "i=" "31" "Wilcoxon-test="
## [4] "0.560402299892182"
## [1] "i=" "32" "Wilcoxon-test="
## [4] "0.0775475535039496"
## [1] "i=" "33" "Wilcoxon-test="
## [4] "0.0478438562128426"
## [1] "i=" "34" "Wilcoxon-test="
## [4] "0.859726704357985"
## [1] "i=" "35" "Wilcoxon-test="
## [4] "0.426408714781497"
## [1] "i=" "36" "Wilcoxon-test="
## [4] "0.853487254852529"
## [1] "i=" "37" "Wilcoxon-test="
## [4] "0.392114049970288"
## [1] "i=" "38" "Wilcoxon-test="
## [4] "0.156724308076248"
## [1] "i=" "39" "Wilcoxon-test="
## [4] "0.694130201824518"
## [1] "i=" "40" "Wilcoxon-test="
## [4] "0.723601415143246"
## [1] "i=" "41" "Wilcoxon-test="
## [4] "0.00619701387523487"
## [1] "i=" "42" "Wilcoxon-test="
## [4] "0.000286562753740492"
## [1] "i=" "43" "Wilcoxon-test="
## [4] "5.29983946602411e-05"
## [1] "i=" "44" "Wilcoxon-test="
## [4] "0.0707571581138932"
## [1] "i=" "45" "Wilcoxon-test="
## [4] "0.140737722531954"
## [1] "i=" "46" "Wilcoxon-test="
## [4] "0.0507074618738118"
## [1] "i=" "47" "Wilcoxon-test="
## [4] "0.020805101536845"
## [1] "i=" "48" "Wilcoxon-test="
## [4] "0.00484021037886222"
## [1] "i=" "49" "Wilcoxon-test="
## [4] "0.0273568640429652"
## [1] "i=" "50" "Wilcoxon-test="
## [4] "0.234512884956754"
## [1] "i=" "51" "Wilcoxon-test="
## [4] "0.0319316834536563"
## [1] "i=" "52" "Wilcoxon-test="
## [4] "8.47910363381624e-05"
## [1] "i=" "53" "Wilcoxon-test="
## [4] "0.000102471108880125"
## [1] "i=" "54" "Wilcoxon-test="
## [4] "0.706925544464841"
## [1] "i=" "55" "Wilcoxon-test="
## [4] "0.0146128471982993"
## [1] "i=" "56" "Wilcoxon-test="
## [4] "0.0723311086286497"
## [1] "i=" "57" "Wilcoxon-test="
## [4] "0.000213922368883682"
## [1] "i=" "58" "Wilcoxon-test="
## [4] "0.000142016596036652"
## [1] "i=" "59" "Wilcoxon-test="
## [4] "0.11681183091181"
## [1] "i=" "60" "Wilcoxon-test="
## [4] "0.0253973561846997"
## [1] "i=" "61" "Wilcoxon-test="
## [4] "0.506528137961723"
## [1] "i=" "62" "Wilcoxon-test="
## [4] "0.770981890446707"
## [1] "i=" "63" "Wilcoxon-test=" "NaN"
## [1] "i=" "64" "Wilcoxon-test="
## [4] "0.204494745121826"
## [1] "i=" "65" "Wilcoxon-test="
## [4] "0.438687802825894"
## [1] "i=" "66" "Wilcoxon-test="
## [4] "0.143044903491988"
## [1] "i=" "67" "Wilcoxon-test="
## [4] "0.208214945007494"
## [1] "i=" "68" "Wilcoxon-test="
## [4] "0.0827002916147158"
## [1] "i=" "69" "Wilcoxon-test="
## [4] "0.768385666605511"
## [1] "i=" "70" "Wilcoxon-test="
## [4] "0.967575952568832"
## [1] "i=" "71" "Wilcoxon-test="
## [4] "0.48546165207412"
## [1] "i=" "72" "Wilcoxon-test="
## [4] "0.764338124700727"
## [1] "i=" "73" "Wilcoxon-test="
## [4] "0.701672829552746"
## [1] "i=" "74" "Wilcoxon-test="
## [4] "0.182127870815918"
## [1] "i=" "75" "Wilcoxon-test="
## [4] "0.783908000869157"
## [1] "i=" "76" "Wilcoxon-test="
## [4] "0.0300275504051034"
## [1] "i=" "77" "Wilcoxon-test="
## [4] "0.0652937253636983"
## [1] "i=" "78" "Wilcoxon-test="
## [4] "0.886630291736222"
## [1] "i=" "79" "Wilcoxon-test="
## [4] "0.00304995330937243"
## [1] "i=" "80" "Wilcoxon-test="
## [4] "0.000193269737895467"
## [1] "i=" "81" "Wilcoxon-test="
## [4] "0.263594088910847"
## [1] "i=" "82" "Wilcoxon-test="
## [4] "0.55249462248568"
## [1] "i=" "83" "Wilcoxon-test="
## [4] "0.840180337999498"
## [1] "i=" "84" "Wilcoxon-test="
## [4] "1.018634517112e-07"
## [1] "i=" "85" "Wilcoxon-test="
## [4] "0.809284153154193"
## [1] "i=" "86" "Wilcoxon-test="
## [4] "0.000202012794771608"
## [1] "i=" "87" "Wilcoxon-test="
## [4] "0.000257390393155121"
## [1] "i=" "88" "Wilcoxon-test="
## [4] "0.151668005666288"
## [1] "i=" "89" "Wilcoxon-test="
## [4] "0.610548017567455"
## [1] "i=" "90" "Wilcoxon-test="
## [4] "0.177766076673501"
## [1] "i=" "91" "Wilcoxon-test="
## [4] "7.52724170827059e-05"
## [1] "i=" "92" "Wilcoxon-test="
## [4] "0.472935618407835"
## [1] "i=" "93" "Wilcoxon-test="
## [4] "0.000191281916588349"
## [1] "i=" "94" "Wilcoxon-test="
## [4] "0.710116019415709"
## [1] "i=" "95" "Wilcoxon-test="
## [4] "0.777249640877696"
## [1] "i=" "96" "Wilcoxon-test="
## [4] "0.394345301658289"
## [1] "i=" "97" "Wilcoxon-test="
## [4] "0.622722439182102"
## [1] "i=" "98" "Wilcoxon-test="
## [4] "0.333295057251737"
## [1] "i=" "99" "Wilcoxon-test="
## [4] "9.31683399292477e-05"
## [1] "i=" "100" "Wilcoxon-test="
## [4] "0.0108452571605789"
## [1] "i=" "101" "Wilcoxon-test="
## [4] "0.0715649084105231"
print(c("Wilcoxon test results: ", test.results.bin))
## [1] "Wilcoxon test results: " "1"
## [3] "0" "0"
## [5] "0" "1"
## [7] "1" "1"
## [9] "0" "1"
## [11] "1" "1"
## [13] "1" "1"
## [15] "1" "1"
## [17] "1" "1"
## [19] "1" "1"
## [21] "1" "1"
## [23] "1" "0"
## [25] "1" "1"
## [27] "1" "1"
## [29] "1" "1"
## [31] "1" "1"
## [33] "1" "0"
## [35] "1" "1"
## [37] "1" "1"
## [39] "1" "1"
## [41] "1" "0"
## [43] "0" "0"
## [45] "1" "1"
## [47] "1" "0"
## [49] "0" "0"
## [51] "1" "0"
## [53] "0" "0"
## [55] "1" "0"
## [57] "1" "0"
## [59] "0" "1"
## [61] "0" "1"
## [63] "1" NA
## [65] "1" "1"
## [67] "1" "1"
## [69] "1" "1"
## [71] "1" "1"
## [73] "1" "1"
## [75] "1" "1"
## [77] "0" "1"
## [79] "1" "0"
## [81] "0" "1"
## [83] "1" "1"
## [85] "0" "1"
## [87] "0" "0"
## [89] "1" "1"
## [91] "1" "0"
## [93] "1" "0"
## [95] "1" "1"
## [97] "1" "1"
## [99] "1" "0"
## [101] "0" "1"
test.results.corr <- stats::p.adjust(test.results.raw, method = "fdr", n = length(test.results.raw))
# where methods are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY","fdr", "none")
plot(test.results.raw, test.results.corr)
# zeros (0) are significant independent between-group T-test differences,ones (1) are insignificant
# Check the Differences between the rate of significance between the raw and FDR-corrected p-values
test.results.bin <- ifelse(test.results.raw > alpha.0.05, 1, 0)
table(test.results.bin)
## test.results.bin
## 0 1
## 29 71
test.results.corr.bin <- ifelse(test.results.corr > alpha.0.05, 1, 0)
table(test.results.corr.bin)
## test.results.corr.bin
## 0 1
## 17 83
#Right Skewed
N <- 10000
x <- rnbinom(N, 10, .5)
hist(x,
xlim=c(min(x), max(x)), probability=T, nclass=max(x)-min(x)+1,
col='lightblue', xlab=' ', ylab=' ', axes=F,
main='Right Skewed')
lines(density(x, bw=1), col='red', lwd=3)
#No Skew
N <- 10000
x <- rnorm(N, 0, 1)
hist(x, probability=T,
col='lightblue', xlab=' ', ylab=' ', axes=F,
main='No Skew')
lines(density(x, bw=0.4), col='red', lwd=3)
#Uniform density
x<-runif(1000, 1, 50)
hist(x, col='lightblue', main="Uniform Distribution", probability = T, xlab=
"", ylab="Density", axes=F)
abline(h=0.02, col='red', lwd=3)
#68-95-99.7 rule
x <- rnorm(N, 0, 1)
hist(x, probability=T,
col='lightblue', xlab=' ', ylab=' ', axes = F,
main='68-95-99.7 Rule')
lines(density(x, bw=0.4), col='red', lwd=3)
axis(1, at=c(-3, -2, -1, 0, 1, 2, 3), labels = expression(mu-3*sigma,
mu-2*sigma, mu-sigma, mu, mu+sigma, mu+2*sigma, mu+3*sigma))
abline(v=-1, lwd=3, lty=2)
abline(v=1, lwd=3, lty=2)
abline(v=-2, lwd=3, lty=2)
abline(v=2, lwd=3, lty=2)
abline(v=-3, lwd=3, lty=2)
abline(v=3, lwd=3, lty=2)
text(0, 0.2, "68%")
segments(-1, 0.2, -0.3, 0.2, col = 'red', lwd=2)
segments(1, 0.2, 0.3, 0.2, col = 'red', lwd=2)
text(0, 0.15, "95%")
segments(-2, 0.15, -0.3, 0.15, col = 'red', lwd=2)
segments(2, 0.15, 0.3, 0.15, col = 'red', lwd=2)
text(0, 0.1, "99.7%")
segments(-3, 0.1, -0.3, 0.1, col = 'red', lwd=2)
segments(3, 0.1, 0.3, 0.1, col = 'red', lwd=2)
Temp_Data <- as.data.frame(read.csv("https://umich.instructure.com/files/706163/download?download_frd=1", header=T, na.strings=c("", ".", "NA", "NR")))
summary(Temp_Data)
## Year Jan Feb Mar
## Min. :1900 Min. :11.40 Min. :14.00 Min. :24.70
## 1st Qu.:1929 1st Qu.:20.95 1st Qu.:22.35 1st Qu.:32.65
## Median :1958 Median :24.20 Median :26.10 Median :35.30
## Mean :1958 Mean :24.22 Mean :25.45 Mean :35.39
## 3rd Qu.:1986 3rd Qu.:27.65 3rd Qu.:28.90 3rd Qu.:38.15
## Max. :2015 Max. :35.00 Max. :35.60 Max. :50.70
## NA's :1 NA's :1 NA's :1
## Apr May Jun Jul
## Min. :37.50 Min. :49.40 Min. :61.90 Min. :67.80
## 1st Qu.:45.15 1st Qu.:56.60 1st Qu.:66.45 1st Qu.:71.10
## Median :47.60 Median :58.70 Median :68.40 Median :72.60
## Mean :47.55 Mean :58.85 Mean :68.22 Mean :72.66
## 3rd Qu.:50.15 3rd Qu.:61.65 3rd Qu.:70.50 3rd Qu.:73.70
## Max. :54.50 Max. :66.60 Max. :74.30 Max. :78.90
## NA's :1 NA's :1
## Aug Sep Oct Nov
## Min. :64.90 Min. :54.80 Min. :42.70 Min. :32.90
## 1st Qu.:68.85 1st Qu.:62.20 1st Qu.:50.20 1st Qu.:37.40
## Median :70.90 Median :63.80 Median :52.20 Median :39.80
## Mean :70.74 Mean :63.82 Mean :52.29 Mean :39.76
## 3rd Qu.:72.40 3rd Qu.:65.75 3rd Qu.:54.30 3rd Qu.:41.85
## Max. :76.20 Max. :68.80 Max. :62.10 Max. :47.50
## NA's :1 NA's :1 NA's :1 NA's :1
## Dec
## Min. :17.70
## 1st Qu.:25.55
## Median :28.40
## Mean :28.39
## 3rd Qu.:31.65
## Max. :36.80
## NA's :1
# View(Temp_Data); colnames(Temp_Data)
# Wide-to-Long transformation: reshape arguments include
# (1) list of variable names that define the different times or metrics (varying),
# (2) the name we wish to give the variable containing these values in our long dataset (v.names),
# (3) the name we wish to give the variable describing the different times or metrics (timevar),
# (4) the values this variable will have (times), and
# (5) the end format for the data (direction)
# Before reshaping make sure all data types are the same as putting them in 1 column will
# otherwise generate inconsistencies/errors
colN <- colnames(Temp_Data[,-1])
longTempData <- reshape(Temp_Data, varying = colN, v.names = "Temps", timevar="Months", times = colN, direction = "long")
# View(longTempData)
bar2 <- ggplot(longTempData, aes(x = Months, y = Temps, fill = Months)) +
geom_bar(stat = "identity")
print(bar2)
## Warning: Removed 10 rows containing missing values (position_stack).
bar3 <- ggplot(longTempData, aes(x = Year, y = Temps, fill = Months)) +
geom_bar(stat = "identity")
print(bar3)
## Warning: Removed 10 rows containing missing values (position_stack).
p <- ggplot(longTempData, aes(x=Year, y=as.integer(Temps), colour=Months)) +
geom_line()
p