Managing Data in R

R Markdown

water <- read.csv('https://umich.instructure.com/files/399172/download?download_frd=1', header=T)
water[1:3, ]

##   Year..string. WHO.region..string. Country..string.
## 1          1990              Africa          Algeria
## 2          1990              Africa           Angola
## 3          1990              Africa            Benin
##   Residence.Area.Type..string.
## 1                        Rural
## 2                        Rural
## 3                        Rural
##   Population.using.improved.drinking.water.sources......numeric.
## 1                                                             88
## 2                                                             42
## 3                                                             49
##   Population.using.improved.sanitation.facilities......numeric.
## 1                                                            77
## 2                                                             7
## 3                                                             0

colnames(water)<-c("year", "region", "country", "residence_area", "improved_water", "sanitation_facilities")
water[1:3, ]

##   year region country residence_area improved_water sanitation_facilities
## 1 1990 Africa Algeria          Rural             88                    77
## 2 1990 Africa  Angola          Rural             42                     7
## 3 1990 Africa   Benin          Rural             49                     0

which.max(water$year);

## [1] 913

# rowMeans(water[,5:6])
mean(water[,6], trim=0.08, na.rm=T)

## [1] 71.63629

Simulation <- read.csv("https://umich.instructure.com/files/354289/download?download_frd=1", header = FALSE)
Simulation[1:3, ]

##   V1 V2  V3    V4       V5  V6  V7   V8     V9    V10      V11     V12
## 1 ID i2 age treat homeless pcs mcs cesd indtot pss_fr drugrisk sexrisk
## 2  1  0  25     0        0  49   7   46     37      0        1       6
## 3  2 18  31     0        0  48  34   17     48      0        0      11
##       V13    V14       V15     V16
## 1 satreat female substance racegrp
## 2       0      0   cocaine   black
## 3       0      0   alcohol   white

str(water)

## 'data.frame':    3331 obs. of  6 variables:
##  $ year                 : int  1990 1990 1990 1990 1990 1990 1990 1990 1990 1990 ...
##  $ region               : Factor w/ 6 levels "Africa","Americas",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ country              : Factor w/ 192 levels "Afghanistan",..: 3 5 19 23 26 27 30 32 33 37 ...
##  $ residence_area       : Factor w/ 3 levels "Rural","Total",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ improved_water       : num  88 42 49 86 39 67 34 46 37 83 ...
##  $ sanitation_facilities: num  77 7 0 22 2 42 27 12 4 11 ...

summary(water$year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1990    1995    2005    2002    2010    2012

summary(water[c("improved_water", "sanitation_facilities")])

##  improved_water  sanitation_facilities
##  Min.   :  3.0   Min.   :  0.00       
##  1st Qu.: 77.0   1st Qu.: 42.00       
##  Median : 93.0   Median : 81.00       
##  Mean   : 84.9   Mean   : 68.87       
##  3rd Qu.: 99.0   3rd Qu.: 97.00       
##  Max.   :100.0   Max.   :100.00       
##  NA's   :32      NA's   :135

plot(density(water$improved_water,na.rm = T))

vec1<-c(40, 56, 99)
mean(vec1)

## [1] 65

mean(c(40, 56, 99))

## [1] 65

median(vec1)

## [1] 56

median(c(40, 56, 99))

## [1] 56

library(psych)

geometric.mean(vec1, na.rm=TRUE)

## [1] 60.52866

range(water$year)

## [1] 1990 2012

diff(range(water$year))

## [1] 22

IQR(water$year)

## [1] 15

summary(water$improved_water)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     3.0    77.0    93.0    84.9    99.0   100.0      32

IQR(water$improved_water, na.rm = T)

## [1] 22

quantile(water$improved_water, na.rm = T)

##   0%  25%  50%  75% 100% 
##    3   77   93   99  100

quantile(water$improved_water, probs = c(0.2, 0.6), na.rm = T)

## 20% 60% 
##  71  97

quantile(water$improved_water, seq(from=0, to=1, by=0.2), na.rm = T)

##   0%  20%  40%  60%  80% 100% 
##    3   71   89   97  100  100

boxplot(water$improved_water, main="Boxplot for Percent improved_water", ylab="Percentage")

hist(water$improved_water, main = "Histogram of Percent improved_water", xlab="Percentage")

hist(water$sanitation_facilities, main = "Histogram of Percent sanitation_facilities", xlab = "Percentage")

x <- rnorm(100, 0, 1)
hist(x, probability=T, col='lightblue', xlab=' ', ylab=' ', axes=F, main='Normal Distribution')
lines(density(x, bw=0.4), col='red', lwd=3)

baseball<-read.table("https://umich.instructure.com/files/330381/download?download_frd=1", header=T)
hist(baseball$Weight, main = "Histogram for Baseball Player's Weight", xlab="weight")

hist(baseball$Height, main = "Histogram for Baseball Player's Height", xlab ="height")

mean(baseball$Weight)

## [1] 201.7166

mean(baseball$Height)

## [1] 73.69729

var(baseball$Weight)

## [1] 440.9913

sd(baseball$Weight)

## [1] 20.99979

var(baseball$Height)

## [1] 5.316798

sd(baseball$Height)

## [1] 2.305818

water <- read.csv('https://umich.instructure.com/files/399172/download?download_frd=1', header=T)
table(water$Year)

## 
## 1990 1995 2000 2005 2010 2012 
##  520  561  570  570  556  554

table(water$WHO.region)

## 
##                Africa              Americas Eastern Mediterranean 
##                   797                   613                   373 
##                Europe       South-East Asia       Western Pacific 
##                   910                   191                   447

table(water$Residence.Area)

## 
## Rural Total Urban 
##  1095  1109  1127

year_table<-table(water$Year..string.)
prop.table(year_table)

## 
##      1990      1995      2000      2005      2010      2012 
## 0.1561093 0.1684179 0.1711198 0.1711198 0.1669168 0.1663164

year_pct<-prop.table(year_table)*100
round(year_pct, digits=1)

## 
## 1990 1995 2000 2005 2010 2012 
## 15.6 16.8 17.1 17.1 16.7 16.6

library(gmodels)
water$africa<-water$WHO.region=="Africa"
table(water$africa)

## 
## FALSE  TRUE 
##  2534   797

CrossTable(x=water$Residence.Area, y=water$africa)

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  3331 
## 
##  
##                      | water$africa 
## water$Residence.Area |     FALSE |      TRUE | Row Total | 
## ---------------------|-----------|-----------|-----------|
##                Rural |       828 |       267 |      1095 | 
##                      |     0.030 |     0.096 |           | 
##                      |     0.756 |     0.244 |     0.329 | 
##                      |     0.327 |     0.335 |           | 
##                      |     0.249 |     0.080 |           | 
## ---------------------|-----------|-----------|-----------|
##                Total |       845 |       264 |      1109 | 
##                      |     0.002 |     0.007 |           | 
##                      |     0.762 |     0.238 |     0.333 | 
##                      |     0.333 |     0.331 |           | 
##                      |     0.254 |     0.079 |           | 
## ---------------------|-----------|-----------|-----------|
##                Urban |       861 |       266 |      1127 | 
##                      |     0.016 |     0.050 |           | 
##                      |     0.764 |     0.236 |     0.338 | 
##                      |     0.340 |     0.334 |           | 
##                      |     0.258 |     0.080 |           | 
## ---------------------|-----------|-----------|-----------|
##         Column Total |      2534 |       797 |      3331 | 
##                      |     0.761 |     0.239 |           | 
## ---------------------|-----------|-----------|-----------|
## 
##

m1<-mean(water$Population.using.improved.drinking, na.rm = T)
m2<-mean(water$Population.using.improved.sanitation, na.rm = T)
water_imp<-water
for(i in 1:3331){
if(is.na(water_imp$Population.using.improved.drinking[i])){
water_imp$Population.using.improved.drinking[i]=m1
}
if(is.na(water_imp$Population.using.improved.sanitation[i])){
water_imp$Population.using.improved.sanitation=m2
}
}
summary(water_imp)

##  Year..string.             WHO.region..string.            Country..string.
##  Min.   :1990   Africa               :797      Albania            :  18   
##  1st Qu.:1995   Americas             :613      Algeria            :  18   
##  Median :2005   Eastern Mediterranean:373      Andorra            :  18   
##  Mean   :2002   Europe               :910      Angola             :  18   
##  3rd Qu.:2010   South-East Asia      :191      Antigua and Barbuda:  18   
##  Max.   :2012   Western Pacific      :447      Argentina          :  18   
##                                                (Other)            :3223   
##  Residence.Area.Type..string.
##  Rural:1095                  
##  Total:1109                  
##  Urban:1127                  
##                              
##                              
##                              
##                              
##  Population.using.improved.drinking.water.sources......numeric.
##  Min.   :  3.0                                                 
##  1st Qu.: 77.0                                                 
##  Median : 93.0                                                 
##  Mean   : 84.9                                                 
##  3rd Qu.: 99.0                                                 
##  Max.   :100.0                                                 
##  NA's   :32                                                    
##  Population.using.improved.sanitation.facilities......numeric.
##  Min.   :  0.00                                               
##  1st Qu.: 42.00                                               
##  Median : 81.00                                               
##  Mean   : 68.87                                               
##  3rd Qu.: 97.00                                               
##  Max.   :100.00                                               
##  NA's   :135                                                  
##    africa        Population.using.improved.sanitation
##  Mode :logical   Min.   :68.87                       
##  FALSE:2534      1st Qu.:68.87                       
##  TRUE :797       Median :68.87                       
##                  Mean   :68.87                       
##                  3rd Qu.:68.87                       
##                  Max.   :68.87                       
##                                                      
##  Population.using.improved.drinking
##  Min.   :  3.0                     
##  1st Qu.: 77.0                     
##  Median : 93.0                     
##  Mean   : 84.9                     
##  3rd Qu.: 99.0                     
##  Max.   :100.0                     
##

library(mi)

## Loading required package: Matrix

## Loading required package: stats4

## mi (Version 1.0, packaged: 2015-04-16 14:03:10 UTC; goodrich)

## mi  Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University

## This program comes with ABSOLUTELY NO WARRANTY.

## This is free software, and you are welcome to redistribute it

## under the General Public License version 2 or later.

## Execute RShowDoc('COPYING') for details.

mdf<-missing_data.frame(water)
head(mdf)

##   Year..string. WHO.region..string. Country..string.
## 1          1990              Africa          Algeria
## 2          1990              Africa           Angola
## 3          1990              Africa            Benin
## 4          1990              Africa         Botswana
## 5          1990              Africa     Burkina Faso
## 6          1990              Africa          Burundi
##   Residence.Area.Type..string.
## 1                        Rural
## 2                        Rural
## 3                        Rural
## 4                        Rural
## 5                        Rural
## 6                        Rural
##   Population.using.improved.drinking.water.sources......numeric.
## 1                                                             88
## 2                                                             42
## 3                                                             49
## 4                                                             86
## 5                                                             39
## 6                                                             67
##   Population.using.improved.sanitation.facilities......numeric. africa
## 1                                                            77   TRUE
## 2                                                             7   TRUE
## 3                                                             0   TRUE
## 4                                                            22   TRUE
## 5                                                             2   TRUE
## 6                                                            42   TRUE
##   missing_Population.using.improved.drinking.water.sources......numeric.
## 1                                                                  FALSE
## 2                                                                  FALSE
## 3                                                                  FALSE
## 4                                                                  FALSE
## 5                                                                  FALSE
## 6                                                                  FALSE
##   missing_Population.using.improved.sanitation.facilities......numeric.
## 1                                                                 FALSE
## 2                                                                 FALSE
## 3                                                                 FALSE
## 4                                                                 FALSE
## 5                                                                 FALSE
## 6                                                                 FALSE

show(mdf)

## Object of class missing_data.frame with 3331 observations on 7 variables
## 
## There are 3 missing data patterns
## 
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
## 
##                                                                                 type
## Year..string.                                                             continuous
## WHO.region..string.                                            unordered-categorical
## Country..string.                                               unordered-categorical
## Residence.Area.Type..string.                                   unordered-categorical
## Population.using.improved.drinking.water.sources......numeric.            continuous
## Population.using.improved.sanitation.facilities......numeric.             continuous
## africa                                                                        binary
##                                                                missing
## Year..string.                                                        0
## WHO.region..string.                                                  0
## Country..string.                                                     0
## Residence.Area.Type..string.                                         0
## Population.using.improved.drinking.water.sources......numeric.      32
## Population.using.improved.sanitation.facilities......numeric.      135
## africa                                                               0
##                                                                method
## Year..string.                                                    <NA>
## WHO.region..string.                                              <NA>
## Country..string.                                                 <NA>
## Residence.Area.Type..string.                                     <NA>
## Population.using.improved.drinking.water.sources......numeric.    ppd
## Population.using.improved.sanitation.facilities......numeric.     ppd
## africa                                                           <NA>
##                                                                 model
## Year..string.                                                    <NA>
## WHO.region..string.                                              <NA>
## Country..string.                                                 <NA>
## Residence.Area.Type..string.                                     <NA>
## Population.using.improved.drinking.water.sources......numeric. linear
## Population.using.improved.sanitation.facilities......numeric.  linear
## africa                                                           <NA>
## 
##                                                                  family
## Year..string.                                                      <NA>
## WHO.region..string.                                                <NA>
## Country..string.                                                   <NA>
## Residence.Area.Type..string.                                       <NA>
## Population.using.improved.drinking.water.sources......numeric. gaussian
## Population.using.improved.sanitation.facilities......numeric.  gaussian
## africa                                                             <NA>
##                                                                    link
## Year..string.                                                      <NA>
## WHO.region..string.                                                <NA>
## Country..string.                                                   <NA>
## Residence.Area.Type..string.                                       <NA>
## Population.using.improved.drinking.water.sources......numeric. identity
## Population.using.improved.sanitation.facilities......numeric.  identity
## africa                                                             <NA>
##                                                                transformation
## Year..string.                                                     standardize
## WHO.region..string.                                                      <NA>
## Country..string.                                                         <NA>
## Residence.Area.Type..string.                                             <NA>
## Population.using.improved.drinking.water.sources......numeric.    standardize
## Population.using.improved.sanitation.facilities......numeric.     standardize
## africa                                                                   <NA>

mdf<-change(mdf, y="Population.using.improved.drinking", what = "imputation_method", to="pmm")
mdf<-change(mdf, y="Population.using.improved.sanitation", what = "imputation_method", to="pmm")

imputations<-mi(mdf, n.iter=10, n.chains=3, verbose=T)

data.frames <- complete(imputations, 3)
summary(water)

##  Year..string.             WHO.region..string.            Country..string.
##  Min.   :1990   Africa               :797      Albania            :  18   
##  1st Qu.:1995   Americas             :613      Algeria            :  18   
##  Median :2005   Eastern Mediterranean:373      Andorra            :  18   
##  Mean   :2002   Europe               :910      Angola             :  18   
##  3rd Qu.:2010   South-East Asia      :191      Antigua and Barbuda:  18   
##  Max.   :2012   Western Pacific      :447      Argentina          :  18   
##                                                (Other)            :3223   
##  Residence.Area.Type..string.
##  Rural:1095                  
##  Total:1109                  
##  Urban:1127                  
##                              
##                              
##                              
##                              
##  Population.using.improved.drinking.water.sources......numeric.
##  Min.   :  3.0                                                 
##  1st Qu.: 77.0                                                 
##  Median : 93.0                                                 
##  Mean   : 84.9                                                 
##  3rd Qu.: 99.0                                                 
##  Max.   :100.0                                                 
##  NA's   :32                                                    
##  Population.using.improved.sanitation.facilities......numeric.
##  Min.   :  0.00                                               
##  1st Qu.: 42.00                                               
##  Median : 81.00                                               
##  Mean   : 68.87                                               
##  3rd Qu.: 97.00                                               
##  Max.   :100.00                                               
##  NA's   :135                                                  
##    africa       
##  Mode :logical  
##  FALSE:2534     
##  TRUE :797      
##                 
##                 
##                 
##

set.seed(123)
# create MCAR missing-data generator
create.missing <- function (data, pct.mis = 10)
{
n <- nrow(data)
J <- ncol(data)
if (length(pct.mis) == 1) {
if(pct.mis>= 0 & pct.mis <=100) {
n.mis <- rep((n * (pct.mis/100)), J)
}
else {
warning("Percent missing values should be an integer between
0 and 100! Exiting"); break
}
}
else {
if (length(pct.mis) < J)
stop("The length of the missing-vector is not equal to the numbe
r of columns in the data! Exiting!")
n.mis <- n * (pct.mis/100)
}
for (i in 1:ncol(data)) {
if (n.mis[i] == 0) { # if column has no missing do nothing.
data[, i] <- data[, i]
}
else {
data[sample(1:n, n.mis[i], replace = FALSE), i] <- NA
# For each given column (i), sample the row indices (1:n),
# a number of indices to replace as "missing", n.mis[i], "NA",
# without replacement
}
}
return(as.data.frame(data))
}

n <- 1000; u1 <- rbinom(n, 1, .5); v1<-log(rnorm(n, 5, 1)); x1 <- u1*exp(v1)
u2 <- rbinom(n, 1, .5); v2 <- log(rnorm(n, 5, 1)); x2 <- u2*exp(v2)
x3 <- rbinom(n,1,prob=0.45); x4<-ordered(rep(seq(1, 5), n)[sample(1:n, n)]);
x5 <- rep(letters[1:10], n)[sample(1:n, n)]; x6 <- trunc(runif(n, 1, 10));
x7 <- rnorm(n); x8 <- factor(rep(seq(1, 10), n)[sample(1:n, n)]);
x9 <- runif(n,0.1,0.99); x10 <- rpois(n, 4); y<-x1 + x2 + x7 + x9 + rnorm(n)

# package the simulated data as a data frame object
sim_data <- cbind.data.frame(y, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10)

# randomly create missing values
sim_data_30pct_missing <- create.missing(sim_data, pct.mis=30);
head(sim_data_30pct_missing); summary(sim_data_30pct_missing)

##            y       x1       x2 x3   x4   x5 x6         x7   x8        x9
## 1         NA 0.000000 0.000000  0 <NA>    d  6  1.2954432    4 0.5055421
## 2 10.4290124 4.006301       NA NA    2 <NA>  8 -0.6485997 <NA>        NA
## 3  0.6498472 0.000000 0.000000  0    2    c NA -1.0443625    4        NA
## 4 11.4870266 5.751061       NA  0    4 <NA>  2  0.1675763 <NA> 0.1308716
## 5         NA 3.490833       NA  0 <NA>    f NA  2.2337774    1 0.4483454
## 6  5.7667229       NA 4.384732  1    2    b  4 -0.6722819    5 0.8840504
##   x10
## 1   6
## 2  NA
## 3   5
## 4   3
## 5   5
## 6   3

##        y                x1              x2              x3        
##  Min.   :-3.886   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.: 2.340   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median : 5.438   Median :0.000   Median :2.582   Median :0.0000  
##  Mean   : 5.425   Mean   :2.493   Mean   :2.499   Mean   :0.4214  
##  3rd Qu.: 8.136   3rd Qu.:4.993   3rd Qu.:4.960   3rd Qu.:1.0000  
##  Max.   :16.526   Max.   :8.390   Max.   :8.421   Max.   :1.0000  
##  NA's   :300      NA's   :300     NA's   :300     NA's   :300     
##     x4            x5            x6              x7                 x8     
##  1   :136   d      : 77   Min.   :1.000   Min.   :-3.26215   5      : 76  
##  2   :143   f      : 73   1st Qu.:3.000   1st Qu.:-0.68042   2      : 74  
##  3   :146   h      : 73   Median :5.000   Median : 0.02927   6      : 73  
##  4   :137   i      : 72   Mean   :5.079   Mean   : 0.02138   1      : 71  
##  5   :138   j      : 72   3rd Qu.:7.000   3rd Qu.: 0.79094   4      : 70  
##  NA's:300   (Other):333   Max.   :9.000   Max.   : 3.71572   (Other):336  
##             NA's   :300   NA's   :300     NA's   :300        NA's   :300  
##        x9              x10        
##  Min.   :0.1010   Min.   : 0.000  
##  1st Qu.:0.3076   1st Qu.: 2.000  
##  Median :0.5100   Median : 4.000  
##  Mean   :0.5286   Mean   : 4.029  
##  3rd Qu.:0.7681   3rd Qu.: 5.000  
##  Max.   :0.9881   Max.   :11.000  
##  NA's   :300      NA's   :300

library("betareg"); library("mi")

# get show the missing information matrix
mdf <- missing_data.frame(sim_data_30pct_missing)
show(mdf)

## Object of class missing_data.frame with 1000 observations on 11 variables
## 
## There are 538 missing data patterns
## 
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
## 
##                      type missing method   model
## y              continuous     300    ppd  linear
## x1             continuous     300    ppd  linear
## x2             continuous     300    ppd  linear
## x3                 binary     300    ppd   logit
## x4    ordered-categorical     300    ppd  ologit
## x5  unordered-categorical     300    ppd  mlogit
## x6             continuous     300    ppd  linear
## x7             continuous     300    ppd  linear
## x8  unordered-categorical     300    ppd  mlogit
## x9             proportion     300    ppd betareg
## x10            continuous     300    ppd  linear
## 
##          family     link transformation
## y      gaussian identity    standardize
## x1     gaussian identity    standardize
## x2     gaussian identity    standardize
## x3     binomial    logit           <NA>
## x4  multinomial    logit           <NA>
## x5  multinomial    logit           <NA>
## x6     gaussian identity    standardize
## x7     gaussian identity    standardize
## x8  multinomial    logit           <NA>
## x9     binomial    logit       identity
## x10    gaussian identity    standardize

# mdf@patterns # to get the textual missing pattern
image(mdf) # remember the visual pattern of this MCAR

# Next, try to impute the missing values.
# Get the Graph Parameters (plotting canvas/margins)
# set to plot the histograms for the 3 imputation chains
# mfcol=c(nr, nc). Subsequent histograms are drawn as nr-by-nc arrays on
# the graphics device by columns (mfcol), or rows (mfrow)
# oma: oma=c(bottom, left, top, right) giving the size of the outer
# margins in lines of text
# mar=c(bottom, left, top, right) gives the number of lines of margin
# to be specified on the four sides of the plot.
# tcl=length of tick marks as a fraction of the height of a line of
# text (default=0.5)
par(mfcol=c(5, 5), oma=c(1, 1, 0, 0), mar=c(1, 1, 1, 0), tcl=-0.1,
mgp=c(0, 0, 0))
imputations <- mi(sim_data_30pct_missing, n.iter=5, n.chains=3,verbose=TRUE)
hist(imputations)

# Extracts several multiply imputed data.frames from "imputations" object
data.frames <- complete(imputations, 3)

# Compare the summaries for the original data (prior to introducing missing
# values) with missing data and the re-completed data following imputation
summary(sim_data)

##        y                x1              x2              x3        x4     
##  Min.   :-3.886   Min.   :0.000   Min.   :0.000   Min.   :0.000   1:200  
##  1st Qu.: 2.579   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.000   2:200  
##  Median : 5.540   Median :0.000   Median :2.687   Median :0.000   3:200  
##  Mean   : 5.558   Mean   :2.472   Mean   :2.516   Mean   :0.431   4:200  
##  3rd Qu.: 8.278   3rd Qu.:4.996   3rd Qu.:5.007   3rd Qu.:1.000   5:200  
##  Max.   :16.526   Max.   :8.390   Max.   :8.421   Max.   :1.000          
##                                                                          
##        x5            x6              x7                  x8     
##  a      :100   Min.   :1.000   Min.   :-3.289376   1      :100  
##  b      :100   1st Qu.:3.000   1st Qu.:-0.705687   2      :100  
##  c      :100   Median :5.000   Median : 0.020011   3      :100  
##  d      :100   Mean   :5.021   Mean   : 0.002811   4      :100  
##  e      :100   3rd Qu.:7.000   3rd Qu.: 0.761157   5      :100  
##  f      :100   Max.   :9.000   Max.   : 3.715721   6      :100  
##  (Other):400                                       (Other):400  
##        x9              x10        
##  Min.   :0.1010   Min.   : 0.000  
##  1st Qu.:0.3106   1st Qu.: 3.000  
##  Median :0.5229   Median : 4.000  
##  Mean   :0.5350   Mean   : 3.951  
##  3rd Qu.:0.7705   3rd Qu.: 5.000  
##  Max.   :0.9881   Max.   :11.000  
##

summary(sim_data_30pct_missing)

##        y                x1              x2              x3        
##  Min.   :-3.886   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.: 2.340   1st Qu.:0.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median : 5.438   Median :0.000   Median :2.582   Median :0.0000  
##  Mean   : 5.425   Mean   :2.493   Mean   :2.499   Mean   :0.4214  
##  3rd Qu.: 8.136   3rd Qu.:4.993   3rd Qu.:4.960   3rd Qu.:1.0000  
##  Max.   :16.526   Max.   :8.390   Max.   :8.421   Max.   :1.0000  
##  NA's   :300      NA's   :300     NA's   :300     NA's   :300     
##     x4            x5            x6              x7                 x8     
##  1   :136   d      : 77   Min.   :1.000   Min.   :-3.26215   5      : 76  
##  2   :143   f      : 73   1st Qu.:3.000   1st Qu.:-0.68042   2      : 74  
##  3   :146   h      : 73   Median :5.000   Median : 0.02927   6      : 73  
##  4   :137   i      : 72   Mean   :5.079   Mean   : 0.02138   1      : 71  
##  5   :138   j      : 72   3rd Qu.:7.000   3rd Qu.: 0.79094   4      : 70  
##  NA's:300   (Other):333   Max.   :9.000   Max.   : 3.71572   (Other):336  
##             NA's   :300   NA's   :300     NA's   :300        NA's   :300  
##        x9              x10        
##  Min.   :0.1010   Min.   : 0.000  
##  1st Qu.:0.3076   1st Qu.: 2.000  
##  Median :0.5100   Median : 4.000  
##  Mean   :0.5286   Mean   : 4.029  
##  3rd Qu.:0.7681   3rd Qu.: 5.000  
##  Max.   :0.9881   Max.   :11.000  
##  NA's   :300      NA's   :300

summary(data.frames[[1]])

##        y                x1               x2         x3      x4     
##  Min.   :-3.886   Min.   :-7.238   Min.   :-4.927   0:567   1:193  
##  1st Qu.: 2.340   1st Qu.: 0.000   1st Qu.: 0.000   1:433   2:208  
##  Median : 5.561   Median : 2.591   Median : 2.129           3:214  
##  Mean   : 5.538   Mean   : 2.526   Mean   : 2.403           4:199  
##  3rd Qu.: 8.301   3rd Qu.: 4.905   3rd Qu.: 4.777           5:186  
##  Max.   :18.305   Max.   : 9.462   Max.   :10.276                  
##                                                                    
##        x5            x6               x7                 x8     
##  b      :111   Min.   :-4.152   Min.   :-3.42597   1      :116  
##  j      :107   1st Qu.: 3.000   1st Qu.:-0.70742   8      :109  
##  i      :105   Median : 5.000   Median : 0.02580   2      :108  
##  h      :102   Mean   : 5.074   Mean   : 0.01439   5      :103  
##  c      :101   3rd Qu.: 7.000   3rd Qu.: 0.80369   4      :102  
##  g      :101   Max.   :13.181   Max.   : 3.71572   10     :102  
##  (Other):373                                       (Other):360  
##        x9                 x10         missing_y       missing_x1     
##  Min.   :0.0009401   Min.   :-3.157   Mode :logical   Mode :logical  
##  1st Qu.:0.3191679   1st Qu.: 2.527   FALSE:700       FALSE:700      
##  Median :0.5173536   Median : 4.000   TRUE :300       TRUE :300      
##  Mean   :0.5315164   Mean   : 4.057                                  
##  3rd Qu.:0.7626689   3rd Qu.: 5.060                                  
##  Max.   :0.9995029   Max.   :11.075                                  
##                                                                      
##  missing_x2      missing_x3      missing_x4      missing_x5     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x6      missing_x7      missing_x8      missing_x9     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x10    
##  Mode :logical  
##  FALSE:700      
##  TRUE :300      
##                 
##                 
##                 
##

lapply(data.frames, summary)

## $`chain:1`
##        y                x1               x2         x3      x4     
##  Min.   :-3.886   Min.   :-7.238   Min.   :-4.927   0:567   1:193  
##  1st Qu.: 2.340   1st Qu.: 0.000   1st Qu.: 0.000   1:433   2:208  
##  Median : 5.561   Median : 2.591   Median : 2.129           3:214  
##  Mean   : 5.538   Mean   : 2.526   Mean   : 2.403           4:199  
##  3rd Qu.: 8.301   3rd Qu.: 4.905   3rd Qu.: 4.777           5:186  
##  Max.   :18.305   Max.   : 9.462   Max.   :10.276                  
##                                                                    
##        x5            x6               x7                 x8     
##  b      :111   Min.   :-4.152   Min.   :-3.42597   1      :116  
##  j      :107   1st Qu.: 3.000   1st Qu.:-0.70742   8      :109  
##  i      :105   Median : 5.000   Median : 0.02580   2      :108  
##  h      :102   Mean   : 5.074   Mean   : 0.01439   5      :103  
##  c      :101   3rd Qu.: 7.000   3rd Qu.: 0.80369   4      :102  
##  g      :101   Max.   :13.181   Max.   : 3.71572   10     :102  
##  (Other):373                                       (Other):360  
##        x9                 x10         missing_y       missing_x1     
##  Min.   :0.0009401   Min.   :-3.157   Mode :logical   Mode :logical  
##  1st Qu.:0.3191679   1st Qu.: 2.527   FALSE:700       FALSE:700      
##  Median :0.5173536   Median : 4.000   TRUE :300       TRUE :300      
##  Mean   :0.5315164   Mean   : 4.057                                  
##  3rd Qu.:0.7626689   3rd Qu.: 5.060                                  
##  Max.   :0.9995029   Max.   :11.075                                  
##                                                                      
##  missing_x2      missing_x3      missing_x4      missing_x5     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x6      missing_x7      missing_x8      missing_x9     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x10    
##  Mode :logical  
##  FALSE:700      
##  TRUE :300      
##                 
##                 
##                 
##                 
## 
## $`chain:2`
##        y                x1               x2         x3      x4     
##  Min.   :-7.378   Min.   :-4.901   Min.   :-6.681   0:583   1:201  
##  1st Qu.: 2.539   1st Qu.: 0.000   1st Qu.: 0.000   1:417   2:204  
##  Median : 5.573   Median : 2.707   Median : 1.883           3:202  
##  Mean   : 5.521   Mean   : 2.550   Mean   : 2.418           4:179  
##  3rd Qu.: 8.278   3rd Qu.: 4.885   3rd Qu.: 4.808           5:214  
##  Max.   :16.526   Max.   : 9.053   Max.   : 9.752                  
##                                                                    
##        x5            x6               x7                  x8     
##  d      :116   Min.   :-4.332   Min.   :-3.262149   7      :109  
##  i      :113   1st Qu.: 3.000   1st Qu.:-0.671050   5      :108  
##  b      :111   Median : 5.000   Median : 0.009235   2      :105  
##  f      :101   Mean   : 5.051   Mean   : 0.025900   6      :104  
##  h      : 99   3rd Qu.: 7.000   3rd Qu.: 0.764619   10     :103  
##  g      : 98   Max.   :11.495   Max.   : 3.932833   4      :102  
##  (Other):362                                        (Other):369  
##        x9               x10         missing_y       missing_x1     
##  Min.   :0.01094   Min.   :-1.287   Mode :logical   Mode :logical  
##  1st Qu.:0.31848   1st Qu.: 2.537   FALSE:700       FALSE:700      
##  Median :0.52444   Median : 4.000   TRUE :300       TRUE :300      
##  Mean   :0.53242   Mean   : 4.007                                  
##  3rd Qu.:0.76260   3rd Qu.: 5.000                                  
##  Max.   :0.98809   Max.   :11.703                                  
##                                                                    
##  missing_x2      missing_x3      missing_x4      missing_x5     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x6      missing_x7      missing_x8      missing_x9     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x10    
##  Mode :logical  
##  FALSE:700      
##  TRUE :300      
##                 
##                 
##                 
##                 
## 
## $`chain:3`
##        y                x1               x2         x3      x4     
##  Min.   :-3.886   Min.   :-4.260   Min.   :-5.877   0:577   1:203  
##  1st Qu.: 2.463   1st Qu.: 0.000   1st Qu.: 0.000   1:423   2:215  
##  Median : 5.604   Median : 2.513   Median : 2.452           3:197  
##  Mean   : 5.571   Mean   : 2.483   Mean   : 2.474           4:192  
##  3rd Qu.: 8.266   3rd Qu.: 4.881   3rd Qu.: 4.817           5:193  
##  Max.   :16.526   Max.   : 8.835   Max.   :12.073                  
##                                                                    
##        x5            x6               x7                 x8     
##  d      :114   Min.   :-2.000   Min.   :-3.26215   10     :110  
##  h      :106   1st Qu.: 3.000   1st Qu.:-0.70352   5      :108  
##  i      :105   Median : 5.000   Median : 0.02426   2      :107  
##  j      :105   Mean   : 5.064   Mean   : 0.01301   7      :107  
##  b      :101   3rd Qu.: 7.000   3rd Qu.: 0.79544   4      :106  
##  a      :100   Max.   :11.754   Max.   : 3.71572   9      :101  
##  (Other):369                                       (Other):361  
##        x9                x10         missing_y       missing_x1     
##  Min.   :0.006127   Min.   :-1.865   Mode :logical   Mode :logical  
##  1st Qu.:0.324624   1st Qu.: 2.634   FALSE:700       FALSE:700      
##  Median :0.540627   Median : 4.000   TRUE :300       TRUE :300      
##  Mean   :0.538723   Mean   : 4.020                                  
##  3rd Qu.:0.768122   3rd Qu.: 5.000                                  
##  Max.   :0.996987   Max.   :11.299                                  
##                                                                     
##  missing_x2      missing_x3      missing_x4      missing_x5     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x6      missing_x7      missing_x8      missing_x9     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x10    
##  Mode :logical  
##  FALSE:700      
##  TRUE :300      
##                 
##                 
##                 
##

round(mipply(imputations, mean, to.matrix = TRUE), 3)

##             chain:1 chain:2 chain:3
## y             0.015   0.012   0.019
## x1            0.006   0.011  -0.002
## x2           -0.019  -0.016  -0.005
## x3            1.433   1.417   1.423
## x4            2.977   3.001   2.957
## x5            5.587   5.546   5.553
## x6           -0.001  -0.005  -0.003
## x7           -0.003   0.002  -0.004
## x8            5.431   5.520   5.564
## x9            0.532   0.532   0.539
## x10           0.007  -0.005  -0.002
## missing_y     0.300   0.300   0.300
## missing_x1    0.300   0.300   0.300
## missing_x2    0.300   0.300   0.300
## missing_x3    0.300   0.300   0.300
## missing_x4    0.300   0.300   0.300
## missing_x5    0.300   0.300   0.300
## missing_x6    0.300   0.300   0.300
## missing_x7    0.300   0.300   0.300
## missing_x8    0.300   0.300   0.300
## missing_x9    0.300   0.300   0.300
## missing_x10   0.300   0.300   0.300

Rhats(imputations, statistic = "moments")

##    mean_y   mean_x1   mean_x2   mean_x3   mean_x4   mean_x5   mean_x6 
## 1.2046635 1.1826790 1.6873019 1.1366042 0.9100276 1.0741083 1.1754183 
##   mean_x7   mean_x8   mean_x9  mean_x10      sd_y     sd_x1     sd_x2 
## 1.1584357 1.1582553 0.9866131 1.1507648 1.1433457 1.1623447 1.0019965 
##     sd_x3     sd_x4     sd_x5     sd_x6     sd_x7     sd_x8     sd_x9 
## 1.1338756 0.9760816 1.1198121 0.9810286 1.3298025 0.9287844 1.5285230 
##    sd_x10 
## 1.1078134

# assess the convergence of MI algorithm

plot(imputations);hist(imputations);image(imputations);summary(imputations)

## $y
## $y$is_missing
## missing
## FALSE  TRUE 
##   700   300 
## 
## $y$imputed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.64194 -0.36281  0.06273  0.05068  0.41826  1.65190 
## 
## $y$observed
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.194077 -0.395670  0.001664  0.000000  0.347725  1.423707 
## 
## 
## $x1
## $x1$is_missing
## missing
## FALSE  TRUE 
##   700   300 
## 
## $x1$imputed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.87556 -0.31922  0.02436  0.01698  0.35155  1.34305 
## 
## $x1$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.4805 -0.4805 -0.4805  0.0000  0.4817  1.1365 
## 
## 
## $x2
## $x2$is_missing
## missing
## FALSE  TRUE 
##   700   300 
## 
## $x2$imputed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.77847 -0.39314 -0.06708 -0.04383  0.32694  1.85462 
## 
## $x2$observed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.48420 -0.48420  0.01595  0.00000  0.47667  1.14717 
## 
## 
## $x3
## $x3$crosstab
##    
##     observed imputed
##   0     1215     512
##   1      885     388
## 
## 
## $x4
## $x4$crosstab
##    
##     observed imputed
##   1      408     189
##   2      429     198
##   3      438     175
##   4      411     159
##   5      414     179
## 
## 
## $x5
## $x5$crosstab
##    
##     observed imputed
##   a      186      80
##   b      210     113
##   c      195      89
##   d      231      99
##   e      198      81
##   f      219      68
##   g      210      86
##   h      219      88
##   i      216     107
##   j      216      89
## 
## 
## $x6
## $x6$is_missing
## missing
## FALSE  TRUE 
##   700   300 
## 
## $x6$imputed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.87057 -0.38528 -0.00860 -0.01031  0.33648  1.61064 
## 
## $x6$observed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.81073 -0.41317 -0.01562  0.00000  0.38194  0.77949 
## 
## 
## $x7
## $x7$is_missing
## missing
## FALSE  TRUE 
##   700   300 
## 
## $x7$imputed
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.654793 -0.354776 -0.014823 -0.005786  0.362309  1.877568 
## 
## $x7$observed
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -1.576156 -0.336879  0.003784  0.000000  0.369404  1.773350 
## 
## 
## $x8
## $x8$crosstab
##     
##      observed imputed
##   1       213      91
##   2       222      98
##   3       186      64
##   4       210     100
##   5       228      91
##   6       219      80
##   7       210      97
##   8       207      84
##   9       198      87
##   10      207     108
## 
## 
## $x9
## $x9$is_missing
## missing
## FALSE  TRUE 
##   700   300 
## 
## $x9$imputed
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0009401 0.3599191 0.5675957 0.5472855 0.7452422 0.9995029 
## 
## $x9$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.1010  0.3077  0.5100  0.5286  0.7681  0.9881 
## 
## 
## $x10
## $x10$is_missing
## missing
## FALSE  TRUE 
##   700   300 
## 
## $x10$imputed
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -1.7503436 -0.3498453  0.0038875 -0.0002989  0.3462585  1.8692424 
## 
## $x10$observed
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.981263 -0.494111 -0.006959  0.000000  0.236617  1.698072

model_results<-pool(y~x1+x2+x3+x4+x5+x6+x7+x8+x9+x10,data=imputations, m=3)
display (model_results); summary (model_results)

## bayesglm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + 
##     x9 + x10, data = imputations, m = 3)
##             coef.est coef.se
## (Intercept) -0.35     0.43  
## x1           0.95     0.03  
## x2           0.96     0.02  
## x31          0.05     0.27  
## x4.L         0.07     0.32  
## x4.Q         0.02     0.27  
## x4.C        -0.12     0.18  
## x4^4         0.02     0.32  
## x5b         -0.11     0.53  
## x5c          0.21     0.23  
## x5d          0.19     0.35  
## x5e          0.51     0.58  
## x5f          0.66     0.60  
## x5g          0.27     0.49  
## x5h          0.22     0.82  
## x5i          0.39     0.70  
## x5j          0.04     0.33  
## x6           0.03     0.03  
## x7           0.82     0.12  
## x82         -0.07     0.55  
## x83          0.05     0.24  
## x84          0.47     0.78  
## x85         -0.13     0.25  
## x86         -0.44     0.48  
## x87         -0.43     0.68  
## x88          0.17     0.35  
## x89         -0.30     0.36  
## x810        -0.38     0.33  
## x9           1.17     0.23  
## x10          0.07     0.06  
## n = 970, k = 30
## residual deviance = 2093.7, null deviance = 15450.0 (difference = 13356.3)
## overdispersion parameter = 2.2
## residual sd is sqrt(overdispersion) = 1.47

## 
## Call:
## pool(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7 + x8 + x9 + 
##     x10, data = imputations, m = 3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5892  -0.7474   0.0207   0.7162   3.2188  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.34936    0.43477  -0.804  0.44813    
## x1           0.94668    0.03208  29.512 3.15e-06 ***
## x2           0.95623    0.01889  50.610  < 2e-16 ***
## x31          0.05135    0.26727   0.192  0.86162    
## x4.L         0.06670    0.32472   0.205  0.85301    
## x4.Q         0.02008    0.26772   0.075  0.94526    
## x4.C        -0.11598    0.18421  -0.630  0.56013    
## x4^4         0.02178    0.32147   0.068  0.95110    
## x5b         -0.10707    0.52572  -0.204  0.85226    
## x5c          0.20986    0.22503   0.933  0.35126    
## x5d          0.19091    0.34655   0.551  0.60385    
## x5e          0.51065    0.57785   0.884  0.44819    
## x5f          0.65886    0.59638   1.105  0.35924    
## x5g          0.26945    0.48518   0.555  0.61587    
## x5h          0.22178    0.81526   0.272  0.80859    
## x5i          0.39250    0.69651   0.564  0.62147    
## x5j          0.04330    0.32543   0.133  0.89806    
## x6           0.02586    0.02853   0.907  0.39886    
## x7           0.82225    0.11628   7.071  0.00735 ** 
## x82         -0.07311    0.55473  -0.132  0.90448    
## x83          0.04611    0.24080   0.191  0.84848    
## x84          0.46605    0.77985   0.598  0.60470    
## x85         -0.12774    0.24709  -0.517  0.60974    
## x86         -0.43783    0.48245  -0.908  0.42977    
## x87         -0.42548    0.68379  -0.622  0.58818    
## x88          0.16671    0.35027   0.476  0.65365    
## x89         -0.30466    0.36069  -0.845  0.43845    
## x810        -0.37982    0.32686  -1.162  0.29179    
## x9           1.16823    0.22810   5.122 5.54e-05 ***
## x10          0.06603    0.05853   1.128  0.34750    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2.158413)
## 
##     Null deviance: 15450.0  on 999  degrees of freedom
## Residual deviance:  2093.7  on 970  degrees of freedom
## AIC: 3638.2
## 
## Number of Fisher Scoring iterations: 7

# Report the summaries of the imputations
data.frames <- complete(imputations, 3) # extract the first 3 chains
lapply(data.frames, summary)

## $`chain:1`
##        y                x1               x2         x3      x4     
##  Min.   :-3.886   Min.   :-7.238   Min.   :-4.927   0:567   1:193  
##  1st Qu.: 2.340   1st Qu.: 0.000   1st Qu.: 0.000   1:433   2:208  
##  Median : 5.561   Median : 2.591   Median : 2.129           3:214  
##  Mean   : 5.538   Mean   : 2.526   Mean   : 2.403           4:199  
##  3rd Qu.: 8.301   3rd Qu.: 4.905   3rd Qu.: 4.777           5:186  
##  Max.   :18.305   Max.   : 9.462   Max.   :10.276                  
##                                                                    
##        x5            x6               x7                 x8     
##  b      :111   Min.   :-4.152   Min.   :-3.42597   1      :116  
##  j      :107   1st Qu.: 3.000   1st Qu.:-0.70742   8      :109  
##  i      :105   Median : 5.000   Median : 0.02580   2      :108  
##  h      :102   Mean   : 5.074   Mean   : 0.01439   5      :103  
##  c      :101   3rd Qu.: 7.000   3rd Qu.: 0.80369   4      :102  
##  g      :101   Max.   :13.181   Max.   : 3.71572   10     :102  
##  (Other):373                                       (Other):360  
##        x9                 x10         missing_y       missing_x1     
##  Min.   :0.0009401   Min.   :-3.157   Mode :logical   Mode :logical  
##  1st Qu.:0.3191679   1st Qu.: 2.527   FALSE:700       FALSE:700      
##  Median :0.5173536   Median : 4.000   TRUE :300       TRUE :300      
##  Mean   :0.5315164   Mean   : 4.057                                  
##  3rd Qu.:0.7626689   3rd Qu.: 5.060                                  
##  Max.   :0.9995029   Max.   :11.075                                  
##                                                                      
##  missing_x2      missing_x3      missing_x4      missing_x5     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x6      missing_x7      missing_x8      missing_x9     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x10    
##  Mode :logical  
##  FALSE:700      
##  TRUE :300      
##                 
##                 
##                 
##                 
## 
## $`chain:2`
##        y                x1               x2         x3      x4     
##  Min.   :-7.378   Min.   :-4.901   Min.   :-6.681   0:583   1:201  
##  1st Qu.: 2.539   1st Qu.: 0.000   1st Qu.: 0.000   1:417   2:204  
##  Median : 5.573   Median : 2.707   Median : 1.883           3:202  
##  Mean   : 5.521   Mean   : 2.550   Mean   : 2.418           4:179  
##  3rd Qu.: 8.278   3rd Qu.: 4.885   3rd Qu.: 4.808           5:214  
##  Max.   :16.526   Max.   : 9.053   Max.   : 9.752                  
##                                                                    
##        x5            x6               x7                  x8     
##  d      :116   Min.   :-4.332   Min.   :-3.262149   7      :109  
##  i      :113   1st Qu.: 3.000   1st Qu.:-0.671050   5      :108  
##  b      :111   Median : 5.000   Median : 0.009235   2      :105  
##  f      :101   Mean   : 5.051   Mean   : 0.025900   6      :104  
##  h      : 99   3rd Qu.: 7.000   3rd Qu.: 0.764619   10     :103  
##  g      : 98   Max.   :11.495   Max.   : 3.932833   4      :102  
##  (Other):362                                        (Other):369  
##        x9               x10         missing_y       missing_x1     
##  Min.   :0.01094   Min.   :-1.287   Mode :logical   Mode :logical  
##  1st Qu.:0.31848   1st Qu.: 2.537   FALSE:700       FALSE:700      
##  Median :0.52444   Median : 4.000   TRUE :300       TRUE :300      
##  Mean   :0.53242   Mean   : 4.007                                  
##  3rd Qu.:0.76260   3rd Qu.: 5.000                                  
##  Max.   :0.98809   Max.   :11.703                                  
##                                                                    
##  missing_x2      missing_x3      missing_x4      missing_x5     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x6      missing_x7      missing_x8      missing_x9     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x10    
##  Mode :logical  
##  FALSE:700      
##  TRUE :300      
##                 
##                 
##                 
##                 
## 
## $`chain:3`
##        y                x1               x2         x3      x4     
##  Min.   :-3.886   Min.   :-4.260   Min.   :-5.877   0:577   1:203  
##  1st Qu.: 2.463   1st Qu.: 0.000   1st Qu.: 0.000   1:423   2:215  
##  Median : 5.604   Median : 2.513   Median : 2.452           3:197  
##  Mean   : 5.571   Mean   : 2.483   Mean   : 2.474           4:192  
##  3rd Qu.: 8.266   3rd Qu.: 4.881   3rd Qu.: 4.817           5:193  
##  Max.   :16.526   Max.   : 8.835   Max.   :12.073                  
##                                                                    
##        x5            x6               x7                 x8     
##  d      :114   Min.   :-2.000   Min.   :-3.26215   10     :110  
##  h      :106   1st Qu.: 3.000   1st Qu.:-0.70352   5      :108  
##  i      :105   Median : 5.000   Median : 0.02426   2      :107  
##  j      :105   Mean   : 5.064   Mean   : 0.01301   7      :107  
##  b      :101   3rd Qu.: 7.000   3rd Qu.: 0.79544   4      :106  
##  a      :100   Max.   :11.754   Max.   : 3.71572   9      :101  
##  (Other):369                                       (Other):361  
##        x9                x10         missing_y       missing_x1     
##  Min.   :0.006127   Min.   :-1.865   Mode :logical   Mode :logical  
##  1st Qu.:0.324624   1st Qu.: 2.634   FALSE:700       FALSE:700      
##  Median :0.540627   Median : 4.000   TRUE :300       TRUE :300      
##  Mean   :0.538723   Mean   : 4.020                                  
##  3rd Qu.:0.768122   3rd Qu.: 5.000                                  
##  Max.   :0.996987   Max.   :11.299                                  
##                                                                     
##  missing_x2      missing_x3      missing_x4      missing_x5     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x6      missing_x7      missing_x8      missing_x9     
##  Mode :logical   Mode :logical   Mode :logical   Mode :logical  
##  FALSE:700       FALSE:700       FALSE:700       FALSE:700      
##  TRUE :300       TRUE :300       TRUE :300       TRUE :300      
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##  missing_x10    
##  Mode :logical  
##  FALSE:700      
##  TRUE :300      
##                 
##                 
##                 
##

coef(summary(model_results))[, 1:2] # get the model coef's and their SE's

##                Estimate Std. Error
## (Intercept) -0.34935528 0.43476723
## x1           0.94667669 0.03207716
## x2           0.95623011 0.01889411
## x31          0.05134643 0.26726832
## x4.L         0.06669926 0.32471930
## x4.Q         0.02008243 0.26771613
## x4.C        -0.11597568 0.18421013
## x4^4         0.02178467 0.32147253
## x5b         -0.10706720 0.52571850
## x5c          0.20986479 0.22503305
## x5d          0.19091395 0.34655472
## x5e          0.51065296 0.57784542
## x5f          0.65886317 0.59638279
## x5g          0.26944791 0.48517637
## x5h          0.22178212 0.81526130
## x5i          0.39250366 0.69651000
## x5j          0.04329837 0.32542720
## x6           0.02586243 0.02852896
## x7           0.82225007 0.11627861
## x82         -0.07310774 0.55473221
## x83          0.04611270 0.24080026
## x84          0.46605214 0.77984682
## x85         -0.12774269 0.24708639
## x86         -0.43783114 0.48245267
## x87         -0.42547522 0.68379391
## x88          0.16671493 0.35026743
## x89         -0.30466469 0.36068715
## x810        -0.37981975 0.32686287
## x9           1.16823126 0.22809937
## x10          0.06603382 0.05852858

library("lattice")
densityplot(y ~ x1 + x2, data=imputations)

TBI Data Example

# Load the (raw) data from the table into a plain text file "08_EpiBioSData_Incomplete.csv"
TBI_Data <- read.csv("https://umich.instructure.com/files/720782/download?download_frd=1",
                     na.strings=c("", ".", "NA"))
summary(TBI_Data)

##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs      er.gcs          icu.gcs         worst.gcs   
##  Min.   : 3   Min.   : 3.000   Min.   : 0.000   Min.   : 0.0  
##  1st Qu.: 3   1st Qu.: 4.000   1st Qu.: 3.000   1st Qu.: 3.0  
##  Median : 7   Median : 7.500   Median : 6.000   Median : 3.0  
##  Mean   : 8   Mean   : 8.182   Mean   : 6.378   Mean   : 5.4  
##  3rd Qu.:12   3rd Qu.:12.250   3rd Qu.: 8.000   3rd Qu.: 7.0  
##  Max.   :15   Max.   :15.000   Max.   :14.000   Max.   :14.0  
##  NA's   :2    NA's   :2        NA's   :1        NA's   :1     
##     X6m.gose       X2013.gose       skull.fx       temp.injury   
##  Min.   :2.000   Min.   :2.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:3.000   1st Qu.:5.000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :5.000   Median :7.000   Median :1.0000   Median :1.000  
##  Mean   :4.805   Mean   :5.804   Mean   :0.6087   Mean   :0.587  
##  3rd Qu.:6.000   3rd Qu.:7.000   3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :8.000   Max.   :8.000   Max.   :1.0000   Max.   :1.000  
##  NA's   :5                                                       
##     surgery         spikes.hr           min.hr           max.hr       
##  Min.   :0.0000   Min.   :  1.280   Min.   : 0.000   Min.   :  12.00  
##  1st Qu.:0.0000   1st Qu.:  5.357   1st Qu.: 0.000   1st Qu.:  35.25  
##  Median :1.0000   Median : 18.170   Median : 0.000   Median :  97.50  
##  Mean   :0.6304   Mean   : 52.872   Mean   : 3.571   Mean   : 241.89  
##  3rd Qu.:1.0000   3rd Qu.: 57.227   3rd Qu.: 0.000   3rd Qu.: 312.75  
##  Max.   :1.0000   Max.   :294.000   Max.   :42.000   Max.   :1199.00  
##                   NA's   :18        NA's   :18       NA's   :18       
##     acute.sz         late.sz          ever.sz     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :1.0000   Median :1.000  
##  Mean   :0.1739   Mean   :0.5652   Mean   :0.587  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##

# Get information matrix of the data
library("betareg"); library("mi")
mdf <- missing_data.frame(TBI_Data) # warnings about missingness patterns

## NOTE: The following pairs of variables appear to have the same missingness pattern.
##  Please verify whether they are in fact logically distinct variables.
##      [,1]      [,2]       
## [1,] "icu.gcs" "worst.gcs"

show(mdf); mdf@patterns; image(mdf)

## Object of class missing_data.frame with 46 observations on 19 variables
## 
## There are 7 missing data patterns
## 
## Append '@patterns' to this missing_data.frame to access the corresponding pattern for every observation or perhaps use table()
## 
##                              type missing method  model
## id                     irrelevant       0   <NA>   <NA>
## age                    continuous       0   <NA>   <NA>
## sex                        binary       0   <NA>   <NA>
## mechanism   unordered-categorical       0   <NA>   <NA>
## field.gcs              continuous       2    ppd linear
## er.gcs                 continuous       2    ppd linear
## icu.gcs                continuous       1    ppd linear
## worst.gcs              continuous       1    ppd linear
## X6m.gose               continuous       5    ppd linear
## X2013.gose             continuous       0   <NA>   <NA>
## skull.fx                   binary       0   <NA>   <NA>
## temp.injury                binary       0   <NA>   <NA>
## surgery                    binary       0   <NA>   <NA>
## spikes.hr              continuous      18    ppd linear
## min.hr                 continuous      18    ppd linear
## max.hr                 continuous      18    ppd linear
## acute.sz                   binary       0   <NA>   <NA>
## late.sz                    binary       0   <NA>   <NA>
## ever.sz                    binary       0   <NA>   <NA>
## 
##               family     link transformation
## id              <NA>     <NA>           <NA>
## age             <NA>     <NA>    standardize
## sex             <NA>     <NA>           <NA>
## mechanism       <NA>     <NA>           <NA>
## field.gcs   gaussian identity    standardize
## er.gcs      gaussian identity    standardize
## icu.gcs     gaussian identity    standardize
## worst.gcs   gaussian identity    standardize
## X6m.gose    gaussian identity    standardize
## X2013.gose      <NA>     <NA>    standardize
## skull.fx        <NA>     <NA>           <NA>
## temp.injury     <NA>     <NA>           <NA>
## surgery         <NA>     <NA>           <NA>
## spikes.hr   gaussian identity    standardize
## min.hr      gaussian identity    standardize
## max.hr      gaussian identity    standardize
## acute.sz        <NA>     <NA>           <NA>
## late.sz         <NA>     <NA>           <NA>
## ever.sz         <NA>     <NA>           <NA>

##  [1] spikes.hr, min.hr, max.hr                      
##  [2] field.gcs                                      
##  [3] nothing                                        
##  [4] nothing                                        
##  [5] nothing                                        
##  [6] nothing                                        
##  [7] spikes.hr, min.hr, max.hr                      
##  [8] nothing                                        
##  [9] nothing                                        
## [10] nothing                                        
## [11] nothing                                        
## [12] nothing                                        
## [13] spikes.hr, min.hr, max.hr                      
## [14] nothing                                        
## [15] spikes.hr, min.hr, max.hr                      
## [16] spikes.hr, min.hr, max.hr                      
## [17] nothing                                        
## [18] spikes.hr, min.hr, max.hr                      
## [19] spikes.hr, min.hr, max.hr                      
## [20] spikes.hr, min.hr, max.hr                      
## [21] X6m.gose, spikes.hr, min.hr, max.hr            
## [22] nothing                                        
## [23] spikes.hr, min.hr, max.hr                      
## [24] spikes.hr, min.hr, max.hr                      
## [25] spikes.hr, min.hr, max.hr                      
## [26] spikes.hr, min.hr, max.hr                      
## [27] spikes.hr, min.hr, max.hr                      
## [28] X6m.gose                                       
## [29] spikes.hr, min.hr, max.hr                      
## [30] nothing                                        
## [31] X6m.gose, spikes.hr, min.hr, max.hr            
## [32] spikes.hr, min.hr, max.hr                      
## [33] nothing                                        
## [34] nothing                                        
## [35] nothing                                        
## [36] nothing                                        
## [37] field.gcs, er.gcs, icu.gcs, worst.gcs, X6m.gose
## [38] er.gcs                                         
## [39] nothing                                        
## [40] nothing                                        
## [41] nothing                                        
## [42] spikes.hr, min.hr, max.hr                      
## [43] nothing                                        
## [44] nothing                                        
## [45] nothing                                        
## [46] X6m.gose                                       
## 7 Levels: nothing field.gcs X6m.gose er.gcs ... field.gcs, er.gcs, icu.gcs, worst.gcs, X6m.gose

# mi::change() method changes the family imputation method,
# size, type, and so forth of a missing variable. It's called
# before calling mi to affect how the conditional expectation of each
# missing variable is modeled.
mdf <- change(mdf, y = "spikes.hr", what = "transformation", to="identity")

summary(mdf); hist(mdf);

##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs      er.gcs          icu.gcs         worst.gcs   
##  Min.   : 3   Min.   : 3.000   Min.   : 0.000   Min.   : 0.0  
##  1st Qu.: 3   1st Qu.: 4.000   1st Qu.: 3.000   1st Qu.: 3.0  
##  Median : 7   Median : 7.500   Median : 6.000   Median : 3.0  
##  Mean   : 8   Mean   : 8.182   Mean   : 6.378   Mean   : 5.4  
##  3rd Qu.:12   3rd Qu.:12.250   3rd Qu.: 8.000   3rd Qu.: 7.0  
##  Max.   :15   Max.   :15.000   Max.   :14.000   Max.   :14.0  
##  NA's   :2    NA's   :2        NA's   :1        NA's   :1     
##     X6m.gose       X2013.gose       skull.fx       temp.injury   
##  Min.   :2.000   Min.   :2.000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:3.000   1st Qu.:5.000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :5.000   Median :7.000   Median :1.0000   Median :1.000  
##  Mean   :4.805   Mean   :5.804   Mean   :0.6087   Mean   :0.587  
##  3rd Qu.:6.000   3rd Qu.:7.000   3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :8.000   Max.   :8.000   Max.   :1.0000   Max.   :1.000  
##  NA's   :5                                                       
##     surgery         spikes.hr           min.hr           max.hr       
##  Min.   :0.0000   Min.   :  1.280   Min.   : 0.000   Min.   :  12.00  
##  1st Qu.:0.0000   1st Qu.:  5.357   1st Qu.: 0.000   1st Qu.:  35.25  
##  Median :1.0000   Median : 18.170   Median : 0.000   Median :  97.50  
##  Mean   :0.6304   Mean   : 52.872   Mean   : 3.571   Mean   : 241.89  
##  3rd Qu.:1.0000   3rd Qu.: 57.227   3rd Qu.: 0.000   3rd Qu.: 312.75  
##  Max.   :1.0000   Max.   :294.000   Max.   :42.000   Max.   :1199.00  
##                   NA's   :18        NA's   :18       NA's   :18       
##     acute.sz         late.sz          ever.sz     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :1.0000   Median :1.000  
##  Mean   :0.1739   Mean   :0.5652   Mean   :0.587  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.000  
##

image(mdf)

imputations <- mi(mdf, n.iter=10, n.chains=5, verbose=TRUE)
hist(imputations)

data.frames <- complete(imputations, 5)

lapply(data.frames, summary)

## $`chain:1`
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs          er.gcs          icu.gcs         worst.gcs    
##  Min.   : 3.000   Min.   : 3.000   Min.   : 0.000   Min.   : 0.00  
##  1st Qu.: 3.250   1st Qu.: 4.250   1st Qu.: 3.000   1st Qu.: 3.00  
##  Median : 7.000   Median : 8.000   Median : 6.000   Median : 3.00  
##  Mean   : 8.368   Mean   : 8.403   Mean   : 6.403   Mean   : 5.49  
##  3rd Qu.:12.750   3rd Qu.:12.750   3rd Qu.: 7.886   3rd Qu.: 7.75  
##  Max.   :19.259   Max.   :16.382   Max.   :14.000   Max.   :14.00  
##                                                                    
##     X6m.gose        X2013.gose    skull.fx temp.injury surgery
##  Min.   :-0.477   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.: 3.000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median : 5.000   Median :7.000                               
##  Mean   : 4.682   Mean   :5.804                               
##  3rd Qu.: 6.000   3rd Qu.:7.000                               
##  Max.   : 8.000   Max.   :8.000                               
##                                                               
##    spikes.hr            min.hr            max.hr        acute.sz late.sz
##  Min.   :-137.889   Min.   :-36.411   Min.   :-528.98   0:38     0:20   
##  1st Qu.:   3.805   1st Qu.:  0.000   1st Qu.:  16.25   1: 8     1:26   
##  Median :  28.125   Median :  0.000   Median :  84.00                   
##  Mean   :  38.266   Mean   :  1.741   Mean   : 188.99                   
##  3rd Qu.:  62.502   3rd Qu.:  3.750   3rd Qu.: 363.00                   
##  Max.   : 294.000   Max.   : 42.000   Max.   :1199.00                   
##                                                                         
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##                 
## 
## $`chain:2`
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs           er.gcs         icu.gcs         worst.gcs     
##  Min.   :-0.8877   Min.   : 3.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.0000   1st Qu.: 4.25   1st Qu.: 3.000   1st Qu.: 3.000  
##  Median : 6.5000   Median : 8.00   Median : 6.000   Median : 3.000  
##  Mean   : 7.6670   Mean   : 8.56   Mean   : 6.303   Mean   : 5.452  
##  3rd Qu.:12.0000   3rd Qu.:13.00   3rd Qu.: 7.750   3rd Qu.: 7.607  
##  Max.   :15.0000   Max.   :18.75   Max.   :14.000   Max.   :14.000  
##                                                                     
##     X6m.gose       X2013.gose    skull.fx temp.injury surgery
##  Min.   :2.000   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.:3.000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median :5.000   Median :7.000                               
##  Mean   :4.904   Mean   :5.804                               
##  3rd Qu.:6.253   3rd Qu.:7.000                               
##  Max.   :8.000   Max.   :8.000                               
##                                                              
##    spikes.hr            min.hr            max.hr        acute.sz late.sz
##  Min.   :-219.254   Min.   :-19.972   Min.   :-924.66   0:38     0:20   
##  1st Qu.:   3.953   1st Qu.:  0.000   1st Qu.:  28.75   1: 8     1:26   
##  Median :  33.490   Median :  0.000   Median : 150.00                   
##  Mean   :  73.034   Mean   :  3.770   Mean   : 255.84                   
##  3rd Qu.: 147.970   3rd Qu.:  6.278   3rd Qu.: 468.00                   
##  Max.   : 421.748   Max.   : 42.000   Max.   :1318.33                   
##                                                                         
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##                 
## 
## $`chain:3`
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs          er.gcs          icu.gcs        worst.gcs     
##  Min.   : 3.000   Min.   : 3.000   Min.   : 0.00   Min.   :-3.824  
##  1st Qu.: 3.250   1st Qu.: 4.250   1st Qu.: 3.00   1st Qu.: 3.000  
##  Median : 7.000   Median : 8.000   Median : 6.00   Median : 3.000  
##  Mean   : 8.122   Mean   : 8.525   Mean   : 6.30   Mean   : 5.199  
##  3rd Qu.:12.000   3rd Qu.:13.000   3rd Qu.: 7.75   3rd Qu.: 7.000  
##  Max.   :16.690   Max.   :16.607   Max.   :14.00   Max.   :14.000  
##                                                                    
##     X6m.gose          X2013.gose    skull.fx temp.injury surgery
##  Min.   :-0.02415   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.: 3.00000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median : 5.00000   Median :7.000                               
##  Mean   : 4.53409   Mean   :5.804                               
##  3rd Qu.: 6.00000   3rd Qu.:7.000                               
##  Max.   : 8.00000   Max.   :8.000                               
##                                                                 
##    spikes.hr            min.hr            max.hr         acute.sz late.sz
##  Min.   :-281.747   Min.   :-33.445   Min.   :-1099.01   0:38     0:20   
##  1st Qu.:   4.577   1st Qu.:  0.000   1st Qu.:   33.75   1: 8     1:26   
##  Median :  29.403   Median :  0.000   Median :  163.03                   
##  Mean   :  51.345   Mean   :  3.239   Mean   :  252.11                   
##  3rd Qu.: 108.989   3rd Qu.:  6.773   3rd Qu.:  518.85                   
##  Max.   : 294.000   Max.   : 42.000   Max.   : 1199.00                   
##                                                                          
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##                 
## 
## $`chain:4`
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs           er.gcs          icu.gcs        worst.gcs    
##  Min.   : 0.8178   Min.   : 2.728   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 3.0000   1st Qu.: 4.000   1st Qu.: 3.00   1st Qu.: 3.00  
##  Median : 7.0000   Median : 7.000   Median : 6.00   Median : 3.00  
##  Mean   : 8.0500   Mean   : 8.016   Mean   : 6.43   Mean   : 5.48  
##  3rd Qu.:12.0000   3rd Qu.:12.000   3rd Qu.: 8.00   3rd Qu.: 7.75  
##  Max.   :17.4818   Max.   :15.000   Max.   :14.00   Max.   :14.00  
##                                                                    
##     X6m.gose       X2013.gose    skull.fx temp.injury surgery
##  Min.   :0.493   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.:3.000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median :5.000   Median :7.000                               
##  Mean   :4.604   Mean   :5.804                               
##  3rd Qu.:6.000   3rd Qu.:7.000                               
##  Max.   :8.000   Max.   :8.000                               
##                                                              
##    spikes.hr            min.hr            max.hr        acute.sz late.sz
##  Min.   :-124.622   Min.   :-37.474   Min.   :-417.67   0:38     0:20   
##  1st Qu.:   3.805   1st Qu.:  0.000   1st Qu.:  28.75   1: 8     1:26   
##  Median :  28.125   Median :  0.000   Median : 116.00                   
##  Mean   :  54.517   Mean   :  6.677   Mean   : 230.91                   
##  3rd Qu.:  95.099   3rd Qu.: 11.369   3rd Qu.: 363.00                   
##  Max.   : 294.000   Max.   : 64.397   Max.   :1199.00                   
##                                                                         
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##                 
## 
## $`chain:5`
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs          er.gcs           icu.gcs         worst.gcs     
##  Min.   : 3.000   Min.   : 0.7236   Min.   :-4.872   Min.   : 0.000  
##  1st Qu.: 3.250   1st Qu.: 3.2500   1st Qu.: 3.000   1st Qu.: 3.000  
##  Median : 7.000   Median : 7.0000   Median : 6.000   Median : 3.000  
##  Mean   : 7.937   Mean   : 7.8938   Mean   : 6.133   Mean   : 5.398  
##  3rd Qu.:12.000   3rd Qu.:12.0000   3rd Qu.: 7.750   3rd Qu.: 7.000  
##  Max.   :15.000   Max.   :15.0000   Max.   :14.000   Max.   :14.000  
##                                                                      
##     X6m.gose       X2013.gose    skull.fx temp.injury surgery
##  Min.   :2.000   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.:3.000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median :5.000   Median :7.000                               
##  Mean   :4.653   Mean   :5.804                               
##  3rd Qu.:6.000   3rd Qu.:7.000                               
##  Max.   :8.000   Max.   :8.000                               
##                                                              
##    spikes.hr            min.hr            max.hr        acute.sz late.sz
##  Min.   :-135.298   Min.   :-38.620   Min.   :-867.26   0:38     0:20   
##  1st Qu.:   3.953   1st Qu.:  0.000   1st Qu.:  28.75   1: 8     1:26   
##  Median :  12.012   Median :  0.000   Median : 117.08                   
##  Mean   :  36.725   Mean   :  3.011   Mean   : 187.49                   
##  3rd Qu.:  62.502   3rd Qu.:  7.103   3rd Qu.: 338.25                   
##  Max.   : 294.000   Max.   : 42.000   Max.   :1199.00                   
##                                                                         
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##

# (not necessary, but may be useful)
indx <- sapply(data.frames[[5]], is.numeric) # get the indices of numeric columns
data.frames[[5]][indx] <- lapply(data.frames[[5]][indx], function(x)
as.numeric(as.integer(x)))
# cast each value as integer data.frames[[5]]$spikes.hr

library("mi")
# lm.mi(); glm.mi(); polr.mi(); bayesglm.mi(); bayespolr.mi(); lmer.mi(); glmer.mi()

# Also see Step above
##linear regression for each imputed data set - 5 regression models are fit
fit_lm1 <- glm(ever.sz ~ surgery + worst.gcs + factor(sex) + age, data.frames$`chain:1`, family = "binomial"); summary(fit_lm1); display(fit_lm1)

## 
## Call:
## glm(formula = ever.sz ~ surgery + worst.gcs + factor(sex) + age, 
##     family = "binomial", data = data.frames$`chain:1`)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7010  -1.2148   0.8212   1.0009   1.3884  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)      0.255915   1.357554   0.189    0.850
## surgery1         0.950585   0.685826   1.386    0.166
## worst.gcs       -0.069793   0.098192  -0.711    0.477
## factor(sex)Male -0.330161   0.842880  -0.392    0.695
## age              0.004406   0.019433   0.227    0.821
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62.371  on 45  degrees of freedom
## Residual deviance: 60.033  on 41  degrees of freedom
## AIC: 70.033
## 
## Number of Fisher Scoring iterations: 4

## glm(formula = ever.sz ~ surgery + worst.gcs + factor(sex) + age, 
##     family = "binomial", data = data.frames$`chain:1`)
##                 coef.est coef.se
## (Intercept)      0.26     1.36  
## surgery1         0.95     0.69  
## worst.gcs       -0.07     0.10  
## factor(sex)Male -0.33     0.84  
## age              0.00     0.02  
## ---
##   n = 46, k = 5
##   residual deviance = 60.0, null deviance = 62.4 (difference = 2.3)

# (estimates over MI chains)
model_results <- pool(ever.sz ~ surgery + worst.gcs + factor(sex) + age,
family = "binomial", data=imputations, m=5)
display (model_results); summary (model_results)

## bayesglm(formula = ever.sz ~ surgery + worst.gcs + factor(sex) + 
##     age, data = imputations, m = 5, family = "binomial")
##                 coef.est coef.se
## (Intercept)      0.37     1.27  
## surgery1         0.89     0.65  
## worst.gcs       -0.08     0.10  
## factor(sex)Male -0.31     0.76  
## age              0.00     0.02  
## n = 41, k = 5
## residual deviance = 59.7, null deviance = 62.4 (difference = 2.7)

## 
## Call:
## pool(formula = ever.sz ~ surgery + worst.gcs + factor(sex) + 
##     age, data = imputations, m = 5, family = "binomial")
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6668  -1.2037   0.8545   1.0150   1.3635  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)      0.365188   1.272203   0.287    0.774
## surgery1         0.887991   0.651889   1.362    0.173
## worst.gcs       -0.075537   0.096347  -0.784    0.433
## factor(sex)Male -0.311468   0.761916  -0.409    0.683
## age              0.002654   0.018246   0.145    0.884
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62.371  on 45  degrees of freedom
## Residual deviance: 59.683  on 41  degrees of freedom
## AIC: 69.683
## 
## Number of Fisher Scoring iterations: 6.2

data.frames <- complete(imputations, 3) # extract the first 3 chains
lapply(data.frames, summary)

## $`chain:1`
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs          er.gcs          icu.gcs         worst.gcs    
##  Min.   : 3.000   Min.   : 3.000   Min.   : 0.000   Min.   : 0.00  
##  1st Qu.: 3.250   1st Qu.: 4.250   1st Qu.: 3.000   1st Qu.: 3.00  
##  Median : 7.000   Median : 8.000   Median : 6.000   Median : 3.00  
##  Mean   : 8.368   Mean   : 8.403   Mean   : 6.403   Mean   : 5.49  
##  3rd Qu.:12.750   3rd Qu.:12.750   3rd Qu.: 7.886   3rd Qu.: 7.75  
##  Max.   :19.259   Max.   :16.382   Max.   :14.000   Max.   :14.00  
##                                                                    
##     X6m.gose        X2013.gose    skull.fx temp.injury surgery
##  Min.   :-0.477   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.: 3.000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median : 5.000   Median :7.000                               
##  Mean   : 4.682   Mean   :5.804                               
##  3rd Qu.: 6.000   3rd Qu.:7.000                               
##  Max.   : 8.000   Max.   :8.000                               
##                                                               
##    spikes.hr            min.hr            max.hr        acute.sz late.sz
##  Min.   :-137.889   Min.   :-36.411   Min.   :-528.98   0:38     0:20   
##  1st Qu.:   3.805   1st Qu.:  0.000   1st Qu.:  16.25   1: 8     1:26   
##  Median :  28.125   Median :  0.000   Median :  84.00                   
##  Mean   :  38.266   Mean   :  1.741   Mean   : 188.99                   
##  3rd Qu.:  62.502   3rd Qu.:  3.750   3rd Qu.: 363.00                   
##  Max.   : 294.000   Max.   : 42.000   Max.   :1199.00                   
##                                                                         
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##                 
## 
## $`chain:2`
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs           er.gcs         icu.gcs         worst.gcs     
##  Min.   :-0.8877   Min.   : 3.00   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.0000   1st Qu.: 4.25   1st Qu.: 3.000   1st Qu.: 3.000  
##  Median : 6.5000   Median : 8.00   Median : 6.000   Median : 3.000  
##  Mean   : 7.6670   Mean   : 8.56   Mean   : 6.303   Mean   : 5.452  
##  3rd Qu.:12.0000   3rd Qu.:13.00   3rd Qu.: 7.750   3rd Qu.: 7.607  
##  Max.   :15.0000   Max.   :18.75   Max.   :14.000   Max.   :14.000  
##                                                                     
##     X6m.gose       X2013.gose    skull.fx temp.injury surgery
##  Min.   :2.000   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.:3.000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median :5.000   Median :7.000                               
##  Mean   :4.904   Mean   :5.804                               
##  3rd Qu.:6.253   3rd Qu.:7.000                               
##  Max.   :8.000   Max.   :8.000                               
##                                                              
##    spikes.hr            min.hr            max.hr        acute.sz late.sz
##  Min.   :-219.254   Min.   :-19.972   Min.   :-924.66   0:38     0:20   
##  1st Qu.:   3.953   1st Qu.:  0.000   1st Qu.:  28.75   1: 8     1:26   
##  Median :  33.490   Median :  0.000   Median : 150.00                   
##  Mean   :  73.034   Mean   :  3.770   Mean   : 255.84                   
##  3rd Qu.: 147.970   3rd Qu.:  6.278   3rd Qu.: 468.00                   
##  Max.   : 421.748   Max.   : 42.000   Max.   :1318.33                   
##                                                                         
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##                 
## 
## $`chain:3`
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs          er.gcs          icu.gcs        worst.gcs     
##  Min.   : 3.000   Min.   : 3.000   Min.   : 0.00   Min.   :-3.824  
##  1st Qu.: 3.250   1st Qu.: 4.250   1st Qu.: 3.00   1st Qu.: 3.000  
##  Median : 7.000   Median : 8.000   Median : 6.00   Median : 3.000  
##  Mean   : 8.122   Mean   : 8.525   Mean   : 6.30   Mean   : 5.199  
##  3rd Qu.:12.000   3rd Qu.:13.000   3rd Qu.: 7.75   3rd Qu.: 7.000  
##  Max.   :16.690   Max.   :16.607   Max.   :14.00   Max.   :14.000  
##                                                                    
##     X6m.gose          X2013.gose    skull.fx temp.injury surgery
##  Min.   :-0.02415   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.: 3.00000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median : 5.00000   Median :7.000                               
##  Mean   : 4.53409   Mean   :5.804                               
##  3rd Qu.: 6.00000   3rd Qu.:7.000                               
##  Max.   : 8.00000   Max.   :8.000                               
##                                                                 
##    spikes.hr            min.hr            max.hr         acute.sz late.sz
##  Min.   :-281.747   Min.   :-33.445   Min.   :-1099.01   0:38     0:20   
##  1st Qu.:   4.577   1st Qu.:  0.000   1st Qu.:   33.75   1: 8     1:26   
##  Median :  29.403   Median :  0.000   Median :  163.03                   
##  Mean   :  51.345   Mean   :  3.239   Mean   :  252.11                   
##  3rd Qu.: 108.989   3rd Qu.:  6.773   3rd Qu.:  518.85                   
##  Max.   : 294.000   Max.   : 42.000   Max.   : 1199.00                   
##                                                                          
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##

# should be similar for each of the k chains (in this case k=5).
# mipply is wrapper for sapply invoked on mi-class objects to compute the col means
round(mipply(imputations, mean, to.matrix = TRUE), 3)

##                   chain:1 chain:2 chain:3 chain:4 chain:5
## id                 23.500  23.500  23.500  23.500  23.500
## age                 0.000   0.000   0.000   0.000   0.000
## sex                 1.804   1.804   1.804   1.804   1.804
## mechanism           4.261   4.261   4.261   4.261   4.261
## field.gcs           0.041  -0.037   0.014   0.006  -0.007
## er.gcs              0.026   0.045   0.041  -0.020  -0.035
## icu.gcs             0.004  -0.012  -0.012   0.008  -0.038
## worst.gcs           0.013   0.008  -0.030   0.012   0.000
## X6m.gose           -0.035   0.028  -0.077  -0.057  -0.044
## X2013.gose          0.000   0.000   0.000   0.000   0.000
## skull.fx            1.609   1.609   1.609   1.609   1.609
## temp.injury         1.587   1.587   1.587   1.587   1.587
## surgery             1.630   1.630   1.630   1.630   1.630
## spikes.hr          38.266  73.034  51.345  54.517  36.725
## min.hr             -0.094   0.010  -0.017   0.159  -0.029
## max.hr             -0.087   0.023   0.017  -0.018  -0.090
## acute.sz            1.174   1.174   1.174   1.174   1.174
## late.sz             1.565   1.565   1.565   1.565   1.565
## ever.sz             1.587   1.587   1.587   1.587   1.587
## missing_field.gcs   0.043   0.043   0.043   0.043   0.043
## missing_er.gcs      0.043   0.043   0.043   0.043   0.043
## missing_icu.gcs     0.022   0.022   0.022   0.022   0.022
## missing_worst.gcs   0.022   0.022   0.022   0.022   0.022
## missing_X6m.gose    0.109   0.109   0.109   0.109   0.109
## missing_spikes.hr   0.391   0.391   0.391   0.391   0.391
## missing_min.hr      0.391   0.391   0.391   0.391   0.391
## missing_max.hr      0.391   0.391   0.391   0.391   0.391

# Rhat convergence statistics compares the variance between chains to the variance across chains.
# Rhat Values ~ 1.0 indicate likely convergence,
# Rhat Values > 1.1 indicate that the chains should be run longer (use large number of iterations)
Rhats(imputations, statistic = "moments") # assess the convergence of MI algorithm

## mean_field.gcs    mean_er.gcs   mean_icu.gcs mean_worst.gcs  mean_X6m.gose 
##      1.4787521      3.1877013      2.8303357      2.2686572      0.9932531 
## mean_spikes.hr    mean_min.hr    mean_max.hr   sd_field.gcs      sd_er.gcs 
##      2.5564563      1.7275399      2.1428859      1.2807005      1.2659777 
##     sd_icu.gcs   sd_worst.gcs    sd_X6m.gose   sd_spikes.hr      sd_min.hr 
##      1.3829297      1.1412632      1.0487617      2.2336051      1.9834381 
##      sd_max.hr 
##      2.2399720

# When convergence is unstable, we can continue the iterations for all chains, e.g.
imputations <- mi(imputations, n.iter=20) # add additional 20 iterations
# To plot the produced mi results, for all missing_variables we can generate
# a histogram of the observed, imputed, and completed data.
# We can compare of the completed data to the fitted values implied by the model for the completed #data, by plotting binned residuals.
# hist function works similarly as plot.
# image function gives a sense of the missingness patterns in the data
plot(imputations);hist(imputations)

image(imputations); summary(imputations)

## $id
## $id$is_missing
## [1] "all values observed"
## 
## $id$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   12.25   23.50   23.50   34.75   46.00 
## 
## 
## $age
## $age$is_missing
## [1] "all values observed"
## 
## $age$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.6045 -0.4019 -0.1126  0.0000  0.2997  1.3341 
## 
## 
## $sex
## $sex$is_missing
## [1] "all values observed"
## 
## $sex$observed
## 
##  1  2 
##  9 37 
## 
## 
## $mechanism
## $mechanism$is_missing
## [1] "all values observed"
## 
## $mechanism$observed
## 
##  1  2  3  4  5  6  7 
##  4  4 13  2  7 10  6 
## 
## 
## $field.gcs
## $field.gcs$is_missing
## missing
## FALSE  TRUE 
##    44     2 
## 
## $field.gcs$imputed
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -2.430462 -0.531767  0.001835  0.009681  0.459283  2.024220 
## 
## $field.gcs$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.5545 -0.5545 -0.1109  0.0000  0.4436  0.7763 
## 
## 
## $er.gcs
## $er.gcs$is_missing
## missing
## FALSE  TRUE 
##    44     2 
## 
## $er.gcs$imputed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.0493 -1.5811 -1.1128 -1.0759 -0.2557  0.3408 
## 
## $er.gcs$observed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.62098 -0.50114 -0.08171  0.00000  0.48752  0.81708 
## 
## 
## $icu.gcs
## $icu.gcs$is_missing
## missing
## FALSE  TRUE 
##    45     1 
## 
## $icu.gcs$imputed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.1686 -1.7804 -1.6045 -1.5440 -0.9245 -0.2418 
## 
## $icu.gcs$observed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.98350 -0.52088 -0.05826  0.00000  0.25016  1.17540 
## 
## 
## $worst.gcs
## $worst.gcs$is_missing
## missing
## FALSE  TRUE 
##    45     1 
## 
## $worst.gcs$imputed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.7031 -0.3589 -0.3131 -0.1614 -0.2112  0.7795 
## 
## $worst.gcs$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.8068 -0.3586 -0.3586  0.0000  0.2390  1.2849 
## 
## 
## $X6m.gose
## $X6m.gose$is_missing
## missing
## FALSE  TRUE 
##    41     5 
## 
## $X6m.gose$imputed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.27850 -0.43691 -0.06858 -0.03898  0.33339  1.32618 
## 
## $X6m.gose$observed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.80159 -0.51581  0.05576  0.00000  0.34155  0.91312 
## 
## 
## $X2013.gose
## $X2013.gose$is_missing
## [1] "all values observed"
## 
## $X2013.gose$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.9987 -0.2112  0.3139  0.0000  0.3139  0.5764 
## 
## 
## $skull.fx
## $skull.fx$is_missing
## [1] "all values observed"
## 
## $skull.fx$observed
## 
##  1  2 
## 18 28 
## 
## 
## $temp.injury
## $temp.injury$is_missing
## [1] "all values observed"
## 
## $temp.injury$observed
## 
##  1  2 
## 19 27 
## 
## 
## $surgery
## $surgery$is_missing
## [1] "all values observed"
## 
## $surgery$observed
## 
##  1  2 
## 17 29 
## 
## 
## $spikes.hr
## $spikes.hr$is_missing
## missing
## FALSE  TRUE 
##    28    18 
## 
## $spikes.hr$imputed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -331.250  -79.870   -6.743   10.109  105.559  324.983 
## 
## $spikes.hr$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.280   5.357  18.170  52.872  57.227 294.000 
## 
## 
## $min.hr
## $min.hr$is_missing
## missing
## FALSE  TRUE 
##    28    18 
## 
## $min.hr$imputed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -4.5731 -1.0627 -0.1471 -0.1819  0.8233  2.9098 
## 
## $min.hr$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.1828 -0.1828 -0.1828  0.0000 -0.1828  1.9668 
## 
## 
## $max.hr
## $max.hr$is_missing
## missing
## FALSE  TRUE 
##    28    18 
## 
## $max.hr$imputed
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.49886 -0.53336 -0.01627 -0.05129  0.47479  1.56717 
## 
## $max.hr$observed
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.3783 -0.3400 -0.2376  0.0000  0.1166  1.5750 
## 
## 
## $acute.sz
## $acute.sz$is_missing
## [1] "all values observed"
## 
## $acute.sz$observed
## 
##  1  2 
## 38  8 
## 
## 
## $late.sz
## $late.sz$is_missing
## [1] "all values observed"
## 
## $late.sz$observed
## 
##  1  2 
## 20 26 
## 
## 
## $ever.sz
## $ever.sz$is_missing
## [1] "all values observed"
## 
## $ever.sz$observed
## 
##  1  2 
## 19 27

# regression model and impact of various predictors
model_results <- pool(ever.sz ~ surgery + worst.gcs + factor(sex) + age,
data = imputations, m=5); display (model_results); summary (model_results)

## bayesglm(formula = ever.sz ~ surgery + worst.gcs + factor(sex) + 
##     age, data = imputations, m = 5)
##                 coef.est coef.se
## (Intercept)      0.42     1.27  
## surgery1         0.91     0.65  
## worst.gcs       -0.08     0.10  
## factor(sex)Male -0.32     0.76  
## age              0.00     0.02  
## n = 41, k = 5
## residual deviance = 59.5, null deviance = 62.4 (difference = 2.8)

## 
## Call:
## pool(formula = ever.sz ~ surgery + worst.gcs + factor(sex) + 
##     age, data = imputations, m = 5)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.676  -1.188   0.844   1.016   1.375  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)      0.417855   1.274720   0.328    0.743
## surgery1         0.914428   0.654134   1.398    0.162
## worst.gcs       -0.084801   0.097286  -0.872    0.383
## factor(sex)Male -0.317796   0.761500  -0.417    0.676
## age              0.002258   0.018153   0.124    0.901
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 62.371  on 45  degrees of freedom
## Residual deviance: 59.545  on 41  degrees of freedom
## AIC: 69.545
## 
## Number of Fisher Scoring iterations: 6.6

coef(summary(model_results))[, 1:2] # get the model coef's and their SE's

##                     Estimate Std. Error
## (Intercept)      0.417854649 1.27471972
## surgery1         0.914428111 0.65413413
## worst.gcs       -0.084801326 0.09728582
## factor(sex)Male -0.317796027 0.76150004
## age              0.002257943 0.01815253

# Report the summaries of the imputations
data.frames <- complete(imputations, 3) # extract the first 3 chains
lapply(data.frames, summary) # report summaries

## [[1]]
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs          er.gcs           icu.gcs         worst.gcs     
##  Min.   :-1.742   Min.   :-17.264   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 3.000   1st Qu.:  3.250   1st Qu.: 3.000   1st Qu.: 3.000  
##  Median : 6.500   Median :  7.000   Median : 6.000   Median : 3.000  
##  Mean   : 7.704   Mean   :  7.343   Mean   : 6.344   Mean   : 5.348  
##  3rd Qu.:12.000   3rd Qu.: 12.000   3rd Qu.: 7.750   3rd Qu.: 7.000  
##  Max.   :15.000   Max.   : 15.000   Max.   :14.000   Max.   :14.000  
##                                                                      
##     X6m.gose        X2013.gose    skull.fx temp.injury surgery
##  Min.   :-3.168   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.: 3.000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median : 5.000   Median :7.000                               
##  Mean   : 4.909   Mean   :5.804                               
##  3rd Qu.: 7.000   3rd Qu.:7.000                               
##  Max.   : 9.445   Max.   :8.000                               
##                                                               
##    spikes.hr            min.hr             max.hr       acute.sz late.sz
##  Min.   :-331.250   Min.   :-22.2183   Min.   :-709.8   0:38     0:20   
##  1st Qu.:   1.375   1st Qu.: -3.8339   1st Qu.:  31.5   1: 8     1:26   
##  Median :   7.450   Median :  0.0000   Median : 177.5                   
##  Mean   :  23.379   Mean   :  0.7072   Mean   : 270.4                   
##  3rd Qu.:  58.328   3rd Qu.:  0.0000   3rd Qu.: 414.2                   
##  Max.   : 294.000   Max.   : 42.0000   Max.   :1199.0                   
##                                                                         
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##                 
## 
## [[2]]
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs           er.gcs          icu.gcs          worst.gcs     
##  Min.   :-13.915   Min.   : 3.000   Min.   :-14.170   Min.   : 0.000  
##  1st Qu.:  3.000   1st Qu.: 4.250   1st Qu.:  3.000   1st Qu.: 3.000  
##  Median :  7.000   Median : 7.700   Median :  6.000   Median : 3.000  
##  Mean   :  7.558   Mean   : 8.227   Mean   :  5.931   Mean   : 5.369  
##  3rd Qu.: 12.000   3rd Qu.:12.000   3rd Qu.:  7.750   3rd Qu.: 7.000  
##  Max.   : 15.000   Max.   :15.000   Max.   : 14.000   Max.   :14.000  
##                                                                       
##     X6m.gose       X2013.gose    skull.fx temp.injury surgery
##  Min.   :2.000   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.:3.000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median :5.000   Median :7.000                               
##  Mean   :4.875   Mean   :5.804                               
##  3rd Qu.:6.000   3rd Qu.:7.000                               
##  Max.   :8.000   Max.   :8.000                               
##                                                              
##    spikes.hr           min.hr            max.hr        acute.sz late.sz
##  Min.   :-265.67   Min.   :-36.561   Min.   :-1276.7   0:38     0:20   
##  1st Qu.:   1.67   1st Qu.:  0.000   1st Qu.:   31.5   1: 8     1:26   
##  Median :   8.73   Median :  0.000   Median :  105.7                   
##  Mean   :  27.90   Mean   :  1.811   Mean   :  197.5                   
##  3rd Qu.:  55.26   3rd Qu.:  5.339   3rd Qu.:  358.2                   
##  Max.   : 294.00   Max.   : 43.005   Max.   : 1199.0                   
##                                                                        
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##                 
## 
## [[3]]
##        id             age            sex            mechanism 
##  Min.   : 1.00   Min.   :16.00   Female: 9   Bike_vs_Auto: 4  
##  1st Qu.:12.25   1st Qu.:23.00   Male  :37   Blunt       : 4  
##  Median :23.50   Median :33.00               Fall        :13  
##  Mean   :23.50   Mean   :36.89               GSW         : 2  
##  3rd Qu.:34.75   3rd Qu.:47.25               MCA         : 7  
##  Max.   :46.00   Max.   :83.00               MVA         :10  
##                                              Peds_vs_Auto: 6  
##    field.gcs          er.gcs          icu.gcs         worst.gcs     
##  Min.   : 3.000   Min.   :-5.035   Min.   :-5.168   Min.   : 0.000  
##  1st Qu.: 3.250   1st Qu.: 3.250   1st Qu.: 3.000   1st Qu.: 3.000  
##  Median : 7.000   Median : 7.000   Median : 6.000   Median : 3.000  
##  Mean   : 8.363   Mean   : 7.760   Mean   : 6.127   Mean   : 5.513  
##  3rd Qu.:12.000   3rd Qu.:12.000   3rd Qu.: 7.750   3rd Qu.: 7.750  
##  Max.   :26.252   Max.   :15.000   Max.   :14.000   Max.   :14.000  
##                                                                     
##     X6m.gose       X2013.gose    skull.fx temp.injury surgery
##  Min.   :2.000   Min.   :2.000   0:18     0:19        0:17   
##  1st Qu.:3.000   1st Qu.:5.000   1:28     1:27        1:29   
##  Median :5.000   Median :7.000                               
##  Mean   :4.606   Mean   :5.804                               
##  3rd Qu.:6.000   3rd Qu.:7.000                               
##  Max.   :8.000   Max.   :8.000                               
##                                                              
##    spikes.hr            min.hr            max.hr        acute.sz late.sz
##  Min.   :-193.857   Min.   :-36.587   Min.   :-835.00   0:38     0:20   
##  1st Qu.:   1.375   1st Qu.:  0.000   1st Qu.:  24.25   1: 8     1:26   
##  Median :   7.450   Median :  0.000   Median : 105.46                   
##  Mean   :  33.133   Mean   :  3.202   Mean   : 203.60                   
##  3rd Qu.:  64.057   3rd Qu.:  7.027   3rd Qu.: 393.73                   
##  Max.   : 294.000   Max.   : 42.000   Max.   :1199.00                   
##                                                                         
##  ever.sz missing_field.gcs missing_er.gcs  missing_icu.gcs
##  0:19    Mode :logical     Mode :logical   Mode :logical  
##  1:27    FALSE:44          FALSE:44        FALSE:45       
##          TRUE :2           TRUE :2         TRUE :1        
##                                                           
##                                                           
##                                                           
##                                                           
##  missing_worst.gcs missing_X6m.gose missing_spikes.hr missing_min.hr 
##  Mode :logical     Mode :logical    Mode :logical     Mode :logical  
##  FALSE:45          FALSE:41         FALSE:28          FALSE:28       
##  TRUE :1           TRUE :5          TRUE :18          TRUE :18       
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  missing_max.hr 
##  Mode :logical  
##  FALSE:28       
##  TRUE :18       
##                 
##                 
##                 
##

A Simple Manual Implementation of EM-Based Imputation

set.seed(202227)
mu <- as.matrix(rep(2,20) )
sig <- diag(c(1:20) )
# Add a noise item. The noise is
# $ \epsilon ~ MVN(as.matrix(rep(0,20)), diag(rep(1,20)))$
library(MASS)
sim_data <- mvrnorm(n = 200, mu, sig) + mvrnorm(n=200, as.matrix(rep(0,20)), diag( rep(1,20) ))
# save these in the "original" object
sim_data.orig <- sim_data
# introduce 500 random missing indices (in the total of 4000=200*20)
# discrete distribution where the probability of the elements of values is proportional to probs, #which are normalized to add up to 1.

library(e1071)
rand.miss <- e1071::rdiscrete(500, probs = rep(1,length(sim_data)), values = seq(1, length(sim_data)))
sim_data[rand.miss] <- NA
sum(is.na(sim_data)) # check now many missing (NA) are there < 500

## [1] 466

# cast the data into a data.frame object and report 15*10 elements
sim_data.df <- data.frame(sim_data)
knitr::kable( sim_data.df[1:15, 1:10], caption = "The first 15 rows and first 10 columns of the simulation data")

The first 15 rows and first 10 columns of the simulation data
X1	X2	X3	X4	X5	X6	X7	X8	X9	X10
NA	4.8500284	1.7282025	4.6387605	4.5511409	2.1699719	4.7118571	2.6972905	2.7895031	-0.9250101
1.7721898	-1.6955535	0.4234295	6.2619764	-3.1339307	-1.8043889	1.2932913	0.9906410	1.0790366	1.1638161
2.2859237	4.2713494	3.8900670	3.5104578	3.0025607	3.9058619	3.0548309	3.5258699	NA	1.3601231
NA	6.9253703	NA	NA	NA	-2.0861790	-0.8847125	5.6485105	1.3348689	0.6512754
-0.0305062	2.3069337	0.7258556	1.2474398	-0.2091862	1.1726298	1.4622491	0.6492390	0.1458781	7.2195026
1.9411674	3.6606019	6.3004943	0.3543817	1.0875460	1.3077307	0.0269935	-0.0112352	NA	2.4616466
0.5209321	4.2225983	NA	4.3656730	5.7103015	0.4736086	4.5803136	0.2535158	-0.9758147	-2.6156309
3.3111727	0.1245427	0.9768425	4.7764851	4.9810860	-0.0438375	-0.3213523	4.3612903	2.7483497	3.7744105
2.1097813	NA	NA	3.6791032	3.7784795	5.5533794	1.8635898	0.1565858	4.6743012	NA
1.6203382	3.2422234	1.3625543	1.4006816	-1.3467049	NA	3.1703070	-3.4308300	0.2195447	-2.2921107
2.4063177	0.5601779	3.9764884	2.1652769	0.1060700	NA	-3.7629196	-0.9718406	1.2196606	0.7038253
NA	1.6714614	0.2000255	2.3308968	0.4593543	0.1716613	-0.5795159	-1.3327330	6.4115262	1.2607465
2.5416914	1.0388186	4.7421120	2.3110187	-0.2892515	-1.5291668	NA	9.3103565	2.5364297	1.6862923
0.9393925	1.8668513	3.4702117	NA	0.7797323	2.5548284	-3.5498813	NA	3.2124694	5.3120133
-0.8334973	0.6466994	1.1949414	-0.6573305	NA	-1.7743495	1.7168340	2.1065878	2.9478528	-0.5562349

EM_algorithm <- function(x, tol = 0.001) {
# identify the missing data entries (Boolean indices)
missvals <- is.na(x)
# instantialize the EM-iteration
new.impute <- x
old.impute <- x
count.iter <- 1
reach.tol <- 0
# compute \Sigma on complete data
sigma <- as.matrix(var(na.exclude(x)))
# compute the vector of feature (column) means
mean.vec <- as.matrix(apply(na.exclude(x), 2, mean))
while (reach.tol != 1) {
for (i in 1:nrow(x)) {
pick.miss <- (c(missvals[i, ]))
if (sum(pick.miss) != 0) {
# compute invese-Sigma_completeData, variance-covariance matrix
inv.S <- solve(sigma[!pick.miss, !pick.miss], tol = 1e-40)
# Expectation Step
# $$E(Y|X)=\mu_{mis}+\Sigma_{mo}\Sigma_{oo}^{-1}(X-\mu_{obs})$$
new.impute[i, pick.miss] <- mean.vec[pick.miss] +
sigma[pick.miss,!pick.miss] %*% inv.S %*%
(t(new.impute[i, !pick.miss]) - t(t(mean.vec[!pick.miss])))
}
}
# Maximization Step
# Compute the complete \Sigma complete vector of feature (column) means
# $$\Sigma^{(t+1)} = \frac{1}{n}\sum_{i=1}^nE(ZZ^T|X) -
# \mu^{(t+1)}{\mu^{(t+1)}}^T$$
sigma <- var((new.impute))
#$$\mu^{(t+1)} = \frac{1}{n}\sum_{i=1}^nE(Z|X)$$
mean.vec <- as.matrix(apply(new.impute, 2, mean))
# Inspect for convergence tolerance, start with the 2nd iteration
if (count.iter > 1) {
for (l in 1:nrow(new.impute)) {
for (m in 1:ncol(new.impute)) {
if (abs((old.impute[l, m] - new.impute[l, m])) > tol) {
reach.tol < -0
} else {
reach.tol <- 1
}
}
}
}
count.iter <- count.iter + 1
old.impute <- new.impute
}
sim_data.imputed <- EM_algorithm(sim_data.df, tol=0.0001)
# return the imputation output of the current iteration that passed the tolerance level
return(new.impute)
}

library(Amelia)

## Loading required package: Rcpp

## ## 
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2019 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##

dim(sim_data.df)

## [1] 200  20

amelia.out <- amelia(sim_data.df, m = 5)

## -- Imputation 1 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21
## 
## -- Imputation 2 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
## 
## -- Imputation 3 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17
## 
## -- Imputation 4 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15
## 
## -- Imputation 5 --
## 
##   1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
##  21 22 23

library("XML")
library("xml2")
library("rvest")

## 
## Attaching package: 'rvest'

## The following object is masked from 'package:XML':
## 
##     xml

wiki_url <- read_html("http://wiki.socr.umich.edu/index.php/SOCR_Data_Dinov_021708_Earthquakes")
html_nodes(wiki_url, "#content")

## {xml_nodeset (1)}
## [1] <div id="content" class="mw-body-primary" role="main">\n\t<a id="top ...

## {xml_nodeset (1)}
## [1] <div id="content" class="mw-body-primary" role="main">\n\t<a id="top...
earthquake<- html_table(html_nodes(wiki_url, "table")[[2]])


library(ggplot2)

## Registered S3 methods overwritten by 'ggplot2':
##   method         from 
##   [.quosures     rlang
##   c.quosures     rlang
##   print.quosures rlang

## 
## Attaching package: 'ggplot2'

## The following objects are masked from 'package:psych':
## 
##     %+%, alpha

plotweb <- ggplot(earthquake, aes(Longitude, Latitude, group = Magt, color = Magt))+
  geom_point(data = earthquake, size = 4, mapping = aes(x = Longitude, y = Latitude, shape = Magt))
plotweb

plot1<-ggplot(earthquake, aes(Latitude, size=1))+
geom_density(aes(color=Magt))
plot1

## Warning: Groups with fewer than two data points have been dropped.

kernal_density <- with(earthquake, MASS::kde2d(Longitude, Latitude, n = 50))

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:MASS':
## 
##     select

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

with(kernal_density, plot_ly(x=x, y=y, z=z, type="surface"))

plot_ly(x = ~ earthquake$Longitude)

## No trace type specified:
##   Based on info supplied, a 'histogram' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#histogram

plot_ly(x = ~ earthquake$Longitude, y = ~earthquake$Latitude)

## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter

## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

plot_ly(x=~earthquake$Longitude, y=~earthquake$Latitude, z=~earthquake$Mag)

## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d

## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

df3D <- data.frame(x=earthquake$Longitude, y=earthquake$Latitude, z=earthquake$Mag)

# Convert the Long (X, Y, Z) Earthquake format data into a Matrix Format
library("Matrix")
matrix_EarthQuakes <- with(df3D, sparseMatrix(i = as.numeric(180-x),j=as.numeric(y), x=z, use.last.ij=T, dimnames=list(levels(x), levels(y))))
dim(matrix_EarthQuakes)

## [1] 307  44

View(as.matrix(matrix_EarthQuakes))

# view matrix is 2D heatmap :
library("ggplot2"); library("gplots")

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

heatmap.2( as.matrix(matrix_EarthQuakes[280:307, 30:44]), Rowv=FALSE,
Colv=FALSE, dendrogram='none', cellnote=as.matrix(matrix_EarthQuakes[280:307, 30:44]), notecol="black", trace='none', key=FALSE, lwid =
c(.01, .99), lhei = c(.01, .99), margins = c(5, 15 ))

# Long -180<x<-170, Lat: 30<y<45, Z: 5<Mag<8
matrix_EarthQuakes <- with(df3D, sparseMatrix(i = as.numeric(180+x),
j=as.numeric(y), x=z,use.last.ij=TRUE,dimnames=list(levels(x), levels(y))))
mat1 <- as.matrix(matrix_EarthQuakes)
plot_ly(z = ~mat1, type = "surface")

# To plot the Aggregate (Summed) Magnitudes at all Long/Lat:
matrix_EarthQuakes <- with(df3D, sparseMatrix(i = as.numeric(180+x),
j=as.numeric(y), x=z, dimnames=list(levels(x), levels(y))))
mat1 <- as.matrix(matrix_EarthQuakes)
plot_ly(z = ~mat1, type = "surface")

# plot_ly(z = ~mat1[30:60, 20:40], type = "surface")

Cohort rebalancing

# load the data: 06_PPMI_ClassificationValidationData_Short.csv
ppmi_data <-read.csv("https://umich.instructure.com/files/330400/download?download_frd=1", header=TRUE)
table(ppmi_data$ResearchGroup)

## 
## Control      PD   SWEDD 
##     166     434      61

# binarize the Dx classes
ppmi_data$ResearchGroup <- ifelse(ppmi_data$ResearchGroup == "Control", "Control", "Patient")
attach(ppmi_data)
head(ppmi_data)

##   X FID_IID L_insular_cortex_AvgMeanCurvature L_insular_cortex_ComputeArea
## 1 2    3001                        0.11044503                     2917.938
## 2 3    3002                        0.10091019                     2505.515
## 3 4    3003                        0.09726433                     3284.796
## 4 5    3004                        0.10955031                     2802.992
## 5 6    3006                        0.09733524                     3188.515
## 6 7    3008                        0.10340988                     2548.982
##   L_insular_cortex_Volume L_insular_cortex_ShapeIndex
## 1                8754.107                   0.3207401
## 2                8089.185                   0.3191578
## 3               11446.049                   0.3150762
## 4                9000.509                   0.3272781
## 5               11004.131                   0.3040458
## 6                7613.921                   0.3006974
##   L_insular_cortex_Curvedness R_insular_cortex_AvgMeanCurvature
## 1                   0.1033839                         0.1160668
## 2                   0.1099705                         0.1077885
## 3                   0.0991298                         0.1014219
## 4                   0.1144485                         0.1126108
## 5                   0.1091740                         0.1086814
## 6                   0.1154384                         0.1232604
##   R_insular_cortex_ComputeArea R_insular_cortex_Volume
## 1                     2342.961                5732.367
## 2                     2073.981                5321.370
## 3                     2320.993                6395.721
## 4                     2225.154                5567.333
## 5                     2135.745                5583.177
## 6                     1839.798                3953.644
##   R_insular_cortex_ShapeIndex R_insular_cortex_Curvedness
## 1                   0.3149570                   0.1215921
## 2                   0.2709259                   0.1345651
## 3                   0.3003472                   0.1126387
## 4                   0.3058957                   0.1315195
## 5                   0.3187932                   0.1205962
## 6                   0.2863870                   0.1324961
##   L_cingulate_gyrus_AvgMeanCurvature L_cingulate_gyrus_ComputeArea
## 1                          0.1246343                      4381.930
## 2                          0.1355602                      3221.540
## 3                          0.1236226                      4658.949
## 4                          0.1274116                      4016.731
## 5                          0.1298253                      4188.552
## 6                          0.1406572                      3366.857
##   L_cingulate_gyrus_Volume L_cingulate_gyrus_ShapeIndex
## 1                11205.127                    0.4349292
## 2                 7439.645                    0.4396813
## 3                12443.624                    0.4556403
## 4                 9922.555                    0.4316057
## 5                10502.193                    0.4510032
## 6                 7630.507                    0.4539803
##   L_cingulate_gyrus_Curvedness R_cingulate_gyrus_AvgMeanCurvature
## 1                   0.06714417                          0.1249111
## 2                   0.08652417                          0.1458248
## 3                   0.06749613                          0.1361298
## 4                   0.07150800                          0.1288771
## 5                   0.07495887                          0.1277486
## 6                   0.07871191                          0.1431415
##   R_cingulate_gyrus_ComputeArea R_cingulate_gyrus_Volume
## 1                      4610.447                12246.549
## 2                      3194.348                 7264.683
## 3                      4028.330                 9681.935
## 4                      3918.263                10227.022
## 5                      4352.117                11528.466
## 6                      3606.831                 8659.683
##   R_cingulate_gyrus_ShapeIndex R_cingulate_gyrus_Curvedness
## 1                    0.4350192                   0.07081687
## 2                    0.4268223                   0.09480283
## 3                    0.4498362                   0.08071711
## 4                    0.4462917                   0.07295120
## 5                    0.4498547                   0.07407571
## 6                    0.4394006                   0.08183549
##   L_caudate_AvgMeanCurvature L_caudate_ComputeArea L_caudate_Volume
## 1                  0.2392929              621.5344         821.8991
## 2                  0.1833535              876.9414        1364.8601
## 3                  0.1983211             1019.6664        1570.3055
## 4                  0.1958644             1086.9388        1772.9241
## 5                  0.2115839              863.4775        1341.3073
## 6                  0.3143357              424.2490         473.3143
##   L_caudate_ShapeIndex L_caudate_Curvedness R_caudate_AvgMeanCurvature
## 1            0.2473931            0.3429220                  0.1558452
## 2            0.2580652            0.2308683                  0.1763202
## 3            0.3164872            0.1760060                  0.1703131
## 4            0.2960296            0.1854576                  0.1739900
## 5            0.2780000            0.2513109                  0.1441599
## 6            0.2509409            0.4493166                  0.2014278
##   R_caudate_ComputeArea R_caudate_Volume R_caudate_ShapeIndex
## 1             1302.1458         2526.248            0.2892424
## 2             1056.2200         1965.206            0.2970232
## 3             1208.7987         2268.701            0.2912487
## 4             1267.7040         2262.694            0.3027961
## 5             1247.1482         2678.534            0.3008528
## 6              811.4173         1302.117            0.2616859
##   R_caudate_Curvedness L_putamen_AvgMeanCurvature L_putamen_ComputeArea
## 1            0.1496714                  0.2022735              1029.175
## 2            0.1854200                  0.1418179              1275.905
## 3            0.1568764                  0.1409578              1482.684
## 4            0.1599818                  0.1814492              1050.227
## 5            0.1360011                  0.1787384              1229.513
## 6            0.2701702                  0.1603679              1180.385
##   L_putamen_Volume L_putamen_ShapeIndex L_putamen_Curvedness
## 1         1543.017            0.2853486            0.2320214
## 2         2696.695            0.2876249            0.1566615
## 3         3046.274            0.3041529            0.1523438
## 4         2028.140            0.2769987            0.1868594
## 5         2482.513            0.2923973            0.2064697
## 6         2105.129            0.2469977            0.1955575
##   R_putamen_AvgMeanCurvature R_putamen_ComputeArea R_putamen_Volume
## 1                  0.1111208              1680.197         3792.201
## 2                  0.1446244              1375.725         2966.682
## 3                  0.1402802              1310.875         2755.439
## 4                  0.1214390              1511.374         3458.706
## 5                  0.1628602              1298.760         2595.811
## 6                  0.1191328              1468.178         3456.212
##   R_putamen_ShapeIndex R_putamen_Curvedness L_hippocampus_AvgMeanCurvature
## 1            0.2670927            0.1332011                      0.1226265
## 2            0.2936447            0.1546469                      0.1304512
## 3            0.2913030            0.1606766                      0.1191435
## 4            0.2978874            0.1362302                      0.1242598
## 5            0.3136044            0.1666391                      0.1116450
## 6            0.2860881            0.1425279                      0.1266569
##   L_hippocampus_ComputeArea L_hippocampus_Volume L_hippocampus_ShapeIndex
## 1                  1769.672             4737.038                0.3201427
## 2                  1529.759             3736.040                0.3189372
## 3                  1679.536             4458.264                0.3512617
## 4                  1552.218             3718.433                0.2814200
## 5                  1878.617             5078.684                0.3338252
## 6                  1580.434             3833.965                0.3145288
##   L_hippocampus_Curvedness R_hippocampus_AvgMeanCurvature
## 1                0.1179703                      0.1312485
## 2                0.1239492                      0.1179286
## 3                0.1092478                      0.1174198
## 4                0.1435228                      0.1158403
## 5                0.1089977                      0.1140703
## 6                0.1460567                      0.1206285
##   R_hippocampus_ComputeArea R_hippocampus_Volume R_hippocampus_ShapeIndex
## 1                  1578.946             3817.621                0.2894075
## 2                  1799.438             4665.167                0.3102413
## 3                  1850.187             5177.618                0.3050725
## 4                  1762.969             4529.123                0.3331539
## 5                  1976.793             5508.145                0.3469782
## 6                  1596.947             3811.093                0.2952811
##   R_hippocampus_Curvedness L_fusiform_gyrus_AvgMeanCurvature
## 1               0.14824928                        0.10385320
## 2               0.12306355                        0.10405280
## 3               0.12038557                        0.11326566
## 4               0.12558195                        0.11218079
## 5               0.09903943                        0.09365524
## 6               0.13272062                        0.10741455
##   L_fusiform_gyrus_ComputeArea L_fusiform_gyrus_Volume
## 1                     4534.707                15830.32
## 2                     4013.385                12677.99
## 3                     3902.592                11525.38
## 4                     4120.621                13775.00
## 5                     4864.410                17467.13
## 6                     3978.040                12779.50
##   L_fusiform_gyrus_ShapeIndex L_fusiform_gyrus_Curvedness
## 1                   0.3102792                  0.08341972
## 2                   0.2857870                  0.10808808
## 3                   0.2983204                  0.09976534
## 4                   0.3584661                  0.09042557
## 5                   0.2929722                  0.09020279
## 6                   0.3071964                  0.10343642
##   R_fusiform_gyrus_AvgMeanCurvature R_fusiform_gyrus_ComputeArea
## 1                         0.1063175                     3945.037
## 2                         0.1267130                     3551.876
## 3                         0.1076625                     4260.940
## 4                         0.1018607                     3919.842
## 5                         0.1199351                     5169.535
## 6                         0.1170813                     3331.151
##   R_fusiform_gyrus_Volume R_fusiform_gyrus_ShapeIndex
## 1                14471.84                   0.3592094
## 2                11263.23                   0.3331986
## 3                14213.62                   0.3515383
## 4                14405.84                   0.3487578
## 5                17455.32                   0.3855645
## 6                10131.39                   0.3454159
##   R_fusiform_gyrus_Curvedness Sex Weight ResearchGroup VisitID     Age
## 1                  0.07762280   1   74.2       Patient       1 65.1808
## 2                  0.10994359   2   70.6       Patient       1 67.6247
## 3                  0.08235058   2   82.0       Patient       1 56.7562
## 4                  0.09018205   1   81.6       Control       1 59.4548
## 5                  0.07418299   2   69.1       Patient       1 57.5781
## 6                  0.09854799   2   69.1       Control       1 81.9452
##   chr12_rs34637584_GT chr17_rs11868035_GT chr17_rs11012_GT
## 1                   0                   1                1
## 2                   0                   1                0
## 3                   0                   1                1
## 4                   0                   1                1
## 5                   0                   0                1
## 6                   0                   0                0
##   chr17_rs393152_GT chr17_rs12185268_GT chr17_rs199533_GT
## 1                 1                   1                 0
## 2                 0                   0                 0
## 3                 1                   1                 1
## 4                 1                   1                 1
## 5                 1                   1                 1
## 6                 0                   0                 0
##   chr17_rs199533_DP chr17_rs199533_GQ UPDRS_Part_I_Summary_Score_Baseline
## 1                20                60                                   0
## 2                26                66                                   3
## 3                65                99                                   1
## 4                55                99                                   0
## 5                33                99                                   1
## 6                20                60                                   1
##   UPDRS_Part_I_Summary_Score_Month_03 UPDRS_Part_I_Summary_Score_Month_06
## 1                            0.000000                            0.000000
## 2                            2.000000                            3.407134
## 3                            1.000000                            0.000000
## 4                            3.295365                            1.498698
## 5                            2.000000                            0.000000
## 6                            2.951738                            1.309742
##   UPDRS_Part_I_Summary_Score_Month_09 UPDRS_Part_I_Summary_Score_Month_12
## 1                            1.000000                                   4
## 2                            1.000000                                   3
## 3                            0.000000                                   0
## 4                           -0.259238                                   0
## 5                            1.000000                                   2
## 6                            4.120260                                   4
##   UPDRS_Part_I_Summary_Score_Month_18 UPDRS_Part_I_Summary_Score_Month_24
## 1                           1.0000000                             0.00000
## 2                           4.0000000                             8.00000
## 3                           0.6328668                             0.00000
## 4                           1.4534149                             1.00000
## 5                           2.7703873                             1.70897
## 6                           2.3499861                             1.00000
##   UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Baseline
## 1                                                          2
## 2                                                         15
## 3                                                          6
## 4                                                          0
## 5                                                          4
## 6                                                          2
##   UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_03
## 1                                                   2.000000
## 2                                                  10.000000
## 3                                                   5.000000
## 4                                                   6.576687
## 5                                                  10.000000
## 6                                                   7.256589
##   UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_06
## 1                                                   3.000000
## 2                                                  11.969026
## 3                                                   3.000000
## 4                                                   3.132565
## 5                                                  10.000000
## 6                                                   7.275325
##   UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_09
## 1                                                   3.000000
## 2                                                  16.000000
## 3                                                   8.000000
## 4                                                   3.061787
## 5                                                   8.000000
## 6                                                   9.568565
##   UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_12
## 1                                                   3.00e+00
## 2                                                   1.50e+01
## 3                                                   5.00e+00
## 4                                                   8.88e-16
## 5                                                   1.30e+01
## 6                                                   1.00e+00
##   UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_18
## 1                                                   4.000000
## 2                                                  16.000000
## 3                                                  13.035019
## 4                                                   5.633763
## 5                                                  16.601842
## 6                                                   6.425239
##   UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_24
## 1                                               2.000000e+00
## 2                                               1.400000e+01
## 3                                               7.000000e+00
## 4                                               8.880000e-16
## 5                                               1.465211e+01
## 6                                               2.000000e+00
##   UPDRS_Part_III_Summary_Score_Baseline
## 1                                    12
## 2                                    17
## 3                                    29
## 4                                     2
## 5                                    22
## 6                                    10
##   UPDRS_Part_III_Summary_Score_Month_03
## 1                              18.00000
## 2                              22.00000
## 3                              37.00000
## 4                              20.59587
## 5                              30.00000
## 6                              30.44123
##   UPDRS_Part_III_Summary_Score_Month_06
## 1                              23.00000
## 2                              24.61497
## 3                              37.00000
## 4                              17.21627
## 5                              31.00000
## 6                              27.84771
##   UPDRS_Part_III_Summary_Score_Month_09
## 1                              19.00000
## 2                              20.00000
## 3                              33.00000
## 4                              19.68433
## 5                              32.00000
## 6                              28.91509
##   UPDRS_Part_III_Summary_Score_Month_12
## 1                                    20
## 2                                    27
## 3                                    44
## 4                                     3
## 5                                    33
## 6                                    12
##   UPDRS_Part_III_Summary_Score_Month_18
## 1                              29.00000
## 2                              22.00000
## 3                              59.19947
## 4                              23.40963
## 5                              47.79686
## 6                              27.78102
##   UPDRS_Part_III_Summary_Score_Month_24
## 1                              39.00000
## 2                              22.00000
## 3                              43.00000
## 4                               7.00000
## 5                              42.49653
## 6                              13.00000
##   X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Baseline
## 1                                                                      6
## 2                                                                     14
## 3                                                                      8
## 4                                                                      6
## 5                                                                      0
## 6                                                                      4
##   X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_06
## 1                                                               7.000000
## 2                                                              11.280820
## 3                                                               8.000000
## 4                                                               7.033210
## 5                                                               0.000000
## 6                                                               6.890178
##   X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_12
## 1                                                                      3
## 2                                                                     14
## 3                                                                      6
## 4                                                                      6
## 5                                                                      6
## 6                                                                      6
##   X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_24
## 1                                                                7.00000
## 2                                                               11.00000
## 3                                                                7.00000
## 4                                                               10.00000
## 5                                                                6.12631
## 6                                                                8.00000
##   X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Baseline
## 1                                                                                  6
## 2                                                                                  6
## 3                                                                                  4
## 4                                                                                  5
## 5                                                                                  5
## 6                                                                                  5
##   X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_06
## 1                                                                           4.000000
## 2                                                                           5.177094
## 3                                                                           4.000000
## 4                                                                           3.409930
## 5                                                                           5.000000
## 6                                                                           4.857484
##   X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_12
## 1                                                                                  5
## 2                                                                                  6
## 3                                                                                  4
## 4                                                                                  4
## 5                                                                                  5
## 6                                                                                  5
##   X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_24
## 1                                                                           5.000000
## 2                                                                           9.000000
## 3                                                                           4.000000
## 4                                                                           6.000000
## 5                                                                           5.435596
## 6                                                                           6.000000

# Model-free analysis, classification
library("crossval")
require(crossval)
require(ada)

## Loading required package: ada

## Loading required package: rpart

#set up adaboosting prediction function
# Define a new classification result-reporting function
my.ada <- function (train.x, train.y, test.x, test.y, negative, formula)
{
  ada.fit <- ada(train.x, train.y)
predict.y <- predict(ada.fit, test.x)
#count TP, FP, TN, FN, Accuracy, etc.
out <- confusionMatrix(test.y, predict.y, negative = negative)
# negative is the label of a negative "null" sample (default: "control").
return (out)
}

# balance cases
# SMOTE: Synthetic Minority Oversampling Technique to handle class misbalance in binary #classification.
set.seed(1000)
require(unbalanced)

## Loading required package: unbalanced

## Loading required package: mlr

## Loading required package: ParamHelpers

## 
## Attaching package: 'mlr'

## The following object is masked from 'package:crossval':
## 
##     crossval

## The following object is masked from 'package:e1071':
## 
##     impute

## Loading required package: foreach

## Loading required package: doParallel

## Loading required package: iterators

## Loading required package: parallel

ppmi_data$PD <- ifelse(ppmi_data$ResearchGroup=="Control", 1, 0)
uniqueID <- unique(ppmi_data$FID_IID)
ppmi_data <- ppmi_data[ppmi_data$VisitID==1, ]
ppmi_data$PD <- factor(ppmi_data$PD)
colnames(ppmi_data)

##   [1] "X"                                                                                 
##   [2] "FID_IID"                                                                           
##   [3] "L_insular_cortex_AvgMeanCurvature"                                                 
##   [4] "L_insular_cortex_ComputeArea"                                                      
##   [5] "L_insular_cortex_Volume"                                                           
##   [6] "L_insular_cortex_ShapeIndex"                                                       
##   [7] "L_insular_cortex_Curvedness"                                                       
##   [8] "R_insular_cortex_AvgMeanCurvature"                                                 
##   [9] "R_insular_cortex_ComputeArea"                                                      
##  [10] "R_insular_cortex_Volume"                                                           
##  [11] "R_insular_cortex_ShapeIndex"                                                       
##  [12] "R_insular_cortex_Curvedness"                                                       
##  [13] "L_cingulate_gyrus_AvgMeanCurvature"                                                
##  [14] "L_cingulate_gyrus_ComputeArea"                                                     
##  [15] "L_cingulate_gyrus_Volume"                                                          
##  [16] "L_cingulate_gyrus_ShapeIndex"                                                      
##  [17] "L_cingulate_gyrus_Curvedness"                                                      
##  [18] "R_cingulate_gyrus_AvgMeanCurvature"                                                
##  [19] "R_cingulate_gyrus_ComputeArea"                                                     
##  [20] "R_cingulate_gyrus_Volume"                                                          
##  [21] "R_cingulate_gyrus_ShapeIndex"                                                      
##  [22] "R_cingulate_gyrus_Curvedness"                                                      
##  [23] "L_caudate_AvgMeanCurvature"                                                        
##  [24] "L_caudate_ComputeArea"                                                             
##  [25] "L_caudate_Volume"                                                                  
##  [26] "L_caudate_ShapeIndex"                                                              
##  [27] "L_caudate_Curvedness"                                                              
##  [28] "R_caudate_AvgMeanCurvature"                                                        
##  [29] "R_caudate_ComputeArea"                                                             
##  [30] "R_caudate_Volume"                                                                  
##  [31] "R_caudate_ShapeIndex"                                                              
##  [32] "R_caudate_Curvedness"                                                              
##  [33] "L_putamen_AvgMeanCurvature"                                                        
##  [34] "L_putamen_ComputeArea"                                                             
##  [35] "L_putamen_Volume"                                                                  
##  [36] "L_putamen_ShapeIndex"                                                              
##  [37] "L_putamen_Curvedness"                                                              
##  [38] "R_putamen_AvgMeanCurvature"                                                        
##  [39] "R_putamen_ComputeArea"                                                             
##  [40] "R_putamen_Volume"                                                                  
##  [41] "R_putamen_ShapeIndex"                                                              
##  [42] "R_putamen_Curvedness"                                                              
##  [43] "L_hippocampus_AvgMeanCurvature"                                                    
##  [44] "L_hippocampus_ComputeArea"                                                         
##  [45] "L_hippocampus_Volume"                                                              
##  [46] "L_hippocampus_ShapeIndex"                                                          
##  [47] "L_hippocampus_Curvedness"                                                          
##  [48] "R_hippocampus_AvgMeanCurvature"                                                    
##  [49] "R_hippocampus_ComputeArea"                                                         
##  [50] "R_hippocampus_Volume"                                                              
##  [51] "R_hippocampus_ShapeIndex"                                                          
##  [52] "R_hippocampus_Curvedness"                                                          
##  [53] "L_fusiform_gyrus_AvgMeanCurvature"                                                 
##  [54] "L_fusiform_gyrus_ComputeArea"                                                      
##  [55] "L_fusiform_gyrus_Volume"                                                           
##  [56] "L_fusiform_gyrus_ShapeIndex"                                                       
##  [57] "L_fusiform_gyrus_Curvedness"                                                       
##  [58] "R_fusiform_gyrus_AvgMeanCurvature"                                                 
##  [59] "R_fusiform_gyrus_ComputeArea"                                                      
##  [60] "R_fusiform_gyrus_Volume"                                                           
##  [61] "R_fusiform_gyrus_ShapeIndex"                                                       
##  [62] "R_fusiform_gyrus_Curvedness"                                                       
##  [63] "Sex"                                                                               
##  [64] "Weight"                                                                            
##  [65] "ResearchGroup"                                                                     
##  [66] "VisitID"                                                                           
##  [67] "Age"                                                                               
##  [68] "chr12_rs34637584_GT"                                                               
##  [69] "chr17_rs11868035_GT"                                                               
##  [70] "chr17_rs11012_GT"                                                                  
##  [71] "chr17_rs393152_GT"                                                                 
##  [72] "chr17_rs12185268_GT"                                                               
##  [73] "chr17_rs199533_GT"                                                                 
##  [74] "chr17_rs199533_DP"                                                                 
##  [75] "chr17_rs199533_GQ"                                                                 
##  [76] "UPDRS_Part_I_Summary_Score_Baseline"                                               
##  [77] "UPDRS_Part_I_Summary_Score_Month_03"                                               
##  [78] "UPDRS_Part_I_Summary_Score_Month_06"                                               
##  [79] "UPDRS_Part_I_Summary_Score_Month_09"                                               
##  [80] "UPDRS_Part_I_Summary_Score_Month_12"                                               
##  [81] "UPDRS_Part_I_Summary_Score_Month_18"                                               
##  [82] "UPDRS_Part_I_Summary_Score_Month_24"                                               
##  [83] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Baseline"                        
##  [84] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_03"                        
##  [85] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_06"                        
##  [86] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_09"                        
##  [87] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_12"                        
##  [88] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_18"                        
##  [89] "UPDRS_Part_II_Patient_Questionnaire_Summary_Score_Month_24"                        
##  [90] "UPDRS_Part_III_Summary_Score_Baseline"                                             
##  [91] "UPDRS_Part_III_Summary_Score_Month_03"                                             
##  [92] "UPDRS_Part_III_Summary_Score_Month_06"                                             
##  [93] "UPDRS_Part_III_Summary_Score_Month_09"                                             
##  [94] "UPDRS_Part_III_Summary_Score_Month_12"                                             
##  [95] "UPDRS_Part_III_Summary_Score_Month_18"                                             
##  [96] "UPDRS_Part_III_Summary_Score_Month_24"                                             
##  [97] "X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Baseline"            
##  [98] "X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_06"            
##  [99] "X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_12"            
## [100] "X_Assessment_Non.Motor_Epworth_Sleepiness_Scale_Summary_Score_Month_24"            
## [101] "X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Baseline"
## [102] "X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_06"
## [103] "X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_12"
## [104] "X_Assessment_Non.Motor_Geriatric_Depression_Scale_GDS_Short_Summary_Score_Month_24"
## [105] "PD"

# ppmi_data.1<-ppmi_data[, c(3:281, 284, 287, 336:340, 341)]
n <- ncol(ppmi_data)
output.1 <- ppmi_data$PD
# remove Default Real Clinical subject classifications!
ppmi_data$PD <- ifelse(ppmi_data$ResearchGroup=="Control", 1, 0)
input <- ppmi_data[ , -which(names(ppmi_data) %in% c("ResearchGroup",
"PD", "X", "FID_IID"))]
# output <- as.matrix(ppmi_data[ , which(names(ppmi_data) %in% {"PD"})])
output <- as.factor(ppmi_data$PD)
c(dim(input), dim(output))

## [1] 422 101

#balance the dataset
data.1<-ubBalance(X= input, Y=output, type="ubSMOTE", percOver=300, percUnder=150, verbose=TRUE)

## Proportion of positives after ubSMOTE : 47.06 % of 1037 observations

# percOver = A number that drives the decision of how many extra cases from the minority class are #generated (known as over-sampling).
# k = A number indicating the number of nearest neighbors that are used to generate the new examples #of the minority class.
# percUnder = A number that drives the decision of how many extra cases from the majority classes are #selected for each case generated from the minority class (known as under-sampling)

balancedData<-cbind(data.1$X, data.1$Y)
table(data.1$Y)

## 
##   0   1 
## 549 488

nrow(data.1$X); ncol(data.1$X)

## [1] 1037

## [1] 101

nrow(balancedData); ncol(balancedData)

## [1] 1037

## [1] 102

nrow(input); ncol(input)

## [1] 422

## [1] 101

colnames(balancedData) <- c(colnames(input), "PD")
# check visually for differences between the distributions of the raw(input) and rebalanced data (for #only one variable, in this case)
qqplot(input[, 5], balancedData [, 5])

alpha.0.05 <- 0.05
test.results.bin <- NULL # binarized/dichotomized p-values
test.results.raw <- NULL # raw p-values
for (i in 1:(ncol(balancedData)-1))
{
test.results.raw[i]<-wilcox.test(input[, i], balancedData [, i])$p.value
test.results.bin [i] <- ifelse(test.results.raw [i] > alpha.0.05, 1, 0)
print(c("i=", i, "Wilcoxon-test=", test.results.raw [i]))
}

## [1] "i="                "1"                 "Wilcoxon-test="   
## [4] "0.121916148645072"
## [1] "i="                 "2"                  "Wilcoxon-test="    
## [4] "0.0115555790092226"
## [1] "i="                  "3"                   "Wilcoxon-test="     
## [4] "0.00263706847152905"
## [1] "i="                 "4"                  "Wilcoxon-test="    
## [4] "0.0193825870829418"
## [1] "i="                "5"                 "Wilcoxon-test="   
## [4] "0.465234458246381"
## [1] "i="               "6"                "Wilcoxon-test="  
## [4] "0.46498323084997"
## [1] "i="                 "7"                  "Wilcoxon-test="    
## [4] "0.0559803555615915"
## [1] "i="                 "8"                  "Wilcoxon-test="    
## [4] "0.0469706919902162"
## [1] "i="                 "9"                  "Wilcoxon-test="    
## [4] "0.0796428263747722"
## [1] "i="                "10"                "Wilcoxon-test="   
## [4] "0.867915750891381"
## [1] "i="                "11"                "Wilcoxon-test="   
## [4] "0.791763744004629"
## [1] "i="                "12"                "Wilcoxon-test="   
## [4] "0.252737493660177"
## [1] "i="                "13"                "Wilcoxon-test="   
## [4] "0.185613271988163"
## [1] "i="                "14"                "Wilcoxon-test="   
## [4] "0.921995769938857"
## [1] "i="                "15"                "Wilcoxon-test="   
## [4] "0.368163390166519"
## [1] "i="                "16"                "Wilcoxon-test="   
## [4] "0.871583919642137"
## [1] "i="                "17"                "Wilcoxon-test="   
## [4] "0.647092249431079"
## [1] "i="                "18"                "Wilcoxon-test="   
## [4] "0.409631033847553"
## [1] "i="                "19"                "Wilcoxon-test="   
## [4] "0.417301066910891"
## [1] "i="                "20"                "Wilcoxon-test="   
## [4] "0.749951942238898"
## [1] "i="                "21"                "Wilcoxon-test="   
## [4] "0.656276365210151"
## [1] "i="               "22"               "Wilcoxon-test="  
## [4] "0.15600349399911"
## [1] "i="                 "23"                 "Wilcoxon-test="    
## [4] "0.0486209830782161"
## [1] "i="                "24"                "Wilcoxon-test="   
## [4] "0.864465951024094"
## [1] "i="                "25"                "Wilcoxon-test="   
## [4] "0.657465243276207"
## [1] "i="                "26"                "Wilcoxon-test="   
## [4] "0.648471417036021"
## [1] "i="               "27"               "Wilcoxon-test="  
## [4] "0.19531148154071"
## [1] "i="                 "28"                 "Wilcoxon-test="    
## [4] "0.0594836689192933"
## [1] "i="                "29"                "Wilcoxon-test="   
## [4] "0.269303857585846"
## [1] "i="                "30"                "Wilcoxon-test="   
## [4] "0.422672530338941"
## [1] "i="                "31"                "Wilcoxon-test="   
## [4] "0.560402299892182"
## [1] "i="                 "32"                 "Wilcoxon-test="    
## [4] "0.0775475535039496"
## [1] "i="                 "33"                 "Wilcoxon-test="    
## [4] "0.0478438562128426"
## [1] "i="                "34"                "Wilcoxon-test="   
## [4] "0.859726704357985"
## [1] "i="                "35"                "Wilcoxon-test="   
## [4] "0.426408714781497"
## [1] "i="                "36"                "Wilcoxon-test="   
## [4] "0.853487254852529"
## [1] "i="                "37"                "Wilcoxon-test="   
## [4] "0.392114049970288"
## [1] "i="                "38"                "Wilcoxon-test="   
## [4] "0.156724308076248"
## [1] "i="                "39"                "Wilcoxon-test="   
## [4] "0.694130201824518"
## [1] "i="                "40"                "Wilcoxon-test="   
## [4] "0.723601415143246"
## [1] "i="                  "41"                  "Wilcoxon-test="     
## [4] "0.00619701387523487"
## [1] "i="                   "42"                   "Wilcoxon-test="      
## [4] "0.000286562753740492"
## [1] "i="                   "43"                   "Wilcoxon-test="      
## [4] "5.29983946602411e-05"
## [1] "i="                 "44"                 "Wilcoxon-test="    
## [4] "0.0707571581138932"
## [1] "i="                "45"                "Wilcoxon-test="   
## [4] "0.140737722531954"
## [1] "i="                 "46"                 "Wilcoxon-test="    
## [4] "0.0507074618738118"
## [1] "i="                "47"                "Wilcoxon-test="   
## [4] "0.020805101536845"
## [1] "i="                  "48"                  "Wilcoxon-test="     
## [4] "0.00484021037886222"
## [1] "i="                 "49"                 "Wilcoxon-test="    
## [4] "0.0273568640429652"
## [1] "i="                "50"                "Wilcoxon-test="   
## [4] "0.234512884956754"
## [1] "i="                 "51"                 "Wilcoxon-test="    
## [4] "0.0319316834536563"
## [1] "i="                   "52"                   "Wilcoxon-test="      
## [4] "8.47910363381624e-05"
## [1] "i="                   "53"                   "Wilcoxon-test="      
## [4] "0.000102471108880125"
## [1] "i="                "54"                "Wilcoxon-test="   
## [4] "0.706925544464841"
## [1] "i="                 "55"                 "Wilcoxon-test="    
## [4] "0.0146128471982993"
## [1] "i="                 "56"                 "Wilcoxon-test="    
## [4] "0.0723311086286497"
## [1] "i="                   "57"                   "Wilcoxon-test="      
## [4] "0.000213922368883682"
## [1] "i="                   "58"                   "Wilcoxon-test="      
## [4] "0.000142016596036652"
## [1] "i="               "59"               "Wilcoxon-test="  
## [4] "0.11681183091181"
## [1] "i="                 "60"                 "Wilcoxon-test="    
## [4] "0.0253973561846997"
## [1] "i="                "61"                "Wilcoxon-test="   
## [4] "0.506528137961723"
## [1] "i="                "62"                "Wilcoxon-test="   
## [4] "0.770981890446707"
## [1] "i="             "63"             "Wilcoxon-test=" "NaN"           
## [1] "i="                "64"                "Wilcoxon-test="   
## [4] "0.204494745121826"
## [1] "i="                "65"                "Wilcoxon-test="   
## [4] "0.438687802825894"
## [1] "i="                "66"                "Wilcoxon-test="   
## [4] "0.143044903491988"
## [1] "i="                "67"                "Wilcoxon-test="   
## [4] "0.208214945007494"
## [1] "i="                 "68"                 "Wilcoxon-test="    
## [4] "0.0827002916147158"
## [1] "i="                "69"                "Wilcoxon-test="   
## [4] "0.768385666605511"
## [1] "i="                "70"                "Wilcoxon-test="   
## [4] "0.967575952568832"
## [1] "i="               "71"               "Wilcoxon-test="  
## [4] "0.48546165207412"
## [1] "i="                "72"                "Wilcoxon-test="   
## [4] "0.764338124700727"
## [1] "i="                "73"                "Wilcoxon-test="   
## [4] "0.701672829552746"
## [1] "i="                "74"                "Wilcoxon-test="   
## [4] "0.182127870815918"
## [1] "i="                "75"                "Wilcoxon-test="   
## [4] "0.783908000869157"
## [1] "i="                 "76"                 "Wilcoxon-test="    
## [4] "0.0300275504051034"
## [1] "i="                 "77"                 "Wilcoxon-test="    
## [4] "0.0652937253636983"
## [1] "i="                "78"                "Wilcoxon-test="   
## [4] "0.886630291736222"
## [1] "i="                  "79"                  "Wilcoxon-test="     
## [4] "0.00304995330937243"
## [1] "i="                   "80"                   "Wilcoxon-test="      
## [4] "0.000193269737895467"
## [1] "i="                "81"                "Wilcoxon-test="   
## [4] "0.263594088910847"
## [1] "i="               "82"               "Wilcoxon-test="  
## [4] "0.55249462248568"
## [1] "i="                "83"                "Wilcoxon-test="   
## [4] "0.840180337999498"
## [1] "i="                 "84"                 "Wilcoxon-test="    
## [4] "1.018634517112e-07"
## [1] "i="                "85"                "Wilcoxon-test="   
## [4] "0.809284153154193"
## [1] "i="                   "86"                   "Wilcoxon-test="      
## [4] "0.000202012794771608"
## [1] "i="                   "87"                   "Wilcoxon-test="      
## [4] "0.000257390393155121"
## [1] "i="                "88"                "Wilcoxon-test="   
## [4] "0.151668005666288"
## [1] "i="                "89"                "Wilcoxon-test="   
## [4] "0.610548017567455"
## [1] "i="                "90"                "Wilcoxon-test="   
## [4] "0.177766076673501"
## [1] "i="                   "91"                   "Wilcoxon-test="      
## [4] "7.52724170827059e-05"
## [1] "i="                "92"                "Wilcoxon-test="   
## [4] "0.472935618407835"
## [1] "i="                   "93"                   "Wilcoxon-test="      
## [4] "0.000191281916588349"
## [1] "i="                "94"                "Wilcoxon-test="   
## [4] "0.710116019415709"
## [1] "i="                "95"                "Wilcoxon-test="   
## [4] "0.777249640877696"
## [1] "i="                "96"                "Wilcoxon-test="   
## [4] "0.394345301658289"
## [1] "i="                "97"                "Wilcoxon-test="   
## [4] "0.622722439182102"
## [1] "i="                "98"                "Wilcoxon-test="   
## [4] "0.333295057251737"
## [1] "i="                   "99"                   "Wilcoxon-test="      
## [4] "9.31683399292477e-05"
## [1] "i="                 "100"                "Wilcoxon-test="    
## [4] "0.0108452571605789"
## [1] "i="                 "101"                "Wilcoxon-test="    
## [4] "0.0715649084105231"

print(c("Wilcoxon test results: ", test.results.bin))

##   [1] "Wilcoxon test results: " "1"                      
##   [3] "0"                       "0"                      
##   [5] "0"                       "1"                      
##   [7] "1"                       "1"                      
##   [9] "0"                       "1"                      
##  [11] "1"                       "1"                      
##  [13] "1"                       "1"                      
##  [15] "1"                       "1"                      
##  [17] "1"                       "1"                      
##  [19] "1"                       "1"                      
##  [21] "1"                       "1"                      
##  [23] "1"                       "0"                      
##  [25] "1"                       "1"                      
##  [27] "1"                       "1"                      
##  [29] "1"                       "1"                      
##  [31] "1"                       "1"                      
##  [33] "1"                       "0"                      
##  [35] "1"                       "1"                      
##  [37] "1"                       "1"                      
##  [39] "1"                       "1"                      
##  [41] "1"                       "0"                      
##  [43] "0"                       "0"                      
##  [45] "1"                       "1"                      
##  [47] "1"                       "0"                      
##  [49] "0"                       "0"                      
##  [51] "1"                       "0"                      
##  [53] "0"                       "0"                      
##  [55] "1"                       "0"                      
##  [57] "1"                       "0"                      
##  [59] "0"                       "1"                      
##  [61] "0"                       "1"                      
##  [63] "1"                       NA                       
##  [65] "1"                       "1"                      
##  [67] "1"                       "1"                      
##  [69] "1"                       "1"                      
##  [71] "1"                       "1"                      
##  [73] "1"                       "1"                      
##  [75] "1"                       "1"                      
##  [77] "0"                       "1"                      
##  [79] "1"                       "0"                      
##  [81] "0"                       "1"                      
##  [83] "1"                       "1"                      
##  [85] "0"                       "1"                      
##  [87] "0"                       "0"                      
##  [89] "1"                       "1"                      
##  [91] "1"                       "0"                      
##  [93] "1"                       "0"                      
##  [95] "1"                       "1"                      
##  [97] "1"                       "1"                      
##  [99] "1"                       "0"                      
## [101] "0"                       "1"

test.results.corr <- stats::p.adjust(test.results.raw, method = "fdr", n = length(test.results.raw))
# where methods are "holm", "hochberg", "hommel", "bonferroni", "BH", "BY","fdr", "none")
plot(test.results.raw, test.results.corr)

# zeros (0) are significant independent between-group T-test differences,ones (1) are insignificant
# Check the Differences between the rate of significance between the raw and FDR-corrected p-values
test.results.bin <- ifelse(test.results.raw > alpha.0.05, 1, 0)
table(test.results.bin)

## test.results.bin
##  0  1 
## 29 71

test.results.corr.bin <- ifelse(test.results.corr > alpha.0.05, 1, 0)
table(test.results.corr.bin)

## test.results.corr.bin
##  0  1 
## 17 83

#Right Skewed
N <- 10000
x <- rnbinom(N, 10, .5)
hist(x,
xlim=c(min(x), max(x)), probability=T, nclass=max(x)-min(x)+1,
col='lightblue', xlab=' ', ylab=' ', axes=F,
main='Right Skewed')
lines(density(x, bw=1), col='red', lwd=3)

#No Skew
N <- 10000
x <- rnorm(N, 0, 1)
hist(x, probability=T,
col='lightblue', xlab=' ', ylab=' ', axes=F,
main='No Skew')
lines(density(x, bw=0.4), col='red', lwd=3)

#Uniform density
x<-runif(1000, 1, 50)
hist(x, col='lightblue', main="Uniform Distribution", probability = T, xlab=
"", ylab="Density", axes=F)
abline(h=0.02, col='red', lwd=3)

#68-95-99.7 rule
x <- rnorm(N, 0, 1)
hist(x, probability=T,
col='lightblue', xlab=' ', ylab=' ', axes = F,
main='68-95-99.7 Rule')
lines(density(x, bw=0.4), col='red', lwd=3)
axis(1, at=c(-3, -2, -1, 0, 1, 2, 3), labels = expression(mu-3*sigma,
mu-2*sigma, mu-sigma, mu, mu+sigma, mu+2*sigma, mu+3*sigma))
abline(v=-1, lwd=3, lty=2)
abline(v=1, lwd=3, lty=2)
abline(v=-2, lwd=3, lty=2)
abline(v=2, lwd=3, lty=2)
abline(v=-3, lwd=3, lty=2)
abline(v=3, lwd=3, lty=2)
text(0, 0.2, "68%")
segments(-1, 0.2, -0.3, 0.2, col = 'red', lwd=2)
segments(1, 0.2, 0.3, 0.2, col = 'red', lwd=2)
text(0, 0.15, "95%")
segments(-2, 0.15, -0.3, 0.15, col = 'red', lwd=2)
segments(2, 0.15, 0.3, 0.15, col = 'red', lwd=2)
text(0, 0.1, "99.7%")
segments(-3, 0.1, -0.3, 0.1, col = 'red', lwd=2)
segments(3, 0.1, 0.3, 0.1, col = 'red', lwd=2)

Temp_Data <- as.data.frame(read.csv("https://umich.instructure.com/files/706163/download?download_frd=1", header=T, na.strings=c("", ".", "NA", "NR")))
summary(Temp_Data)

##       Year           Jan             Feb             Mar       
##  Min.   :1900   Min.   :11.40   Min.   :14.00   Min.   :24.70  
##  1st Qu.:1929   1st Qu.:20.95   1st Qu.:22.35   1st Qu.:32.65  
##  Median :1958   Median :24.20   Median :26.10   Median :35.30  
##  Mean   :1958   Mean   :24.22   Mean   :25.45   Mean   :35.39  
##  3rd Qu.:1986   3rd Qu.:27.65   3rd Qu.:28.90   3rd Qu.:38.15  
##  Max.   :2015   Max.   :35.00   Max.   :35.60   Max.   :50.70  
##                 NA's   :1       NA's   :1       NA's   :1      
##       Apr             May             Jun             Jul       
##  Min.   :37.50   Min.   :49.40   Min.   :61.90   Min.   :67.80  
##  1st Qu.:45.15   1st Qu.:56.60   1st Qu.:66.45   1st Qu.:71.10  
##  Median :47.60   Median :58.70   Median :68.40   Median :72.60  
##  Mean   :47.55   Mean   :58.85   Mean   :68.22   Mean   :72.66  
##  3rd Qu.:50.15   3rd Qu.:61.65   3rd Qu.:70.50   3rd Qu.:73.70  
##  Max.   :54.50   Max.   :66.60   Max.   :74.30   Max.   :78.90  
##  NA's   :1       NA's   :1                                      
##       Aug             Sep             Oct             Nov       
##  Min.   :64.90   Min.   :54.80   Min.   :42.70   Min.   :32.90  
##  1st Qu.:68.85   1st Qu.:62.20   1st Qu.:50.20   1st Qu.:37.40  
##  Median :70.90   Median :63.80   Median :52.20   Median :39.80  
##  Mean   :70.74   Mean   :63.82   Mean   :52.29   Mean   :39.76  
##  3rd Qu.:72.40   3rd Qu.:65.75   3rd Qu.:54.30   3rd Qu.:41.85  
##  Max.   :76.20   Max.   :68.80   Max.   :62.10   Max.   :47.50  
##  NA's   :1       NA's   :1       NA's   :1       NA's   :1      
##       Dec       
##  Min.   :17.70  
##  1st Qu.:25.55  
##  Median :28.40  
##  Mean   :28.39  
##  3rd Qu.:31.65  
##  Max.   :36.80  
##  NA's   :1

# View(Temp_Data); colnames(Temp_Data)
# Wide-to-Long transformation: reshape arguments include
# (1) list of variable names that define the different times or metrics (varying),
# (2) the name we wish to give the variable containing these values in our long dataset (v.names),
# (3) the name we wish to give the variable describing the different times or metrics (timevar),
# (4) the values this variable will have (times), and
# (5) the end format for the data (direction)
# Before reshaping make sure all data types are the same as putting them in 1 column will
# otherwise generate inconsistencies/errors

colN <- colnames(Temp_Data[,-1])
longTempData <- reshape(Temp_Data, varying = colN, v.names = "Temps", timevar="Months", times = colN, direction = "long")

# View(longTempData)
bar2 <- ggplot(longTempData, aes(x = Months, y = Temps, fill = Months)) +
geom_bar(stat = "identity")
print(bar2)

## Warning: Removed 10 rows containing missing values (position_stack).

bar3 <- ggplot(longTempData, aes(x = Year, y = Temps, fill = Months)) +
geom_bar(stat = "identity")
print(bar3)

## Warning: Removed 10 rows containing missing values (position_stack).

p <- ggplot(longTempData, aes(x=Year, y=as.integer(Temps), colour=Months)) +
geom_line()
p

Managing Data in R

Kushan De Silva

22 May 2019

R Markdown