This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
## parallel computing HW2 assignment Rachel Konshok
#first I install the packages and load the libraries I need
install.packages("foreach")
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'foreach' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpshELSH\downloaded_packages
install.packages("doParallel")
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'doParallel' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpshELSH\downloaded_packages
library(foreach)
## Warning: package 'foreach' was built under R version 4.4.3
library(doParallel)
## Warning: package 'doParallel' was built under R version 4.4.3
## Loading required package: iterators
## Warning: package 'iterators' was built under R version 4.4.3
## Loading required package: parallel
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
#for logistic regression between sepal length ad width we use the following code
x1 = iris[c(1,2)]
x1
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
## 4 4.6 3.1
## 5 5.0 3.6
## 6 5.4 3.9
## 7 4.6 3.4
## 8 5.0 3.4
## 9 4.4 2.9
## 10 4.9 3.1
## 11 5.4 3.7
## 12 4.8 3.4
## 13 4.8 3.0
## 14 4.3 3.0
## 15 5.8 4.0
## 16 5.7 4.4
## 17 5.4 3.9
## 18 5.1 3.5
## 19 5.7 3.8
## 20 5.1 3.8
## 21 5.4 3.4
## 22 5.1 3.7
## 23 4.6 3.6
## 24 5.1 3.3
## 25 4.8 3.4
## 26 5.0 3.0
## 27 5.0 3.4
## 28 5.2 3.5
## 29 5.2 3.4
## 30 4.7 3.2
## 31 4.8 3.1
## 32 5.4 3.4
## 33 5.2 4.1
## 34 5.5 4.2
## 35 4.9 3.1
## 36 5.0 3.2
## 37 5.5 3.5
## 38 4.9 3.6
## 39 4.4 3.0
## 40 5.1 3.4
## 41 5.0 3.5
## 42 4.5 2.3
## 43 4.4 3.2
## 44 5.0 3.5
## 45 5.1 3.8
## 46 4.8 3.0
## 47 5.1 3.8
## 48 4.6 3.2
## 49 5.3 3.7
## 50 5.0 3.3
## 51 7.0 3.2
## 52 6.4 3.2
## 53 6.9 3.1
## 54 5.5 2.3
## 55 6.5 2.8
## 56 5.7 2.8
## 57 6.3 3.3
## 58 4.9 2.4
## 59 6.6 2.9
## 60 5.2 2.7
## 61 5.0 2.0
## 62 5.9 3.0
## 63 6.0 2.2
## 64 6.1 2.9
## 65 5.6 2.9
## 66 6.7 3.1
## 67 5.6 3.0
## 68 5.8 2.7
## 69 6.2 2.2
## 70 5.6 2.5
## 71 5.9 3.2
## 72 6.1 2.8
## 73 6.3 2.5
## 74 6.1 2.8
## 75 6.4 2.9
## 76 6.6 3.0
## 77 6.8 2.8
## 78 6.7 3.0
## 79 6.0 2.9
## 80 5.7 2.6
## 81 5.5 2.4
## 82 5.5 2.4
## 83 5.8 2.7
## 84 6.0 2.7
## 85 5.4 3.0
## 86 6.0 3.4
## 87 6.7 3.1
## 88 6.3 2.3
## 89 5.6 3.0
## 90 5.5 2.5
## 91 5.5 2.6
## 92 6.1 3.0
## 93 5.8 2.6
## 94 5.0 2.3
## 95 5.6 2.7
## 96 5.7 3.0
## 97 5.7 2.9
## 98 6.2 2.9
## 99 5.1 2.5
## 100 5.7 2.8
## 101 6.3 3.3
## 102 5.8 2.7
## 103 7.1 3.0
## 104 6.3 2.9
## 105 6.5 3.0
## 106 7.6 3.0
## 107 4.9 2.5
## 108 7.3 2.9
## 109 6.7 2.5
## 110 7.2 3.6
## 111 6.5 3.2
## 112 6.4 2.7
## 113 6.8 3.0
## 114 5.7 2.5
## 115 5.8 2.8
## 116 6.4 3.2
## 117 6.5 3.0
## 118 7.7 3.8
## 119 7.7 2.6
## 120 6.0 2.2
## 121 6.9 3.2
## 122 5.6 2.8
## 123 7.7 2.8
## 124 6.3 2.7
## 125 6.7 3.3
## 126 7.2 3.2
## 127 6.2 2.8
## 128 6.1 3.0
## 129 6.4 2.8
## 130 7.2 3.0
## 131 7.4 2.8
## 132 7.9 3.8
## 133 6.4 2.8
## 134 6.3 2.8
## 135 6.1 2.6
## 136 7.7 3.0
## 137 6.3 3.4
## 138 6.4 3.1
## 139 6.0 3.0
## 140 6.9 3.1
## 141 6.7 3.1
## 142 6.9 3.1
## 143 5.8 2.7
## 144 6.8 3.2
## 145 6.7 3.3
## 146 6.7 3.0
## 147 6.3 2.5
## 148 6.5 3.0
## 149 6.2 3.4
## 150 5.9 3.0
# fit logistic regression model
result=glm(x1[,2]~x1[,1])
#cannot use family = binomial as sepal.width is not a binomial variable
result
##
## Call: glm(formula = x1[, 2] ~ x1[, 1])
##
## Coefficients:
## (Intercept) x1[, 1]
## 3.41895 -0.06188
##
## Degrees of Freedom: 149 Total (i.e. Null); 148 Residual
## Null Deviance: 28.31
## Residual Deviance: 27.92 AIC: 179.5
coefficients(result)
## (Intercept) x1[, 1]
## 3.4189468 -0.0618848
#Question 4 run B=20000 for iris data set for variables Sepal.Length and Sepal.Width
trials <- 20000
res <- data.frame()
system.time({
trial <- 1
while(trial <= trials) {
ind <- sample(100, 100, replace=TRUE)
result1 <- glm(x1[ind,2]~x1[ind,1])
r <- coefficients(result1)
res <- rbind(res, r)
trial <- trial + 1
}
})
## user system elapsed
## 18.99 0.94 19.97
head(res)
## X3.61915094339623 X.0.0867924528301887
## 1 3.619151 -0.08679245
## 2 3.577478 -0.10403206
## 3 3.975554 -0.15643406
## 4 3.480705 -0.06918479
## 5 3.387607 -0.04428623
## 6 3.604432 -0.09996884
mean(res[,2])
## [1] -0.1557452
sd(res[,2])
## [1] 0.05527194
#This loop^ calculates the time to estimate the variance of the correlation coefficient without using paralell computing
#Running this code for me the elapse time it took to run this for a B= 20000 was 79.14 seconds
#variance is equal to .05503709
#Now to loade the libraries for parallel computing
library(foreach)
library(doParallel)
detectCores()
## [1] 8
#I have eight cores on my computer
numCores=6
for(i in 1:3){
print(i^2)
}
## [1] 1
## [1] 4
## [1] 9
foreach (i=1:3) %do% {
i^2
}
## [[1]]
## [1] 1
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 9
registerDoParallel(numCores)
foreach (i=1:3) %dopar% {
i^2
}
## [[1]]
## [1] 1
##
## [[2]]
## [1] 4
##
## [[3]]
## [1] 9
foreach (i=1:3, .combine=c) %dopar% {
i^2
}
## [1] 1 4 9
foreach (i=1:3, .combine=rbind) %dopar% {
i^2
}
## [,1]
## result.1 1
## result.2 4
## result.3 9
#Below B= 20000 for paralell computing using foreach and doParallel (question 5)
#elapsed time was 11.27 which is much faster than 79.14 seconds it took above
#the variance for the run was .0552892
#overall the variance estimates were very close and parallel computing greatly reduced time
trials <- 20000
system.time({
r <- foreach(icount(trials), .combine=rbind) %dopar% {
ind <- sample(100, 100, replace=TRUE) # resample data
result1 <- glm(x1[ind,2]~x1[ind,1],)
coefficients(result1)
}
})
## user system elapsed
## 4.23 0.56 7.48
head(r)
## (Intercept) x1[ind, 1]
## result.1 3.584939 -0.10141376
## result.2 3.624672 -0.08240969
## result.3 3.957852 -0.17349472
## result.4 3.803692 -0.13319014
## result.5 3.602386 -0.11036206
## result.6 4.167545 -0.18661773
mean(r[,2])
## [1] -0.1552835
sd(r[,2])
## [1] 0.05505604
stopImplicitCluster()
#question 6 using parlapply to parallel compute
#using parlappy the elapsed time was 4.39 seconds and the variance was .0552892
#overall the fastest method was parlappy however both parallel computation methods were far faster than running the code without parallel computing
model.mse <- function(x) {
id <- sample(1:nrow(x1), 200, replace = T)
mod <- lm(Sepal.Width ~ ., data = x1[id,])
mse <- mean((fitted.values(mod) - x1$medv[id])^2)
return(mse)
}
x.list <- sapply(1:10000, list)
n.cores <- detectCores()
# 8 cores
system.time({
clust <- makeCluster(n.cores)
clusterExport(clust, "x1")
a <- parLapply(clust, x.list, model.mse)})
## user system elapsed
## 0.05 0.05 1.94
head(r)
## (Intercept) x1[ind, 1]
## result.1 3.584939 -0.10141376
## result.2 3.624672 -0.08240969
## result.3 3.957852 -0.17349472
## result.4 3.803692 -0.13319014
## result.5 3.602386 -0.11036206
## result.6 4.167545 -0.18661773
mean(r[,2])
## [1] -0.1552835
sd(r[,2])
## [1] 0.05505604
stopCluster(clust)
#I accidentially calculated regression analysis for questions 4, 5, and 6 first and now I'll do correlation coeffcient.
#I couldn't find examples from the HW, lectures, or code so I assume it's a simple pearson's coeffcient
s.length <- iris$Sepal.Length
s.width <- iris$Sepal.Width
correlation <- cor(s.length, s.width, method = "pearson")
correlation
## [1] -0.1175698
#question
B <- 20000
n <- length(s.length)
runtime <- system.time({
r <- numeric(B)
for (i in 1:B) {
idx <- sample(1:n, size = n, replace = TRUE)
r[i] <- cor(s.length[idx], s.width[idx])
}
# Calculate results
mean_r <- mean(r)
var_r <- var(r)
})
runtime
## user system elapsed
## 0.61 0.00 0.61
mean_r
## [1] -0.1195381
var_r
## [1] 0.005441184
#without parallel computation the elapsed time for this calculation was 1.61 seconds and the mean was -.1188058 and the variance was .005492118
#question 2 below
#using foreach and dopar the elapsed time was 7.48 seconds the mean was -.1196421 and the variance was .005619631
n.cores <- detectCores()
cl <- makeCluster(n.cores)
registerDoParallel(cl)
# Run bootstrap with foreach + dopar
runtime <- system.time({
cor_vals <- foreach(i = 1:B, .combine = c) %dopar% {
idx <- sample(1:n, n, replace = TRUE)
cor(s.length[idx], s.width[idx])
}
})
# Stop the cluster
stopCluster(cl)
# Compute results
mean_corr <- mean(cor_vals)
var_corr <- var(cor_vals)
#question 3 using parlapply the elapsed time was .3 seconds the mean was -.1190302 and the variance was .005518913
#overall for pearson's coeffcient parlapply was the shortest amount of processing time but using foreach and dopar actually took more time that not using parallel computing
#the means were all similar to each other so was the variance
boot_cor <- function(dummy) {
idx <- sample(1:nrow(x1), replace = TRUE)
return(cor(x1$Sepal.Length[idx], x1$Sepal.Width[idx]))
}
# Create a list of length B
B <- 20000
x.list <- vector("list", B)
# Setup parallel cluster
n.cores <- detectCores()
clust <- makeCluster(n.cores)
# Export required variables to each cluster worker
clusterExport(clust, varlist = c("x1", "boot_cor"))
runtime <- system.time({
cor_vals <- parLapply(clust, x.list, boot_cor)
})
stopCluster(clust)
cor_vals <- unlist(cor_vals) # convert list to numeric vector
mean_corr <- mean(cor_vals)
var_corr <- var(cor_vals)
#missing data values question 2 (1-5)
irisrk = read.csv("C:/Users/Rache/OneDrive/Documents/R/R folder/Stat713/Hw2/iris.csv")
irisrk
## s.length s.width p.length p.width class
## 1 5.1 3.5 1.4 0.2 Iris-setosa
## 2 4.9 3.0 1.4 0.2 Iris-setosa
## 3 4.7 3.2 1.3 0.2 Iris-setosa
## 4 4.6 NA 1.5 0.2 Iris-setosa
## 5 5.0 3.6 1.4 0.2 Iris-setosa
## 6 5.4 3.9 1.7 0.4 Iris-setosa
## 7 4.6 3.4 1.4 0.3 Iris-setosa
## 8 5.0 NA 1.5 0.2 Iris-setosa
## 9 4.4 2.9 1.4 0.2 Iris-setosa
## 10 7.0 3.2 4.7 1.4 Iris-versicolor
## 11 6.4 3.2 4.5 1.5 Iris-versicolor
## 12 6.9 3.1 4.9 1.5 Iris-versicolor
## 13 5.5 2.3 4.0 1.3 Iris-versicolor
## 14 6.5 2.8 4.6 1.5 Iris-versicolor
## 15 5.7 2.8 4.5 1.3 Iris-versicolor
## 16 6.3 3.3 4.7 1.6 Iris-versicolor
## 17 6.3 NA 6.0 2.5 Iris-virginica
## 18 5.8 2.7 5.1 1.9 Iris-virginica
## 19 7.1 3.0 5.9 2.1 Iris-virginica
## 20 6.3 2.9 5.6 1.8 Iris-virginica
## 21 6.5 NA 5.8 NA Iris-virginica
## 22 7.6 3.0 6.6 2.1 Iris-virginica
## 23 4.9 NA 4.5 NA Iris-virginica
## 24 7.3 2.9 6.3 1.8 Iris-virginic
dim(irisrk)
## [1] 24 5
summary(irisrk)
## s.length s.width p.length p.width
## Min. :4.400 Min. :2.300 Min. :1.300 Min. :0.200
## 1st Qu.:4.975 1st Qu.:2.900 1st Qu.:1.475 1st Qu.:0.200
## Median :5.750 Median :3.000 Median :4.500 Median :1.350
## Mean :5.825 Mean :3.089 Mean :3.779 Mean :1.109
## 3rd Qu.:6.500 3rd Qu.:3.250 3rd Qu.:5.225 3rd Qu.:1.750
## Max. :7.600 Max. :3.900 Max. :6.600 Max. :2.500
## NA's :5 NA's :2
## class
## Length:24
## Class :character
## Mode :character
##
##
##
##
sapply(irisrk, function(x) sum(is.na(x)))
## s.length s.width p.length p.width class
## 0 5 0 2 0
# Overall missing value count
sum(is.na(irisrk))
## [1] 7
#there are 7 total missing values
# Calculate the percentage of missing values in each column
sapply(irisrk, function(x) mean(is.na(x)) * 100)
## s.length s.width p.length p.width class
## 0.000000 20.833333 0.000000 8.333333 0.000000
#s.width has 21% missing values and p.width has 8.33% missing values the rest have none
# Count missing values in each column
sapply(irisrk, function(x) sum(is.na(x)))
## s.length s.width p.length p.width class
## 0 5 0 2 0
#total s.width has 5 and p.width has 2
# For a single column
# Create a copy of the original dataframe
iris_imputed_one <- irisrk
iris_imputed_one$p.width[is.na(irisrk$p.width)]<-mean(irisrk$p.width,na.rm=TRUE)
# Count missing values in each column
sapply(iris_imputed_one, function(x) sum(is.na(x)))
## s.length s.width p.length p.width class
## 0 5 0 0 0
# whole dataset
# For all columns in a dataset
airquality_imputed <- irisrk
for(i in 1:ncol(airquality_imputed)) {
if(is.numeric(airquality_imputed[[i]])) {
airquality_imputed[[i]][is.na(airquality_imputed[[i]])] <-mean(airquality_imputed[[i]],na.rm=TRUE)
}
}
sapply(airquality_imputed, function(x) sum(is.na(x)))
## s.length s.width p.length p.width class
## 0 0 0 0 0
# compare the imputed/unimputed variable
install.packages("ggplot2")
## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'ggplot2' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpshELSH\downloaded_packages
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 4.4.3
# Density plots
ggplot(irisrk, aes(x=p.width, fill="Original")) +
geom_density(alpha=0.5) +
geom_density(data=airquality_imputed, aes(x=p.length, fill="Imputed"), alpha=0.5) +
labs(title="Density Plot of P.length: Original vs. Imputed")
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).
#density plot of s.length has imputed values spread out more evenly across the length where as befroe it skewed to 0-2
#however the imputed values do look very different from the original
ggplot(irisrk, aes(x=s.width, fill="Original")) +
geom_density(alpha=0.5) +
geom_density(data=airquality_imputed, aes(x=s.width, fill="Imputed"), alpha=0.5) +
labs(title="Density Plot of S.width: Original vs. Imputed")
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density()`).
#the imputed values do match the orginal more than p.length but the are centered more around the mean than the orginals
#overall the imputed values do differ greatly from the orginal values however with fewer values in this dataset one value change looks drastic on the graph
##we compare two different linear models
# Linear model before imputation
lm_original <- lm(s.length ~ s.width, data=na.omit(irisrk))
# Linear model after imputation
lm_imputed <- lm(s.length ~ s.width, data=airquality_imputed)
summary(lm_original)
##
## Call:
## lm(formula = s.length ~ s.width, data = na.omit(irisrk))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.63823 -0.75731 -0.01981 0.76638 1.62362
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.8316 2.0226 3.872 0.00122 **
## s.width -0.6184 0.6505 -0.951 0.35509
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9946 on 17 degrees of freedom
## Multiple R-squared: 0.05048, Adjusted R-squared: -0.005373
## F-statistic: 0.9038 on 1 and 17 DF, p-value: 0.3551
summary(lm_imputed)
##
## Call:
## lm(formula = s.length ~ s.width, data = airquality_imputed)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.54217 -0.85000 -0.09481 0.65126 1.71967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.7356 1.9704 3.926 0.000723 ***
## s.width -0.6184 0.6346 -0.975 0.340378
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9703 on 22 degrees of freedom
## Multiple R-squared: 0.04138, Adjusted R-squared: -0.00219
## F-statistic: 0.9497 on 1 and 22 DF, p-value: 0.3404
# Diagnostic plots before imputation
par(mfrow=c(2,2))
plot(lm_original)
# Diagnostic plots after imputation
par(mfrow=c(2,2))
plot(lm_imputed)
#Imputation with MICE((Multivariate Imputation by Chained Equations)
# Install packages if they are not already installed
install.packages(c("mice", "ggplot2", "naniar"))
## Warning: package 'ggplot2' is in use and will not be installed
## Installing packages into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'mice' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'mice'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\mice\libs\x64\mice.dll to
## C:\Users\Rache\AppData\Local\R\win-library\4.4\mice\libs\x64\mice.dll:
## Permission denied
## Warning: restored 'mice'
## package 'naniar' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Rache\AppData\Local\Temp\RtmpshELSH\downloaded_packages
# Load the packages
library(mice)
## Warning: package 'mice' was built under R version 4.4.3
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(ggplot2)
library(naniar)
## Warning: package 'naniar' was built under R version 4.4.3
vis_miss(irisrk)
#from the graph we can see s.width has 21% missing values while p.wdith has 8% missing values
#we can also see from the graph that the other groups do not have any missing values
#overall 94.2% of the data is present while missing values ony make up 5.8% of the data
set.seed(12345)
# Perform Multiple Imputation
imputed_data <- mice(irisrk, m=5, method='pmm', print=FALSE)
## Warning: Number of logged events: 1
# m=5 impute the data 5 times and get 5 different imputed data sets.
model_original <- lm(s.length ~ s.width, data=na.omit(irisrk))
# Summarize the model
summary(model_original)
##
## Call:
## lm(formula = s.length ~ s.width, data = na.omit(irisrk))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.63823 -0.75731 -0.01981 0.76638 1.62362
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.8316 2.0226 3.872 0.00122 **
## s.width -0.6184 0.6505 -0.951 0.35509
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9946 on 17 degrees of freedom
## Multiple R-squared: 0.05048, Adjusted R-squared: -0.005373
## F-statistic: 0.9038 on 1 and 17 DF, p-value: 0.3551
# Fit the same model to each of the imputed datasets and pool the results
model_imputed <- with(data=imputed_data, exp=lm(s.length ~ s.width))
model_imputed
## call :
## with.mids(data = imputed_data, expr = lm(s.length ~ s.width))
##
## call1 :
## mice(data = irisrk, m = 5, method = "pmm", printFlag = FALSE)
##
## nmis :
## [1] 0 5 0 2 0
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = s.length ~ s.width)
##
## Coefficients:
## (Intercept) s.width
## 6.735 -0.300
##
##
## [[2]]
##
## Call:
## lm(formula = s.length ~ s.width)
##
## Coefficients:
## (Intercept) s.width
## 7.5434 -0.5619
##
##
## [[3]]
##
## Call:
## lm(formula = s.length ~ s.width)
##
## Coefficients:
## (Intercept) s.width
## 8.1682 -0.7599
##
##
## [[4]]
##
## Call:
## lm(formula = s.length ~ s.width)
##
## Coefficients:
## (Intercept) s.width
## 7.2688 -0.4753
##
##
## [[5]]
##
## Call:
## lm(formula = s.length ~ s.width)
##
## Coefficients:
## (Intercept) s.width
## 7.5490 -0.5599
# Pool the results: pool function combines the estimates from each imputed dataset
pooled_results <- pool(model_imputed)
summary(pooled_results)
## term estimate std.error statistic df p.value
## 1 (Intercept) 7.4528883 1.8225471 4.0892706 17.51034 0.0007238649
## 2 s.width -0.5314124 0.5907044 -0.8996249 17.58535 0.3804742797
#see that whether the imputation distorted the distribution of a variable too much or not
par(mfrow=c(1,2))
hist(na.omit(irisrk)$s.length, main="Original s.length", xlab="sepal length", col="blue")
imputed_data_second=complete(imputed_data,action=1,include=FALSE) ## the 1st imputed data
hist(imputed_data_second$s.length, main="Imputed s.length", xlab="sepal length", col="red")
#there is distortion specifically from 4.5-5.5 and slightly for 6-6.5 however overall the data does look similar
#also due to the low number of variables in general slight changes (example: 1 extra observation) drastically affect the graphs
# Linear model before imputation
lm_original <- lm(s.length ~ s.width, data=na.omit(irisrk))
# Linear model after imputation
lm_imputed <- lm(s.length ~ s.width, data=pooled_results)
summary(lm_original)
##
## Call:
## lm(formula = s.length ~ s.width, data = na.omit(irisrk))
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.63823 -0.75731 -0.01981 0.76638 1.62362
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.8316 2.0226 3.872 0.00122 **
## s.width -0.6184 0.6505 -0.951 0.35509
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9946 on 17 degrees of freedom
## Multiple R-squared: 0.05048, Adjusted R-squared: -0.005373
## F-statistic: 0.9038 on 1 and 17 DF, p-value: 0.3551
summary(lm_imputed)
##
## Call:
## lm(formula = s.length ~ s.width, data = pooled_results)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.5561 -0.6333 -0.1120 0.5579 2.2226
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5262 0.4789 13.63 <2e-16 ***
## s.width -0.2234 0.1551 -1.44 0.152
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8251 on 148 degrees of freedom
## Multiple R-squared: 0.01382, Adjusted R-squared: 0.007159
## F-statistic: 2.074 on 1 and 148 DF, p-value: 0.1519
# Diagnostic plots before imputation
par(mfrow=c(2,2))
plot(lm_original)
# Diagnostic plots after imputation
par(mfrow=c(2,2))
plot(lm_imputed)
#after using MICE the data does have much beter homoscedasticity
#every plot shows data that fits the regression lines far better
You can also embed plots, for example:
Note that the echo = FALSE
parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.