hw2

R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

## parallel computing HW2 assignment Rachel Konshok



#first I install the packages and load the libraries I need

install.packages("foreach")

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'foreach' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpshELSH\downloaded_packages

install.packages("doParallel")

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'doParallel' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpshELSH\downloaded_packages

library(foreach)

## Warning: package 'foreach' was built under R version 4.4.3

library(doParallel)

## Warning: package 'doParallel' was built under R version 4.4.3

## Loading required package: iterators

## Warning: package 'iterators' was built under R version 4.4.3

## Loading required package: parallel

data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

#for logistic regression between sepal length ad width we use the following code

x1 = iris[c(1,2)]
x1

##     Sepal.Length Sepal.Width
## 1            5.1         3.5
## 2            4.9         3.0
## 3            4.7         3.2
## 4            4.6         3.1
## 5            5.0         3.6
## 6            5.4         3.9
## 7            4.6         3.4
## 8            5.0         3.4
## 9            4.4         2.9
## 10           4.9         3.1
## 11           5.4         3.7
## 12           4.8         3.4
## 13           4.8         3.0
## 14           4.3         3.0
## 15           5.8         4.0
## 16           5.7         4.4
## 17           5.4         3.9
## 18           5.1         3.5
## 19           5.7         3.8
## 20           5.1         3.8
## 21           5.4         3.4
## 22           5.1         3.7
## 23           4.6         3.6
## 24           5.1         3.3
## 25           4.8         3.4
## 26           5.0         3.0
## 27           5.0         3.4
## 28           5.2         3.5
## 29           5.2         3.4
## 30           4.7         3.2
## 31           4.8         3.1
## 32           5.4         3.4
## 33           5.2         4.1
## 34           5.5         4.2
## 35           4.9         3.1
## 36           5.0         3.2
## 37           5.5         3.5
## 38           4.9         3.6
## 39           4.4         3.0
## 40           5.1         3.4
## 41           5.0         3.5
## 42           4.5         2.3
## 43           4.4         3.2
## 44           5.0         3.5
## 45           5.1         3.8
## 46           4.8         3.0
## 47           5.1         3.8
## 48           4.6         3.2
## 49           5.3         3.7
## 50           5.0         3.3
## 51           7.0         3.2
## 52           6.4         3.2
## 53           6.9         3.1
## 54           5.5         2.3
## 55           6.5         2.8
## 56           5.7         2.8
## 57           6.3         3.3
## 58           4.9         2.4
## 59           6.6         2.9
## 60           5.2         2.7
## 61           5.0         2.0
## 62           5.9         3.0
## 63           6.0         2.2
## 64           6.1         2.9
## 65           5.6         2.9
## 66           6.7         3.1
## 67           5.6         3.0
## 68           5.8         2.7
## 69           6.2         2.2
## 70           5.6         2.5
## 71           5.9         3.2
## 72           6.1         2.8
## 73           6.3         2.5
## 74           6.1         2.8
## 75           6.4         2.9
## 76           6.6         3.0
## 77           6.8         2.8
## 78           6.7         3.0
## 79           6.0         2.9
## 80           5.7         2.6
## 81           5.5         2.4
## 82           5.5         2.4
## 83           5.8         2.7
## 84           6.0         2.7
## 85           5.4         3.0
## 86           6.0         3.4
## 87           6.7         3.1
## 88           6.3         2.3
## 89           5.6         3.0
## 90           5.5         2.5
## 91           5.5         2.6
## 92           6.1         3.0
## 93           5.8         2.6
## 94           5.0         2.3
## 95           5.6         2.7
## 96           5.7         3.0
## 97           5.7         2.9
## 98           6.2         2.9
## 99           5.1         2.5
## 100          5.7         2.8
## 101          6.3         3.3
## 102          5.8         2.7
## 103          7.1         3.0
## 104          6.3         2.9
## 105          6.5         3.0
## 106          7.6         3.0
## 107          4.9         2.5
## 108          7.3         2.9
## 109          6.7         2.5
## 110          7.2         3.6
## 111          6.5         3.2
## 112          6.4         2.7
## 113          6.8         3.0
## 114          5.7         2.5
## 115          5.8         2.8
## 116          6.4         3.2
## 117          6.5         3.0
## 118          7.7         3.8
## 119          7.7         2.6
## 120          6.0         2.2
## 121          6.9         3.2
## 122          5.6         2.8
## 123          7.7         2.8
## 124          6.3         2.7
## 125          6.7         3.3
## 126          7.2         3.2
## 127          6.2         2.8
## 128          6.1         3.0
## 129          6.4         2.8
## 130          7.2         3.0
## 131          7.4         2.8
## 132          7.9         3.8
## 133          6.4         2.8
## 134          6.3         2.8
## 135          6.1         2.6
## 136          7.7         3.0
## 137          6.3         3.4
## 138          6.4         3.1
## 139          6.0         3.0
## 140          6.9         3.1
## 141          6.7         3.1
## 142          6.9         3.1
## 143          5.8         2.7
## 144          6.8         3.2
## 145          6.7         3.3
## 146          6.7         3.0
## 147          6.3         2.5
## 148          6.5         3.0
## 149          6.2         3.4
## 150          5.9         3.0

# fit logistic regression model

result=glm(x1[,2]~x1[,1])
#cannot use family = binomial as sepal.width is not a binomial variable
result

## 
## Call:  glm(formula = x1[, 2] ~ x1[, 1])
## 
## Coefficients:
## (Intercept)      x1[, 1]  
##     3.41895     -0.06188  
## 
## Degrees of Freedom: 149 Total (i.e. Null);  148 Residual
## Null Deviance:       28.31 
## Residual Deviance: 27.92     AIC: 179.5

coefficients(result)

## (Intercept)     x1[, 1] 
##   3.4189468  -0.0618848

#Question 4 run B=20000 for iris data set for variables Sepal.Length and Sepal.Width

trials <- 20000
res <- data.frame()
system.time({
              trial <- 1 
                while(trial <= trials) {
                  ind <- sample(100, 100, replace=TRUE) 
                  result1 <- glm(x1[ind,2]~x1[ind,1])
              r <- coefficients(result1)
              res <- rbind(res, r)
              trial <- trial + 1
              }
})

##    user  system elapsed 
##   18.99    0.94   19.97

head(res)

##   X3.61915094339623 X.0.0867924528301887
## 1          3.619151          -0.08679245
## 2          3.577478          -0.10403206
## 3          3.975554          -0.15643406
## 4          3.480705          -0.06918479
## 5          3.387607          -0.04428623
## 6          3.604432          -0.09996884

mean(res[,2])

## [1] -0.1557452

sd(res[,2])

## [1] 0.05527194

#This loop^ calculates the time to estimate the variance of the correlation coefficient without using paralell computing
#Running this code for me the elapse time it took to run this for a B= 20000 was 79.14 seconds
#variance is equal to .05503709

#Now to loade the libraries for parallel computing

library(foreach)
library(doParallel)

detectCores()

## [1] 8

#I have eight cores on my computer

numCores=6


for(i in 1:3){
  print(i^2)
}

## [1] 1
## [1] 4
## [1] 9

foreach (i=1:3) %do% {
  i^2
}

## [[1]]
## [1] 1
## 
## [[2]]
## [1] 4
## 
## [[3]]
## [1] 9

registerDoParallel(numCores) 

foreach (i=1:3) %dopar% {
  i^2
}

## [[1]]
## [1] 1
## 
## [[2]]
## [1] 4
## 
## [[3]]
## [1] 9

foreach (i=1:3, .combine=c) %dopar% {
  i^2
}

## [1] 1 4 9

foreach (i=1:3, .combine=rbind) %dopar% {
  i^2
}

##          [,1]
## result.1    1
## result.2    4
## result.3    9

#Below B= 20000 for paralell computing using foreach and doParallel (question 5)
#elapsed time was 11.27 which is much faster than 79.14 seconds it took above
#the variance for the run was .0552892
#overall the variance estimates were very close and parallel computing greatly reduced time 


trials <- 20000
system.time({
  r <- foreach(icount(trials), .combine=rbind) %dopar% {
    ind <- sample(100, 100, replace=TRUE)    # resample data
    result1 <- glm(x1[ind,2]~x1[ind,1],)
    coefficients(result1)
  }
})

##    user  system elapsed 
##    4.23    0.56    7.48

head(r)

##          (Intercept)  x1[ind, 1]
## result.1    3.584939 -0.10141376
## result.2    3.624672 -0.08240969
## result.3    3.957852 -0.17349472
## result.4    3.803692 -0.13319014
## result.5    3.602386 -0.11036206
## result.6    4.167545 -0.18661773

mean(r[,2])

## [1] -0.1552835

sd(r[,2])

## [1] 0.05505604

stopImplicitCluster()

#question 6 using parlapply to parallel compute
#using parlappy the elapsed time was 4.39 seconds and the variance was .0552892
#overall the fastest method was parlappy however both parallel computation methods were far faster than running the code without parallel computing

model.mse <- function(x) {
  id <- sample(1:nrow(x1), 200, replace = T)
  mod <- lm(Sepal.Width ~ ., data = x1[id,])
  mse <- mean((fitted.values(mod) - x1$medv[id])^2)
  return(mse)
}

x.list <- sapply(1:10000, list)
n.cores <- detectCores()

# 8 cores
system.time({
  clust <- makeCluster(n.cores)
  clusterExport(clust, "x1")
  a <- parLapply(clust, x.list, model.mse)})

##    user  system elapsed 
##    0.05    0.05    1.94

head(r)

##          (Intercept)  x1[ind, 1]
## result.1    3.584939 -0.10141376
## result.2    3.624672 -0.08240969
## result.3    3.957852 -0.17349472
## result.4    3.803692 -0.13319014
## result.5    3.602386 -0.11036206
## result.6    4.167545 -0.18661773

mean(r[,2])

## [1] -0.1552835

sd(r[,2])

## [1] 0.05505604

stopCluster(clust)

#I accidentially calculated regression analysis for questions 4, 5, and 6 first and now I'll do correlation coeffcient.
#I couldn't find examples from the HW, lectures, or code so I assume it's a simple pearson's coeffcient


s.length <- iris$Sepal.Length
s.width  <- iris$Sepal.Width


correlation <- cor(s.length, s.width, method = "pearson")
correlation

## [1] -0.1175698

#question 

B <- 20000
n <- length(s.length)

runtime <- system.time({
  r <- numeric(B)
  
  for (i in 1:B) {
    idx <- sample(1:n, size = n, replace = TRUE)
    r[i] <- cor(s.length[idx], s.width[idx])
  }
  
  # Calculate results
  mean_r <- mean(r)
  var_r  <- var(r)    
  
})
runtime

##    user  system elapsed 
##    0.61    0.00    0.61

mean_r

## [1] -0.1195381

var_r

## [1] 0.005441184

#without parallel computation the elapsed time for this calculation was 1.61 seconds and the mean was -.1188058 and the variance was .005492118


#question 2 below
#using foreach and dopar the elapsed time was 7.48 seconds the mean was -.1196421 and the variance was .005619631
n.cores <- detectCores()
cl <- makeCluster(n.cores)
registerDoParallel(cl)

# Run bootstrap with foreach + dopar
runtime <- system.time({
  cor_vals <- foreach(i = 1:B, .combine = c) %dopar% {
    idx <- sample(1:n, n, replace = TRUE)
    cor(s.length[idx], s.width[idx])
  }
})

# Stop the cluster
stopCluster(cl)

# Compute results
mean_corr <- mean(cor_vals)
var_corr  <- var(cor_vals)


#question 3 using parlapply the elapsed time was .3 seconds the mean was -.1190302 and the variance was .005518913

#overall for pearson's coeffcient parlapply was the shortest amount of processing time but using foreach and dopar actually took more time that not using parallel computing
#the means were all similar to each other so was the variance

boot_cor <- function(dummy) {
  idx <- sample(1:nrow(x1), replace = TRUE)
  return(cor(x1$Sepal.Length[idx], x1$Sepal.Width[idx]))
}

# Create a list of length B
B <- 20000
x.list <- vector("list", B)

# Setup parallel cluster
n.cores <- detectCores()
clust <- makeCluster(n.cores)

# Export required variables to each cluster worker
clusterExport(clust, varlist = c("x1", "boot_cor"))


runtime <- system.time({
  cor_vals <- parLapply(clust, x.list, boot_cor)
})

stopCluster(clust)


cor_vals <- unlist(cor_vals)  # convert list to numeric vector
mean_corr <- mean(cor_vals)
var_corr  <- var(cor_vals)





#missing data values question 2 (1-5)

irisrk = read.csv("C:/Users/Rache/OneDrive/Documents/R/R folder/Stat713/Hw2/iris.csv")
irisrk

##    s.length s.width p.length p.width           class
## 1       5.1     3.5      1.4     0.2     Iris-setosa
## 2       4.9     3.0      1.4     0.2     Iris-setosa
## 3       4.7     3.2      1.3     0.2     Iris-setosa
## 4       4.6      NA      1.5     0.2     Iris-setosa
## 5       5.0     3.6      1.4     0.2     Iris-setosa
## 6       5.4     3.9      1.7     0.4     Iris-setosa
## 7       4.6     3.4      1.4     0.3     Iris-setosa
## 8       5.0      NA      1.5     0.2     Iris-setosa
## 9       4.4     2.9      1.4     0.2     Iris-setosa
## 10      7.0     3.2      4.7     1.4 Iris-versicolor
## 11      6.4     3.2      4.5     1.5 Iris-versicolor
## 12      6.9     3.1      4.9     1.5 Iris-versicolor
## 13      5.5     2.3      4.0     1.3 Iris-versicolor
## 14      6.5     2.8      4.6     1.5 Iris-versicolor
## 15      5.7     2.8      4.5     1.3 Iris-versicolor
## 16      6.3     3.3      4.7     1.6 Iris-versicolor
## 17      6.3      NA      6.0     2.5  Iris-virginica
## 18      5.8     2.7      5.1     1.9  Iris-virginica
## 19      7.1     3.0      5.9     2.1  Iris-virginica
## 20      6.3     2.9      5.6     1.8  Iris-virginica
## 21      6.5      NA      5.8      NA  Iris-virginica
## 22      7.6     3.0      6.6     2.1  Iris-virginica
## 23      4.9      NA      4.5      NA  Iris-virginica
## 24      7.3     2.9      6.3     1.8   Iris-virginic

dim(irisrk)

## [1] 24  5

summary(irisrk)

##     s.length        s.width         p.length        p.width     
##  Min.   :4.400   Min.   :2.300   Min.   :1.300   Min.   :0.200  
##  1st Qu.:4.975   1st Qu.:2.900   1st Qu.:1.475   1st Qu.:0.200  
##  Median :5.750   Median :3.000   Median :4.500   Median :1.350  
##  Mean   :5.825   Mean   :3.089   Mean   :3.779   Mean   :1.109  
##  3rd Qu.:6.500   3rd Qu.:3.250   3rd Qu.:5.225   3rd Qu.:1.750  
##  Max.   :7.600   Max.   :3.900   Max.   :6.600   Max.   :2.500  
##                  NA's   :5                       NA's   :2      
##     class          
##  Length:24         
##  Class :character  
##  Mode  :character  
##                    
##                    
##                    
##

sapply(irisrk, function(x) sum(is.na(x)))

## s.length  s.width p.length  p.width    class 
##        0        5        0        2        0

# Overall missing value count
sum(is.na(irisrk))

## [1] 7

#there are 7 total missing values

# Calculate the percentage of missing values in each column
sapply(irisrk, function(x) mean(is.na(x)) * 100)

##  s.length   s.width  p.length   p.width     class 
##  0.000000 20.833333  0.000000  8.333333  0.000000

#s.width has 21% missing values and p.width has 8.33% missing values the rest have none


# Count missing values in each column
sapply(irisrk, function(x) sum(is.na(x)))

## s.length  s.width p.length  p.width    class 
##        0        5        0        2        0

#total s.width has 5 and p.width has 2


# For a single column
# Create a copy of the original dataframe
iris_imputed_one <- irisrk
iris_imputed_one$p.width[is.na(irisrk$p.width)]<-mean(irisrk$p.width,na.rm=TRUE)

# Count missing values in each column
sapply(iris_imputed_one, function(x) sum(is.na(x)))

## s.length  s.width p.length  p.width    class 
##        0        5        0        0        0

# whole dataset
# For all columns in a dataset
airquality_imputed <- irisrk
for(i in 1:ncol(airquality_imputed)) {
  if(is.numeric(airquality_imputed[[i]])) {
    airquality_imputed[[i]][is.na(airquality_imputed[[i]])] <-mean(airquality_imputed[[i]],na.rm=TRUE)
  }
}

sapply(airquality_imputed, function(x) sum(is.na(x)))

## s.length  s.width p.length  p.width    class 
##        0        0        0        0        0

# compare the imputed/unimputed variable
install.packages("ggplot2")

## Installing package into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'ggplot2' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpshELSH\downloaded_packages

library("ggplot2")

## Warning: package 'ggplot2' was built under R version 4.4.3

# Density plots 
ggplot(irisrk, aes(x=p.width, fill="Original")) +
  geom_density(alpha=0.5) +
  geom_density(data=airquality_imputed, aes(x=p.length, fill="Imputed"), alpha=0.5) +
  labs(title="Density Plot of P.length: Original vs. Imputed")

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_density()`).

#density plot of s.length has imputed values spread out more evenly across the length where as befroe it skewed to 0-2
#however the imputed values do look very different from the original

ggplot(irisrk, aes(x=s.width, fill="Original")) +
  geom_density(alpha=0.5) +
  geom_density(data=airquality_imputed, aes(x=s.width, fill="Imputed"), alpha=0.5) +
  labs(title="Density Plot of S.width: Original vs. Imputed")

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density()`).

#the imputed values do match the orginal more than p.length but the are centered more around the mean than the orginals


#overall the imputed values do differ greatly from the orginal values however with fewer values in this dataset one value change looks drastic on the graph


##we compare two different linear models

# Linear model before imputation
lm_original <- lm(s.length ~ s.width, data=na.omit(irisrk))

# Linear model after imputation
lm_imputed <- lm(s.length ~ s.width, data=airquality_imputed)

summary(lm_original)

## 
## Call:
## lm(formula = s.length ~ s.width, data = na.omit(irisrk))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.63823 -0.75731 -0.01981  0.76638  1.62362 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   7.8316     2.0226   3.872  0.00122 **
## s.width      -0.6184     0.6505  -0.951  0.35509   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9946 on 17 degrees of freedom
## Multiple R-squared:  0.05048,    Adjusted R-squared:  -0.005373 
## F-statistic: 0.9038 on 1 and 17 DF,  p-value: 0.3551

summary(lm_imputed)

## 
## Call:
## lm(formula = s.length ~ s.width, data = airquality_imputed)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.54217 -0.85000 -0.09481  0.65126  1.71967 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.7356     1.9704   3.926 0.000723 ***
## s.width      -0.6184     0.6346  -0.975 0.340378    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9703 on 22 degrees of freedom
## Multiple R-squared:  0.04138,    Adjusted R-squared:  -0.00219 
## F-statistic: 0.9497 on 1 and 22 DF,  p-value: 0.3404

# Diagnostic plots before imputation
par(mfrow=c(2,2))
plot(lm_original)

# Diagnostic plots after imputation
par(mfrow=c(2,2))
plot(lm_imputed)

#Imputation with MICE((Multivariate Imputation by Chained Equations)
# Install packages if they are not already installed
install.packages(c("mice", "ggplot2", "naniar"))

## Warning: package 'ggplot2' is in use and will not be installed

## Installing packages into 'C:/Users/Rache/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)

## package 'mice' successfully unpacked and MD5 sums checked

## Warning: cannot remove prior installation of package 'mice'

## Warning in file.copy(savedcopy, lib, recursive = TRUE): problem copying
## C:\Users\Rache\AppData\Local\R\win-library\4.4\00LOCK\mice\libs\x64\mice.dll to
## C:\Users\Rache\AppData\Local\R\win-library\4.4\mice\libs\x64\mice.dll:
## Permission denied

## Warning: restored 'mice'

## package 'naniar' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\Rache\AppData\Local\Temp\RtmpshELSH\downloaded_packages

# Load the packages
library(mice)

## Warning: package 'mice' was built under R version 4.4.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(ggplot2)
library(naniar)

## Warning: package 'naniar' was built under R version 4.4.3

vis_miss(irisrk)

#from the graph we can see s.width has 21% missing values while p.wdith has 8% missing values
#we can also see from the graph that the other groups do not have any missing values
#overall 94.2% of the data is present while missing values ony make up 5.8% of the data

set.seed(12345)

# Perform Multiple Imputation
imputed_data <- mice(irisrk, m=5, method='pmm', print=FALSE)

## Warning: Number of logged events: 1

# m=5 impute the data 5 times and get 5 different imputed data sets.

model_original <- lm(s.length ~ s.width, data=na.omit(irisrk))

# Summarize the model
summary(model_original)

## 
## Call:
## lm(formula = s.length ~ s.width, data = na.omit(irisrk))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.63823 -0.75731 -0.01981  0.76638  1.62362 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   7.8316     2.0226   3.872  0.00122 **
## s.width      -0.6184     0.6505  -0.951  0.35509   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9946 on 17 degrees of freedom
## Multiple R-squared:  0.05048,    Adjusted R-squared:  -0.005373 
## F-statistic: 0.9038 on 1 and 17 DF,  p-value: 0.3551

# Fit the same model to each of the imputed datasets and pool the results
model_imputed <- with(data=imputed_data, exp=lm(s.length ~ s.width))
model_imputed

## call :
## with.mids(data = imputed_data, expr = lm(s.length ~ s.width))
## 
## call1 :
## mice(data = irisrk, m = 5, method = "pmm", printFlag = FALSE)
## 
## nmis :
## [1] 0 5 0 2 0
## 
## analyses :
## [[1]]
## 
## Call:
## lm(formula = s.length ~ s.width)
## 
## Coefficients:
## (Intercept)      s.width  
##       6.735       -0.300  
## 
## 
## [[2]]
## 
## Call:
## lm(formula = s.length ~ s.width)
## 
## Coefficients:
## (Intercept)      s.width  
##      7.5434      -0.5619  
## 
## 
## [[3]]
## 
## Call:
## lm(formula = s.length ~ s.width)
## 
## Coefficients:
## (Intercept)      s.width  
##      8.1682      -0.7599  
## 
## 
## [[4]]
## 
## Call:
## lm(formula = s.length ~ s.width)
## 
## Coefficients:
## (Intercept)      s.width  
##      7.2688      -0.4753  
## 
## 
## [[5]]
## 
## Call:
## lm(formula = s.length ~ s.width)
## 
## Coefficients:
## (Intercept)      s.width  
##      7.5490      -0.5599

# Pool the results: pool function combines the estimates from each imputed dataset 
pooled_results <- pool(model_imputed)
summary(pooled_results)

##          term   estimate std.error  statistic       df      p.value
## 1 (Intercept)  7.4528883 1.8225471  4.0892706 17.51034 0.0007238649
## 2     s.width -0.5314124 0.5907044 -0.8996249 17.58535 0.3804742797

#see that whether the imputation distorted the distribution of a variable too much or not
par(mfrow=c(1,2))
hist(na.omit(irisrk)$s.length, main="Original s.length", xlab="sepal length", col="blue")

imputed_data_second=complete(imputed_data,action=1,include=FALSE) ## the 1st imputed data
hist(imputed_data_second$s.length, main="Imputed s.length", xlab="sepal length", col="red")

#there is distortion specifically from 4.5-5.5 and slightly for 6-6.5 however overall the data does look similar
#also due to the low number of variables in general slight changes (example: 1 extra observation) drastically affect the graphs


# Linear model before imputation
lm_original <- lm(s.length ~ s.width, data=na.omit(irisrk))

# Linear model after imputation
lm_imputed <- lm(s.length ~ s.width, data=pooled_results)

summary(lm_original)

## 
## Call:
## lm(formula = s.length ~ s.width, data = na.omit(irisrk))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.63823 -0.75731 -0.01981  0.76638  1.62362 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   7.8316     2.0226   3.872  0.00122 **
## s.width      -0.6184     0.6505  -0.951  0.35509   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9946 on 17 degrees of freedom
## Multiple R-squared:  0.05048,    Adjusted R-squared:  -0.005373 
## F-statistic: 0.9038 on 1 and 17 DF,  p-value: 0.3551

summary(lm_imputed)

## 
## Call:
## lm(formula = s.length ~ s.width, data = pooled_results)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5561 -0.6333 -0.1120  0.5579  2.2226 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.5262     0.4789   13.63   <2e-16 ***
## s.width      -0.2234     0.1551   -1.44    0.152    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8251 on 148 degrees of freedom
## Multiple R-squared:  0.01382,    Adjusted R-squared:  0.007159 
## F-statistic: 2.074 on 1 and 148 DF,  p-value: 0.1519

# Diagnostic plots before imputation
par(mfrow=c(2,2))
plot(lm_original)

# Diagnostic plots after imputation
par(mfrow=c(2,2))
plot(lm_imputed)

#after using MICE the data does have much beter homoscedasticity
#every plot shows data that fits the regression lines far better

hw2

2025-07-20

R Markdown

Including Plots