library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mice)
## 
## Attaching package: 'mice'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
setwd("C:/Users/gabeg/Documents/Uni/Stat 5003/Week 7")
df <- read.csv("skin-cancer.csv", header = TRUE)

simple_lm <- lm(Mort ~  Lat, data = df)
plot(Mort~Lat, data = df)
abline(simple_lm)

summary(simple_lm)
## 
## Call:
## lm(formula = Mort ~ Lat, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.972 -13.185   0.972  12.006  43.938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 389.1894    23.8123   16.34  < 2e-16 ***
## Lat          -5.9776     0.5984   -9.99 3.31e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared:  0.6798, Adjusted R-squared:  0.673 
## F-statistic:  99.8 on 1 and 47 DF,  p-value: 3.309e-13
set.seed(530306627)
amputed <- ampute(df, patterns = data.frame(Mort = 1, Lat = 0)) #pattern says which value gets filled in with NA

missing_df <- amputed$amp
missing_lm <- lm(Lat ~Mort, data = missing_df, )
plot(Lat~Mort, data = missing_df)
abline(missing_lm)

summary(missing_lm)
## 
## Call:
## lm(formula = Lat ~ Mort, data = missing_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.6988 -1.6270  0.4466  1.7237  3.3803 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 54.53173    3.52323  15.478 1.88e-11 ***
## Mort        -0.09811    0.02507  -3.914  0.00112 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.394 on 17 degrees of freedom
##   (30 observations deleted due to missingness)
## Multiple R-squared:  0.474,  Adjusted R-squared:  0.4431 
## F-statistic: 15.32 on 1 and 17 DF,  p-value: 0.001117
amputed_lm <- lm(Mort~Lat, data = missing_df)
plot(Mort~Lat, data = df, col = 'black')
points(Mort ~Lat, data = missing_df, pch= 18, col = 'blue')
abline(simple_lm, lty = 'dotted', col = 'black')
abline(amputed_lm, col = 'blue')

fill_in <- filter(missing_df, is.na(Lat))
imputed_vals <- predict(missing_lm, newdata = fill_in)

## get estimated variability from 1.3
missing_sd <- sigma(missing_lm)

noisy <- rnorm(length(imputed_vals), mean = 0, sd= missing_sd)

#adding to the imputed vals

imputed_vals <- data_frame(imp =imputed_vals, noisy =  noisy)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
imputed_vals<-  mutate(imputed_vals, final_imp = imp+ noisy)


#find NA index

na_index = is.na(missing_df$Lat)

impute_df <- missing_df

impute_df$Lat[na_index] <- imputed_vals$final_imp

Question 1.5

impute_lm <- lm(Mort ~Lat, data = impute_df)
summary(impute_lm)
## 
## Call:
## lm(formula = Mort ~ Lat, data = impute_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.979 -11.041   2.795  12.363  34.774 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 413.1741    23.0517   17.92  < 2e-16 ***
## Lat          -6.6209     0.5829  -11.36 4.48e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.46 on 47 degrees of freedom
## Multiple R-squared:  0.733,  Adjusted R-squared:  0.7273 
## F-statistic:   129 on 1 and 47 DF,  p-value: 4.484e-15
plot(Mort~Lat, data = df, col = 'black')
points(Mort ~Lat, data = missing_df, pch= 21, col = 'blue')
abline(simple_lm, lty = 'dotted', col = 'black')
abline(amputed_lm, col = 'blue')
points(Mort~Lat, data = impute_df, pch= 18, col = 'red')
abline(impute_lm, lty = 'dashed', col = 'red')
legend("bottomleft", legend = c("Full data", "Available data", "Imputed data", "Full reg", "Available reg", "Imputed reg"), col = rep(c("black", "blue", "red"), 2), pch = c(21, 21, 18, rep(NA, 3)), lty = c("dotted", "solid", "dashed"))
title("Impact of Imputing Data")

Discussion

As you can see by imputing the data the line of best fit (red line) is much closer to the full data line (black line) than the line with availed data (blue line). We have dampened the effect of the bias in our data from the missing values

summary(impute_lm)
## 
## Call:
## lm(formula = Mort ~ Lat, data = impute_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.979 -11.041   2.795  12.363  34.774 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 413.1741    23.0517   17.92  < 2e-16 ***
## Lat          -6.6209     0.5829  -11.36 4.48e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.46 on 47 degrees of freedom
## Multiple R-squared:  0.733,  Adjusted R-squared:  0.7273 
## F-statistic:   129 on 1 and 47 DF,  p-value: 4.484e-15