library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
setwd("C:/Users/gabeg/Documents/Uni/Stat 5003/Week 7")
df <- read.csv("skin-cancer.csv", header = TRUE)
simple_lm <- lm(Mort ~ Lat, data = df)
plot(Mort~Lat, data = df)
abline(simple_lm)
summary(simple_lm)
##
## Call:
## lm(formula = Mort ~ Lat, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.972 -13.185 0.972 12.006 43.938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 389.1894 23.8123 16.34 < 2e-16 ***
## Lat -5.9776 0.5984 -9.99 3.31e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared: 0.6798, Adjusted R-squared: 0.673
## F-statistic: 99.8 on 1 and 47 DF, p-value: 3.309e-13
set.seed(530306627)
amputed <- ampute(df, patterns = data.frame(Mort = 1, Lat = 0)) #pattern says which value gets filled in with NA
missing_df <- amputed$amp
missing_lm <- lm(Lat ~Mort, data = missing_df, )
plot(Lat~Mort, data = missing_df)
abline(missing_lm)
summary(missing_lm)
##
## Call:
## lm(formula = Lat ~ Mort, data = missing_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.6988 -1.6270 0.4466 1.7237 3.3803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 54.53173 3.52323 15.478 1.88e-11 ***
## Mort -0.09811 0.02507 -3.914 0.00112 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.394 on 17 degrees of freedom
## (30 observations deleted due to missingness)
## Multiple R-squared: 0.474, Adjusted R-squared: 0.4431
## F-statistic: 15.32 on 1 and 17 DF, p-value: 0.001117
amputed_lm <- lm(Mort~Lat, data = missing_df)
plot(Mort~Lat, data = df, col = 'black')
points(Mort ~Lat, data = missing_df, pch= 18, col = 'blue')
abline(simple_lm, lty = 'dotted', col = 'black')
abline(amputed_lm, col = 'blue')
fill_in <- filter(missing_df, is.na(Lat))
imputed_vals <- predict(missing_lm, newdata = fill_in)
## get estimated variability from 1.3
missing_sd <- sigma(missing_lm)
noisy <- rnorm(length(imputed_vals), mean = 0, sd= missing_sd)
#adding to the imputed vals
imputed_vals <- data_frame(imp =imputed_vals, noisy = noisy)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
imputed_vals<- mutate(imputed_vals, final_imp = imp+ noisy)
#find NA index
na_index = is.na(missing_df$Lat)
impute_df <- missing_df
impute_df$Lat[na_index] <- imputed_vals$final_imp
impute_lm <- lm(Mort ~Lat, data = impute_df)
summary(impute_lm)
##
## Call:
## lm(formula = Mort ~ Lat, data = impute_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.979 -11.041 2.795 12.363 34.774
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 413.1741 23.0517 17.92 < 2e-16 ***
## Lat -6.6209 0.5829 -11.36 4.48e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.46 on 47 degrees of freedom
## Multiple R-squared: 0.733, Adjusted R-squared: 0.7273
## F-statistic: 129 on 1 and 47 DF, p-value: 4.484e-15
plot(Mort~Lat, data = df, col = 'black')
points(Mort ~Lat, data = missing_df, pch= 21, col = 'blue')
abline(simple_lm, lty = 'dotted', col = 'black')
abline(amputed_lm, col = 'blue')
points(Mort~Lat, data = impute_df, pch= 18, col = 'red')
abline(impute_lm, lty = 'dashed', col = 'red')
legend("bottomleft", legend = c("Full data", "Available data", "Imputed data", "Full reg", "Available reg", "Imputed reg"), col = rep(c("black", "blue", "red"), 2), pch = c(21, 21, 18, rep(NA, 3)), lty = c("dotted", "solid", "dashed"))
title("Impact of Imputing Data")
As you can see by imputing the data the line of best fit (red line) is much closer to the full data line (black line) than the line with availed data (blue line). We have dampened the effect of the bias in our data from the missing values
summary(impute_lm)
##
## Call:
## lm(formula = Mort ~ Lat, data = impute_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45.979 -11.041 2.795 12.363 34.774
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 413.1741 23.0517 17.92 < 2e-16 ***
## Lat -6.6209 0.5829 -11.36 4.48e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17.46 on 47 degrees of freedom
## Multiple R-squared: 0.733, Adjusted R-squared: 0.7273
## F-statistic: 129 on 1 and 47 DF, p-value: 4.484e-15