load data:
cancer <- full.data %>% filter(!is.na(full.data))
lmfit <- lm(Mort ~ Lat, data = cancer)
cancer %>% plot_ly(x = ~Lat) %>% add_markers(y = ~Mort, name = "skin-cancer") %>%
add_lines(x = ~Lat, y = fitted(lmfit), name = "linear regression") %>% layout(showlegend = F,
title = "Standard Simple regression without missingness", xaxis = list(title = "Latitude"),
yaxis = list(title = "Mortality rates"))summary(lmfit)
##
## Call:
## lm(formula = Mort ~ Lat, data = cancer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.972 -13.185 0.972 12.006 43.938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 389.1894 23.8123 16.34 < 2e-16 ***
## Lat -5.9776 0.5984 -9.99 3.31e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared: 0.6798, Adjusted R-squared: 0.673
## F-statistic: 99.8 on 1 and 47 DF, p-value: 3.309e-13The Goodness of fit statistic is 0.6798 which implies that 67.98% of the variation in Mort is explained by the linear regression on Lat
output <- mice::ampute(data = cancer, patterns = c(1, 0), mech = "MAR")
class(output)
## [1] "mads"
head(output$amp)
## Mort Lat
## 1 219 NA
## 2 160 NA
## 3 170 35
## 4 182 NA
## 5 149 NA
## 6 159 NA
aggr(output$amp)cancer2 <- missing_val %>% filter(!is.na(missing_val$Lat))
lmfit2 <- lm(Lat ~ Mort, data = cancer2)
cancer2 %>% plot_ly(x = ~Mort) %>% add_markers(y = ~Lat, name = "skin-cancer") %>%
add_lines(x = ~Mort, y = fitted(lmfit2), name = "linear regression") %>% layout(showlegend = F,
title = "Simple regression on the missing data as the response", xaxis = list(title = "Mortality rates"),
yaxis = list(title = "Latitude"))summary(lmfit2)
##
## Call:
## lm(formula = Lat ~ Mort, data = cancer2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8610 -1.2013 0.4247 1.3270 2.2532
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.53423 1.99740 26.802 5.85e-16 ***
## Mort -0.09293 0.01400 -6.636 3.15e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.695 on 18 degrees of freedom
## Multiple R-squared: 0.7099, Adjusted R-squared: 0.6937
## F-statistic: 44.04 on 1 and 18 DF, p-value: 3.145e-06The Goodness of fit statistic is 0.4833 which implies that 48.33% of the variation in Lat is explained by the linear regression on Mort
cancer3 <- missing_val %>% filter(!is.na(missing_val$Lat))
lmfit3 <- lm(Mort ~ Lat, data = cancer3)
cancer3 %>% plot_ly(x = ~Lat) %>% add_markers(y = ~Mort, name = "skin-cancer") %>%
add_lines(x = ~Lat, y = fitted(lmfit3), name = "linear regression") %>% layout(showlegend = F,
title = "Original with missingness complete-case regression", xaxis = list(title = "Latitude"),
yaxis = list(title = "Mortality rates"))summary(lmfit3)
##
## Call:
## lm(formula = Mort ~ Lat, data = cancer3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -24.481 -12.787 1.978 11.113 25.339
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 449.582 46.770 9.613 1.63e-08 ***
## Lat -7.639 1.151 -6.636 3.15e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.37 on 18 degrees of freedom
## Multiple R-squared: 0.7099, Adjusted R-squared: 0.6937
## F-statistic: 44.04 on 1 and 18 DF, p-value: 3.145e-06The Goodness of fit statistic is 0.4833 which implies that 48.33% of the variation in Mort is explained by the linear regression on Lat.
So when compared with 1.1 which uses using the full data, 1.4 with missingness is not consistent with the original regression. We can see that from the two model’s summary.
# missing index
missing.inds <- which(rowSums(is.na(missing_val)) == 1)
# construct a new feature for legend
full.data[["Missing"]] <- missing_val[["Missing"]] <- c("Not missing", "Missing")[as.numeric(is.na(missing_val[["Lat"]])) +
1]
# Do a simple linear regression imputation
obs.Mort <- missing_val %>% filter(is.na(missing_val$Lat)) %>% select(Mort)
Lat.preds <- predict(lmfit2, newdata = obs.Mort)
# Do a random imputation based on the estimated variability in the regression
# model
est.sigma <- sigma(lmfit2)
rand.preds <- Lat.preds + rnorm(length(Lat.preds), sd = est.sigma)
dat.rand <- missing_val
dat.rand[["Lat"]][missing.inds] <- rand.preds %>% unname
# Create the plots
fig1 <- plot_ly(data = full.data, x = ~Lat, y = ~Mort, color = ~Missing)
fig2 <- plot_ly(data = dat.rand, x = ~Lat, y = ~Mort, color = ~Missing) %>% add_markers(showlegend = F)
fig <- subplot(fig1, fig2, shareY = TRUE, shareX = TRUE) %>% layout(annotations = list(list(x = 0.2,
y = 1.05, text = "Original Data", showarrow = F, xref = "paper", yref = "paper"),
list(x = 0.8, y = 1.05, text = "Random imputation", showarrow = F, xref = "paper",
yref = "paper")))
figThen we will conduct the regression on the imputed dataset and compare with the orginal full model:
# lm model with impute data
lmfit4 <- lm(Mort ~ Lat, data = dat.rand)
fig1 <- full.data %>% plot_ly(x = ~Lat) %>% add_markers(y = ~Mort, name = "skin-cancer") %>%
add_lines(x = ~Lat, y = fitted(lmfit), name = "the original full data with linear regression") %>%
layout(showlegend = F)
fig2 <- dat.rand %>% plot_ly(x = ~Lat) %>% add_markers(y = ~Mort, name = "skin-cancer") %>%
add_lines(x = ~Lat, y = fitted(lmfit4), name = "the random imputed linear regression") %>%
layout(showlegend = F)
fig <- subplot(fig1, fig2, shareY = TRUE, shareX = TRUE) %>% layout(annotations = list(list(x = 0.05,
y = 1.05, text = "the original full data with lm", showarrow = F, xref = "paper",
yref = "paper"), list(x = 0.9, y = 1.05, text = "the random imputed with lm",
showarrow = F, xref = "paper", yref = "paper")))
fig##
## Call:
## lm(formula = Mort ~ Lat, data = cancer)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38.972 -13.185 0.972 12.006 43.938
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 389.1894 23.8123 16.34 < 2e-16 ***
## Lat -5.9776 0.5984 -9.99 3.31e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared: 0.6798, Adjusted R-squared: 0.673
## F-statistic: 99.8 on 1 and 47 DF, p-value: 3.309e-13
##
## Call:
## lm(formula = Mort ~ Lat, data = dat.rand)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.478 -6.714 1.597 9.000 28.244
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 490.7187 23.4341 20.94 <2e-16 ***
## Lat -8.5501 0.5908 -14.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.46 on 47 degrees of freedom
## Multiple R-squared: 0.8167, Adjusted R-squared: 0.8128
## F-statistic: 209.5 on 1 and 47 DF, p-value: < 2.2e-16
Here we can see the goodness of fit metrics (both raw and adjusted) increase on the imputed model, when compared with the original full model. It may caused by the missing imputed method: Do a random imputation based on the estimated variability in the regression model.
Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 7
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: yyao2983@uni.sydney.edu.au