Libraries to load

load data:

full.data <- read.csv("skin-cancer.csv", header = TRUE)
head(full.data)
##   Mort  Lat
## 1  219 33.0
## 2  160 34.5
## 3  170 35.0
## 4  182 37.5
## 5  149 39.0
## 6  159 41.8
dim(full.data)
## [1] 49  2

1 Simple regression with and without missingness

1.1 Standard Simple regression without missingness

cancer <- full.data %>% filter(!is.na(full.data))
lmfit <- lm(Mort ~ Lat, data = cancer)

cancer %>% plot_ly(x = ~Lat) %>% add_markers(y = ~Mort, name = "skin-cancer") %>% 
    add_lines(x = ~Lat, y = fitted(lmfit), name = "linear regression") %>% layout(showlegend = F, 
    title = "Standard Simple regression without missingness", xaxis = list(title = "Latitude"), 
    yaxis = list(title = "Mortality rates"))
summary(lmfit)
## 
## Call:
## lm(formula = Mort ~ Lat, data = cancer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.972 -13.185   0.972  12.006  43.938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 389.1894    23.8123   16.34  < 2e-16 ***
## Lat          -5.9776     0.5984   -9.99 3.31e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared:  0.6798, Adjusted R-squared:  0.673 
## F-statistic:  99.8 on 1 and 47 DF,  p-value: 3.309e-13

The Goodness of fit statistic is 0.6798 which implies that 67.98% of the variation in Mort is explained by the linear regression on Lat

1.2 Simulate missingness in the latitudes

output <- mice::ampute(data = cancer, patterns = c(1, 0), mech = "MAR")
class(output)
## [1] "mads"
head(output$amp)
##   Mort Lat
## 1  219  NA
## 2  160  NA
## 3  170  35
## 4  182  NA
## 5  149  NA
## 6  159  NA
aggr(output$amp)

missing_val <- output$amp

1.3 Simple regression on the missing data as the response

cancer2 <- missing_val %>% filter(!is.na(missing_val$Lat))

lmfit2 <- lm(Lat ~ Mort, data = cancer2)

cancer2 %>% plot_ly(x = ~Mort) %>% add_markers(y = ~Lat, name = "skin-cancer") %>% 
    add_lines(x = ~Mort, y = fitted(lmfit2), name = "linear regression") %>% layout(showlegend = F, 
    title = "Simple regression on the missing data as the response", xaxis = list(title = "Mortality rates"), 
    yaxis = list(title = "Latitude"))
summary(lmfit2)
## 
## Call:
## lm(formula = Lat ~ Mort, data = cancer2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8610 -1.2013  0.4247  1.3270  2.2532 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 53.53423    1.99740  26.802 5.85e-16 ***
## Mort        -0.09293    0.01400  -6.636 3.15e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.695 on 18 degrees of freedom
## Multiple R-squared:  0.7099, Adjusted R-squared:  0.6937 
## F-statistic: 44.04 on 1 and 18 DF,  p-value: 3.145e-06

The Goodness of fit statistic is 0.4833 which implies that 48.33% of the variation in Lat is explained by the linear regression on Mort

1.4 Original with missingness complete-case regression

cancer3 <- missing_val %>% filter(!is.na(missing_val$Lat))

lmfit3 <- lm(Mort ~ Lat, data = cancer3)

cancer3 %>% plot_ly(x = ~Lat) %>% add_markers(y = ~Mort, name = "skin-cancer") %>% 
    add_lines(x = ~Lat, y = fitted(lmfit3), name = "linear regression") %>% layout(showlegend = F, 
    title = "Original with missingness complete-case regression", xaxis = list(title = "Latitude"), 
    yaxis = list(title = "Mortality rates"))
summary(lmfit3)
## 
## Call:
## lm(formula = Mort ~ Lat, data = cancer3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.481 -12.787   1.978  11.113  25.339 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  449.582     46.770   9.613 1.63e-08 ***
## Lat           -7.639      1.151  -6.636 3.15e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.37 on 18 degrees of freedom
## Multiple R-squared:  0.7099, Adjusted R-squared:  0.6937 
## F-statistic: 44.04 on 1 and 18 DF,  p-value: 3.145e-06

The Goodness of fit statistic is 0.4833 which implies that 48.33% of the variation in Mort is explained by the linear regression on Lat.

So when compared with 1.1 which uses using the full data, 1.4 with missingness is not consistent with the original regression. We can see that from the two model’s summary.

1.5 Random imputation

# missing index
missing.inds <- which(rowSums(is.na(missing_val)) == 1)

# construct a new feature for legend
full.data[["Missing"]] <- missing_val[["Missing"]] <- c("Not missing", "Missing")[as.numeric(is.na(missing_val[["Lat"]])) + 
    1]

# Do a simple linear regression imputation
obs.Mort <- missing_val %>% filter(is.na(missing_val$Lat)) %>% select(Mort)
Lat.preds <- predict(lmfit2, newdata = obs.Mort)

# Do a random imputation based on the estimated variability in the regression
# model
est.sigma <- sigma(lmfit2)
rand.preds <- Lat.preds + rnorm(length(Lat.preds), sd = est.sigma)
dat.rand <- missing_val
dat.rand[["Lat"]][missing.inds] <- rand.preds %>% unname

# Create the plots
fig1 <- plot_ly(data = full.data, x = ~Lat, y = ~Mort, color = ~Missing)

fig2 <- plot_ly(data = dat.rand, x = ~Lat, y = ~Mort, color = ~Missing) %>% add_markers(showlegend = F)


fig <- subplot(fig1, fig2, shareY = TRUE, shareX = TRUE) %>% layout(annotations = list(list(x = 0.2, 
    y = 1.05, text = "Original Data", showarrow = F, xref = "paper", yref = "paper"), 
    list(x = 0.8, y = 1.05, text = "Random imputation", showarrow = F, xref = "paper", 
        yref = "paper")))


fig

After randomly impute the missing Latitude observations, we can view the missing point distribution above. They are a bit different, but the overall distribution is close.

Then we will conduct the regression on the imputed dataset and compare with the orginal full model:

# lm model with impute data
lmfit4 <- lm(Mort ~ Lat, data = dat.rand)

fig1 <- full.data %>% plot_ly(x = ~Lat) %>% add_markers(y = ~Mort, name = "skin-cancer") %>% 
    add_lines(x = ~Lat, y = fitted(lmfit), name = "the original full data with linear regression") %>% 
    layout(showlegend = F)

fig2 <- dat.rand %>% plot_ly(x = ~Lat) %>% add_markers(y = ~Mort, name = "skin-cancer") %>% 
    add_lines(x = ~Lat, y = fitted(lmfit4), name = "the random imputed linear regression") %>% 
    layout(showlegend = F)

fig <- subplot(fig1, fig2, shareY = TRUE, shareX = TRUE) %>% layout(annotations = list(list(x = 0.05, 
    y = 1.05, text = "the original full data with lm", showarrow = F, xref = "paper", 
    yref = "paper"), list(x = 0.9, y = 1.05, text = "the random imputed with lm", 
    showarrow = F, xref = "paper", yref = "paper")))

fig
# compare two lm model performance
summary(lmfit)
## 
## Call:
## lm(formula = Mort ~ Lat, data = cancer)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -38.972 -13.185   0.972  12.006  43.938 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 389.1894    23.8123   16.34  < 2e-16 ***
## Lat          -5.9776     0.5984   -9.99 3.31e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.12 on 47 degrees of freedom
## Multiple R-squared:  0.6798, Adjusted R-squared:  0.673 
## F-statistic:  99.8 on 1 and 47 DF,  p-value: 3.309e-13
summary(lmfit4)
## 
## Call:
## lm(formula = Mort ~ Lat, data = dat.rand)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -36.478  -6.714   1.597   9.000  28.244 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 490.7187    23.4341   20.94   <2e-16 ***
## Lat          -8.5501     0.5908  -14.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.46 on 47 degrees of freedom
## Multiple R-squared:  0.8167, Adjusted R-squared:  0.8128 
## F-statistic: 209.5 on 1 and 47 DF,  p-value: < 2.2e-16

Here we can see the goodness of fit metrics (both raw and adjusted) increase on the imputed model, when compared with the original full model. It may caused by the missing imputed method: Do a random imputation based on the estimated variability in the regression model.

APP. Student Info

Course: STAT5003_Computational Statistical Methods
Assignment: Lab Week 7
Student Name: Yujun Yao(June Yao)
SID: 500316995
Email: