Objective: Determine missing observation using Linear Regression model.

Setting up environment

library(dplyr)
library(lubridate)
library(ggvis)


# Initializing working directory
dir_Data        = "/Users/richleung/Dropbox/Projects/Granular/1_Input"
dir_Main      = "/Users/richleung/Dropbox/Projects/Granular/2_Code"
dir_Output    = "/Users/richleung/Dropbox/Projects/Granular/3_Output"



# Import raw data file
setwd(dir_Data)
Data_orig <- read.table("IAValues.csv", sep=",", header=TRUE)
# convert to local data frame
Data_orig <- tbl_df(Data_orig)
# Rename column name
names(Data_orig)[names(Data_orig) == "rootznaws"] <- "temperature"



# Subset dataset with variables of interests (NOTE: Randomly choosen following variables as I have no context of these variables)
#   rootznemc
#   musumcpcts
#   pctearthmc
# 
#   NOTE01: Removes any missing observations across variables (temperature has 2000 missing observations)
Data_master <- Data_orig %>% 
  select(mukey, temperature, rootznemc, musumcpcts, pctearthmc) %>% 
  na.omit()



# quick review of variables' statistics
summary(Data_master)
##      mukey          temperature      rootznemc     musumcpcts    
##  Min.   : 399270   Min.   :  0.0   Min.   :  0   Min.   :  0.00  
##  1st Qu.: 405795   1st Qu.:178.0   1st Qu.:150   1st Qu.:100.00  
##  Median : 409358   Median :260.0   Median :150   Median :100.00  
##  Mean   : 658956   Mean   :226.5   Mean   :132   Mean   : 92.17  
##  3rd Qu.: 412866   3rd Qu.:293.0   3rd Qu.:150   3rd Qu.:100.00  
##  Max.   :2884321   Max.   :600.0   Max.   :150   Max.   :100.00  
##    pctearthmc    
##  Min.   :  0.00  
##  1st Qu.: 85.00  
##  Median : 95.00  
##  Mean   : 86.64  
##  3rd Qu.:100.00  
##  Max.   :100.00
# Subset dataset for missing Temperature
Data_missing <- Data_orig %>% 
  select(mukey, temperature, rootznemc, musumcpcts, pctearthmc) %>% 
  filter(is.na(temperature))

1.) Initial regression

model1 = lm(temperature ~ rootznemc + musumcpcts + pctearthmc, data = Data_master)


Data Screening (part 1)

library(QuantPsyc)
library(ppcor)

2.) Outliers (Mahalanobis)

Cut-off Score = chi-square distribution with 0.999 of probabilities on N observations.

mahal = mahalanobis(Data_master, colMeans(Data_master), cov(Data_master))
cutoff_mahal = qchisq(1-0.001, ncol(Data_master))
bad_mahal = as.numeric(mahal > cutoff_mahal)                  # people over the cutoff scores
table(bad_mahal)
## bad_mahal
##    0    1 
## 8574  227

3.) Leverage

Defn: Influence of that person on slope of the regression line

Cut-off score: (2k+2) / N

k = 3                                                         # number of IVs
leverage = hatvalues(model1)
cutoff_leverage = (2*k+2) / nrow(Data_master)
bad_leverage = as.numeric(leverage > cutoff_leverage)         # people over cutoff scores (undue influence on slope of regression line)
table(bad_leverage)
## bad_leverage
##    0    1 
## 7553 1248

4.) Influence (Cook’s)

Defn: Effects that a single case possess on the whole model. Often described as Leverage + Discrepancy.

Cut-off score: 4 / (N-k-1)

cooks = cooks.distance(model1)
cutoff_cooks = 4 / (nrow(Data_master)-k-1)
bad_cooks = as.numeric(cooks > cutoff_cooks)                  # people over cutoff scores
table(bad_cooks)
## bad_cooks
##    0    1 
## 8665  136

5.) Overall Outliers (Mahalanobis + Leverage + Influence)

Outliers_total = bad_mahal + bad_leverage + bad_cooks
table(Outliers_total)
## Outliers_total
##    0    1    2    3 
## 7487 1053  225   36

Remove observation with 2+ outlier criterias. Set as new dataset.

Data_master_NoOutliers = subset(Data_master, Outliers_total < 2)

6.) Regression post removal of outliers

model2 = lm(temperature ~ rootznemc + musumcpcts + pctearthmc, data = Data_master_NoOutliers)
summary(model2)
## 
## Call:
## lm(formula = temperature ~ rootznemc + musumcpcts + pctearthmc, 
##     data = Data_master_NoOutliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -207.45  -19.88   11.81   37.09  237.25 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.19365    2.56415   0.856   0.3923    
## rootznemc    1.67231    0.02643  63.279   <2e-16 ***
## musumcpcts   0.19779    0.08725   2.267   0.0234 *  
## pctearthmc  -0.16914    0.07965  -2.124   0.0337 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.92 on 8536 degrees of freedom
## Multiple R-squared:  0.5823, Adjusted R-squared:  0.5822 
## F-statistic:  3967 on 3 and 8536 DF,  p-value: < 2.2e-16

RESULT:

variable Coefficient (b) t-Value p-Value pr^2 Notes
[rootzemc] 1.67287 60.501 0 0.3006833565 IV is significant
[musumcpcts] 0.19627 2.221 0.0264 0.0005790369 IV is significant
[pctearthmc] -0.16780 -2.098 0.0360 0.0005165851 IV is significant

F(3, 8513) = 3879, p < 0.001, R^2 = 0.58 –> Model is significant



Data Screening part 2

7.) Multicollinerity

Objective: Want X and Y to be correlated | Do NOT want Xs to be highly correlated (waste power of Degree-of-Freedoms == Supression)

summary(model2, correlation = TRUE)
## 
## Call:
## lm(formula = temperature ~ rootznemc + musumcpcts + pctearthmc, 
##     data = Data_master_NoOutliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -207.45  -19.88   11.81   37.09  237.25 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.19365    2.56415   0.856   0.3923    
## rootznemc    1.67231    0.02643  63.279   <2e-16 ***
## musumcpcts   0.19779    0.08725   2.267   0.0234 *  
## pctearthmc  -0.16914    0.07965  -2.124   0.0337 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.92 on 8536 degrees of freedom
## Multiple R-squared:  0.5823, Adjusted R-squared:  0.5822 
## F-statistic:  3967 on 3 and 8536 DF,  p-value: < 2.2e-16
## 
## Correlation of Coefficients:
##            (Intercept) rootznemc musumcpcts
## rootznemc  -0.05                           
## musumcpcts -0.27       -0.45               
## pctearthmc  0.00        0.04     -0.86

8.) Linearity

standardized = rstudent(model2)                               # Standarized Residuals
fitted = scale(model2$fitted.values)                          # Fitted values

qqnorm(standardized)
abline(0,1)

9.) Normality

hist(standardized)

10.) Homogeneity & Homoscedasticity

plot(fitted, standardized)
abline(0,0)
abline(v = 0)



11.) Review Model IVs for signifiance using Beta == Z-Score == Standardize Regression Coefficient

lm.beta(model2)
##   rootznemc  musumcpcts  pctearthmc 
##  0.75364864  0.05223103 -0.04375178

12.) Review Model IVs for signifiance using Partial Correlation (pr^2) of dataset (sans outliers).

options(scipen = 999)                                         # turn off scientific notation
partials = pcor(Data_master_NoOutliers)
partials$estimate^2
##                   mukey temperature       rootznemc  musumcpcts
## mukey       1.000000000 0.004718941 0.0027640499979 0.046855954
## temperature 0.004718941 1.000000000 0.3210020257049 0.001502421
## rootznemc   0.002764050 0.321002026 1.0000000000000 0.112896628
## musumcpcts  0.046855954 0.001502421 0.1128966282330 1.000000000
## pctearthmc  0.105289931 0.001933741 0.0000009978752 0.741349011
##                  pctearthmc
## mukey       0.1052899308031
## temperature 0.0019337405750
## rootznemc   0.0000009978752
## musumcpcts  0.7413490114020
## pctearthmc  1.0000000000000


13.) Final Model

options(scipen = 0)                                           # turn on scientific notation
model3 = lm(temperature ~ rootznemc, data = Data_master_NoOutliers)
summary(model3)
## 
## Call:
## lm(formula = temperature ~ rootznemc, data = Data_master_NoOutliers)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -207.29  -19.29   12.12   36.71  236.71 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.35017    2.17426   1.541    0.123    
## rootznemc    1.69291    0.01552 109.046   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 55.93 on 8538 degrees of freedom
## Multiple R-squared:  0.5821, Adjusted R-squared:  0.582 
## F-statistic: 1.189e+04 on 1 and 8538 DF,  p-value: < 2.2e-16

Given strong correlation between [musumcpcts] and [pctearthmc], and also that Beta for these variables [musumcpcts] > [pctearthmc]. I drop [pctearthmc] and potentially [musumcpcts] from Linear Regression.

By removing only [pctearthmc], p-Value for [musumcpcts] increased to 0.406 == Still not signficant.

As a result, I removed all IV except for [rootznemc]. RESULT from only 1 IV, model’s fit improve.

RESULT:

temperature = (1.69394 * rootznemc) + 3.22723

variable | Coefficient (b) | t-Value | p-Value
———— | ————— | ———– | ———– [rootzemc] | 1.69394 | 107.826 | 0

Finally, I calculate each of the missing temperature observation with predict().

IV.model3 <- data.frame(rootznemc = Data_missing$rootznemc)
Data_missing$temp_predict <- predict(model3, IV.model3)