Objective: Determine missing observation using Linear Regression model.
library(dplyr)
library(lubridate)
library(ggvis)
# Initializing working directory
dir_Data = "/Users/richleung/Dropbox/Projects/Granular/1_Input"
dir_Main = "/Users/richleung/Dropbox/Projects/Granular/2_Code"
dir_Output = "/Users/richleung/Dropbox/Projects/Granular/3_Output"
# Import raw data file
setwd(dir_Data)
Data_orig <- read.table("IAValues.csv", sep=",", header=TRUE)
# convert to local data frame
Data_orig <- tbl_df(Data_orig)
# Rename column name
names(Data_orig)[names(Data_orig) == "rootznaws"] <- "temperature"
# Subset dataset with variables of interests (NOTE: Randomly choosen following variables as I have no context of these variables)
# rootznemc
# musumcpcts
# pctearthmc
#
# NOTE01: Removes any missing observations across variables (temperature has 2000 missing observations)
Data_master <- Data_orig %>%
select(mukey, temperature, rootznemc, musumcpcts, pctearthmc) %>%
na.omit()
# quick review of variables' statistics
summary(Data_master)
## mukey temperature rootznemc musumcpcts
## Min. : 399270 Min. : 0.0 Min. : 0 Min. : 0.00
## 1st Qu.: 405795 1st Qu.:178.0 1st Qu.:150 1st Qu.:100.00
## Median : 409358 Median :260.0 Median :150 Median :100.00
## Mean : 658956 Mean :226.5 Mean :132 Mean : 92.17
## 3rd Qu.: 412866 3rd Qu.:293.0 3rd Qu.:150 3rd Qu.:100.00
## Max. :2884321 Max. :600.0 Max. :150 Max. :100.00
## pctearthmc
## Min. : 0.00
## 1st Qu.: 85.00
## Median : 95.00
## Mean : 86.64
## 3rd Qu.:100.00
## Max. :100.00
# Subset dataset for missing Temperature
Data_missing <- Data_orig %>%
select(mukey, temperature, rootznemc, musumcpcts, pctearthmc) %>%
filter(is.na(temperature))
model1 = lm(temperature ~ rootznemc + musumcpcts + pctearthmc, data = Data_master)
library(QuantPsyc)
library(ppcor)
Cut-off Score = chi-square distribution with 0.999 of probabilities on N observations.
mahal = mahalanobis(Data_master, colMeans(Data_master), cov(Data_master))
cutoff_mahal = qchisq(1-0.001, ncol(Data_master))
bad_mahal = as.numeric(mahal > cutoff_mahal) # people over the cutoff scores
table(bad_mahal)
## bad_mahal
## 0 1
## 8574 227
Defn: Influence of that person on slope of the regression line
Cut-off score: (2k+2) / N
k = 3 # number of IVs
leverage = hatvalues(model1)
cutoff_leverage = (2*k+2) / nrow(Data_master)
bad_leverage = as.numeric(leverage > cutoff_leverage) # people over cutoff scores (undue influence on slope of regression line)
table(bad_leverage)
## bad_leverage
## 0 1
## 7553 1248
Defn: Effects that a single case possess on the whole model. Often described as Leverage + Discrepancy.
Cut-off score: 4 / (N-k-1)
cooks = cooks.distance(model1)
cutoff_cooks = 4 / (nrow(Data_master)-k-1)
bad_cooks = as.numeric(cooks > cutoff_cooks) # people over cutoff scores
table(bad_cooks)
## bad_cooks
## 0 1
## 8665 136
Outliers_total = bad_mahal + bad_leverage + bad_cooks
table(Outliers_total)
## Outliers_total
## 0 1 2 3
## 7487 1053 225 36
Remove observation with 2+ outlier criterias. Set as new dataset.
Data_master_NoOutliers = subset(Data_master, Outliers_total < 2)
model2 = lm(temperature ~ rootznemc + musumcpcts + pctearthmc, data = Data_master_NoOutliers)
summary(model2)
##
## Call:
## lm(formula = temperature ~ rootznemc + musumcpcts + pctearthmc,
## data = Data_master_NoOutliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -207.45 -19.88 11.81 37.09 237.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.19365 2.56415 0.856 0.3923
## rootznemc 1.67231 0.02643 63.279 <2e-16 ***
## musumcpcts 0.19779 0.08725 2.267 0.0234 *
## pctearthmc -0.16914 0.07965 -2.124 0.0337 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.92 on 8536 degrees of freedom
## Multiple R-squared: 0.5823, Adjusted R-squared: 0.5822
## F-statistic: 3967 on 3 and 8536 DF, p-value: < 2.2e-16
RESULT:
| variable | Coefficient (b) | t-Value | p-Value | pr^2 | Notes |
|---|---|---|---|---|---|
| [rootzemc] | 1.67287 | 60.501 | 0 | 0.3006833565 | IV is significant |
| [musumcpcts] | 0.19627 | 2.221 | 0.0264 | 0.0005790369 | IV is significant |
| [pctearthmc] | -0.16780 | -2.098 | 0.0360 | 0.0005165851 | IV is significant |
F(3, 8513) = 3879, p < 0.001, R^2 = 0.58 –> Model is significant
Objective: Want X and Y to be correlated | Do NOT want Xs to be highly correlated (waste power of Degree-of-Freedoms == Supression)
summary(model2, correlation = TRUE)
##
## Call:
## lm(formula = temperature ~ rootznemc + musumcpcts + pctearthmc,
## data = Data_master_NoOutliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -207.45 -19.88 11.81 37.09 237.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.19365 2.56415 0.856 0.3923
## rootznemc 1.67231 0.02643 63.279 <2e-16 ***
## musumcpcts 0.19779 0.08725 2.267 0.0234 *
## pctearthmc -0.16914 0.07965 -2.124 0.0337 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.92 on 8536 degrees of freedom
## Multiple R-squared: 0.5823, Adjusted R-squared: 0.5822
## F-statistic: 3967 on 3 and 8536 DF, p-value: < 2.2e-16
##
## Correlation of Coefficients:
## (Intercept) rootznemc musumcpcts
## rootznemc -0.05
## musumcpcts -0.27 -0.45
## pctearthmc 0.00 0.04 -0.86
standardized = rstudent(model2) # Standarized Residuals
fitted = scale(model2$fitted.values) # Fitted values
qqnorm(standardized)
abline(0,1)
hist(standardized)
plot(fitted, standardized)
abline(0,0)
abline(v = 0)
lm.beta(model2)
## rootznemc musumcpcts pctearthmc
## 0.75364864 0.05223103 -0.04375178
options(scipen = 999) # turn off scientific notation
partials = pcor(Data_master_NoOutliers)
partials$estimate^2
## mukey temperature rootznemc musumcpcts
## mukey 1.000000000 0.004718941 0.0027640499979 0.046855954
## temperature 0.004718941 1.000000000 0.3210020257049 0.001502421
## rootznemc 0.002764050 0.321002026 1.0000000000000 0.112896628
## musumcpcts 0.046855954 0.001502421 0.1128966282330 1.000000000
## pctearthmc 0.105289931 0.001933741 0.0000009978752 0.741349011
## pctearthmc
## mukey 0.1052899308031
## temperature 0.0019337405750
## rootznemc 0.0000009978752
## musumcpcts 0.7413490114020
## pctearthmc 1.0000000000000
options(scipen = 0) # turn on scientific notation
model3 = lm(temperature ~ rootznemc, data = Data_master_NoOutliers)
summary(model3)
##
## Call:
## lm(formula = temperature ~ rootznemc, data = Data_master_NoOutliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -207.29 -19.29 12.12 36.71 236.71
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.35017 2.17426 1.541 0.123
## rootznemc 1.69291 0.01552 109.046 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 55.93 on 8538 degrees of freedom
## Multiple R-squared: 0.5821, Adjusted R-squared: 0.582
## F-statistic: 1.189e+04 on 1 and 8538 DF, p-value: < 2.2e-16
Given strong correlation between [musumcpcts] and [pctearthmc], and also that Beta for these variables [musumcpcts] > [pctearthmc]. I drop [pctearthmc] and potentially [musumcpcts] from Linear Regression.
By removing only [pctearthmc], p-Value for [musumcpcts] increased to 0.406 == Still not signficant.
As a result, I removed all IV except for [rootznemc]. RESULT from only 1 IV, model’s fit improve.
RESULT:
temperature = (1.69394 * rootznemc) + 3.22723
variable | Coefficient (b) | t-Value | p-Value
———— | ————— | ———– | ———– [rootzemc] | 1.69394 | 107.826 | 0
Finally, I calculate each of the missing temperature observation with predict().
IV.model3 <- data.frame(rootznemc = Data_missing$rootznemc)
Data_missing$temp_predict <- predict(model3, IV.model3)