Usage of Indicator and Interactive Variables in Dataset Download dataset from http://www1.aucegypt.edu/faculty/hadi/RABE5/
rm(list=ls())
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 3.5.1
pacman::p_load("data.table", "devtools", "dplyr","ggplot2", "shiny", "MASS", "dummies", "moments")
#install_github("StatsWithR/statsr")
load("C:/RSTATS/StatsWithR/RABE5.RData")
#data(P130)
str(P130)
## 'data.frame': 46 obs. of 4 variables:
## $ S: num 13876 11608 18701 11283 11767 ...
## $ X: num 1 1 1 1 1 2 2 2 2 3 ...
## $ E: num 1 3 3 2 3 2 2 1 3 2 ...
## $ M: num 1 0 1 0 0 1 0 0 0 0 ...
# s => salary, x => number of experiance, E => educaton level, M => hold manferial poistion.
salaryDF <- dummy.data.frame(P130, names = c("E","M"), sep = "_")
# we do't need E_3 and M_1 for our regression
salary_ml <- lm(S ~ X + M_0 + E_1 + E_2, data = salaryDF)
summary(salary_ml)
##
## Call:
## lm(formula = S ~ X + M_0 + E_1 + E_2, data = salaryDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1884.60 -653.60 22.23 844.85 1716.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17915.34 352.84 50.774 < 2e-16 ***
## X 546.18 30.52 17.896 < 2e-16 ***
## M_0 -6883.53 313.92 -21.928 < 2e-16 ***
## E_1 -2996.21 411.75 -7.277 6.72e-09 ***
## E_2 147.82 387.66 0.381 0.705
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1027 on 41 degrees of freedom
## Multiple R-squared: 0.9568, Adjusted R-squared: 0.9525
## F-statistic: 226.8 on 4 and 41 DF, p-value: < 2.2e-16
#To assess whether the linear model is reliable, we need to check for (1) linearity, (2) nearly normal residuals, and (3) constant variability.
# now check residual vs fitted values
ggplot(data = salary_ml, aes(x = .fitted, y = .resid)) + geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") + xlab("Fitted values") + ylab("Residuals")
ggplot(data = salary_ml, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals")
# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here
ggplot(data = salary_ml, aes(sample = .resid)) +
stat_qq()
print(skewness(resid(salary_ml)))
## [1] -0.07731677
############################################################ Using Interaction Variabes##################
#Plotted residual graph with fitted values shows some interactions. lets plot Experiance vs Residual plots
std_red <- rstandard(salary_ml) # standarsied it
# plot between all categorical variables and std residuals
# now plot all six combination in easy way
salaryDF <- salaryDF %>% mutate(category = as.character(100*E_1 + 10*E_2 + M_0))
str(salaryDF)
## 'data.frame': 46 obs. of 8 variables:
## $ S : num 13876 11608 18701 11283 11767 ...
## $ X : num 1 1 1 1 1 2 2 2 2 3 ...
## $ E_1 : int 1 0 0 0 0 0 0 1 0 0 ...
## $ E_2 : int 0 0 0 1 0 1 1 0 0 1 ...
## $ E_3 : int 0 1 1 0 1 0 0 0 1 0 ...
## $ M_0 : int 0 1 0 1 1 0 1 1 1 1 ...
## $ M_1 : int 1 0 1 0 0 1 0 0 0 0 ...
## $ category: chr "100" "1" "0" "11" ...
df <- data.frame("CAT" = salaryDF$category, "stdres" = std_red,stringsAsFactors = FALSE )
str(df)
## 'data.frame': 46 obs. of 2 variables:
## $ CAT : chr "100" "1" "0" "11" ...
## $ stdres: num -1.6831 0.0313 0.2469 -0.4576 0.1973 ...
ggplot(data = df, aes(x = CAT, y = stdres)) + geom_point() + geom_hline(yintercept = 0, linetype = "dashed") +xlab("category") +ylab("Residuals")
# graph shows above strong relation between Education level and mangerial level which is not additive in nature
# create 2 more columns of E1 * M0 and E2* M0
salaryDF <- salaryDF %>% mutate(E_1.M_0 = E_1 * M_0)
salaryDF <- salaryDF %>% mutate(E_2.M_0 = E_2 * M_0)
# now create new model
salary_new_ml <- lm(S ~ X + M_0 + E_1 + E_2 + E_1.M_0 + E_2.M_0, data = salaryDF)
summary(salary_new_ml)
##
## Call:
## lm(formula = S ~ X + M_0 + E_1 + E_2 + E_1.M_0 + E_2.M_0, data = salaryDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -928.13 -46.21 24.33 65.88 204.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18250.846 73.902 246.96 <2e-16 ***
## X 496.987 5.566 89.28 <2e-16 ***
## M_0 -7047.412 102.589 -68.69 <2e-16 ***
## E_1 -4796.783 100.118 -47.91 <2e-16 ***
## E_2 1487.410 90.263 16.48 <2e-16 ***
## E_1.M_0 3066.035 149.330 20.53 <2e-16 ***
## E_2.M_0 -1836.488 131.167 -14.00 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 173.8 on 39 degrees of freedom
## Multiple R-squared: 0.9988, Adjusted R-squared: 0.9986
## F-statistic: 5517 on 6 and 39 DF, p-value: < 2.2e-16
summary(salary_ml)
##
## Call:
## lm(formula = S ~ X + M_0 + E_1 + E_2, data = salaryDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1884.60 -653.60 22.23 844.85 1716.47
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17915.34 352.84 50.774 < 2e-16 ***
## X 546.18 30.52 17.896 < 2e-16 ***
## M_0 -6883.53 313.92 -21.928 < 2e-16 ***
## E_1 -2996.21 411.75 -7.277 6.72e-09 ***
## E_2 147.82 387.66 0.381 0.705
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1027 on 41 degrees of freedom
## Multiple R-squared: 0.9568, Adjusted R-squared: 0.9525
## F-statistic: 226.8 on 4 and 41 DF, p-value: < 2.2e-16
# with std error
#o assess whether the linear model is reliable, we need to check for (1) linearity, (2) nearly normal residuals, and (3) constant variability.
# linearity already checked by using scatterplo now check residual vs fitted values
ggplot(data = salary_new_ml, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
xlab("Fitted values") +
ylab("Residuals")
ggplot(data = salary_new_ml, aes(x = .resid)) + geom_histogram(binwidth = 25) + xlab("Residuals")
# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here
ggplot(data = salary_new_ml, aes(sample = .resid)) +
stat_qq()
skewness(resid(salary_new_ml))
## [1] -4.102518
#################################################################
# Above graphs shows there is one datapoint which may be outlier and if removed we achieve more better R-Square