R Markdown

Usage of Indicator and Interactive Variables in Dataset Download dataset from http://www1.aucegypt.edu/faculty/hadi/RABE5/

rm(list=ls())
if (!require("pacman")) install.packages("pacman")
## Loading required package: pacman
## Warning: package 'pacman' was built under R version 3.5.1
pacman::p_load("data.table", "devtools", "dplyr","ggplot2", "shiny", "MASS", "dummies", "moments")
#install_github("StatsWithR/statsr")

load("C:/RSTATS/StatsWithR/RABE5.RData")
#data(P130)
str(P130)
## 'data.frame':    46 obs. of  4 variables:
##  $ S: num  13876 11608 18701 11283 11767 ...
##  $ X: num  1 1 1 1 1 2 2 2 2 3 ...
##  $ E: num  1 3 3 2 3 2 2 1 3 2 ...
##  $ M: num  1 0 1 0 0 1 0 0 0 0 ...
# s => salary, x => number of experiance, E => educaton level, M => hold manferial poistion.
salaryDF <- dummy.data.frame(P130, names = c("E","M"), sep = "_")
# we do't need E_3 and M_1 for our regression
salary_ml <- lm(S ~ X + M_0 + E_1 + E_2, data = salaryDF)
summary(salary_ml)
## 
## Call:
## lm(formula = S ~ X + M_0 + E_1 + E_2, data = salaryDF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1884.60  -653.60    22.23   844.85  1716.47 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 17915.34     352.84  50.774  < 2e-16 ***
## X             546.18      30.52  17.896  < 2e-16 ***
## M_0         -6883.53     313.92 -21.928  < 2e-16 ***
## E_1         -2996.21     411.75  -7.277 6.72e-09 ***
## E_2           147.82     387.66   0.381    0.705    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1027 on 41 degrees of freedom
## Multiple R-squared:  0.9568, Adjusted R-squared:  0.9525 
## F-statistic: 226.8 on 4 and 41 DF,  p-value: < 2.2e-16
#To assess whether the linear model is reliable, we need to check for (1) linearity, (2) nearly normal residuals, and (3) constant variability.
# now check residual vs fitted values
ggplot(data = salary_ml, aes(x = .fitted, y = .resid)) +  geom_point() +  
  geom_hline(yintercept = 0, linetype = "dashed") +  xlab("Fitted values") +  ylab("Residuals")

ggplot(data = salary_ml, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here 
ggplot(data = salary_ml, aes(sample = .resid)) +
  stat_qq()

print(skewness(resid(salary_ml)))
## [1] -0.07731677
############################################################ Using Interaction Variabes##################
#Plotted residual graph with fitted values shows some interactions. lets plot Experiance vs Residual plots
std_red <- rstandard(salary_ml) # standarsied it
# plot between all categorical variables and std residuals
# now plot all six combination in easy way 
salaryDF <- salaryDF %>% mutate(category = as.character(100*E_1 + 10*E_2 + M_0)) 
str(salaryDF)
## 'data.frame':    46 obs. of  8 variables:
##  $ S       : num  13876 11608 18701 11283 11767 ...
##  $ X       : num  1 1 1 1 1 2 2 2 2 3 ...
##  $ E_1     : int  1 0 0 0 0 0 0 1 0 0 ...
##  $ E_2     : int  0 0 0 1 0 1 1 0 0 1 ...
##  $ E_3     : int  0 1 1 0 1 0 0 0 1 0 ...
##  $ M_0     : int  0 1 0 1 1 0 1 1 1 1 ...
##  $ M_1     : int  1 0 1 0 0 1 0 0 0 0 ...
##  $ category: chr  "100" "1" "0" "11" ...
df <- data.frame("CAT" = salaryDF$category, "stdres" = std_red,stringsAsFactors = FALSE )
str(df)
## 'data.frame':    46 obs. of  2 variables:
##  $ CAT   : chr  "100" "1" "0" "11" ...
##  $ stdres: num  -1.6831 0.0313 0.2469 -0.4576 0.1973 ...
ggplot(data = df, aes(x = CAT, y = stdres)) +  geom_point() + geom_hline(yintercept = 0, linetype = "dashed") +xlab("category") +ylab("Residuals")

# graph shows above strong relation between Education level and mangerial level which is not additive in nature
# create 2 more columns of E1 * M0 and E2* M0
salaryDF <- salaryDF %>% mutate(E_1.M_0 = E_1 * M_0)
salaryDF <- salaryDF %>% mutate(E_2.M_0 = E_2 * M_0)
 # now create new model 
salary_new_ml <- lm(S ~ X + M_0 + E_1 + E_2 + E_1.M_0 + E_2.M_0, data = salaryDF)
summary(salary_new_ml)
## 
## Call:
## lm(formula = S ~ X + M_0 + E_1 + E_2 + E_1.M_0 + E_2.M_0, data = salaryDF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -928.13  -46.21   24.33   65.88  204.89 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18250.846     73.902  246.96   <2e-16 ***
## X             496.987      5.566   89.28   <2e-16 ***
## M_0         -7047.412    102.589  -68.69   <2e-16 ***
## E_1         -4796.783    100.118  -47.91   <2e-16 ***
## E_2          1487.410     90.263   16.48   <2e-16 ***
## E_1.M_0      3066.035    149.330   20.53   <2e-16 ***
## E_2.M_0     -1836.488    131.167  -14.00   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 173.8 on 39 degrees of freedom
## Multiple R-squared:  0.9988, Adjusted R-squared:  0.9986 
## F-statistic:  5517 on 6 and 39 DF,  p-value: < 2.2e-16
summary(salary_ml)
## 
## Call:
## lm(formula = S ~ X + M_0 + E_1 + E_2, data = salaryDF)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1884.60  -653.60    22.23   844.85  1716.47 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 17915.34     352.84  50.774  < 2e-16 ***
## X             546.18      30.52  17.896  < 2e-16 ***
## M_0         -6883.53     313.92 -21.928  < 2e-16 ***
## E_1         -2996.21     411.75  -7.277 6.72e-09 ***
## E_2           147.82     387.66   0.381    0.705    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1027 on 41 degrees of freedom
## Multiple R-squared:  0.9568, Adjusted R-squared:  0.9525 
## F-statistic: 226.8 on 4 and 41 DF,  p-value: < 2.2e-16
# with std error
#o assess whether the linear model is reliable, we need to check for (1) linearity, (2) nearly normal residuals, and (3) constant variability.
# linearity already checked by using scatterplo now check residual vs fitted values
ggplot(data = salary_new_ml, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

ggplot(data = salary_new_ml, aes(x = .resid)) +  geom_histogram(binwidth = 25) +  xlab("Residuals")

# or a normal probability plot of the residuals. QQ - quantile-quantile plot, using sample onstead of x here 
ggplot(data = salary_new_ml, aes(sample = .resid)) +
  stat_qq()

skewness(resid(salary_new_ml))
## [1] -4.102518
#################################################################
# Above graphs shows there is one datapoint which may be outlier and if removed we achieve more better R-Square