1. Load Library, Set WD and Datasets

Load Library

library(knitr)
library(dplyr)
library(corrplot)

Set WD

setwd("C:/Users/chenk/OneDrive/Documents/Spring 2020/PMAP 4041/Classroom sets/WK 7")

Datasets

load("C:/Users/chenk/OneDrive/Documents/Spring 2020/PMAP 4041/Datasets/Class4set/OPM94.RData")

Check for structure

names(opm94)
##  [1] "x"        "sal"      "grade"    "patco"    "major"    "age"     
##  [7] "male"     "vet"      "handvet"  "hand"     "yos"      "edyrs"   
## [13] "promo"    "exit"     "supmgr"   "race"     "minority" "grade4"  
## [19] "promo01"  "supmgr01" "male01"   "exit01"
str(opm94)
## 'data.frame':    1000 obs. of  22 variables:
##  $ x       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ sal     : int  26045 37651 64926 18588 19573 28648 27805 16560 40440 24285 ...
##  $ grade   : int  7 9 14 4 3 9 7 3 11 6 ...
##  $ patco   : Factor w/ 5 levels "Administrative",..: 1 4 4 2 2 4 5 2 1 2 ...
##  $ major   : Factor w/ 23 levels "     ","AGRIC",..: 16 11 10 1 1 11 1 1 1 6 ...
##  $ age     : int  52 34 37 26 51 44 50 37 59 57 ...
##  $ male    : Factor w/ 2 levels "female","male": 1 1 1 1 1 1 1 1 1 1 ...
##  $ vet     : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 2 1 ...
##  $ handvet : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ hand    : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
##  $ yos     : int  6 4 3 6 14 1 7 5 13 6 ...
##  $ edyrs   : int  16 16 16 12 12 16 14 12 12 14 ...
##  $ promo   : Factor w/ 2 levels "no","yes": 2 1 1 1 NA 1 1 1 1 1 ...
##  $ exit    : Factor w/ 2 levels "no","yes": 1 1 1 1 2 1 1 1 1 1 ...
##  $ supmgr  : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ race    : Factor w/ 5 levels "American Indian",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ minority: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ grade4  : Factor w/ 4 levels "grades 1 to 4",..: 3 4 2 1 1 4 3 1 4 3 ...
##  $ promo01 : num  1 0 0 0 NA 0 0 0 0 0 ...
##  $ supmgr01: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ male01  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ exit01  : num  0 0 0 0 1 0 0 0 0 0 ...


2. Creating a New Variable

To see how changing the units of measurement affects the regression coefficient and the correlation coefficient, we are going to create a new variable (months) that measures age in months instead of years.

opm94 <- opm94 %>% mutate(age_months = age*12)


3. Correlation Matrix

M <- opm94 %>% select(sal,grade,edyrs,yos,age,age_months,male01,minority) %>% cor(use = "pairwise.complete.obs") %>% round(2)

opm94 %>% select(sal,grade,edyrs,yos,age,age_months,male01,minority) %>% cor(use = "pairwise.complete.obs") %>% round(2)
##              sal grade edyrs   yos   age age_months male01 minority
## sal         1.00  0.91  0.59  0.40  0.29       0.29   0.36    -0.23
## grade       0.91  1.00  0.61  0.31  0.19       0.19   0.35    -0.23
## edyrs       0.59  0.61  1.00  0.01  0.08       0.08   0.31    -0.15
## yos         0.40  0.31  0.01  1.00  0.62       0.62   0.08    -0.13
## age         0.29  0.19  0.08  0.62  1.00       1.00   0.09    -0.15
## age_months  0.29  0.19  0.08  0.62  1.00       1.00   0.09    -0.15
## male01      0.36  0.35  0.31  0.08  0.09       0.09   1.00    -0.12
## minority   -0.23 -0.23 -0.15 -0.13 -0.15      -0.15  -0.12     1.00
corrplot(M, method = "number")


4. Regression with Numeric Explanatory Variable

lm(sal ~ grade, data = opm94) %>% summary() # Expected Grade = Response Variable, Salary = Explanatory Variable
## 
## Call:
## lm(formula = sal ~ grade, data = opm94)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -12775  -4778   -505   3413  45197 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5132.8      698.5  -7.348 4.19e-13 ***
## grade         4779.0       68.6  69.662  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7292 on 993 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.8301, Adjusted R-squared:   0.83 
## F-statistic:  4853 on 1 and 993 DF,  p-value: < 2.2e-16
# grade is n and for each additional grade +1
# Expected Grade = (-5132.8 + 4779 * grade) @ grade 0 = 353.8
lm(grade ~ yos, data = opm94) %>% summary()
## 
## Call:
## lm(formula = grade ~ yos, data = opm94)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -8.252 -2.833  0.527  2.684  6.539 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.87967    0.19747   39.90   <2e-16 ***
## yos          0.11629    0.01144   10.17   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.21 on 998 degrees of freedom
## Multiple R-squared:  0.09387,    Adjusted R-squared:  0.09296 
## F-statistic: 103.4 on 1 and 998 DF,  p-value: < 2.2e-16
# yos is n and for each additional year +1
# Expected Grade = (7.87967 + .12 * yos) @ yos = 0, the expected grade is 7.87967
# YOS = Years of Service
lm(grade ~ edyrs, data = opm94) %>% summary()
## 
## Call:
## lm(formula = grade ~ edyrs, data = opm94)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.0775 -2.0775 -0.0775  1.9225  7.5345 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -3.37071    0.54503  -6.184 9.08e-10 ***
## edyrs        0.90301    0.03748  24.095  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.681 on 998 degrees of freedom
## Multiple R-squared:  0.3678, Adjusted R-squared:  0.3671 
## F-statistic: 580.6 on 1 and 998 DF,  p-value: < 2.2e-16
# edyrs is n and for each additional year +1
#Expected Grade = (-3.37071 + .90301 * edyrs)
#Intercept tells us the at (0) edyrs the expected grade is -3.37. 
lm(grade ~ age, data = opm94) %>% summary()
## 
## Call:
## lm(formula = grade ~ age, data = opm94)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.7443 -2.9733  0.9045  2.7595  6.2099 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.92001    0.46168  14.989  < 2e-16 ***
## age          0.06107    0.01024   5.965  3.4e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.313 on 998 degrees of freedom
## Multiple R-squared:  0.03442,    Adjusted R-squared:  0.03346 
## F-statistic: 35.58 on 1 and 998 DF,  p-value: 3.395e-09
# age is n and for each additional year +1
#Expected Grade = (6.92001 + 0.06107 * age) 
#Intercept tells us the at (0) age the expected grade is 6.98107
lm(yos ~ age, data = opm94) %>% summary()
## 
## Call:
## lm(formula = yos ~ age, data = opm94)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2467  -4.3889   0.2288   4.9875  16.6804 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.85485    0.96979  -9.131   <2e-16 ***
## age          0.53883    0.02151  25.056   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.96 on 998 degrees of freedom
## Multiple R-squared:  0.3861, Adjusted R-squared:  0.3855 
## F-statistic: 627.8 on 1 and 998 DF,  p-value: < 2.2e-16
# age is n and for each additional year +1
#Expected YOS = (-8.85485 + 0.53883 * age) 
#Intercept tells us the at (0) age the expected YOS is -8.31602) 
lm(yos ~ age_months, data = opm94) %>% summary()
## 
## Call:
## lm(formula = yos ~ age_months, data = opm94)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2467  -4.3889   0.2288   4.9875  16.6804 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8.854846   0.969789  -9.131   <2e-16 ***
## age_months   0.044902   0.001792  25.056   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.96 on 998 degrees of freedom
## Multiple R-squared:  0.3861, Adjusted R-squared:  0.3855 
## F-statistic: 627.8 on 1 and 998 DF,  p-value: < 2.2e-16
# age_months is n and for each additional month +1
#Expected YOS = (-8.85485 + 0.044902 * age_months) 
#Intercept tells us the at (0) age the expected YOS is -8.809948,) 


5. Regression with Dummy Explanatory Variable

lm(sal ~ male01, data = opm94) %>% summary()
## 
## Call:
## lm(formula = sal ~ male01, data = opm94)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31945 -11537  -3092   9591  71883 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34222.8      749.9   45.64   <2e-16 ***
## male01       12776.6     1046.3   12.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16500 on 993 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1305, Adjusted R-squared:  0.1297 
## F-statistic: 149.1 on 1 and 993 DF,  p-value: < 2.2e-16
# male01 is n and for each additional male01 +1
#Expected salary = (34222.8 + 12776.6 * male01) 
#Intercept tells us the at (0) age the expected sal is 46,999.4.,,)

Create a dummy variable ‘female’ (female = 1, male = 0)

opm94 <- opm94 %>% mutate(female01 = if_else(male01 == 0, 1, 0 ))

lm(sal ~ female01, data = opm94) %>% summary()
## 
## Call:
## lm(formula = sal ~ female01, data = opm94)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -31945 -11537  -3092   9591  71883 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  46999.4      729.8   64.40   <2e-16 ***
## female01    -12776.6     1046.3  -12.21   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16500 on 993 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.1305, Adjusted R-squared:  0.1297 
## F-statistic: 149.1 on 1 and 993 DF,  p-value: < 2.2e-16
# female01 is n and for each additional female01 +1
#Expected salary = (46999.4 - 12776.6 * female01) 
#Intercept tells us the at (0) age the expected salary is 34222.8,,)

Mean Salaries with Males/Females

opm94 %>% group_by(male) %>% summarise(Mean_Salary = mean(sal, na.rm = TRUE))
## # A tibble: 2 x 2
##   male   Mean_Salary
##   <fct>        <dbl>
## 1 female      34223.
## 2 male        46999.