library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
setwd("C:/Users/ramin/Desktop/2020 winter/Data Analysis/Computer Assignment 5/Dataset")
load(file = "OPM94.RData")
opm94$edyrs_months = opm94$edyrs/12
opm94 %>% select(sal, grade, edyrs, edyrs_months, yos, age, male01, minority) %>%
cor(use = "pairwise.complete.obs") %>%
round(2)
## sal grade edyrs edyrs_months yos age male01 minority
## sal 1.00 0.91 0.59 0.59 0.40 0.29 0.36 -0.23
## grade 0.91 1.00 0.61 0.61 0.31 0.19 0.35 -0.23
## edyrs 0.59 0.61 1.00 1.00 0.01 0.08 0.31 -0.15
## edyrs_months 0.59 0.61 1.00 1.00 0.01 0.08 0.31 -0.15
## yos 0.40 0.31 0.01 0.01 1.00 0.62 0.08 -0.13
## age 0.29 0.19 0.08 0.08 0.62 1.00 0.09 -0.15
## male01 0.36 0.35 0.31 0.31 0.08 0.09 1.00 -0.12
## minority -0.23 -0.23 -0.15 -0.15 -0.13 -0.15 -0.12 1.00
lm(sal ~ grade, data = opm94) %>% summary()
##
## Call:
## lm(formula = sal ~ grade, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12775 -4778 -505 3413 45197
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5132.8 698.5 -7.348 4.19e-13 ***
## grade 4779.0 68.6 69.662 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7292 on 993 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.8301, Adjusted R-squared: 0.83
## F-statistic: 4853 on 1 and 993 DF, p-value: < 2.2e-16
lm(grade ~ yos, data = opm94) %>% summary()
##
## Call:
## lm(formula = grade ~ yos, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.252 -2.833 0.527 2.684 6.539
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.87967 0.19747 39.90 <2e-16 ***
## yos 0.11629 0.01144 10.17 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.21 on 998 degrees of freedom
## Multiple R-squared: 0.09387, Adjusted R-squared: 0.09296
## F-statistic: 103.4 on 1 and 998 DF, p-value: < 2.2e-16
lm(grade ~ edyrs, data = opm94) %>% summary()
##
## Call:
## lm(formula = grade ~ edyrs, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0775 -2.0775 -0.0775 1.9225 7.5345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.37071 0.54503 -6.184 9.08e-10 ***
## edyrs 0.90301 0.03748 24.095 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.681 on 998 degrees of freedom
## Multiple R-squared: 0.3678, Adjusted R-squared: 0.3671
## F-statistic: 580.6 on 1 and 998 DF, p-value: < 2.2e-16
lm(yos ~ age, data = opm94) %>% summary()
##
## Call:
## lm(formula = yos ~ age, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.2467 -4.3889 0.2288 4.9875 16.6804
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8.85485 0.96979 -9.131 <2e-16 ***
## age 0.53883 0.02151 25.056 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.96 on 998 degrees of freedom
## Multiple R-squared: 0.3861, Adjusted R-squared: 0.3855
## F-statistic: 627.8 on 1 and 998 DF, p-value: < 2.2e-16
Questions
4a. For each regression, briefly explain the meaning of the y-intercept and the regression coefficient.
For each regression, the y-intercept means what the dependent variable is when the independent variable equals zero.
4b. Find the expected salary for someone in 16th grade
(16*4779)-5132.8 = 71331.2
4c. Find the expected grade for someone with 5 years of service
(.11629*5)+7.87967 = 8.46112
4d. Find the expected grade for someone with 12 years of education
(.90301*12)-3.37071 = 7.46541
lm(grade ~ edyrs_months, data = opm94) %>% summary()
##
## Call:
## lm(formula = grade ~ edyrs_months, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.0775 -2.0775 -0.0775 1.9225 7.5345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.3707 0.5450 -6.184 9.08e-10 ***
## edyrs_months 10.8362 0.4497 24.095 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.681 on 998 degrees of freedom
## Multiple R-squared: 0.3678, Adjusted R-squared: 0.3671
## F-statistic: 580.6 on 1 and 998 DF, p-value: < 2.2e-16
4e. Why is the regressin coeficient different from the coefficient on edyrs? How are they the same?
They are different because edyrs is in years and the edyrs_months is measured ed year in months. They are still the same variable in different units.
opm94 <- opm94 %>% mutate(nonvet = if_else(vet == 0, 1, 0 ))
lm(sal ~ vet, data = opm94) %>% summary()
##
## Call:
## lm(formula = sal ~ vet, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -30056 -14230 -3155 10464 72731
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 39439.7 636.2 61.995 < 2e-16 ***
## vetyes 5669.8 1306.3 4.341 1.57e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17530 on 993 degrees of freedom
## (5 observations deleted due to missingness)
## Multiple R-squared: 0.01862, Adjusted R-squared: 0.01763
## F-statistic: 18.84 on 1 and 993 DF, p-value: 1.567e-05
lm(sal ~ nonvet, data = opm94) %>% summary()
##
## Call:
## lm(formula = sal ~ nonvet, data = opm94)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25730 -14959 -2655 10828 75745
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40784.5 560.6 72.75 <2e-16 ***
## nonvet NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17680 on 994 degrees of freedom
## (5 observations deleted due to missingness)
opm94 %>% group_by(vet) %>% summarise(Mean_Salary = mean(sal, na.rm = TRUE))
## # A tibble: 2 x 2
## vet Mean_Salary
## <fct> <dbl>
## 1 no 39440.
## 2 yes 45110.
opm94 %>% group_by(nonvet) %>% summarise(Mean_Salary = mean(sal, na.rm = TRUE))
## # A tibble: 1 x 2
## nonvet Mean_Salary
## <dbl> <dbl>
## 1 0 40784.
QUESTIONS
5a. Find the mean grades of veterans and nonveterans from the two rgression outputs
opm94 %>% group_by(vet) %>% summarise(Mean_grade = mean(grade, na.rm = TRUE))
opm94 %>% group_by(nonvet) %>% summarise(Mean_grade = mean(grade, na.rm = TRUE))
5b. Interpret the Y-intercepts. Why do they differ?
The y-intercept shows what salary a vet or nonvet gets at year 0. They differ, because the vets salary is less.
5c. Interpret the regression coefficients. Why do they differ?
Since there is a formula for the regression line, and there is a strong, positive relationship between vet and nonvet salaries, the corellation coefficient is 1.