The packages required for this assignment are listed below
##DATA
Telework_aov <- aov(weekly_earnings~as.factor(telecommute), data=Telework)
summary(Telework_aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(telecommute) 1 1.451e+08 145093488 346.5 <2e-16 ***
## Residuals 5540 2.320e+09 418730
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(Telework_aov)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = weekly_earnings ~ as.factor(telecommute), data = Telework)
##
## $`as.factor(telecommute)`
## diff lwr upr p adj
## 2-1 -350.7614 -387.7015 -313.8213 0
# Telecommuting seems to have an impact on earnings. As we can see from the model, telecommuting is significant with over 95% confidence level.
Telework %>%
ggplot(aes(x=as.factor(telecommute),y=weekly_earnings))+
geom_boxplot()
#From the boxplot, we can tell that people who telework make on average $346 much more a week than those # that do not.
# This is a naive model because we are assuming just one independent variable (Teleworking) to be the #only factor influencing weekly earnings.
# From the model, we can see that eleworking only helps explain only 5% of the variation in income, # meaning that there are other possible factors that if evaluated, would explain income better.
# Lastly, this is a naive model because we have not studied the interaction between telecommuting and #other variables to see how that interaction would help explain the model better.
Telework_aov1 <- aov(weekly_earnings~as.factor(telecommute)* as.factor(hourly_non_hourly), data=Telework)
summary(Telework_aov1)
## Df Sum Sq
## as.factor(telecommute) 1 1.451e+08
## as.factor(hourly_non_hourly) 1 3.755e+08
## as.factor(telecommute):as.factor(hourly_non_hourly) 1 9.009e+06
## Residuals 5538 1.935e+09
## Mean Sq F value
## as.factor(telecommute) 145093488 415.20
## as.factor(hourly_non_hourly) 375496285 1074.53
## as.factor(telecommute):as.factor(hourly_non_hourly) 9009218 25.78
## Residuals 349451
## Pr(>F)
## as.factor(telecommute) < 2e-16 ***
## as.factor(hourly_non_hourly) < 2e-16 ***
## as.factor(telecommute):as.factor(hourly_non_hourly) 3.95e-07 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
TukeyHSD(Telework_aov1)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = weekly_earnings ~ as.factor(telecommute) * as.factor(hourly_non_hourly), data = Telework)
##
## $`as.factor(telecommute)`
## diff lwr upr p adj
## 2-1 -350.7614 -384.5076 -317.0153 0
##
## $`as.factor(hourly_non_hourly)`
## diff lwr upr p adj
## 2-1 512.5514 481.0693 544.0335 0
##
## $`as.factor(telecommute):as.factor(hourly_non_hourly)`
## diff lwr upr p adj
## 2:1-1:1 -125.7199 -191.1042 -60.33548 4.8e-06
## 1:2-1:1 662.9527 587.9357 737.96978 0.0e+00
## 2:2-1:1 357.5900 286.4375 428.74251 0.0e+00
## 1:2-2:1 788.6726 732.0702 845.27496 0.0e+00
## 2:2-2:1 483.3099 431.9392 534.68055 0.0e+00
## 2:2-1:2 -305.3627 -368.5402 -242.18525 0.0e+00
# Using the hourly and non hourly variable, we can help establish the difference between salaried and hourly contractors that are assigned a project and work from home. Adding this variable makes the model fit better as it helps explain the relationship better. On avaerage, a telecommuter earns $ 415 but if they are paid hourly, they earn about $ 659 more.
# Both these variables are significant with p-values of less than .5 and higher R^2 than the initial model.Though improved, the model is still naive as it does not consider the interactions between other variables.
Telework %>%
ggplot(aes(x=as.factor(hourly_non_hourly), y=weekly_earnings, color=as.factor(telecommute)))+
geom_boxplot()
# From the plot, we can see that, on average, people that telework and are paid hourly make more than
# people that telework and are salaried.
### Testing the difference between model in question 1 and model in question 2
anova(Telework_aov, Telework_aov1, test = "Chisq")
## Analysis of Variance Table
##
## Model 1: weekly_earnings ~ as.factor(telecommute)
## Model 2: weekly_earnings ~ as.factor(telecommute) * as.factor(hourly_non_hourly)
## Res.Df RSS Df Sum of Sq Pr(>Chi)
## 1 5540 2319766548
## 2 5538 1935261045 2 384505502 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model two has a better p-value and explains the relationship better.
### Simple Linear Regression Model
Telework_lm1<- lm(weekly_earnings~hours_worked, data=Telework)
summary(Telework_lm1)
##
## Call:
## lm(formula = weekly_earnings ~ hours_worked, data = Telework)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1758.0 -411.2 -185.1 230.4 2796.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 66.0433 28.5766 2.311 0.0209 *
## hours_worked 22.5887 0.7072 31.943 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 613 on 5540 degrees of freedom
## Multiple R-squared: 0.1555, Adjusted R-squared: 0.1554
## F-statistic: 1020 on 1 and 5540 DF, p-value: < 2.2e-16
#General Equation
#Yi=bo +biXi
# Estimated Equation
#Y(Weekly Earnings)= 66.0433 + 22.58X (hours worked)
# This is a naive model because only 15% of the variance in Y can be explained by the independent variable chosen variable.
# The T value also cannot tell us if the estimated coefficients are very different from zero or not.
# The 85% or the remaining variance could be exoplained by a combination of other variables such as age, sex etc.
#To better model this, we may have to use log transformations or use use polynomial factors .
#Once we change the hours worked to 3 degree polynomial, wwe get a better model. With a simple lm, we have an adjusted Rsquared of .15 and an RMSE of 613.When we use polynomial factor of three degrees, we get an adjusted square of .1793 and a lower RMSE of 603.
Telework_lm2 <- lm(weekly_earnings~poly(hours_worked,3, raw=TRUE), data=Telework)
summary(Telework_lm2)
##
## Call:
## lm(formula = weekly_earnings ~ poly(hours_worked, 3, raw = TRUE),
## data = Telework)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1715.8 -389.5 -180.8 225.5 2453.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 681.502993 59.449501 11.464 < 2e-16
## poly(hours_worked, 3, raw = TRUE)1 -40.718388 5.031704 -8.092 7.13e-16
## poly(hours_worked, 3, raw = TRUE)2 1.695403 0.133351 12.714 < 2e-16
## poly(hours_worked, 3, raw = TRUE)3 -0.012749 0.001048 -12.164 < 2e-16
##
## (Intercept) ***
## poly(hours_worked, 3, raw = TRUE)1 ***
## poly(hours_worked, 3, raw = TRUE)2 ***
## poly(hours_worked, 3, raw = TRUE)3 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 604.2 on 5538 degrees of freedom
## Multiple R-squared: 0.1797, Adjusted R-squared: 0.1793
## F-statistic: 404.5 on 3 and 5538 DF, p-value: < 2.2e-16
Telework %>%
ggplot(aes(x=hours_worked, y=weekly_earnings))+
geom_point()+
stat_smooth(method="lm", se=TRUE, fill=NA,
formula= y ~x + I(x*x)+ I(x*x*x),colour="red")
### Testing to see which model fits better
anova(Telework_lm1, Telework_lm2)
## Analysis of Variance Table
##
## Model 1: weekly_earnings ~ hours_worked
## Model 2: weekly_earnings ~ poly(hours_worked, 3, raw = TRUE)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 5540 2081492559
## 2 5538 2021830057 2 59662502 81.711 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Telework_lm3<- lm(weekly_earnings~age, data=Telework)
summary(Telework_lm3)
##
## Call:
## lm(formula = weekly_earnings ~ age, data = Telework)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1245.3 -445.3 -178.1 284.7 2133.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 548.9457 28.2350 19.44 <2e-16 ***
## age 9.1941 0.6306 14.58 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 654.6 on 5540 degrees of freedom
## Multiple R-squared: 0.03696, Adjusted R-squared: 0.03678
## F-statistic: 212.6 on 1 and 5540 DF, p-value: < 2.2e-16
# (b)# Yi= bo + biXi
#Estimated equation
# Weekly Earnings(Y)=548.9457 (intercept)+ 9.19(slope)X
# This is a naive model because apart from age, the gender of the person, or the level of education could be affecting earnings. For instance, someone whith a amasters will on average earn a lot more money than someone with a bachelors. This model only assumes that someone's age,regardless of age, education among many other factors, affects their income.
Telework_lm4 <- lm(weekly_earnings~age+I(age*age),data=Telework)
crPlots(Telework_lm4)
# There apprears to be a non-linear relationship between age and earnings. Also, looking at the graphs we can see that there are outliers hence we may need to log some of the varables.
### Adding other variables to the model
Telework_lm5<- lm(weekly_earnings~age + as.factor(telecommute) + (hours_worked) + as.factor(full_or_part_time), data=Telework)
summary(Telework_lm5)
##
## Call:
## lm(formula = weekly_earnings ~ age + as.factor(telecommute) +
## (hours_worked) + as.factor(full_or_part_time), data = Telework)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1488.6 -377.8 -129.2 215.9 2584.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 203.5636 44.4963 4.575 4.87e-06 ***
## age 9.0632 0.5547 16.338 < 2e-16 ***
## as.factor(telecommute)2 -291.0720 16.8522 -17.272 < 2e-16 ***
## hours_worked 15.5611 0.8119 19.166 < 2e-16 ***
## as.factor(full_or_part_time)2 -337.2492 26.6865 -12.637 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 575.2 on 5537 degrees of freedom
## Multiple R-squared: 0.2567, Adjusted R-squared: 0.2562
## F-statistic: 478.1 on 4 and 5537 DF, p-value: < 2.2e-16
# (Y)=Bo + B(telecommute) +B(sex) + B(age) + B(full_or_part_time)
#Y= 203.56 + 9.06(B1) + -291.07(B1) + 15.56(B2) + -337.24(B3)
cor(Telework[, c("age", "telecommute","hours_worked", "full_or_part_time", "weekly_earnings")], method = "spearman")
## age telecommute hours_worked full_or_part_time
## age 1.00000000 0.01094764 0.02472655 -0.04784105
## telecommute 0.01094764 1.00000000 -0.13114746 0.08838726
## hours_worked 0.02472655 -0.13114746 1.00000000 -0.57766801
## full_or_part_time -0.04784105 0.08838726 -0.57766801 1.00000000
## weekly_earnings 0.20890993 -0.23731017 0.48703347 -0.47663195
## weekly_earnings
## age 0.2089099
## telecommute -0.2373102
## hours_worked 0.4870335
## full_or_part_time -0.4766320
## weekly_earnings 1.0000000
# Non of the variables are corellated. I ran the colinearity test above and non of them have more than 70% collinearity factor.
# Hypothetical assumption for someone aged 30, works 40hrs a week and telecommutes.