require(ggthemes)
library(tidyverse)
library(magrittr)
library(tidyr)
library(dplyr)
library(lubridate)
library(ggplot2)
library(fpp2)
library(forecast)
library(ggpubr)
library(boot)
library(plotly)
-I extracted some data from the Wagner Manufacturing of labor hours, Machine hours vs. Overhead Expenses:
## Week Overhead LaborHrs MachineHrs
## 1 1 53421 695 239
## 2 2 44167 531 173
## 3 3 60085 839 329
## 4 4 55181 379 335
## 5 5 44484 602 152
## 6 6 68154 807 365
fig <- plot_ly(data = df, x = ~LaborHrs, y = ~Overhead,type='scatter',mode='markers',
marker = list(size = 10,
color = 'rgba(255, 182, 193, .9)',
line = list(color = 'rgba(152, 0, 0, .8)',
width = 2)))
fig <- fig %>% layout(title = 'Fig1a: Scatter Plot of Overhead Expenses as function of Labor hours',
yaxis = list(zeroline = FALSE),
xaxis = list(zeroline = FALSE))
fig
fig1 <- plot_ly(data = df, x = ~MachineHrs, y = ~Overhead,type='scatter',mode='markers',
marker = list(size = 10,
color = 'rgba(255, 182, 193, .9)',
line = list(color = 'rgba(152, 0, 0, .8)',
width = 2)))
fig1 <- fig1 %>% layout(title = 'Fig1b: Scatter Plot of Overhead Expenses as function of Machine hours',
yaxis = list(zeroline = FALSE),
xaxis = list(zeroline = FALSE))
fig1
Observations 1:
# test each predictor variables independently and together to see if it improves the model
model1 <- lm(Overhead ~ LaborHrs, data = df)
summary(model1)
##
## Call:
## lm(formula = Overhead ~ LaborHrs, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15411.5 -4781.6 -137.6 3937.1 12006.2
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30764.379 2888.347 10.651 1.84e-14 ***
## LaborHrs 32.745 3.838 8.532 2.53e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6418 on 50 degrees of freedom
## Multiple R-squared: 0.5928, Adjusted R-squared: 0.5847
## F-statistic: 72.79 on 1 and 50 DF, p-value: 2.531e-11
model2 <- lm(Overhead ~ MachineHrs, data = df)
summary(model2)
##
## Call:
## lm(formula = Overhead ~ MachineHrs, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17955.4 -5639.3 -183.7 4503.3 14263.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 30990.44 3684.79 8.410 3.88e-11 ***
## MachineHrs 95.85 14.61 6.559 2.94e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7374 on 50 degrees of freedom
## Multiple R-squared: 0.4624, Adjusted R-squared: 0.4517
## F-statistic: 43.01 on 1 and 50 DF, p-value: 2.944e-08
Observations 2:
model3 <- lm(Overhead ~ MachineHrs+LaborHrs, data = df)
summary(model3)
##
## Call:
## lm(formula = Overhead ~ MachineHrs + LaborHrs, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7603 -2574 642 1900 8611
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14630.255 1977.808 7.397 1.61e-09 ***
## MachineHrs 79.105 6.537 12.102 2.48e-16 ***
## LaborHrs 28.516 1.972 14.457 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3246 on 49 degrees of freedom
## Multiple R-squared: 0.8979, Adjusted R-squared: 0.8937
## F-statistic: 215.5 on 2 and 49 DF, p-value: < 2.2e-16
Observations 3:
# Compute the analysis of variance of labor hours (predictor variable #1)
res.aov <- aov(Overhead~ LaborHrs, data = df)
# Summary of the analysis
summary(res.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## LaborHrs 1 2.999e+09 2.999e+09 72.79 2.53e-11 ***
## Residuals 50 2.060e+09 4.119e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances
plot(res.aov, 1)
# 2. Normality
plot(res.aov, 2)
# Compute the analysis of variance of Machine hours (predictor variable #2)
res1.aov <- aov(Overhead~ MachineHrs, data = df)
# Summary of the analysis
summary(res1.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## MachineHrs 1 2.339e+09 2.339e+09 43.01 2.94e-08 ***
## Residuals 50 2.719e+09 5.438e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances
plot(res1.aov, 1)
# 2. Normality
plot(res1.aov, 2)
Observations 4:
df1 <- df %>% mutate(laborhrssquare = LaborHrs**2)
head(df1)
## Week Overhead LaborHrs MachineHrs laborhrssquare
## 1 1 53421 695 239 483025
## 2 2 44167 531 173 281961
## 3 3 60085 839 329 703921
## 4 4 55181 379 335 143641
## 5 5 44484 602 152 362404
## 6 6 68154 807 365 651249
model4 <- lm(Overhead ~ laborhrssquare , data = df1)
summary(model4)
##
## Call:
## lm(formula = Overhead ~ laborhrssquare, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15452 -3814 1025 3808 13106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.178e+04 1.819e+03 22.965 < 2e-16 ***
## laborhrssquare 2.194e-02 2.764e-03 7.938 2.07e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6690 on 50 degrees of freedom
## Multiple R-squared: 0.5576, Adjusted R-squared: 0.5487
## F-statistic: 63.01 on 1 and 50 DF, p-value: 2.074e-10
# Compute the analysis of variance of Labor hours square (predictor variable #3)
res2.aov <- aov(Overhead~ laborhrssquare, data = df1)
# Summary of the analysis
summary(res2.aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## laborhrssquare 1 2.820e+09 2.820e+09 63.01 2.07e-10 ***
## Residuals 50 2.238e+09 4.476e+07
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 1. Homogeneity of variances
plot(res2.aov, 1)
# 2. Normality
plot(res2.aov, 2)
Observations 5:
It seems the added quadratic term was just as suficient as the original “unpowered” labor hour predictor variable. Although all the statistical diagnostic remained good, the R-square did not improved but actually went back to the original level of the labor hours.
Therefore, adding this extra quadratic term, did not improved was not needed.