02.27.2021 Cooper P: HW3

All problems: Applied Linear Statistical Models 5E

library(tidyverse)
library(GGally)
library(magrittr)

Problem 1 (1.19) Grade Point Average

Observations = Students

load("G:/My Drive/homework/Cooper P/gpa_spring2021.RData")
names(data) <- c("GPA", "ACT")

data %>% ggpairs() # + ggtitle("EDA")

data %>% mutate(ACT = factor(ACT)) %>% ggpairs(cardinality_threshold = 21)

Part (a)

\(\hat{GPA} = \beta_0 + \beta_1 * ACT\) where \(\beta_0 = 1.91264\) & \(\beta_1 = 0.04826\)

model.fit <- 
  data %>%
  lm(GPA ~ ACT, data = .) %>%
  coef()
(b0 <- model.fit[1])

## (Intercept) 
##    1.912642

(b1 <- model.fit[2])

##        ACT 
## 0.04825715

Part (b)

data %>%
  ggplot(aes(ACT, GPA)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, col = "red") +
  # geom_abline(intercept = b0, slope = b1, col = "red") +
  ggtitle("Model Fit")

Part (c)

yhat <- function(x){
  b0 + b1*x
}

yhat(30)

## (Intercept) 
##    3.360356

data %>% filter(ACT == 30)

##     GPA ACT
## 1 3.765  30
## 2 3.498  30
## 3 3.318  30
## 4 3.987  30
## 5 3.828  30
## 6 2.650  30
## 7 3.071  30

Part (d)

Every one point increase in ACT (X) results in 0.04825 increase in GPA (Y)

Problem 2 (1.20) Copier Maintenance

Observations = repair calls

load("G:/My Drive/homework/Cooper P/copier_spring2021.RData")
names(data) <- c("Minutes", "Copiers")

data %>% ggpairs()

data %>% mutate(Copiers = factor(Copiers)) %>% ggpairs()

Part (a)

\(\hat{Minutes} = \beta_0 + \beta_1 * Copiers\) where \(\beta_0 = -0.885\) & \(\beta_1 = 15.056\)

model.fit <-
  data %>%
  lm(Minutes ~ Copiers, data = .) %>%
  coef()

(b0 <- model.fit[1])

## (Intercept) 
##   -0.885248

(b1 <- model.fit[2])

##  Copiers 
## 15.05581

Part (b)

data %>%
  ggplot(aes(Copiers, Minutes)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, col = "red") +
  ggtitle("Model Fit")

Part (c)

Mathematically, \(\beta_0 = -0.885\) means that number of minutes accrued when zero copiers are serviced on a call is -0.885.

Part (d)

yhat <- function(x){
  b0 + b1*x
}

yhat(5)

## (Intercept) 
##     74.3938

data %>% filter(Copiers == 5)

##   Minutes Copiers
## 1      79       5
## 2      71       5
## 3      93       5
## 4      76       5
## 5      74       5
## 6      80       5
## 7      55       5
## 8      76       5

Problem 6 (6.10) Grocery Retailer

Observations = Weeks

load("G:/My Drive/homework/Cooper P/grocery_spring2021.RData")
names(data) <- c("Hours", "Cases", "Labor", "Holiday")

data %>% ggpairs()

data %>% mutate(Holiday = factor(Holiday)) %>% ggpairs()

Part (a)

\(\hat{Hours} = \beta_0 + \beta_1 * Cases + \beta_2 * Labor + \beta_3 * Holiday\) where \(\beta_0 = 4269.3, \beta_1 = 0.0004434, \beta_2 = -12.28, \beta_3 = 545.8\)

model.fit1 <-
  data %>%
  lm(Hours ~ ., data = .)

(b0 <- model.fit1$coefficients[1])

## (Intercept) 
##    4269.313

(b1 <- model.fit1$coefficients[2])

##        Cases 
## 0.0004434404

(b2 <- model.fit1$coefficients[3])

##     Labor 
## -12.28001

(b3 <- model.fit1$coefficients[4])

##  Holiday 
## 545.7521

Part (b)

model.fit1$residuals %>%
  as_tibble() %>%
  ggplot(aes(value)) +
  geom_boxplot() +
  ggtitle("Model Residuals") +
  xlab("Hours") +
  geom_vline(xintercept = mean(model.fit1$residuals), col = "red") +
  theme(axis.text.y = element_blank())

Part (ci) Without interaction term

data %>%
  mutate(Residuals = model.fit1$residuals) %>%
  pivot_longer(!Residuals, names_to = "Variable") %>%
  ggplot(aes(value, Residuals)) +
  geom_point() +
  facet_wrap(vars(Variable), scales = "free") +
  ggtitle("Residuals (Hours) against Variables w/o Interaction Term")

data %>%
  ggplot(aes(sample = Hours)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("QQ Plot of Y = Hours")

model.fit1$residuals %>%
  as_tibble() %>%
  ggplot(aes(sample = value)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("QQ Plot of Residuals")

Part (cii) With interaction term

model.fit2 <-
  data %>%
  lm(Hours ~ . + Cases*Labor, data = .)
model.fit2 %>% coefficients()

##   (Intercept)         Cases         Labor       Holiday   Cases:Labor 
##  4.953637e+03 -1.801342e-03 -1.018092e+02  5.382781e+02  2.934599e-04

data %>%
  mutate(Residuals = model.fit2$residuals,
         CasesXLabor = Cases * Labor) %>%
  pivot_longer(!Residuals, names_to = "Variable") %>%
  ggplot(aes(value, Residuals)) +
  geom_point() +
  facet_wrap(vars(Variable), scales = "free") +
  ggtitle("Residuals (Hours) against Variables with Interaction Term")

data %>%
  ggplot(aes(sample = Hours)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("QQ Plot of Y = Hours")

model.fit2$residuals %>%
  as_tibble() %>%
  ggplot(aes(sample = value)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("QQ Plot of Residuals")

Part (di) Without interaction term

model.fit1$residuals %>%
  as_tibble() %>%
  ggplot(aes(1:length(value), value)) +
  geom_path() +
  geom_point() +
  ggtitle("Residual Time Plot with Interaction Term") +
  xlab("Observation") +
  ylab("Hours")

Part (dii) With interaction term

model.fit2$residuals %>%
  as_tibble() %>%
  ggplot(aes(1:length(value), value)) +
  geom_path() +
  geom_point() +
  ggtitle("Residual Time Plot w/o Interaction Term") +
  xlab("Observation") +
  ylab("Hours")

Part (ei) Without Interaction Term

Brown-Forsythe test
small abs result allows us to approximate with t-test
\(H_0:=\) error variance constant vs \(H_A:=\) error variance not constant

d1 <-
  model.fit1$residuals %>%
  as_tibble() %>%
  arrange(value %>% abs()) %>%
  slice_head(prop = 0.5) %>% 
  pull()

d2 <-
  model.fit1$residuals %>%
  as_tibble() %>%
  arrange(value %>% abs()) %>%
  slice_tail(prop = 0.5) %>% 
  pull()

t.test(d1, d2, conf.level = 0.99)

## 
##  Welch Two Sample t-test
## 
## data:  d1 and d2
## t = 0.0030244, df = 28.749, p-value = 0.9976
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -124.6107  124.8843
## sample estimates:
##   mean of x   mean of y 
##  0.06839659 -0.06839659

Part (eii) With interaction term

d1 <-
  model.fit2$residuals %>%
  as_tibble() %>%
  arrange(value %>% abs()) %>%
  slice_head(prop = 0.5) %>% 
  pull()

d2 <-
  model.fit2$residuals %>%
  as_tibble() %>%
  arrange(value %>% abs()) %>%
  slice_tail(prop = 0.5) %>% 
  pull()

t.test(d1, d2, conf.level = 0.99)

## 
##  Welch Two Sample t-test
## 
## data:  d1 and d2
## t = 0.33022, df = 28.817, p-value = 0.7436
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -109.4381  139.2143
## sample estimates:
## mean of x mean of y 
##  7.444046 -7.444046

Problem 7 (6.15) Patient Satisfaction

Observations = patients

load("G:/My Drive/homework/Cooper P/patient_satisfaction_spring2021.RData")
data %<>% mutate(Y = as.integer(Y))
names(data) <- c("Satsifaction", "Years", "Severity", "Anxiety")

data %>% ggpairs()

data %>%
  mutate(across(.cols = c(Severity, Anxiety), as_factor)) %>%
  ggpairs(cardinality_threshold = 19)

Part (a)

stem(data$Years)

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   2 | 23
##   2 | 58899999
##   3 | 012233344
##   3 | 66678
##   4 | 0012233344
##   4 | 557779
##   5 | 0233
##   5 | 55

stem(data$Severity)

## 
##   The decimal point is at the |
## 
##   40 | 0
##   42 | 00
##   44 | 0
##   46 | 00000
##   48 | 0000000000
##   50 | 000000000000
##   52 | 000000
##   54 | 0000
##   56 | 00
##   58 | 0
##   60 | 0
##   62 | 0

stem(data$Anxiety)

## 
##   The decimal point is 1 digit(s) to the left of the |
## 
##   18 | 000000
##   20 | 00000000
##   22 | 0000000000000
##   24 | 00000000000
##   26 | 0000
##   28 | 0000

Part (ci) Without factorization

model.fit <-
  data %>%
  lm(Satsifaction ~ . , data = .)
model.fit %>% coefficients()

## (Intercept)       Years    Severity     Anxiety 
## 162.4755302  -1.2469821  -0.4068475 -14.7226010

Part (cii) With factorization

model.fit <-
  data %>%
  lm(Satsifaction ~ Years + factor(Severity) + factor(Anxiety) , data = .)
model.fit %>% summary()

## 
## Call:
## lm(formula = Satsifaction ~ Years + factor(Severity) + factor(Anxiety), 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.515  -3.037   0.000   2.546  19.515 
## 
## Coefficients: (1 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        116.8849    15.3677   7.606 1.06e-06 ***
## Years               -1.3428     0.4093  -3.280  0.00471 ** 
## factor(Severity)42  21.8498    22.2704   0.981  0.34114    
## factor(Severity)43  11.7141    17.2986   0.677  0.50797    
## factor(Severity)44   8.1708    18.6907   0.437  0.66784    
## factor(Severity)46  15.1996    15.8797   0.957  0.35272    
## factor(Severity)47  17.1642    22.0041   0.780  0.44675    
## factor(Severity)48   5.6000    17.8165   0.314  0.75734    
## factor(Severity)49   8.9712    16.6548   0.539  0.59754    
## factor(Severity)50   5.5530    17.9986   0.309  0.76167    
## factor(Severity)51   4.1218    17.4826   0.236  0.81660    
## factor(Severity)52   1.2756    18.5829   0.069  0.94612    
## factor(Severity)53  10.3135    18.7859   0.549  0.59059    
## factor(Severity)54   2.1029    22.0030   0.096  0.92505    
## factor(Severity)55   0.1726    21.1631   0.008  0.99359    
## factor(Severity)56   1.3487    22.1835   0.061  0.95227    
## factor(Severity)57  -8.7153    21.1153  -0.413  0.68527    
## factor(Severity)58   9.3887    27.1702   0.346  0.73418    
## factor(Severity)60   4.0814    22.3724   0.182  0.85754    
## factor(Severity)62  -0.8687    28.4216  -0.031  0.97600    
## factor(Anxiety)1.9  -5.1427    15.7641  -0.326  0.74848    
## factor(Anxiety)2   -18.4216    12.4645  -1.478  0.15884    
## factor(Anxiety)2.1  -9.1843    12.4906  -0.735  0.47280    
## factor(Anxiety)2.2 -11.0303    11.8703  -0.929  0.36658    
## factor(Anxiety)2.3 -13.5553    11.5388  -1.175  0.25728    
## factor(Anxiety)2.4 -10.8536    12.3251  -0.881  0.39156    
## factor(Anxiety)2.5  -7.9204    13.7940  -0.574  0.57382    
## factor(Anxiety)2.6 -23.7873    12.3686  -1.923  0.07244 .  
## factor(Anxiety)2.7 -15.6082    16.7536  -0.932  0.36537    
## factor(Anxiety)2.8       NA         NA      NA       NA    
## factor(Anxiety)2.9 -17.1895    19.0423  -0.903  0.38008    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.15 on 16 degrees of freedom
## Multiple R-squared:  0.8352, Adjusted R-squared:  0.5364 
## F-statistic: 2.795 on 29 and 16 DF,  p-value: 0.01701

02.27.2021 Cooper P: HW3

Nathan at GrandValleyTutor.com

2/26/2021

Problem 1 (1.19) Grade Point Average

Part (a)

Part (b)

Part (c)

Part (d)

Problem 2 (1.20) Copier Maintenance

Part (a)

Part (b)

Part (c)

Part (d)

Problem 6 (6.10) Grocery Retailer

Part (a)

Part (b)

Part (ci) Without interaction term

Part (cii) With interaction term

Part (di) Without interaction term

Part (dii) With interaction term

Part (ei) Without Interaction Term

Part (eii) With interaction term

Problem 7 (6.15) Patient Satisfaction

Part (a)

Part (ci) Without factorization

Part (cii) With factorization