All problems: Applied Linear Statistical Models 5E

library(tidyverse)
library(GGally)
library(magrittr)

Problem 1 (1.19) Grade Point Average

Observations = Students

load("G:/My Drive/homework/Cooper P/gpa_spring2021.RData")
names(data) <- c("GPA", "ACT")

data %>% ggpairs() # + ggtitle("EDA")

data %>% mutate(ACT = factor(ACT)) %>% ggpairs(cardinality_threshold = 21)

Part (a)

\(\hat{GPA} = \beta_0 + \beta_1 * ACT\) where \(\beta_0 = 1.91264\) & \(\beta_1 = 0.04826\)

model.fit <- 
  data %>%
  lm(GPA ~ ACT, data = .) %>%
  coef()
(b0 <- model.fit[1])
## (Intercept) 
##    1.912642
(b1 <- model.fit[2])
##        ACT 
## 0.04825715

Part (b)

data %>%
  ggplot(aes(ACT, GPA)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, col = "red") +
  # geom_abline(intercept = b0, slope = b1, col = "red") +
  ggtitle("Model Fit")

Part (c)

yhat <- function(x){
  b0 + b1*x
}

yhat(30)
## (Intercept) 
##    3.360356
data %>% filter(ACT == 30) 
##     GPA ACT
## 1 3.765  30
## 2 3.498  30
## 3 3.318  30
## 4 3.987  30
## 5 3.828  30
## 6 2.650  30
## 7 3.071  30

Part (d)

Every one point increase in ACT (X) results in 0.04825 increase in GPA (Y)

Problem 2 (1.20) Copier Maintenance

Observations = repair calls

load("G:/My Drive/homework/Cooper P/copier_spring2021.RData")
names(data) <- c("Minutes", "Copiers")

data %>% ggpairs()

data %>% mutate(Copiers = factor(Copiers)) %>% ggpairs()

Part (a)

\(\hat{Minutes} = \beta_0 + \beta_1 * Copiers\) where \(\beta_0 = -0.885\) & \(\beta_1 = 15.056\)

model.fit <-
  data %>%
  lm(Minutes ~ Copiers, data = .) %>%
  coef()

(b0 <- model.fit[1])
## (Intercept) 
##   -0.885248
(b1 <- model.fit[2])
##  Copiers 
## 15.05581

Part (b)

data %>%
  ggplot(aes(Copiers, Minutes)) +
  geom_point() +
  geom_smooth(method = "lm", formula = y ~ x, se = FALSE, col = "red") +
  ggtitle("Model Fit")

Part (c)

Mathematically, \(\beta_0 = -0.885\) means that number of minutes accrued when zero copiers are serviced on a call is -0.885.

Part (d)

yhat <- function(x){
  b0 + b1*x
}

yhat(5)
## (Intercept) 
##     74.3938
data %>% filter(Copiers == 5) 
##   Minutes Copiers
## 1      79       5
## 2      71       5
## 3      93       5
## 4      76       5
## 5      74       5
## 6      80       5
## 7      55       5
## 8      76       5

Problem 6 (6.10) Grocery Retailer

Observations = Weeks

load("G:/My Drive/homework/Cooper P/grocery_spring2021.RData")
names(data) <- c("Hours", "Cases", "Labor", "Holiday")

data %>% ggpairs()

data %>% mutate(Holiday = factor(Holiday)) %>% ggpairs()

Part (a)

\(\hat{Hours} = \beta_0 + \beta_1 * Cases + \beta_2 * Labor + \beta_3 * Holiday\) where \(\beta_0 = 4269.3, \beta_1 = 0.0004434, \beta_2 = -12.28, \beta_3 = 545.8\)

model.fit1 <-
  data %>%
  lm(Hours ~ ., data = .)

(b0 <- model.fit1$coefficients[1])
## (Intercept) 
##    4269.313
(b1 <- model.fit1$coefficients[2])
##        Cases 
## 0.0004434404
(b2 <- model.fit1$coefficients[3])
##     Labor 
## -12.28001
(b3 <- model.fit1$coefficients[4])
##  Holiday 
## 545.7521

Part (b)

model.fit1$residuals %>%
  as_tibble() %>%
  ggplot(aes(value)) +
  geom_boxplot() +
  ggtitle("Model Residuals") +
  xlab("Hours") +
  geom_vline(xintercept = mean(model.fit1$residuals), col = "red") +
  theme(axis.text.y = element_blank())

Part (ci) Without interaction term

data %>%
  mutate(Residuals = model.fit1$residuals) %>%
  pivot_longer(!Residuals, names_to = "Variable") %>%
  ggplot(aes(value, Residuals)) +
  geom_point() +
  facet_wrap(vars(Variable), scales = "free") +
  ggtitle("Residuals (Hours) against Variables w/o Interaction Term")

data %>%
  ggplot(aes(sample = Hours)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("QQ Plot of Y = Hours")

model.fit1$residuals %>%
  as_tibble() %>%
  ggplot(aes(sample = value)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("QQ Plot of Residuals")

Part (cii) With interaction term

model.fit2 <-
  data %>%
  lm(Hours ~ . + Cases*Labor, data = .)
model.fit2 %>% coefficients()
##   (Intercept)         Cases         Labor       Holiday   Cases:Labor 
##  4.953637e+03 -1.801342e-03 -1.018092e+02  5.382781e+02  2.934599e-04
data %>%
  mutate(Residuals = model.fit2$residuals,
         CasesXLabor = Cases * Labor) %>%
  pivot_longer(!Residuals, names_to = "Variable") %>%
  ggplot(aes(value, Residuals)) +
  geom_point() +
  facet_wrap(vars(Variable), scales = "free") +
  ggtitle("Residuals (Hours) against Variables with Interaction Term")

data %>%
  ggplot(aes(sample = Hours)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("QQ Plot of Y = Hours")

model.fit2$residuals %>%
  as_tibble() %>%
  ggplot(aes(sample = value)) +
  geom_qq() +
  geom_qq_line() +
  ggtitle("QQ Plot of Residuals")

Part (di) Without interaction term

model.fit1$residuals %>%
  as_tibble() %>%
  ggplot(aes(1:length(value), value)) +
  geom_path() +
  geom_point() +
  ggtitle("Residual Time Plot with Interaction Term") +
  xlab("Observation") +
  ylab("Hours")

Part (dii) With interaction term

model.fit2$residuals %>%
  as_tibble() %>%
  ggplot(aes(1:length(value), value)) +
  geom_path() +
  geom_point() +
  ggtitle("Residual Time Plot w/o Interaction Term") +
  xlab("Observation") +
  ylab("Hours")

Part (ei) Without Interaction Term

Brown-Forsythe test
small abs result allows us to approximate with t-test
\(H_0:=\) error variance constant vs \(H_A:=\) error variance not constant

d1 <-
  model.fit1$residuals %>%
  as_tibble() %>%
  arrange(value %>% abs()) %>%
  slice_head(prop = 0.5) %>% 
  pull()

d2 <-
  model.fit1$residuals %>%
  as_tibble() %>%
  arrange(value %>% abs()) %>%
  slice_tail(prop = 0.5) %>% 
  pull()

t.test(d1, d2, conf.level = 0.99)
## 
##  Welch Two Sample t-test
## 
## data:  d1 and d2
## t = 0.0030244, df = 28.749, p-value = 0.9976
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -124.6107  124.8843
## sample estimates:
##   mean of x   mean of y 
##  0.06839659 -0.06839659

Part (eii) With interaction term

d1 <-
  model.fit2$residuals %>%
  as_tibble() %>%
  arrange(value %>% abs()) %>%
  slice_head(prop = 0.5) %>% 
  pull()

d2 <-
  model.fit2$residuals %>%
  as_tibble() %>%
  arrange(value %>% abs()) %>%
  slice_tail(prop = 0.5) %>% 
  pull()

t.test(d1, d2, conf.level = 0.99)
## 
##  Welch Two Sample t-test
## 
## data:  d1 and d2
## t = 0.33022, df = 28.817, p-value = 0.7436
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
##  -109.4381  139.2143
## sample estimates:
## mean of x mean of y 
##  7.444046 -7.444046

Problem 7 (6.15) Patient Satisfaction

Observations = patients

load("G:/My Drive/homework/Cooper P/patient_satisfaction_spring2021.RData")
data %<>% mutate(Y = as.integer(Y))
names(data) <- c("Satsifaction", "Years", "Severity", "Anxiety")

data %>% ggpairs()

data %>%
  mutate(across(.cols = c(Severity, Anxiety), as_factor)) %>%
  ggpairs(cardinality_threshold = 19)

Part (a)

stem(data$Years)
## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   2 | 23
##   2 | 58899999
##   3 | 012233344
##   3 | 66678
##   4 | 0012233344
##   4 | 557779
##   5 | 0233
##   5 | 55
stem(data$Severity)
## 
##   The decimal point is at the |
## 
##   40 | 0
##   42 | 00
##   44 | 0
##   46 | 00000
##   48 | 0000000000
##   50 | 000000000000
##   52 | 000000
##   54 | 0000
##   56 | 00
##   58 | 0
##   60 | 0
##   62 | 0
stem(data$Anxiety)
## 
##   The decimal point is 1 digit(s) to the left of the |
## 
##   18 | 000000
##   20 | 00000000
##   22 | 0000000000000
##   24 | 00000000000
##   26 | 0000
##   28 | 0000

Part (ci) Without factorization

model.fit <-
  data %>%
  lm(Satsifaction ~ . , data = .)
model.fit %>% coefficients()
## (Intercept)       Years    Severity     Anxiety 
## 162.4755302  -1.2469821  -0.4068475 -14.7226010

Part (cii) With factorization

model.fit <-
  data %>%
  lm(Satsifaction ~ Years + factor(Severity) + factor(Anxiety) , data = .)
model.fit %>% summary()
## 
## Call:
## lm(formula = Satsifaction ~ Years + factor(Severity) + factor(Anxiety), 
##     data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.515  -3.037   0.000   2.546  19.515 
## 
## Coefficients: (1 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        116.8849    15.3677   7.606 1.06e-06 ***
## Years               -1.3428     0.4093  -3.280  0.00471 ** 
## factor(Severity)42  21.8498    22.2704   0.981  0.34114    
## factor(Severity)43  11.7141    17.2986   0.677  0.50797    
## factor(Severity)44   8.1708    18.6907   0.437  0.66784    
## factor(Severity)46  15.1996    15.8797   0.957  0.35272    
## factor(Severity)47  17.1642    22.0041   0.780  0.44675    
## factor(Severity)48   5.6000    17.8165   0.314  0.75734    
## factor(Severity)49   8.9712    16.6548   0.539  0.59754    
## factor(Severity)50   5.5530    17.9986   0.309  0.76167    
## factor(Severity)51   4.1218    17.4826   0.236  0.81660    
## factor(Severity)52   1.2756    18.5829   0.069  0.94612    
## factor(Severity)53  10.3135    18.7859   0.549  0.59059    
## factor(Severity)54   2.1029    22.0030   0.096  0.92505    
## factor(Severity)55   0.1726    21.1631   0.008  0.99359    
## factor(Severity)56   1.3487    22.1835   0.061  0.95227    
## factor(Severity)57  -8.7153    21.1153  -0.413  0.68527    
## factor(Severity)58   9.3887    27.1702   0.346  0.73418    
## factor(Severity)60   4.0814    22.3724   0.182  0.85754    
## factor(Severity)62  -0.8687    28.4216  -0.031  0.97600    
## factor(Anxiety)1.9  -5.1427    15.7641  -0.326  0.74848    
## factor(Anxiety)2   -18.4216    12.4645  -1.478  0.15884    
## factor(Anxiety)2.1  -9.1843    12.4906  -0.735  0.47280    
## factor(Anxiety)2.2 -11.0303    11.8703  -0.929  0.36658    
## factor(Anxiety)2.3 -13.5553    11.5388  -1.175  0.25728    
## factor(Anxiety)2.4 -10.8536    12.3251  -0.881  0.39156    
## factor(Anxiety)2.5  -7.9204    13.7940  -0.574  0.57382    
## factor(Anxiety)2.6 -23.7873    12.3686  -1.923  0.07244 .  
## factor(Anxiety)2.7 -15.6082    16.7536  -0.932  0.36537    
## factor(Anxiety)2.8       NA         NA      NA       NA    
## factor(Anxiety)2.9 -17.1895    19.0423  -0.903  0.38008    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.15 on 16 degrees of freedom
## Multiple R-squared:  0.8352, Adjusted R-squared:  0.5364 
## F-statistic: 2.795 on 29 and 16 DF,  p-value: 0.01701