All problems: Applied Linear Statistical Models 5E
library(tidyverse)
library(GGally)
library(magrittr)
Observations = Students
load("G:/My Drive/homework/Cooper P/gpa_spring2021.RData")
names(data) <- c("GPA", "ACT")
data %>% ggpairs() # + ggtitle("EDA")
data %>% mutate(ACT = factor(ACT)) %>% ggpairs(cardinality_threshold = 21)
\(\hat{GPA} = \beta_0 + \beta_1 * ACT\) where \(\beta_0 = 1.91264\) & \(\beta_1 = 0.04826\)
model.fit <-
data %>%
lm(GPA ~ ACT, data = .) %>%
coef()
(b0 <- model.fit[1])
## (Intercept)
## 1.912642
(b1 <- model.fit[2])
## ACT
## 0.04825715
data %>%
ggplot(aes(ACT, GPA)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, col = "red") +
# geom_abline(intercept = b0, slope = b1, col = "red") +
ggtitle("Model Fit")
yhat <- function(x){
b0 + b1*x
}
yhat(30)
## (Intercept)
## 3.360356
data %>% filter(ACT == 30)
## GPA ACT
## 1 3.765 30
## 2 3.498 30
## 3 3.318 30
## 4 3.987 30
## 5 3.828 30
## 6 2.650 30
## 7 3.071 30
Every one point increase in ACT (X) results in 0.04825 increase in GPA (Y)
Observations = repair calls
load("G:/My Drive/homework/Cooper P/copier_spring2021.RData")
names(data) <- c("Minutes", "Copiers")
data %>% ggpairs()
data %>% mutate(Copiers = factor(Copiers)) %>% ggpairs()
\(\hat{Minutes} = \beta_0 + \beta_1 * Copiers\) where \(\beta_0 = -0.885\) & \(\beta_1 = 15.056\)
model.fit <-
data %>%
lm(Minutes ~ Copiers, data = .) %>%
coef()
(b0 <- model.fit[1])
## (Intercept)
## -0.885248
(b1 <- model.fit[2])
## Copiers
## 15.05581
data %>%
ggplot(aes(Copiers, Minutes)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ x, se = FALSE, col = "red") +
ggtitle("Model Fit")
Mathematically, \(\beta_0 = -0.885\) means that number of minutes accrued when zero copiers are serviced on a call is -0.885.
yhat <- function(x){
b0 + b1*x
}
yhat(5)
## (Intercept)
## 74.3938
data %>% filter(Copiers == 5)
## Minutes Copiers
## 1 79 5
## 2 71 5
## 3 93 5
## 4 76 5
## 5 74 5
## 6 80 5
## 7 55 5
## 8 76 5
Observations = Weeks
load("G:/My Drive/homework/Cooper P/grocery_spring2021.RData")
names(data) <- c("Hours", "Cases", "Labor", "Holiday")
data %>% ggpairs()
data %>% mutate(Holiday = factor(Holiday)) %>% ggpairs()
\(\hat{Hours} = \beta_0 + \beta_1 * Cases + \beta_2 * Labor + \beta_3 * Holiday\) where \(\beta_0 = 4269.3, \beta_1 = 0.0004434, \beta_2 = -12.28, \beta_3 = 545.8\)
model.fit1 <-
data %>%
lm(Hours ~ ., data = .)
(b0 <- model.fit1$coefficients[1])
## (Intercept)
## 4269.313
(b1 <- model.fit1$coefficients[2])
## Cases
## 0.0004434404
(b2 <- model.fit1$coefficients[3])
## Labor
## -12.28001
(b3 <- model.fit1$coefficients[4])
## Holiday
## 545.7521
model.fit1$residuals %>%
as_tibble() %>%
ggplot(aes(value)) +
geom_boxplot() +
ggtitle("Model Residuals") +
xlab("Hours") +
geom_vline(xintercept = mean(model.fit1$residuals), col = "red") +
theme(axis.text.y = element_blank())
data %>%
mutate(Residuals = model.fit1$residuals) %>%
pivot_longer(!Residuals, names_to = "Variable") %>%
ggplot(aes(value, Residuals)) +
geom_point() +
facet_wrap(vars(Variable), scales = "free") +
ggtitle("Residuals (Hours) against Variables w/o Interaction Term")
data %>%
ggplot(aes(sample = Hours)) +
geom_qq() +
geom_qq_line() +
ggtitle("QQ Plot of Y = Hours")
model.fit1$residuals %>%
as_tibble() %>%
ggplot(aes(sample = value)) +
geom_qq() +
geom_qq_line() +
ggtitle("QQ Plot of Residuals")
model.fit2 <-
data %>%
lm(Hours ~ . + Cases*Labor, data = .)
model.fit2 %>% coefficients()
## (Intercept) Cases Labor Holiday Cases:Labor
## 4.953637e+03 -1.801342e-03 -1.018092e+02 5.382781e+02 2.934599e-04
data %>%
mutate(Residuals = model.fit2$residuals,
CasesXLabor = Cases * Labor) %>%
pivot_longer(!Residuals, names_to = "Variable") %>%
ggplot(aes(value, Residuals)) +
geom_point() +
facet_wrap(vars(Variable), scales = "free") +
ggtitle("Residuals (Hours) against Variables with Interaction Term")
data %>%
ggplot(aes(sample = Hours)) +
geom_qq() +
geom_qq_line() +
ggtitle("QQ Plot of Y = Hours")
model.fit2$residuals %>%
as_tibble() %>%
ggplot(aes(sample = value)) +
geom_qq() +
geom_qq_line() +
ggtitle("QQ Plot of Residuals")
model.fit1$residuals %>%
as_tibble() %>%
ggplot(aes(1:length(value), value)) +
geom_path() +
geom_point() +
ggtitle("Residual Time Plot with Interaction Term") +
xlab("Observation") +
ylab("Hours")
model.fit2$residuals %>%
as_tibble() %>%
ggplot(aes(1:length(value), value)) +
geom_path() +
geom_point() +
ggtitle("Residual Time Plot w/o Interaction Term") +
xlab("Observation") +
ylab("Hours")
Brown-Forsythe test
small abs result allows us to approximate with t-test
\(H_0:=\) error variance constant vs \(H_A:=\) error variance not constant
d1 <-
model.fit1$residuals %>%
as_tibble() %>%
arrange(value %>% abs()) %>%
slice_head(prop = 0.5) %>%
pull()
d2 <-
model.fit1$residuals %>%
as_tibble() %>%
arrange(value %>% abs()) %>%
slice_tail(prop = 0.5) %>%
pull()
t.test(d1, d2, conf.level = 0.99)
##
## Welch Two Sample t-test
##
## data: d1 and d2
## t = 0.0030244, df = 28.749, p-value = 0.9976
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## -124.6107 124.8843
## sample estimates:
## mean of x mean of y
## 0.06839659 -0.06839659
d1 <-
model.fit2$residuals %>%
as_tibble() %>%
arrange(value %>% abs()) %>%
slice_head(prop = 0.5) %>%
pull()
d2 <-
model.fit2$residuals %>%
as_tibble() %>%
arrange(value %>% abs()) %>%
slice_tail(prop = 0.5) %>%
pull()
t.test(d1, d2, conf.level = 0.99)
##
## Welch Two Sample t-test
##
## data: d1 and d2
## t = 0.33022, df = 28.817, p-value = 0.7436
## alternative hypothesis: true difference in means is not equal to 0
## 99 percent confidence interval:
## -109.4381 139.2143
## sample estimates:
## mean of x mean of y
## 7.444046 -7.444046
Observations = patients
load("G:/My Drive/homework/Cooper P/patient_satisfaction_spring2021.RData")
data %<>% mutate(Y = as.integer(Y))
names(data) <- c("Satsifaction", "Years", "Severity", "Anxiety")
data %>% ggpairs()
data %>%
mutate(across(.cols = c(Severity, Anxiety), as_factor)) %>%
ggpairs(cardinality_threshold = 19)
stem(data$Years)
##
## The decimal point is 1 digit(s) to the right of the |
##
## 2 | 23
## 2 | 58899999
## 3 | 012233344
## 3 | 66678
## 4 | 0012233344
## 4 | 557779
## 5 | 0233
## 5 | 55
stem(data$Severity)
##
## The decimal point is at the |
##
## 40 | 0
## 42 | 00
## 44 | 0
## 46 | 00000
## 48 | 0000000000
## 50 | 000000000000
## 52 | 000000
## 54 | 0000
## 56 | 00
## 58 | 0
## 60 | 0
## 62 | 0
stem(data$Anxiety)
##
## The decimal point is 1 digit(s) to the left of the |
##
## 18 | 000000
## 20 | 00000000
## 22 | 0000000000000
## 24 | 00000000000
## 26 | 0000
## 28 | 0000
model.fit <-
data %>%
lm(Satsifaction ~ . , data = .)
model.fit %>% coefficients()
## (Intercept) Years Severity Anxiety
## 162.4755302 -1.2469821 -0.4068475 -14.7226010
model.fit <-
data %>%
lm(Satsifaction ~ Years + factor(Severity) + factor(Anxiety) , data = .)
model.fit %>% summary()
##
## Call:
## lm(formula = Satsifaction ~ Years + factor(Severity) + factor(Anxiety),
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -19.515 -3.037 0.000 2.546 19.515
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 116.8849 15.3677 7.606 1.06e-06 ***
## Years -1.3428 0.4093 -3.280 0.00471 **
## factor(Severity)42 21.8498 22.2704 0.981 0.34114
## factor(Severity)43 11.7141 17.2986 0.677 0.50797
## factor(Severity)44 8.1708 18.6907 0.437 0.66784
## factor(Severity)46 15.1996 15.8797 0.957 0.35272
## factor(Severity)47 17.1642 22.0041 0.780 0.44675
## factor(Severity)48 5.6000 17.8165 0.314 0.75734
## factor(Severity)49 8.9712 16.6548 0.539 0.59754
## factor(Severity)50 5.5530 17.9986 0.309 0.76167
## factor(Severity)51 4.1218 17.4826 0.236 0.81660
## factor(Severity)52 1.2756 18.5829 0.069 0.94612
## factor(Severity)53 10.3135 18.7859 0.549 0.59059
## factor(Severity)54 2.1029 22.0030 0.096 0.92505
## factor(Severity)55 0.1726 21.1631 0.008 0.99359
## factor(Severity)56 1.3487 22.1835 0.061 0.95227
## factor(Severity)57 -8.7153 21.1153 -0.413 0.68527
## factor(Severity)58 9.3887 27.1702 0.346 0.73418
## factor(Severity)60 4.0814 22.3724 0.182 0.85754
## factor(Severity)62 -0.8687 28.4216 -0.031 0.97600
## factor(Anxiety)1.9 -5.1427 15.7641 -0.326 0.74848
## factor(Anxiety)2 -18.4216 12.4645 -1.478 0.15884
## factor(Anxiety)2.1 -9.1843 12.4906 -0.735 0.47280
## factor(Anxiety)2.2 -11.0303 11.8703 -0.929 0.36658
## factor(Anxiety)2.3 -13.5553 11.5388 -1.175 0.25728
## factor(Anxiety)2.4 -10.8536 12.3251 -0.881 0.39156
## factor(Anxiety)2.5 -7.9204 13.7940 -0.574 0.57382
## factor(Anxiety)2.6 -23.7873 12.3686 -1.923 0.07244 .
## factor(Anxiety)2.7 -15.6082 16.7536 -0.932 0.36537
## factor(Anxiety)2.8 NA NA NA NA
## factor(Anxiety)2.9 -17.1895 19.0423 -0.903 0.38008
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.15 on 16 degrees of freedom
## Multiple R-squared: 0.8352, Adjusted R-squared: 0.5364
## F-statistic: 2.795 on 29 and 16 DF, p-value: 0.01701