Step 1: Load the Data
# Replace with your actual path if needed
senic <- read_csv("senic.csv")
## Rows: 114 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): id, length, age, infection, cultur, xray, bed, affiliat, region, p...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(senic)
## spc_tbl_ [114 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ id : num [1:114] 1 2 3 4 5 6 7 8 9 10 ...
## $ length : num [1:114] 7.13 8.82 8.34 8.95 11.2 ...
## $ age : num [1:114] 55.7 58.2 56.9 53.7 56.5 50.9 57.8 45.7 48.2 56.3 ...
## $ infection: num [1:114] 4.1 1.6 2.7 5.6 5.7 5.1 4.6 5.4 4.3 6.3 ...
## $ cultur : num [1:114] 9 3.8 8.1 18.9 34.5 21.9 16.7 60.5 24.4 29.6 ...
## $ xray : num [1:114] 39.6 51.7 74 122.8 88.9 ...
## $ bed : num [1:114] 279 80 107 147 180 150 186 640 182 85 ...
## $ affiliat : num [1:114] 2 2 2 2 2 2 2 1 2 2 ...
## $ region : num [1:114] 4 2 3 4 1 2 3 2 3 1 ...
## $ patient : num [1:114] 207 51 82 53 134 147 151 399 130 59 ...
## $ nurse : num [1:114] 241 52 54 148 151 106 129 360 118 66 ...
## $ facility : num [1:114] 60 40 20 40 40 40 40 60 40 40 ...
## - attr(*, "spec")=
## .. cols(
## .. id = col_double(),
## .. length = col_double(),
## .. age = col_double(),
## .. infection = col_double(),
## .. cultur = col_double(),
## .. xray = col_double(),
## .. bed = col_double(),
## .. affiliat = col_double(),
## .. region = col_double(),
## .. patient = col_double(),
## .. nurse = col_double(),
## .. facility = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
Step 2: Fit Initial Simple Linear Regression (SLR)
model <- lm(length ~ infection, data = senic)
summary(model)
##
## Call:
## lm(formula = length ~ infection, data = senic)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.0587 -0.7776 -0.1487 0.7159 8.2805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.3368 0.5213 12.156 < 2e-16 ***
## infection 0.7604 0.1144 6.645 1.18e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.624 on 111 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.2846, Adjusted R-squared: 0.2781
## F-statistic: 44.15 on 1 and 111 DF, p-value: 1.177e-09
Step 3: Check Regression Assumptions
Linearity
ggplot(senic, aes(x = infection, y = length)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
theme_minimal() +
labs(title = "Length of Stay vs Infection Risk")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Normality of Residuals
hist(residuals(model), main = "Histogram of Residuals", xlab = "Residuals")

qqnorm(residuals(model))
qqline(residuals(model), col = "red")

Homoscedasticity
plot(model, which = 1)

bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 4.8996, df = 1, p-value = 0.02686