Step 1: Load the Data

# Replace with your actual path if needed
senic <- read_csv("senic.csv")
## Rows: 114 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (12): id, length, age, infection, cultur, xray, bed, affiliat, region, p...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
str(senic)
## spc_tbl_ [114 × 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ id       : num [1:114] 1 2 3 4 5 6 7 8 9 10 ...
##  $ length   : num [1:114] 7.13 8.82 8.34 8.95 11.2 ...
##  $ age      : num [1:114] 55.7 58.2 56.9 53.7 56.5 50.9 57.8 45.7 48.2 56.3 ...
##  $ infection: num [1:114] 4.1 1.6 2.7 5.6 5.7 5.1 4.6 5.4 4.3 6.3 ...
##  $ cultur   : num [1:114] 9 3.8 8.1 18.9 34.5 21.9 16.7 60.5 24.4 29.6 ...
##  $ xray     : num [1:114] 39.6 51.7 74 122.8 88.9 ...
##  $ bed      : num [1:114] 279 80 107 147 180 150 186 640 182 85 ...
##  $ affiliat : num [1:114] 2 2 2 2 2 2 2 1 2 2 ...
##  $ region   : num [1:114] 4 2 3 4 1 2 3 2 3 1 ...
##  $ patient  : num [1:114] 207 51 82 53 134 147 151 399 130 59 ...
##  $ nurse    : num [1:114] 241 52 54 148 151 106 129 360 118 66 ...
##  $ facility : num [1:114] 60 40 20 40 40 40 40 60 40 40 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   id = col_double(),
##   ..   length = col_double(),
##   ..   age = col_double(),
##   ..   infection = col_double(),
##   ..   cultur = col_double(),
##   ..   xray = col_double(),
##   ..   bed = col_double(),
##   ..   affiliat = col_double(),
##   ..   region = col_double(),
##   ..   patient = col_double(),
##   ..   nurse = col_double(),
##   ..   facility = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>

Step 2: Fit Initial Simple Linear Regression (SLR)

model <- lm(length ~ infection, data = senic)
summary(model)
## 
## Call:
## lm(formula = length ~ infection, data = senic)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.0587 -0.7776 -0.1487  0.7159  8.2805 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.3368     0.5213  12.156  < 2e-16 ***
## infection     0.7604     0.1144   6.645 1.18e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.624 on 111 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.2846, Adjusted R-squared:  0.2781 
## F-statistic: 44.15 on 1 and 111 DF,  p-value: 1.177e-09

Step 3: Check Regression Assumptions

Linearity

ggplot(senic, aes(x = infection, y = length)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  theme_minimal() +
  labs(title = "Length of Stay vs Infection Risk")
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Normality of Residuals

hist(residuals(model), main = "Histogram of Residuals", xlab = "Residuals")

qqnorm(residuals(model))
qqline(residuals(model), col = "red")

Homoscedasticity

plot(model, which = 1)

bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 4.8996, df = 1, p-value = 0.02686

Step 4: Apply Log Transformation

senic$log_length <- log(senic$length)
model_log <- lm(log_length ~ infection, data = senic)
summary(model_log)
## 
## Call:
## lm(formula = log_length ~ infection, data = senic)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.35860 -0.06961 -0.00373  0.07897  0.56691 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.93250    0.04794  40.310  < 2e-16 ***
## infection    0.07293    0.01053   6.929 2.92e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1494 on 111 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.302,  Adjusted R-squared:  0.2957 
## F-statistic: 48.02 on 1 and 111 DF,  p-value: 2.918e-10

Step 5: Recheck Assumptions (Transformed Model)

plot(model_log, which = 1)

hist(residuals(model_log), main = "Histogram of Residuals (Log Model)", xlab = "Residuals")

qqnorm(residuals(model_log))
qqline(residuals(model_log), col = "blue")

bptest(model_log)
## 
##  studentized Breusch-Pagan test
## 
## data:  model_log
## BP = 3.5992, df = 1, p-value = 0.05781