data <- read.csv("neonati.csv")
# Standardize column names
df <- data %>%
rename(MaternalAge = Anni.madre,
NumPregnancies = N.gravidanze,
Smoking = Fumatrici,
GestWeeks = Gestazione,
Weight_g = Peso,
Length_mm = Lunghezza,
HeadDiameter_mm = Cranio,
BirthType = Tipo.parto,
Hospital = Ospedale,
Sex = Sesso)
# Convert types
df <- df %>%
mutate(Smoking = as.factor(if_else(as.numeric(as.character(Smoking))==1, 1, 0)),
BirthType = as.factor(BirthType),
Hospital = as.factor(Hospital),
Sex = as.factor(Sex),
Preterm = if_else(GestWeeks < 37, 1, 0))
# Quick look
glimpse(df)
## Rows: 2,500
## Columns: 11
## $ MaternalAge <int> 26, 21, 34, 28, 20, 32, 26, 25, 22, 23, 29, 21, 36, 24…
## $ NumPregnancies <int> 0, 2, 3, 1, 0, 0, 1, 0, 1, 0, 2, 2, 5, 0, 3, 2, 2, 3, …
## $ Smoking <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ GestWeeks <int> 42, 39, 38, 41, 38, 40, 39, 40, 40, 41, 38, 40, 38, 40…
## $ Weight_g <int> 3380, 3150, 3640, 3690, 3700, 3200, 3100, 3580, 3670, …
## $ Length_mm <int> 490, 490, 500, 515, 480, 495, 480, 510, 500, 510, 480,…
## $ HeadDiameter_mm <int> 325, 345, 375, 365, 335, 340, 345, 349, 335, 362, 330,…
## $ BirthType <fct> Nat, Nat, Nat, Nat, Nat, Nat, Nat, Nat, Ces, Ces, Ces,…
## $ Hospital <fct> osp3, osp1, osp2, osp2, osp3, osp2, osp3, osp1, osp2, …
## $ Sex <fct> M, F, M, M, F, F, F, M, F, F, M, F, F, F, M, M, M, M, …
## $ Preterm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, …
Missing values & outliers
missing_summary <- map_df(df, ~sum(is.na(.x))) %>% pivot_longer(everything(), names_to='var', values_to='n_missing')
print(missing_summary)
## # A tibble: 11 × 2
## var n_missing
## <chr> <int>
## 1 MaternalAge 0
## 2 NumPregnancies 0
## 3 Smoking 0
## 4 GestWeeks 0
## 5 Weight_g 0
## 6 Length_mm 0
## 7 HeadDiameter_mm 0
## 8 BirthType 0
## 9 Hospital 0
## 10 Sex 0
## 11 Preterm 0
# Boxplot directly in R
par(mar=c(5,5,2,2)) # optional, for margins
boxplot(df$Weight_g,
main = "Birth weight (g) - boxplot",
ylab = "Weight (g)")
Exploratory analysis
# Distributions
hist(df$Weight_g, main='Histogram of birth weight (g)',
xlab='Weight (g)', breaks=10, col="lightblue")
# Weight vs Gestational weeks
plot(df$GestWeeks, df$Weight_g, xlab='Gestational weeks', ylab='Weight (g)', main='Weight vs Gestational Weeks')
lines(lowess(df$GestWeeks, df$Weight_g), col='blue')
# Mean weight by Smoking
boxplot(Weight_g ~ Smoking, data=df, xlab='Smoking (0=No,1=Yes)', ylab='Weight (g)', main='Weight by Maternal Smoking')
Hypothesis tests
Hypothesis 1: Some hospitals perform more cesarean sections
chisq.test(table(df$BirthType, df$Hospital))
##
## Pearson's Chi-squared test
##
## data: table(df$BirthType, df$Hospital)
## X-squared = 1.0972, df = 2, p-value = 0.5778
Hypothesis 2: The mean weight and length of this sample are significantly equal to those of the population
(I have not included this test in the code, because we don’t have the population mean values for weight and length.)
Hypothesis 3: Anthropometric measurements are significantly different between the two sexes
t.test(Weight_g ~ Sex, data=df)
##
## Welch Two Sample t-test
##
## data: Weight_g by Sex
## t = -12.106, df = 2490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -287.1051 -207.0615
## sample estimates:
## mean in group F mean in group M
## 3161.132 3408.215
Feature Engineering
df <- df %>% mutate(GestWeeks2 = GestWeeks^2,
Gest_c = scale(GestWeeks, center=TRUE, scale=FALSE)[,1],
MaternalAge_c = scale(MaternalAge, center=TRUE, scale=FALSE)[,1])
Full Regression Model
# Full linear model
full_mod <- lm(Weight_g ~ MaternalAge + NumPregnancies + Smoking + GestWeeks +
Length_mm + HeadDiameter_mm + BirthType + Hospital + Sex,
data = df)
# Summary of full model
summary(full_mod)
##
## Call:
## lm(formula = Weight_g ~ MaternalAge + NumPregnancies + Smoking +
## GestWeeks + Length_mm + HeadDiameter_mm + BirthType + Hospital +
## Sex, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1124.40 -181.66 -14.42 160.91 2611.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6738.4762 141.3087 -47.686 < 2e-16 ***
## MaternalAge 0.8921 1.1323 0.788 0.4308
## NumPregnancies 11.2665 4.6608 2.417 0.0157 *
## Smoking1 -30.1631 27.5386 -1.095 0.2735
## GestWeeks 32.5696 3.8187 8.529 < 2e-16 ***
## Length_mm 10.2945 0.3007 34.236 < 2e-16 ***
## HeadDiameter_mm 10.4707 0.4260 24.578 < 2e-16 ***
## BirthTypeNat 29.5254 12.0844 2.443 0.0146 *
## Hospitalosp2 -11.2095 13.4379 -0.834 0.4043
## Hospitalosp3 28.0958 13.4957 2.082 0.0375 *
## SexM 77.5409 11.1776 6.937 5.08e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.9 on 2489 degrees of freedom
## Multiple R-squared: 0.7289, Adjusted R-squared: 0.7278
## F-statistic: 669.2 on 10 and 2489 DF, p-value: < 2.2e-16
# Multicollinearity
vif_vals <- vif(full_mod)
vif_vals
## GVIF Df GVIF^(1/(2*Df))
## MaternalAge 1.187454 1 1.089704
## NumPregnancies 1.186428 1 1.089233
## Smoking 1.007392 1 1.003689
## GestWeeks 1.695810 1 1.302233
## Length_mm 2.085755 1 1.444214
## HeadDiameter_mm 1.630796 1 1.277026
## BirthType 1.004242 1 1.002119
## Hospital 1.004071 2 1.001016
## Sex 1.040643 1 1.020119
# Stepwise selection by AIC
step_mod <- stepAIC(full_mod, direction = "both", trace = FALSE)
# Summary of stepwise model
summary(step_mod)
##
## Call:
## lm(formula = Weight_g ~ NumPregnancies + GestWeeks + Length_mm +
## HeadDiameter_mm + BirthType + Hospital + Sex, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1113.18 -181.16 -16.58 161.01 2620.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6707.4293 135.9438 -49.340 < 2e-16 ***
## NumPregnancies 12.3619 4.3325 2.853 0.00436 **
## GestWeeks 31.9909 3.7896 8.442 < 2e-16 ***
## Length_mm 10.3086 0.3004 34.316 < 2e-16 ***
## HeadDiameter_mm 10.4922 0.4254 24.661 < 2e-16 ***
## BirthTypeNat 29.2803 12.0817 2.424 0.01544 *
## Hospitalosp2 -11.0227 13.4363 -0.820 0.41209
## Hospitalosp3 28.6408 13.4886 2.123 0.03382 *
## SexM 77.4412 11.1756 6.930 5.36e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.9 on 2491 degrees of freedom
## Multiple R-squared: 0.7287, Adjusted R-squared: 0.7278
## F-statistic: 836.3 on 8 and 2491 DF, p-value: < 2.2e-16
# Final formula chosen
formula(step_mod)
## Weight_g ~ NumPregnancies + GestWeeks + Length_mm + HeadDiameter_mm +
## BirthType + Hospital + Sex
Diagnostics
## Residual diagnostic plots
par(mfrow = c(2,2))
plot(step_mod)
par(mfrow = c(1,1))
## Breusch-Pagan test (heteroskedasticity)
bptest(step_mod)
##
## studentized Breusch-Pagan test
##
## data: step_mod
## BP = 91.768, df = 8, p-value < 2.2e-16
## Durbin-Watson test (autocorrelation)
dwtest(step_mod)
##
## Durbin-Watson test
##
## data: step_mod
## DW = 1.9527, p-value = 0.1184
## alternative hypothesis: true autocorrelation is greater than 0
## Cook's distance (influential points)
cd <- cooks.distance(step_mod)
which(cd > (4/length(cd))) # potentially influential obs
## 13 34 119 130 134 140 146 155 161 220 274 295 310 329 375 377
## 13 34 119 130 134 140 146 155 161 220 274 295 310 329 375 377
## 390 455 472 478 516 582 615 616 632 633 648 656 657 684 791 805
## 390 455 472 478 516 582 615 616 632 633 648 656 657 684 791 805
## 828 890 908 928 950 1008 1014 1036 1130 1132 1137 1181 1188 1192 1194 1212
## 828 890 908 928 950 1008 1014 1036 1130 1132 1137 1181 1188 1192 1194 1212
## 1230 1253 1267 1268 1273 1287 1293 1306 1317 1341 1395 1399 1402 1426 1429 1433
## 1230 1253 1267 1268 1273 1287 1293 1306 1317 1341 1395 1399 1402 1426 1429 1433
## 1450 1472 1499 1505 1541 1551 1553 1556 1588 1593 1619 1635 1639 1694 1702 1712
## 1450 1472 1499 1505 1541 1551 1553 1556 1588 1593 1619 1635 1639 1694 1702 1712
## 1718 1780 1838 1856 1868 1893 1915 1920 1926 1937 1962 1963 2023 2040 2076 2086
## 1718 1780 1838 1856 1868 1893 1915 1920 1926 1937 1962 1963 2023 2040 2076 2086
## 2114 2115 2120 2123 2135 2175 2195 2204 2219 2221 2225 2287 2315 2317 2392 2421
## 2114 2115 2120 2123 2135 2175 2195 2204 2219 2221 2225 2287 2315 2317 2392 2421
## 2422 2437 2452 2471
## 2422 2437 2452 2471
head(sort(cd, decreasing = TRUE), 10)
## 1551 310 155 1780 928 1429 2452
## 0.55575274 0.04956384 0.02273094 0.02149897 0.02063530 0.02039252 0.01714674
## 2437 2115 2175
## 0.01504329 0.01398786 0.01264418
# Robust standard errors
coeftest(step_mod, vcov = vcovHC(step_mod, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6707.42933 167.40680 -40.0666 < 2.2e-16 ***
## NumPregnancies 12.36191 4.59376 2.6910 0.007171 **
## GestWeeks 31.99091 5.25796 6.0843 1.351e-09 ***
## Length_mm 10.30864 0.73116 14.0990 < 2.2e-16 ***
## HeadDiameter_mm 10.49219 0.79846 13.1405 < 2.2e-16 ***
## BirthTypeNat 29.28027 11.79771 2.4819 0.013135 *
## Hospitalosp2 -11.02268 13.32715 -0.8271 0.408269
## Hospitalosp3 28.64080 13.68793 2.0924 0.036503 *
## SexM 77.44116 11.06327 6.9998 3.282e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Cross-validated performance
set.seed(123)
train_ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
caret_lm <- train(form = formula(step_mod),
data = df,
method = "lm",
trControl = train_ctrl,
na.action = na.omit,
metric = "RMSE")
caret_lm
## Linear Regression
##
## 2500 samples
## 7 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2250, 2249, 2251, 2250, 2250, 2250, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 274.2745 0.7270545 210.7495
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
caret_lm$results
LASSO regression
# LASSO regression
x <- model.matrix(Weight_g ~ MaternalAge + NumPregnancies + Smoking + GestWeeks +
Length_mm + HeadDiameter_mm + BirthType + Hospital + Sex,
data = df)[,-1] # remove intercept
y <- df$Weight_g
set.seed(123)
cv_glmnet <- cv.glmnet(x, y, alpha = 1, nfolds = 10, standardize = TRUE)
# Plot CV curve
plot(cv_glmnet)
# Best lambda
best_lambda <- cv_glmnet$lambda.min
best_lambda
## [1] 1.084374
# Coefficients at best lambda
coef(cv_glmnet, s = "lambda.min")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## lambda.min
## (Intercept) -6699.4059812
## MaternalAge 0.7239497
## NumPregnancies 10.5841920
## Smoking1 -24.2726181
## GestWeeks 32.0583084
## Length_mm 10.2816549
## HeadDiameter_mm 10.4556582
## BirthTypeNat 27.0156402
## Hospitalosp2 -9.6479641
## Hospitalosp3 26.8403780
## SexM 75.8799010
LASSO regression was performed with cross-validation, yielding an optimal penalty parameter of λ = 1.08. The model retained all predictors, with the largest effects observed for gestational weeks, neonatal length, head diameter, sex, and birth type, while maternal age, smoking, and hospital effects were shrunk toward zero but not eliminated. These findings confirm the robustness of the main predictors identified in the stepwise model, while suggesting that additional variables may contribute marginally to birth weight prediction.
Model Comparison and Validation
To evaluate which approach provides the most reliable predictive model for newborn weight, we compared three alternatives:
Full Multiple Linear Regression (OLS) – included all predictors.
Stepwise Regression (AIC-based) – retained only significant variables.
LASSO Regression (λ = 1.08) – applied shrinkage and automatic variable selection.
Model Performance
Full OLS: R² = 0.729, RMSE ≈ 274 g.
Stepwise: R² = 0.729, RMSE ≈ 274 g (slightly more parsimonious, dropped non-significant predictors).
LASSO: Coefficients shrunk toward zero, but all variables retained at optimal λ. Performance remained close to stepwise, confirming robustness of predictors.
Validation
Using 10-fold cross-validation repeated 3 times:
Stepwise model achieved RMSE = 274.3 g and R² = 0.727.
LASSO model showed similar performance, indicating that penalization did not reduce predictive accuracy but helped confirm the most important variables.
# Use stepwise final model
final_mod <- lm(Weight_g ~ NumPregnancies + GestWeeks + Length_mm +
HeadDiameter_mm + BirthType + Hospital + Sex,
data = df)
# Example: mother in 3rd pregnancy, 39 weeks, average length/head, natural birth, hospital 1, female baby
new_data <- data.frame(
NumPregnancies = 3,
GestWeeks = 39,
Length_mm = mean(df$Length_mm, na.rm = TRUE),
HeadDiameter_mm = mean(df$HeadDiameter_mm, na.rm = TRUE),
BirthType = factor("Nat", levels = levels(df$BirthType)),
Hospital = factor("osp1", levels = levels(df$Hospital)),
Sex = factor("F", levels = levels(df$Sex))
)
predicted_weight <- predict(final_mod, newdata = new_data, interval = "prediction")
print(predicted_weight)
## fit lwr upr
## 1 3273.833 2735.93 3811.737
# Effect of gestational weeks on predicted weight
ggplot(df, aes(x = GestWeeks, y = Weight_g)) +
geom_point(alpha = 0.3, color = "grey") +
geom_smooth(method = "lm", formula = y ~ x, color = "blue", se = TRUE) +
labs(title = "Effect of Gestational Weeks on Birth Weight",
x = "Gestational Weeks", y = "Birth Weight (g)") +
theme_minimal()
# Effect of maternal smoking on birth weight
ggplot(df, aes(x = Smoking, y = Weight_g, fill = Smoking)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("lightblue", "tomato")) +
labs(title = "Impact of Maternal Smoking on Birth Weight",
x = "Smoking Status (0 = No, 1 = Yes)", y = "Birth Weight (g)") +
theme_minimal()
# Interaction: Gestational Weeks by Smoking
ggplot(df, aes(x = GestWeeks, y = Weight_g, color = Smoking)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Gestational Weeks and Smoking Interaction",
x = "Gestational Weeks", y = "Birth Weight (g)") +
theme_minimal()