data <- read.csv("neonati.csv")
In the first phase, we will explore the variables through descriptive analysis to understand their distribution and identify any outliers or anomalies.
# First we, standardize column names
df <- data %>%
rename(MaternalAge = Anni.madre,
NumPregnancies = N.gravidanze,
Smoking = Fumatrici,
GestWeeks = Gestazione,
Weight_g = Peso,
Length_mm = Lunghezza,
HeadDiameter_mm = Cranio,
BirthType = Tipo.parto,
Hospital = Ospedale,
Sex = Sesso)
# covert the variables to numeric and others to factors
df <- df %>%
mutate(Smoking = as.factor(if_else(as.numeric(as.character(Smoking))==1, 1, 0)),
BirthType = as.factor(BirthType),
Hospital = as.factor(Hospital),
Sex = as.factor(Sex),
Preterm = if_else(GestWeeks < 37, 1, 0))
glimpse(df) # shows the structure of the data
## Rows: 2,500
## Columns: 11
## $ MaternalAge <int> 26, 21, 34, 28, 20, 32, 26, 25, 22, 23, 29, 21, 36, 24…
## $ NumPregnancies <int> 0, 2, 3, 1, 0, 0, 1, 0, 1, 0, 2, 2, 5, 0, 3, 2, 2, 3, …
## $ Smoking <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ GestWeeks <int> 42, 39, 38, 41, 38, 40, 39, 40, 40, 41, 38, 40, 38, 40…
## $ Weight_g <int> 3380, 3150, 3640, 3690, 3700, 3200, 3100, 3580, 3670, …
## $ Length_mm <int> 490, 490, 500, 515, 480, 495, 480, 510, 500, 510, 480,…
## $ HeadDiameter_mm <int> 325, 345, 375, 365, 335, 340, 345, 349, 335, 362, 330,…
## $ BirthType <fct> Nat, Nat, Nat, Nat, Nat, Nat, Nat, Nat, Ces, Ces, Ces,…
## $ Hospital <fct> osp3, osp1, osp2, osp2, osp3, osp2, osp3, osp1, osp2, …
## $ Sex <fct> M, F, M, M, F, F, F, M, F, F, M, F, F, F, M, M, M, M, …
## $ Preterm <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, …
Missing values & outliers
In this section we will check for missing values and presence of outliers using boxplot
# checking missing values
missing_summary <- map_df(df, ~sum(is.na(.x))) %>% pivot_longer(everything(), names_to='var', values_to='n_missing')
missing_summary %>%
kable("html", caption = "Summary of Missing Values") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
| var | n_missing |
|---|---|
| MaternalAge | 0 |
| NumPregnancies | 0 |
| Smoking | 0 |
| GestWeeks | 0 |
| Weight_g | 0 |
| Length_mm | 0 |
| HeadDiameter_mm | 0 |
| BirthType | 0 |
| Hospital | 0 |
| Sex | 0 |
| Preterm | 0 |
# Boxplot
par(mar=c(5,5,2,2))
boxplot(df$Weight_g,
main = "Birth weight (g) - boxplot",
ylab = "Weight (g)")
From the analysis, we notice that there no missing values and the no significant outliers in the boxplot.
Exploratory analysis
# Distributions
hist(df$Weight_g, main='Histogram of birth weight (g)',
xlab='Weight (g)', breaks=10, col="lightblue")
# Weight vs Gestational weeks
plot(df$GestWeeks, df$Weight_g, xlab='Gestational weeks', ylab='Weight (g)', main='Weight vs Gestational Weeks')
lines(lowess(df$GestWeeks, df$Weight_g), col='blue')
# Mean weight by Smoking
boxplot(Weight_g ~ Smoking, data=df, xlab='Smoking (0=No,1=Yes)', ylab='Weight (g)', main='Weight by Maternal Smoking')
Hypothesis tests
Hypothesis 1: Some hospitals perform more cesarean sections
chisq.test(table(df$BirthType, df$Hospital))
##
## Pearson's Chi-squared test
##
## data: table(df$BirthType, df$Hospital)
## X-squared = 1.0972, df = 2, p-value = 0.5778
Hypothesis 2: The mean weight and length of this sample are significantly equal to those of the population
(I have not included this test in the code, because we don’t have the population mean values for weight and length.)
Hypothesis 3: Anthropometric measurements are significantly different between the two sexes
t.test(Weight_g ~ Sex, data=df)
##
## Welch Two Sample t-test
##
## data: Weight_g by Sex
## t = -12.106, df = 2490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -287.1051 -207.0615
## sample estimates:
## mean in group F mean in group M
## 3161.132 3408.215
Feature Engineering
df <- df %>% mutate(GestWeeks2 = GestWeeks^2,
Gest_c = scale(GestWeeks, center=TRUE, scale=FALSE)[,1],
MaternalAge_c = scale(MaternalAge, center=TRUE, scale=FALSE)[,1])
Full Regression Model
# Full linear model
full_mod <- lm(Weight_g ~ MaternalAge + NumPregnancies + Smoking + GestWeeks +
Length_mm + HeadDiameter_mm + BirthType + Hospital + Sex,
data = df)
# Summary of full model
summary(full_mod)
##
## Call:
## lm(formula = Weight_g ~ MaternalAge + NumPregnancies + Smoking +
## GestWeeks + Length_mm + HeadDiameter_mm + BirthType + Hospital +
## Sex, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1124.40 -181.66 -14.42 160.91 2611.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6738.4762 141.3087 -47.686 < 2e-16 ***
## MaternalAge 0.8921 1.1323 0.788 0.4308
## NumPregnancies 11.2665 4.6608 2.417 0.0157 *
## Smoking1 -30.1631 27.5386 -1.095 0.2735
## GestWeeks 32.5696 3.8187 8.529 < 2e-16 ***
## Length_mm 10.2945 0.3007 34.236 < 2e-16 ***
## HeadDiameter_mm 10.4707 0.4260 24.578 < 2e-16 ***
## BirthTypeNat 29.5254 12.0844 2.443 0.0146 *
## Hospitalosp2 -11.2095 13.4379 -0.834 0.4043
## Hospitalosp3 28.0958 13.4957 2.082 0.0375 *
## SexM 77.5409 11.1776 6.937 5.08e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.9 on 2489 degrees of freedom
## Multiple R-squared: 0.7289, Adjusted R-squared: 0.7278
## F-statistic: 669.2 on 10 and 2489 DF, p-value: < 2.2e-16
# Multicollinearity
vif_vals <- vif(full_mod)
vif_vals
## GVIF Df GVIF^(1/(2*Df))
## MaternalAge 1.187454 1 1.089704
## NumPregnancies 1.186428 1 1.089233
## Smoking 1.007392 1 1.003689
## GestWeeks 1.695810 1 1.302233
## Length_mm 2.085755 1 1.444214
## HeadDiameter_mm 1.630796 1 1.277026
## BirthType 1.004242 1 1.002119
## Hospital 1.004071 2 1.001016
## Sex 1.040643 1 1.020119
# Stepwise selection by AIC
step_mod <- stepAIC(full_mod, direction = "both", trace = FALSE)
summary(step_mod)
##
## Call:
## lm(formula = Weight_g ~ NumPregnancies + GestWeeks + Length_mm +
## HeadDiameter_mm + BirthType + Hospital + Sex, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1113.18 -181.16 -16.58 161.01 2620.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6707.4293 135.9438 -49.340 < 2e-16 ***
## NumPregnancies 12.3619 4.3325 2.853 0.00436 **
## GestWeeks 31.9909 3.7896 8.442 < 2e-16 ***
## Length_mm 10.3086 0.3004 34.316 < 2e-16 ***
## HeadDiameter_mm 10.4922 0.4254 24.661 < 2e-16 ***
## BirthTypeNat 29.2803 12.0817 2.424 0.01544 *
## Hospitalosp2 -11.0227 13.4363 -0.820 0.41209
## Hospitalosp3 28.6408 13.4886 2.123 0.03382 *
## SexM 77.4412 11.1756 6.930 5.36e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.9 on 2491 degrees of freedom
## Multiple R-squared: 0.7287, Adjusted R-squared: 0.7278
## F-statistic: 836.3 on 8 and 2491 DF, p-value: < 2.2e-16
# Final formula chosen
formula(step_mod)
## Weight_g ~ NumPregnancies + GestWeeks + Length_mm + HeadDiameter_mm +
## BirthType + Hospital + Sex
Diagnostics
## Residual diagnostic plots
par(mfrow = c(2,2))
plot(step_mod)
par(mfrow = c(1,1))
## Breusch-Pagan test
bptest(step_mod)
##
## studentized Breusch-Pagan test
##
## data: step_mod
## BP = 91.768, df = 8, p-value < 2.2e-16
## Durbin-Watson test
dwtest(step_mod)
##
## Durbin-Watson test
##
## data: step_mod
## DW = 1.9527, p-value = 0.1184
## alternative hypothesis: true autocorrelation is greater than 0
## Cook's distance
cd <- cooks.distance(step_mod)
which(cd > (4/length(cd)))
## 13 34 119 130 134 140 146 155 161 220 274 295 310 329 375 377
## 13 34 119 130 134 140 146 155 161 220 274 295 310 329 375 377
## 390 455 472 478 516 582 615 616 632 633 648 656 657 684 791 805
## 390 455 472 478 516 582 615 616 632 633 648 656 657 684 791 805
## 828 890 908 928 950 1008 1014 1036 1130 1132 1137 1181 1188 1192 1194 1212
## 828 890 908 928 950 1008 1014 1036 1130 1132 1137 1181 1188 1192 1194 1212
## 1230 1253 1267 1268 1273 1287 1293 1306 1317 1341 1395 1399 1402 1426 1429 1433
## 1230 1253 1267 1268 1273 1287 1293 1306 1317 1341 1395 1399 1402 1426 1429 1433
## 1450 1472 1499 1505 1541 1551 1553 1556 1588 1593 1619 1635 1639 1694 1702 1712
## 1450 1472 1499 1505 1541 1551 1553 1556 1588 1593 1619 1635 1639 1694 1702 1712
## 1718 1780 1838 1856 1868 1893 1915 1920 1926 1937 1962 1963 2023 2040 2076 2086
## 1718 1780 1838 1856 1868 1893 1915 1920 1926 1937 1962 1963 2023 2040 2076 2086
## 2114 2115 2120 2123 2135 2175 2195 2204 2219 2221 2225 2287 2315 2317 2392 2421
## 2114 2115 2120 2123 2135 2175 2195 2204 2219 2221 2225 2287 2315 2317 2392 2421
## 2422 2437 2452 2471
## 2422 2437 2452 2471
head(sort(cd, decreasing = TRUE), 10)
## 1551 310 155 1780 928 1429 2452
## 0.55575274 0.04956384 0.02273094 0.02149897 0.02063530 0.02039252 0.01714674
## 2437 2115 2175
## 0.01504329 0.01398786 0.01264418
# Robust standard errors
coeftest(step_mod, vcov = vcovHC(step_mod, type = "HC3"))
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6707.42933 167.40680 -40.0666 < 2.2e-16 ***
## NumPregnancies 12.36191 4.59376 2.6910 0.007171 **
## GestWeeks 31.99091 5.25796 6.0843 1.351e-09 ***
## Length_mm 10.30864 0.73116 14.0990 < 2.2e-16 ***
## HeadDiameter_mm 10.49219 0.79846 13.1405 < 2.2e-16 ***
## BirthTypeNat 29.28027 11.79771 2.4819 0.013135 *
## Hospitalosp2 -11.02268 13.32715 -0.8271 0.408269
## Hospitalosp3 28.64080 13.68793 2.0924 0.036503 *
## SexM 77.44116 11.06327 6.9998 3.282e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Cross-validated performance
set.seed(123)
train_ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
caret_lm <- train(form = formula(step_mod),
data = df,
method = "lm",
trControl = train_ctrl,
na.action = na.omit,
metric = "RMSE")
caret_lm
## Linear Regression
##
## 2500 samples
## 7 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2250, 2249, 2251, 2250, 2250, 2250, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 274.2745 0.7270545 210.7495
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
caret_lm$results
LASSO regression
# LASSO regression
x <- model.matrix(Weight_g ~ MaternalAge + NumPregnancies + Smoking + GestWeeks +
Length_mm + HeadDiameter_mm + BirthType + Hospital + Sex,
data = df)[,-1]
y <- df$Weight_g
set.seed(123)
cv_glmnet <- cv.glmnet(x, y, alpha = 1, nfolds = 10, standardize = TRUE)
# Plot CV curve
plot(cv_glmnet)
# Best lambda
best_lambda <- cv_glmnet$lambda.min
best_lambda
## [1] 1.084374
# Coefficients at best lambda
coef(cv_glmnet, s = "lambda.min")
## 11 x 1 sparse Matrix of class "dgCMatrix"
## lambda.min
## (Intercept) -6699.4059812
## MaternalAge 0.7239497
## NumPregnancies 10.5841920
## Smoking1 -24.2726181
## GestWeeks 32.0583084
## Length_mm 10.2816549
## HeadDiameter_mm 10.4556582
## BirthTypeNat 27.0156402
## Hospitalosp2 -9.6479641
## Hospitalosp3 26.8403780
## SexM 75.8799010
LASSO regression was performed with cross-validation, yielding an optimal penalty parameter of λ = 1.08. The model retained all predictors, with the largest effects observed for gestational weeks, neonatal length, head diameter, sex, and birth type, while maternal age, smoking, and hospital effects were shrunk toward zero but not eliminated. These findings confirm the robustness of the main predictors identified in the stepwise model, while suggesting that additional variables may contribute marginally to birth weight prediction.
Model Comparison and Validation
To evaluate which approach provides the most reliable predictive model for newborn weight, we compared three alternatives:
Full Multiple Linear Regression (OLS) – included all predictors.
Stepwise Regression (AIC-based) – retained only significant variables.
LASSO Regression (λ = 1.08) – applied shrinkage and automatic variable selection.
Model Performance
Full OLS: R² = 0.729, RMSE ≈ 274 g.
Stepwise: R² = 0.729, RMSE ≈ 274 g (slightly more parsimonious, dropped non-significant predictors).
LASSO: Coefficients shrunk toward zero, but all variables retained at optimal λ. Performance remained close to stepwise, confirming robustness of predictors.
Validation
Using 10-fold cross-validation repeated 3 times:
Stepwise model achieved RMSE = 274.3 g and R² = 0.727.
LASSO model showed similar performance, indicating that penalization did not reduce predictive accuracy but helped confirm the most important variables.
# stepwise final model
final_mod <- lm(Weight_g ~ NumPregnancies + GestWeeks + Length_mm +
HeadDiameter_mm + BirthType + Hospital + Sex,
data = df)
new_data <- data.frame(
NumPregnancies = 3,
GestWeeks = 39,
Length_mm = mean(df$Length_mm, na.rm = TRUE),
HeadDiameter_mm = mean(df$HeadDiameter_mm, na.rm = TRUE),
BirthType = factor("Nat", levels = levels(df$BirthType)),
Hospital = factor("osp1", levels = levels(df$Hospital)),
Sex = factor("F", levels = levels(df$Sex))
)
predicted_weight <- predict(final_mod, newdata = new_data, interval = "prediction")
print(predicted_weight)
## fit lwr upr
## 1 3273.833 2735.93 3811.737
# Effect of gestational weeks on predicted weight
ggplot(df, aes(x = GestWeeks, y = Weight_g)) +
geom_point(alpha = 0.3, color = "grey") +
geom_smooth(method = "lm", formula = y ~ x, color = "blue", se = TRUE) +
labs(title = "Effect of Gestational Weeks on Birth Weight",
x = "Gestational Weeks", y = "Birth Weight (g)") +
theme_minimal()
# Effect of maternal smoking on birth weight
ggplot(df, aes(x = Smoking, y = Weight_g, fill = Smoking)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("lightblue", "tomato")) +
labs(title = "Impact of Maternal Smoking on Birth Weight",
x = "Smoking Status (0 = No, 1 = Yes)", y = "Birth Weight (g)") +
theme_minimal()
# Interaction: Gestational Weeks by Smoking
ggplot(df, aes(x = GestWeeks, y = Weight_g, color = Smoking)) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Gestational Weeks and Smoking Interaction",
x = "Gestational Weeks", y = "Birth Weight (g)") +
theme_minimal()