data <- read.csv("neonati.csv")

2. Analysis and Modelling

# Standardize column names
df <- data %>%
rename(MaternalAge = Anni.madre,
NumPregnancies = N.gravidanze,
Smoking = Fumatrici,
GestWeeks = Gestazione,
Weight_g = Peso,
Length_mm = Lunghezza,
HeadDiameter_mm = Cranio,
BirthType = Tipo.parto,
Hospital = Ospedale,
Sex = Sesso)


# Convert types
df <- df %>%
mutate(Smoking = as.factor(if_else(as.numeric(as.character(Smoking))==1, 1, 0)),
BirthType = as.factor(BirthType),
Hospital = as.factor(Hospital),
Sex = as.factor(Sex),
Preterm = if_else(GestWeeks < 37, 1, 0))


# Quick look
glimpse(df)
## Rows: 2,500
## Columns: 11
## $ MaternalAge     <int> 26, 21, 34, 28, 20, 32, 26, 25, 22, 23, 29, 21, 36, 24…
## $ NumPregnancies  <int> 0, 2, 3, 1, 0, 0, 1, 0, 1, 0, 2, 2, 5, 0, 3, 2, 2, 3, …
## $ Smoking         <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ GestWeeks       <int> 42, 39, 38, 41, 38, 40, 39, 40, 40, 41, 38, 40, 38, 40…
## $ Weight_g        <int> 3380, 3150, 3640, 3690, 3700, 3200, 3100, 3580, 3670, …
## $ Length_mm       <int> 490, 490, 500, 515, 480, 495, 480, 510, 500, 510, 480,…
## $ HeadDiameter_mm <int> 325, 345, 375, 365, 335, 340, 345, 349, 335, 362, 330,…
## $ BirthType       <fct> Nat, Nat, Nat, Nat, Nat, Nat, Nat, Nat, Ces, Ces, Ces,…
## $ Hospital        <fct> osp3, osp1, osp2, osp2, osp3, osp2, osp3, osp1, osp2, …
## $ Sex             <fct> M, F, M, M, F, F, F, M, F, F, M, F, F, F, M, M, M, M, …
## $ Preterm         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, …

Missing values & outliers

missing_summary <- map_df(df, ~sum(is.na(.x))) %>% pivot_longer(everything(), names_to='var', values_to='n_missing')
print(missing_summary)
## # A tibble: 11 × 2
##    var             n_missing
##    <chr>               <int>
##  1 MaternalAge             0
##  2 NumPregnancies          0
##  3 Smoking                 0
##  4 GestWeeks               0
##  5 Weight_g                0
##  6 Length_mm               0
##  7 HeadDiameter_mm         0
##  8 BirthType               0
##  9 Hospital                0
## 10 Sex                     0
## 11 Preterm                 0
# Boxplot directly in R
par(mar=c(5,5,2,2))  # optional, for margins
boxplot(df$Weight_g,
        main = "Birth weight (g) - boxplot",
        ylab = "Weight (g)")

Exploratory analysis

# Distributions
hist(df$Weight_g, main='Histogram of birth weight (g)', 
     xlab='Weight (g)', breaks=10, col="lightblue")

# Weight vs Gestational weeks
plot(df$GestWeeks, df$Weight_g, xlab='Gestational weeks', ylab='Weight (g)', main='Weight vs Gestational Weeks')
lines(lowess(df$GestWeeks, df$Weight_g), col='blue')

# Mean weight by Smoking
boxplot(Weight_g ~ Smoking, data=df, xlab='Smoking (0=No,1=Yes)', ylab='Weight (g)', main='Weight by Maternal Smoking')

Hypothesis tests

Hypothesis 1: Some hospitals perform more cesarean sections

chisq.test(table(df$BirthType, df$Hospital))
## 
##  Pearson's Chi-squared test
## 
## data:  table(df$BirthType, df$Hospital)
## X-squared = 1.0972, df = 2, p-value = 0.5778

Hypothesis 2: The mean weight and length of this sample are significantly equal to those of the population

(I have not included this test in the code, because we don’t have the population mean values for weight and length.)

Hypothesis 3: Anthropometric measurements are significantly different between the two sexes

t.test(Weight_g ~ Sex, data=df)
## 
##  Welch Two Sample t-test
## 
## data:  Weight_g by Sex
## t = -12.106, df = 2490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -287.1051 -207.0615
## sample estimates:
## mean in group F mean in group M 
##        3161.132        3408.215

Feature Engineering

df <- df %>% mutate(GestWeeks2 = GestWeeks^2,
Gest_c = scale(GestWeeks, center=TRUE, scale=FALSE)[,1],
MaternalAge_c = scale(MaternalAge, center=TRUE, scale=FALSE)[,1])

Full Regression Model

# Full linear model
full_mod <- lm(Weight_g ~ MaternalAge + NumPregnancies + Smoking + GestWeeks +
                 Length_mm + HeadDiameter_mm + BirthType + Hospital + Sex,
               data = df)

# Summary of full model
summary(full_mod)
## 
## Call:
## lm(formula = Weight_g ~ MaternalAge + NumPregnancies + Smoking + 
##     GestWeeks + Length_mm + HeadDiameter_mm + BirthType + Hospital + 
##     Sex, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1124.40  -181.66   -14.42   160.91  2611.89 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -6738.4762   141.3087 -47.686  < 2e-16 ***
## MaternalAge         0.8921     1.1323   0.788   0.4308    
## NumPregnancies     11.2665     4.6608   2.417   0.0157 *  
## Smoking1          -30.1631    27.5386  -1.095   0.2735    
## GestWeeks          32.5696     3.8187   8.529  < 2e-16 ***
## Length_mm          10.2945     0.3007  34.236  < 2e-16 ***
## HeadDiameter_mm    10.4707     0.4260  24.578  < 2e-16 ***
## BirthTypeNat       29.5254    12.0844   2.443   0.0146 *  
## Hospitalosp2      -11.2095    13.4379  -0.834   0.4043    
## Hospitalosp3       28.0958    13.4957   2.082   0.0375 *  
## SexM               77.5409    11.1776   6.937 5.08e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.9 on 2489 degrees of freedom
## Multiple R-squared:  0.7289, Adjusted R-squared:  0.7278 
## F-statistic: 669.2 on 10 and 2489 DF,  p-value: < 2.2e-16
# Multicollinearity

vif_vals <- vif(full_mod)
vif_vals
##                     GVIF Df GVIF^(1/(2*Df))
## MaternalAge     1.187454  1        1.089704
## NumPregnancies  1.186428  1        1.089233
## Smoking         1.007392  1        1.003689
## GestWeeks       1.695810  1        1.302233
## Length_mm       2.085755  1        1.444214
## HeadDiameter_mm 1.630796  1        1.277026
## BirthType       1.004242  1        1.002119
## Hospital        1.004071  2        1.001016
## Sex             1.040643  1        1.020119
# Stepwise selection by AIC

step_mod <- stepAIC(full_mod, direction = "both", trace = FALSE)

# Summary of stepwise model
summary(step_mod)
## 
## Call:
## lm(formula = Weight_g ~ NumPregnancies + GestWeeks + Length_mm + 
##     HeadDiameter_mm + BirthType + Hospital + Sex, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1113.18  -181.16   -16.58   161.01  2620.19 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -6707.4293   135.9438 -49.340  < 2e-16 ***
## NumPregnancies     12.3619     4.3325   2.853  0.00436 ** 
## GestWeeks          31.9909     3.7896   8.442  < 2e-16 ***
## Length_mm          10.3086     0.3004  34.316  < 2e-16 ***
## HeadDiameter_mm    10.4922     0.4254  24.661  < 2e-16 ***
## BirthTypeNat       29.2803    12.0817   2.424  0.01544 *  
## Hospitalosp2      -11.0227    13.4363  -0.820  0.41209    
## Hospitalosp3       28.6408    13.4886   2.123  0.03382 *  
## SexM               77.4412    11.1756   6.930 5.36e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.9 on 2491 degrees of freedom
## Multiple R-squared:  0.7287, Adjusted R-squared:  0.7278 
## F-statistic: 836.3 on 8 and 2491 DF,  p-value: < 2.2e-16
# Final formula chosen
formula(step_mod)
## Weight_g ~ NumPregnancies + GestWeeks + Length_mm + HeadDiameter_mm + 
##     BirthType + Hospital + Sex

Diagnostics

##  Residual diagnostic plots
par(mfrow = c(2,2))
plot(step_mod)

par(mfrow = c(1,1))

##  Breusch-Pagan test (heteroskedasticity)
bptest(step_mod)
## 
##  studentized Breusch-Pagan test
## 
## data:  step_mod
## BP = 91.768, df = 8, p-value < 2.2e-16
##  Durbin-Watson test (autocorrelation)
dwtest(step_mod)
## 
##  Durbin-Watson test
## 
## data:  step_mod
## DW = 1.9527, p-value = 0.1184
## alternative hypothesis: true autocorrelation is greater than 0
##  Cook's distance (influential points)
cd <- cooks.distance(step_mod)
which(cd > (4/length(cd)))  # potentially influential obs
##   13   34  119  130  134  140  146  155  161  220  274  295  310  329  375  377 
##   13   34  119  130  134  140  146  155  161  220  274  295  310  329  375  377 
##  390  455  472  478  516  582  615  616  632  633  648  656  657  684  791  805 
##  390  455  472  478  516  582  615  616  632  633  648  656  657  684  791  805 
##  828  890  908  928  950 1008 1014 1036 1130 1132 1137 1181 1188 1192 1194 1212 
##  828  890  908  928  950 1008 1014 1036 1130 1132 1137 1181 1188 1192 1194 1212 
## 1230 1253 1267 1268 1273 1287 1293 1306 1317 1341 1395 1399 1402 1426 1429 1433 
## 1230 1253 1267 1268 1273 1287 1293 1306 1317 1341 1395 1399 1402 1426 1429 1433 
## 1450 1472 1499 1505 1541 1551 1553 1556 1588 1593 1619 1635 1639 1694 1702 1712 
## 1450 1472 1499 1505 1541 1551 1553 1556 1588 1593 1619 1635 1639 1694 1702 1712 
## 1718 1780 1838 1856 1868 1893 1915 1920 1926 1937 1962 1963 2023 2040 2076 2086 
## 1718 1780 1838 1856 1868 1893 1915 1920 1926 1937 1962 1963 2023 2040 2076 2086 
## 2114 2115 2120 2123 2135 2175 2195 2204 2219 2221 2225 2287 2315 2317 2392 2421 
## 2114 2115 2120 2123 2135 2175 2195 2204 2219 2221 2225 2287 2315 2317 2392 2421 
## 2422 2437 2452 2471 
## 2422 2437 2452 2471
head(sort(cd, decreasing = TRUE), 10)
##       1551        310        155       1780        928       1429       2452 
## 0.55575274 0.04956384 0.02273094 0.02149897 0.02063530 0.02039252 0.01714674 
##       2437       2115       2175 
## 0.01504329 0.01398786 0.01264418
# Robust standard errors
coeftest(step_mod, vcov = vcovHC(step_mod, type = "HC3"))
## 
## t test of coefficients:
## 
##                    Estimate  Std. Error  t value  Pr(>|t|)    
## (Intercept)     -6707.42933   167.40680 -40.0666 < 2.2e-16 ***
## NumPregnancies     12.36191     4.59376   2.6910  0.007171 ** 
## GestWeeks          31.99091     5.25796   6.0843 1.351e-09 ***
## Length_mm          10.30864     0.73116  14.0990 < 2.2e-16 ***
## HeadDiameter_mm    10.49219     0.79846  13.1405 < 2.2e-16 ***
## BirthTypeNat       29.28027    11.79771   2.4819  0.013135 *  
## Hospitalosp2      -11.02268    13.32715  -0.8271  0.408269    
## Hospitalosp3       28.64080    13.68793   2.0924  0.036503 *  
## SexM               77.44116    11.06327   6.9998 3.282e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Cross-validated performance
set.seed(123)
train_ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 3)

caret_lm <- train(form = formula(step_mod),
                  data = df,
                  method = "lm",
                  trControl = train_ctrl,
                  na.action = na.omit,
                  metric = "RMSE")

caret_lm
## Linear Regression 
## 
## 2500 samples
##    7 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 2250, 2249, 2251, 2250, 2250, 2250, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   274.2745  0.7270545  210.7495
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
caret_lm$results

LASSO regression

# LASSO regression

x <- model.matrix(Weight_g ~ MaternalAge + NumPregnancies + Smoking + GestWeeks +
                    Length_mm + HeadDiameter_mm + BirthType + Hospital + Sex,
                  data = df)[,-1]  # remove intercept
y <- df$Weight_g

set.seed(123)
cv_glmnet <- cv.glmnet(x, y, alpha = 1, nfolds = 10, standardize = TRUE)

# Plot CV curve
plot(cv_glmnet)

# Best lambda
best_lambda <- cv_glmnet$lambda.min
best_lambda
## [1] 1.084374
# Coefficients at best lambda
coef(cv_glmnet, s = "lambda.min")
## 11 x 1 sparse Matrix of class "dgCMatrix"
##                    lambda.min
## (Intercept)     -6699.4059812
## MaternalAge         0.7239497
## NumPregnancies     10.5841920
## Smoking1          -24.2726181
## GestWeeks          32.0583084
## Length_mm          10.2816549
## HeadDiameter_mm    10.4556582
## BirthTypeNat       27.0156402
## Hospitalosp2       -9.6479641
## Hospitalosp3       26.8403780
## SexM               75.8799010

LASSO regression was performed with cross-validation, yielding an optimal penalty parameter of λ = 1.08. The model retained all predictors, with the largest effects observed for gestational weeks, neonatal length, head diameter, sex, and birth type, while maternal age, smoking, and hospital effects were shrunk toward zero but not eliminated. These findings confirm the robustness of the main predictors identified in the stepwise model, while suggesting that additional variables may contribute marginally to birth weight prediction.

Model Comparison and Validation

To evaluate which approach provides the most reliable predictive model for newborn weight, we compared three alternatives:

Model Performance

Validation

Using 10-fold cross-validation repeated 3 times:

3. Predictions with Final Model

# Use stepwise final model
final_mod <- lm(Weight_g ~ NumPregnancies + GestWeeks + Length_mm + 
                  HeadDiameter_mm + BirthType + Hospital + Sex, 
                data = df)

# Example: mother in 3rd pregnancy, 39 weeks, average length/head, natural birth, hospital 1, female baby
new_data <- data.frame(
  NumPregnancies = 3,
  GestWeeks = 39,
  Length_mm = mean(df$Length_mm, na.rm = TRUE),
  HeadDiameter_mm = mean(df$HeadDiameter_mm, na.rm = TRUE),
  BirthType = factor("Nat", levels = levels(df$BirthType)),
  Hospital = factor("osp1", levels = levels(df$Hospital)),
  Sex = factor("F", levels = levels(df$Sex))
)

predicted_weight <- predict(final_mod, newdata = new_data, interval = "prediction")
print(predicted_weight)
##        fit     lwr      upr
## 1 3273.833 2735.93 3811.737

4. Visualizations

# Effect of gestational weeks on predicted weight
ggplot(df, aes(x = GestWeeks, y = Weight_g)) +
  geom_point(alpha = 0.3, color = "grey") +
  geom_smooth(method = "lm", formula = y ~ x, color = "blue", se = TRUE) +
  labs(title = "Effect of Gestational Weeks on Birth Weight",
       x = "Gestational Weeks", y = "Birth Weight (g)") +
  theme_minimal()

# Effect of maternal smoking on birth weight
ggplot(df, aes(x = Smoking, y = Weight_g, fill = Smoking)) +
  geom_boxplot(alpha = 0.7) +
  scale_fill_manual(values = c("lightblue", "tomato")) +
  labs(title = "Impact of Maternal Smoking on Birth Weight",
       x = "Smoking Status (0 = No, 1 = Yes)", y = "Birth Weight (g)") +
  theme_minimal()

# Interaction: Gestational Weeks by Smoking
ggplot(df, aes(x = GestWeeks, y = Weight_g, color = Smoking)) +
  geom_point(alpha = 0.4) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Gestational Weeks and Smoking Interaction",
       x = "Gestational Weeks", y = "Birth Weight (g)") +
  theme_minimal()