Prediction of neonatal weight

DESCRIPTIVE ANALYTICS

summary(df)

##    Anni.madre     N.gravidanze       Fumatrici        Gestazione   
##  Min.   : 0.00   Min.   : 0.0000   Min.   :0.0000   Min.   :25.00  
##  1st Qu.:25.00   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:38.00  
##  Median :28.00   Median : 1.0000   Median :0.0000   Median :39.00  
##  Mean   :28.16   Mean   : 0.9812   Mean   :0.0416   Mean   :38.98  
##  3rd Qu.:32.00   3rd Qu.: 1.0000   3rd Qu.:0.0000   3rd Qu.:40.00  
##  Max.   :46.00   Max.   :12.0000   Max.   :1.0000   Max.   :43.00  
##       Peso        Lunghezza         Cranio    Tipo.parto Ospedale   Sesso   
##  Min.   : 830   Min.   :310.0   Min.   :235   Ces: 728   osp1:816   F:1256  
##  1st Qu.:2990   1st Qu.:480.0   1st Qu.:330   Nat:1772   osp2:849   M:1244  
##  Median :3300   Median :500.0   Median :340              osp3:835           
##  Mean   :3284   Mean   :494.7   Mean   :340                                 
##  3rd Qu.:3620   3rd Qu.:510.0   3rd Qu.:350                                 
##  Max.   :4930   Max.   :565.0   Max.   :390

results <- data.frame(
  Variables_categories = c(
    paste("Fumatrici:", names(table(Fumatrici))),
    paste("Tipo parto:", names(table(Tipo.parto)))
  ),
  Percentage = c(
    (table(Fumatrici) / length(Fumatrici)) * 100,
    (table(Tipo.parto) / length(Tipo.parto)) * 100
  )
)

results$Percentage <- paste0(round(results$Percentage, 1), "%")

knitr::kable(results, row.names = FALSE)

Variables_categories	Percentage
Fumatrici: 0	95.8%
Fumatrici: 1	4.2%
Tipo parto: Ces	29.1%
Tipo parto: Nat	70.9%

bp_osp = barplot(table(Ospedale),
        col = "lightblue",
        main = "Number of births per hospital",
        xlab = "Ospedale",
        ylim = c(0, max(table(Ospedale)) * 1.15))
text(x = bp_osp,
     y = table(Ospedale),
     labels = table(Ospedale),
     pos = 3,
     cex = 0.7)
box()

bp_ngr = barplot(table(N.gravidanze),
        col = "lightgrey",
        main = "Barplot of N.gravidanze",
        xlab = "N.gravidanze",
        ylim = c(0, 1200))
text(x = bp_ngr[1:4],
     y = table(N.gravidanze)[1:4],
     labels = table(N.gravidanze)[1:4],
     pos = 3,
     cex = 0.7)
box()

freq = cumsum(prop.table(table(N.gravidanze)))*100
x = as.numeric(names(freq))
y = as.numeric(freq)
plot(as.numeric(names(freq)),
           freq,
           type = "b",
           pch = 19,
           col = "blue",
           xlab = "N.gravidanze",
           ylab = "Cumulative frequency (%)",
           main = "Cumulative percentage of N.gravidanze")
grid(col = "lightgray", lty = "dotted")
text(x[1:4],
     y[1:4],
     labels = paste0(round(y[1:4], 1), "%"),
     pos = 3,
     cex = 0.7,
     col = "black")

From the first analysis we can see few things:

- The dimension of the dataset is 2500 observations x 10 columns

- Most of the women are non-smokers (95.8%) and have had a natural birth (70.9%)

- All three hospitals have roughly the same absolute frequency

- Most of the women had 3 or less than 3 pregnancies, specifically 96.2% of them

par(mfrow = c(1, 2))
plot(density(Anni.madre),
     main = "Density curve Anni.madre",
     col = "blue",
     lwd = 2)
boxplot(Anni.madre,
        main = "Boxplot Anni.madre",
        col = "blue")

par(mfrow = c(1, 1))

bp_gest = barplot(table(Gestazione),
        main = "Barplot of Gestazione",
        col = "green",
        ylim = c(0, 900))
text(x = bp_gest[13:17],
     y = table(Gestazione)[13:17],
     labels = table(Gestazione)[13:17],
     pos = 3,
     cex = 0.7,
     col = "black")

skewness(Gestazione)

## [1] -2.065313

par(mfrow = c(1, 2))
plot(density(Lunghezza),
        main = "Density curve of Lunghezza",
        col = "orange")
boxplot(Lunghezza, horizontal = T,
        main = "Boxplot of Lunghezza",
        col = "orange")

skewness(Lunghezza)

## [1] -1.514699

plot(density(Cranio),
        main = "Density curve of Cranio",
        col = "red")
boxplot(Cranio, horizontal = T,
        main = "Boxplot of Cranio",
        col = "red")

par(mfrow = c(1, 1))
skewness(Cranio)

## [1] -0.7850527

par(mfrow = c(1, 2))
hist(Peso, probability = T, ylim = c(0, max(density((Peso))$y) * 1.1),
     main = "Histogram and density curve of Peso")
lines(density(Peso), col = "violet", lwd = 2)
boxplot(Peso, horizontal = T,
        main = "Boxplot of Peso",
        col = "violet")

par(mfrow = c(1, 1))
skewness(Peso)

## [1] -0.6470308

All the plots above show negative skewness; for some variables is more pronounced, for others less

Anni.madre shows values for 0 and 1 meaning they had a child at that age; that is impossible and may be some sort of error (like a typing error)

Gestazione shows most of the values between 37 and 41 weeks, that is consistent with the 9-months pregnancy

To bear in mind is the fact that the variabile Peso (that will be our target variable) has a slight negative skewness and could influence the models

HYPOTHESIS TESTS

tab.cong = table(Ospedale, Tipo.parto)
ggpubr::ggballoonplot(data = as.data.frame(tab.cong), fill="blue") +
  labs(title = "Baloonplot Ospedale-Tipo.parto",
       x = "Ospedale",
       y = "Tipo.parto")

test1 = chisq.test(Ospedale, Tipo.parto)
data.frame(
  X_squared = round(test1$statistic, 3),
  df = test1$parameter,
  p_value = round(test1$p.value, 3)
) |>
  knitr::kable() |>
  kableExtra::kable_styling(full_width = FALSE)

	X_squared	df	p_value
X-squared	1.097	2	0.578

plot(density(rchisq(1000000, 2)), xlim=c(0, 8),
     main = "Density curve for a sample of 1Mln values generated by a Chi-Square distribution",
     cex.main = 0.9)
abline(v=qchisq(0.95, 2), col=2)
points(1.0972, 0, cex=3, col=4, pch=20)

From the baloonplot seems that more caesarean sections are being performed at osp2

The Chisq test (H0 independence between variables) shows a p-value > 0.05, so we can’t reject the null hypotesis. Ospedale and Tipo.parto seem to be independent

The same results can be seen from the density curve plot where the test statistics is deep inside the acceptance region

test2a = t.test(Peso, mu = 3300, conf.level = 0.95, alternative = "two.sided")
test2b = t.test(Lunghezza, mu = 500, conf.level = 0.95, alternative = "two.sided")
df_test2 = data.frame(
  Variables_name = c("Peso", "Lunghezza"),
  t = c(
    round(as.numeric(test2a$statistic), 3),
    round(as.numeric(test2b$statistic), 3)
  ),
  df = c(
    as.numeric(test2a$parameter),
    as.numeric(test2b$parameter)
  ),
  Estimate = c(
    round(as.numeric(test2a$estimate), 3),
    round(as.numeric(test2b$estimate), 3)
  ),
  p_value = c(
    round(test2a$p.value, 3),
    round(test2b$p.value, 3)
  )
)
df_test2 |>
  knitr::kable(
    caption = "T tests for equality of means"
  ) |>
  kableExtra::kable_styling(full_width = FALSE)

T tests for equality of means
Variables_name	t	df	Estimate	p_value
Peso	-1.516	2499	3284.081	0.13
Lunghezza	-10.084	2499	494.692	0.00

3300 for Peso and 500 for Lunghezza are the population means found in literature

From the T test (H0 sample mean = population mean) we can’t reject the null hypotesis for Peso, meaning the sample mean is equal to 3300, and we reject the null hypotesis for Lunghezza, meaning the sample mean is not equal to 500

datas = df[c("Peso", "Lunghezza", "Cranio")]
variables = c("Peso", "Lunghezza", "Cranio")

shapiro_results = do.call(
  rbind,
  lapply(variables, function(v) {
    test = shapiro.test(datas[[v]])

    data.frame(
      Variabile = v,
      W = round(as.numeric(test$statistic), 3),
      p_value = round(test$p.value, 3)
    )
  })
)

shapiro_results |>
  knitr::kable(
    caption = "Shapiro-Wilk normality tests"
  ) |>
  kableExtra::kable_styling(full_width = FALSE)

Shapiro-Wilk normality tests
Variabile	W	p_value
Peso	0.971	0
Lunghezza	0.909	0
Cranio	0.964	0

wilcox_results = do.call(
  rbind,
  lapply(variables, function(v) {
    test <- wilcox.test(
      as.formula(paste(v, "~ Sesso")),
      data = df
    )

    data.frame(
      Variabile = v,
      W = round(as.numeric(test$statistic), 3),
      p_value = round(test$p.value, 3)
    )
  })
)

wilcox_results |>
  knitr::kable(
    caption = "Wilcoxon rank-sum tests by sex"
  ) |>
  kableExtra::kable_styling(full_width = FALSE)

Wilcoxon rank-sum tests by sex
Variabile	W	p_value
Peso	538640.5	0
Lunghezza	594454.5	0
Cranio	641637.5	0

par(mfrow = c(1, 3))
boxplot(Cranio ~ Sesso,
        main = "Boxplot of Cranio by Sesso",
        col = "lightblue")
boxplot(Lunghezza ~ Sesso,
        main = "Boxplot of Lunghezza by Sesso",
        col = "lightgrey")
boxplot(Peso ~ Sesso,
        main = "Boxplot of Peso by Sesso",
        col = "lightgreen")

par(mfrow = c(1, 1))

After rejecting the null hypotesis for Shapiro-Wilk test (H0 datas came from a population with normal distribution), I decided to use the Wilcoxon rank-sum test for non-parametric tests (H0 distribution of the anthropometric measurement is the same for males and females). P-values < 0.05 for Peso, Lunghezza and Cranio show that males and females have different distributions for every variables

REGRESSION MODELS

panel.cor = function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  par(usr = c(0, 1, 0, 1))
  r = abs(cor(x, y))
  txt = format(c(r, 0.123456789), digits = digits)[1]
  txt = paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor = 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor * r)
}

pairs(df, upper.panel = panel.smooth, lower.panel = panel.cor,
      main = "Scatterplot and Correlation Matrix")

From the above matrix we can see both graphically and numerically the relationship between variables

Considering Peso as our target variable, the higher relationships are with Lunghezza, Cranio and Gestazione (all seem to be positive)

options(kableExtra.html.bsTable = TRUE)

model_report = function(model, title) {

  cat("\n\n##", title, "\n\n")

  coef_tab = as.data.frame(summary(model)$coefficients)

  colnames(coef_tab) = c(
    "Estimate",
    "Std_Error",
    "t_value",
    "p_value"
  )

  print(knitr::kable(coef_tab, digits = 3))

  stats_tab = data.frame(
    Statistic = c("R_squared",
                  "Adj_R_squared",
                  "Residual_SE",
                  "F_statistic"),
    Value = c(
      summary(model)$r.squared,
      summary(model)$adj.r.squared,
      summary(model)$sigma,
      summary(model)$fstatistic[1]
    )
  )

  print(knitr::kable(stats_tab, digits = 3, row.names = FALSE))
}

models <- list(

  "MODEL 1" = lm(Peso ~ ., data = df),

  "MODEL 2" = lm(Peso ~ . -Anni.madre -Fumatrici -Ospedale, data = df),

  "MODEL 3" = update(lm(Peso ~ . -Anni.madre -Fumatrici -Ospedale, data = df),
                        ~ . + I(Gestazione^2)),

  "MODEL 4" = update(lm(Peso ~ . -Anni.madre -Fumatrici -Ospedale, data = df),
                        ~ . - I(Gestazione^2) - Tipo.parto),

  "MODEL 5" = lm(Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
                     Sesso + Lunghezza:Gestazione, data = df)
)

for (i in names(models)) {
  model_report(models[[i]], i)
}

## 
## 
## ## MODEL 1 
## 
## 
## 
## |              |  Estimate| Std_Error| t_value| p_value|
## |:-------------|---------:|---------:|-------:|-------:|
## |(Intercept)   | -6738.476|   141.309| -47.686|   0.000|
## |Anni.madre    |     0.892|     1.132|   0.788|   0.431|
## |N.gravidanze  |    11.267|     4.661|   2.417|   0.016|
## |Fumatrici     |   -30.163|    27.539|  -1.095|   0.273|
## |Gestazione    |    32.570|     3.819|   8.529|   0.000|
## |Lunghezza     |    10.295|     0.301|  34.236|   0.000|
## |Cranio        |    10.471|     0.426|  24.578|   0.000|
## |Tipo.partoNat |    29.525|    12.084|   2.443|   0.015|
## |Ospedaleosp2  |   -11.210|    13.438|  -0.834|   0.404|
## |Ospedaleosp3  |    28.096|    13.496|   2.082|   0.037|
## |SessoM        |    77.541|    11.178|   6.937|   0.000|
## 
## 
## |Statistic     |   Value|
## |:-------------|-------:|
## |R_squared     |   0.729|
## |Adj_R_squared |   0.728|
## |Residual_SE   | 273.925|
## |F_statistic   | 669.188|
## 
## 
## ## MODEL 2 
## 
## 
## 
## |              |  Estimate| Std_Error| t_value| p_value|
## |:-------------|---------:|---------:|-------:|-------:|
## |(Intercept)   | -6707.297|   135.991| -49.322|   0.000|
## |N.gravidanze  |    12.756|     4.337|   2.941|   0.003|
## |Gestazione    |    32.271|     3.794|   8.506|   0.000|
## |Lunghezza     |    10.286|     0.301|  34.207|   0.000|
## |Cranio        |    10.506|     0.426|  24.659|   0.000|
## |Tipo.partoNat |    30.034|    12.097|   2.483|   0.013|
## |SessoM        |    77.929|    11.191|   6.964|   0.000|
## 
## 
## |Statistic     |    Value|
## |:-------------|--------:|
## |R_squared     |    0.728|
## |Adj_R_squared |    0.727|
## |Residual_SE   |  274.320|
## |F_statistic   | 1110.250|
## 
## 
## ## MODEL 3 
## 
## 
## 
## |                |  Estimate| Std_Error| t_value| p_value|
## |:---------------|---------:|---------:|-------:|-------:|
## |(Intercept)     | -4713.385|   897.646|  -5.251|   0.000|
## |N.gravidanze    |    12.841|     4.333|   2.963|   0.003|
## |Gestazione      |   -79.020|    49.670|  -1.591|   0.112|
## |Lunghezza       |    10.389|     0.304|  34.185|   0.000|
## |Cranio          |    10.601|     0.428|  24.780|   0.000|
## |Tipo.partoNat   |    29.566|    12.089|   2.446|   0.015|
## |SessoM          |    75.767|    11.223|   6.751|   0.000|
## |I(Gestazione^2) |     1.486|     0.661|   2.247|   0.025|
## 
## 
## |Statistic     |   Value|
## |:-------------|-------:|
## |R_squared     |   0.728|
## |Adj_R_squared |   0.727|
## |Residual_SE   | 274.097|
## |F_statistic   | 953.910|
## 
## 
## ## MODEL 4 
## 
## 
## 
## |             |  Estimate| Std_Error| t_value| p_value|
## |:------------|---------:|---------:|-------:|-------:|
## |(Intercept)  | -6681.144|   135.723| -49.226|   0.000|
## |N.gravidanze |    12.475|     4.340|   2.875|   0.004|
## |Gestazione   |    32.332|     3.798|   8.513|   0.000|
## |Lunghezza    |    10.249|     0.301|  34.090|   0.000|
## |Cranio       |    10.540|     0.426|  24.728|   0.000|
## |SessoM       |    77.993|    11.202|   6.962|   0.000|
## 
## 
## |Statistic     |    Value|
## |:-------------|--------:|
## |R_squared     |    0.727|
## |Adj_R_squared |    0.726|
## |Residual_SE   |  274.604|
## |F_statistic   | 1328.316|
## 
## 
## ## MODEL 5 
## 
## 
## 
## |                     |  Estimate| Std_Error| t_value| p_value|
## |:--------------------|---------:|---------:|-------:|-------:|
## |(Intercept)          | -1990.630|   920.217|  -2.163|   0.031|
## |N.gravidanze         |    13.044|     4.319|   3.020|   0.003|
## |Gestazione           |   -93.965|    24.799|  -3.789|   0.000|
## |Lunghezza            |    -0.081|     2.027|  -0.040|   0.968|
## |Cranio               |    10.760|     0.426|  25.245|   0.000|
## |SessoM               |    72.280|    11.200|   6.454|   0.000|
## |Gestazione:Lunghezza |     0.273|     0.053|   5.153|   0.000|
## 
## 
## |Statistic     |    Value|
## |:-------------|--------:|
## |R_squared     |    0.730|
## |Adj_R_squared |    0.729|
## |Residual_SE   |  273.208|
## |F_statistic   | 1122.697|

I decided to investigate five multiple linear regression models. For everyone the Adjusted R squared seems to be more or less the same

Starting with the mod1, where I decided to bring all the variables, I have removed the non-significant variables in the mod2

In mod3 I assumed a non-linear relationship between Peso and Gestazione

Then mod4 was the one with the fewest significant variables. In mod5 I investigated the interaction between Gestazione and Lunghezza

SELECTION OF THE OPTIMAL MODEL

table_ic = data.frame(
  Model = c("mod1", "mod2", "mod3", "mod4", "mod5"),
  AIC = c(AIC(mod1), AIC(mod2), AIC(mod3), AIC(mod4), AIC(mod5)),
  BIC = c(BIC(mod1), BIC(mod2), BIC(mod3), BIC(mod4), BIC(mod5))
)

knitr::kable(table_ic, digits = 3,
  caption = "Comparison between AIC and BIC models") |>
  kableExtra::kable_styling(full_width = FALSE)

Comparison between AIC and BIC models
Model	AIC	BIC
mod1	35171.95	35241.84
mod2	35175.16	35221.75
mod3	35172.10	35224.51
mod4	35179.33	35220.10
mod5	35154.84	35201.44

From AIC and BIC method of comparison, the models with the fewest value is mod5. I decided to prefer the BIC method and decided to compare mod4 and mod5

anova(mod5, mod4)

## Analysis of Variance Table
## 
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso + 
##     Lunghezza:Gestazione
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
##   Res.Df       RSS Df Sum of Sq      F    Pr(>F)    
## 1   2493 186083565                                  
## 2   2494 188065546 -1  -1981981 26.553 2.765e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

The p-value < 0.05, so there is a significant increase in the explained variance when the interaction term is added. Given that mod5 seems to be the best one

vif4 <- data.frame(
  Model = "mod4",
  Predictor = names(vif(mod4)),
  VIF = as.numeric(vif(mod4))
)

vif_mod5 = vif(mod5, type = "predictor")

vif5 = data.frame(
  Model = "mod5",
  Predictor = rownames(vif_mod5),
  VIF = vif_mod5[, "GVIF^(1/(2*Df))"]
)

table_vif = rbind(vif4, vif5)

knitr::kable(table_vif, digits = 3,
             caption = "VIF values for mod4 and mod5") |>
  kableExtra::kable_styling(full_width = FALSE)

VIF values for mod4 and mod5
Model	Predictor	VIF
mod4	N.gravidanze	1.023
mod4	Gestazione	1.669
mod4	Lunghezza	2.075
mod4	Cranio	1.624
mod4	Sesso	1.040
mod5	N.gravidanze	1.012
mod5	Gestazione	1.092
mod5	Lunghezza	1.092
mod5	Cranio	1.281
mod5	Sesso	1.025

With VIF I checked whether there is multicollinearity in your model (whether some explanatory variables are highly correlated with one another)

For mod4 all VIF are below 5 so there is not multicollinearity

For mod5 I used the type = “predictor” and all the GVIF are below 5

n = nrow(df)
stepwise.mod <- MASS::stepAIC(mod1, direction = "both", k = log(n))

## Start:  AIC=28139.32
## Peso ~ Anni.madre + N.gravidanze + Fumatrici + Gestazione + Lunghezza + 
##     Cranio + Tipo.parto + Ospedale + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Anni.madre    1     46578 186809099 28132
## - Fumatrici     1     90019 186852540 28133
## - Ospedale      2    685979 187448501 28133
## - N.gravidanze  1    438452 187200974 28137
## - Tipo.parto    1    447929 187210450 28138
## <none>                      186762521 28139
## - Sesso         1   3611021 190373542 28179
## - Gestazione    1   5458403 192220925 28204
## - Cranio        1  45326172 232088693 28675
## - Lunghezza     1  87951062 274713583 29096
## 
## Step:  AIC=28132.12
## Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio + 
##     Tipo.parto + Ospedale + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Fumatrici     1     90897 186899996 28126
## - Ospedale      2    692738 187501837 28126
## - Tipo.parto    1    448222 187257321 28130
## <none>                      186809099 28132
## - N.gravidanze  1    633756 187442855 28133
## + Anni.madre    1     46578 186762521 28139
## - Sesso         1   3618736 190427835 28172
## - Gestazione    1   5412879 192221978 28196
## - Cranio        1  45588236 232397335 28670
## - Lunghezza     1  87950050 274759149 29089
## 
## Step:  AIC=28125.51
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto + 
##     Ospedale + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Ospedale      2    701680 187601677 28119
## - Tipo.parto    1    440684 187340680 28124
## <none>                      186899996 28126
## - N.gravidanze  1    610840 187510837 28126
## + Fumatrici     1     90897 186809099 28132
## + Anni.madre    1     47456 186852540 28133
## - Sesso         1   3602797 190502794 28165
## - Gestazione    1   5346781 192246777 28188
## - Cranio        1  45632149 232532146 28664
## - Lunghezza     1  88355030 275255027 29086
## 
## Step:  AIC=28119.23
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto + 
##     Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Tipo.parto    1    463870 188065546 28118
## <none>                      187601677 28119
## - N.gravidanze  1    651066 188252743 28120
## + Ospedale      2    701680 186899996 28126
## + Fumatrici     1     99840 187501837 28126
## + Anni.madre    1     54392 187547285 28126
## - Sesso         1   3649259 191250936 28160
## - Gestazione    1   5444109 193045786 28183
## - Cranio        1  45758101 233359778 28657
## - Lunghezza     1  88054432 275656108 29074
## 
## Step:  AIC=28117.58
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## <none>                      188065546 28118
## - N.gravidanze  1    623141 188688687 28118
## + Tipo.parto    1    463870 187601677 28119
## + Ospedale      2    724866 187340680 28124
## + Fumatrici     1     91892 187973654 28124
## + Anni.madre    1     54816 188010731 28125
## - Sesso         1   3655292 191720838 28158
## - Gestazione    1   5464853 193530399 28181
## - Cranio        1  46108583 234174130 28658
## - Lunghezza     1  87632762 275698308 29066

coef_tab = as.data.frame(summary(stepwise.mod)$coefficients)

colnames(coef_tab) = c("Estimate", "Std_Error", "t_value", "p_value")

knitr::kable(coef_tab, digits = 3,
             caption = "Stepwise Model Coefficients") |>
  kableExtra::kable_styling(full_width = FALSE)

Stepwise Model Coefficients
	Estimate	Std_Error	t_value	p_value
(Intercept)	-6681.144	135.723	-49.226	0.000
N.gravidanze	12.475	4.340	2.875	0.004
Gestazione	32.332	3.798	8.513	0.000
Lunghezza	10.249	0.301	34.090	0.000
Cranio	10.540	0.426	24.728	0.000
SessoM	77.993	11.202	6.962	0.000

stats_tab = data.frame(
  Statistic = c("R_squared", "Adj_R_squared", "Residual_SE"),
  Value = c(
    summary(stepwise.mod)$r.squared,
    summary(stepwise.mod)$adj.r.squared,
    summary(stepwise.mod)$sigma
  )
)

knitr::kable(stats_tab, digits = 3,
             caption = "Stepwise Model Fit Statistics") |>
  kableExtra::kable_styling(full_width = FALSE)

Stepwise Model Fit Statistics
Statistic	Value
R_squared	0.727
Adj_R_squared	0.726
Residual_SE	274.604

I also used the stepAIC method (with k = log(n), so utilizing the BIC) to see which is the method chosen by the automatic procedure

It seems the mod4 as the best method chosen by the procedure

I also decided to chose the mod4 as the best method because it uses simple variables and all of them are significant (so I decided not to introduce the interaction in the chosen model)

Below the interpretation of the model’s coefficients:

- An increase of N.gravidanze results in an increase of 12.475 grams of Peso

- An increase of Gestazione results in an increase of 32.332 grams of Peso

- An increase of Lunghezza results in an increase of 10.249 grams of Peso

- An increase of Cranio results in an increase of 10.540 grams of Peso

- For Sesso variable, all other factors being equal, a male weighs 77.993 grams more than a female

MODEL QUALITY ANALYSIS

The Adjusted R squared is more or less 0.73, therefore, the model explains 73% of the observed variability in Peso

RMSE is equal to the Residual Standard Error (~275 grams), which means that, on average, the model’s predictions are wrong by around 275 grams (corresponding to approximately 8% of the observed mean Peso -> 275/mean(Peso))

par(mfrow=c(2,2))
plot(mod4)

par(mfrow=c(1,1))

From the residual analysis we can see four graphs:

- The firs shows a curved pattern, so it appears that some of the information has not been properly captured by the regressors and has been reflected in the residuals

- In the second graph, the data points appear to lie on the line with the exception of the upper curve, which does not seem to fit well

- In the third graph a curvature is noticeable, although not too excessive

- In the fourth outliers are visible; in particular, observation 1551 is approaching the warning threshold (set at 1 for Cook’s distance)

lev = hatvalues(mod4)
plot(lev, main = "Leverage values (hat values) mod4",
     ylab = "Leverage", xlab = "Observation index")
p = sum(lev)
soglia = 2 * p/n
abline(h=soglia, col=2)

lev[lev > soglia]

##          13          15          34          67          89          96 
## 0.005630918 0.007050422 0.006744143 0.005891590 0.012816470 0.005351586 
##         101         106         131         134         151         155 
## 0.007526751 0.014479871 0.007229339 0.007552819 0.010883937 0.007207682 
##         161         189         190         204         205         206 
## 0.020335483 0.004893297 0.005366557 0.014490604 0.005351634 0.009476652 
##         220         294         305         310         312         315 
## 0.007393997 0.005912765 0.005442061 0.028812123 0.013169272 0.005385800 
##         378         440         442         445         486         492 
## 0.015934061 0.005404736 0.007723662 0.007509382 0.005164446 0.008274018 
##         497         516         582         587         592         614 
## 0.005166306 0.013079851 0.011665555 0.008412325 0.006384116 0.005299262 
##         638         656         657         684         697         702 
## 0.006688287 0.005927777 0.005322685 0.008818987 0.005863826 0.005202259 
##         729         748         750         757         765         805 
## 0.005023115 0.008565543 0.006942097 0.008145491 0.006070298 0.014356657 
##         828         893         895         913         928         946 
## 0.007179817 0.005075205 0.005295896 0.005571144 0.022742332 0.006909044 
##         947         956         985        1008        1014        1049 
## 0.008409465 0.007784123 0.007039416 0.005343037 0.008470133 0.004956169 
##        1067        1091        1106        1130        1166        1181 
## 0.008465430 0.008933360 0.005967317 0.031728597 0.005513559 0.005677676 
##        1188        1200        1219        1238        1248        1273 
## 0.006477203 0.005492370 0.030694311 0.005908078 0.014622914 0.007085831 
##        1291        1293        1311        1321        1325        1356 
## 0.006117497 0.006073639 0.009625908 0.009293111 0.004857169 0.005303442 
##        1357        1385        1395        1400        1402        1411 
## 0.006965051 0.012636943 0.005126697 0.005925069 0.004811441 0.008048184 
##        1420        1428        1429        1450        1505        1551 
## 0.005155654 0.008192811 0.021757172 0.015104831 0.013330439 0.048769569 
##        1553        1556        1573        1593        1606        1610 
## 0.008504889 0.005919673 0.005047204 0.005623758 0.005001812 0.008722184 
##        1617        1619        1628        1686        1693        1701 
## 0.004866796 0.015067498 0.005069731 0.009349313 0.005077858 0.010842957 
##        1712        1718        1727        1735        1780        1781 
## 0.006992084 0.006958857 0.013300523 0.004884846 0.025538678 0.016832361 
##        1809        1827        1868        1892        1962        1967 
## 0.008707504 0.006065698 0.005205637 0.005332812 0.005540442 0.005337356 
##        1977        2037        2040        2046        2086        2089 
## 0.006927281 0.004889127 0.011494872 0.005471670 0.013193090 0.006293550 
##        2098        2114        2115        2120        2140        2146 
## 0.005094455 0.013316875 0.011772090 0.018659995 0.006244232 0.005802168 
##        2148        2149        2157        2175        2200        2215 
## 0.007926839 0.013583436 0.005907225 0.032527273 0.011670024 0.004892265 
##        2216        2220        2221        2224        2225        2244 
## 0.008117864 0.005414040 0.021628717 0.005838076 0.005591261 0.006929217 
##        2257        2307        2317        2318        2337        2359 
## 0.006170254 0.013965608 0.007673614 0.004831118 0.005230450 0.010067364 
##        2408        2422        2436        2437        2452        2458 
## 0.009696691 0.021532808 0.004986522 0.023943328 0.023838489 0.008506087 
##        2471        2478 
## 0.020903740 0.005775173

It seems there are several levearge points (values that lie far apart from the other observations in the space of the predictors)

plot(rstudent(mod4), main = "Studentized residuals mod4",
     ylab = "Studentized residuals",
     xlab = "Index")
abline(h=c(-2,2), col=2)

outlierTest(mod4)

##       rstudent unadjusted p-value Bonferroni p
## 1551 10.051908         2.4906e-23   6.2265e-20
## 155   5.027798         5.3138e-07   1.3285e-03
## 1306  4.827238         1.4681e-06   3.6702e-03

From the graph seems there are several outlier points (in response variable) but with the outlierTest the Bonferroni correction is applied to this test, only 3 otuliers points are identified

cook = cooks.distance(mod4)
plot(cook, main = "Cook's distance mod4", ylab = "Studentized residuals", xlab = "Index", pch=19)
max_i <- which.max(cook)
points(max_i, cook[max_i],
       col = "red", pch = 19, cex = 1.5)
text(max_i, cook[max_i],
     labels = max_i,
     pos = 1,
     cex = 0.8)

knitr::kable(
  df[1551, , drop = FALSE],
  caption = "Observation 1551"
) |>
  kableExtra::kable_styling(full_width = FALSE)

Observation 1551
	Anni.madre	N.gravidanze	Fumatrici	Gestazione	Peso	Lunghezza	Cranio	Tipo.parto	Ospedale	Sesso
1551	35	1	0	38	4370	315	374	Nat	osp3	F

From Cook’s distance plot there is one observation near the warning threshold and is the observation 1551; this observation could have an impact on the regression estimates

bptest(mod4)

## 
##  studentized Breusch-Pagan test
## 
## data:  mod4
## BP = 90.253, df = 5, p-value < 2.2e-16

dwtest(mod4)

## 
##  Durbin-Watson test
## 
## data:  mod4
## DW = 1.9535, p-value = 0.1224
## alternative hypothesis: true autocorrelation is greater than 0

shapiro.test(residuals(mod4))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(mod4)
## W = 0.97408, p-value < 2.2e-16

plot(density(residuals(mod4)), main = "Density curve on residuals")

For Breusch-Pagan test the H0 hypothesis of homoscedasticity (non-constant variance) is rejected

For Durbin-Watson test the H0 hypothesis of no autocorrelation in the residuals is not rejected (the residuals are not autocorrelated)

For Shapiro-Wilk test the H0 null hypothesis of normality of the residuals is rejected (non-normal residuals). From the density curve of residuals we can see an elongated right tail

RESULTS

The model demonstrates good explanatory and predictive power, accounting for approximately 73% of the variability in Peso and featuring coefficients that are all highly significant

The residual diagnostics indicate some deviations from the ideal assumptions, with mild heteroscedasticity confirmed by the Breusch-Pagan test, though this is partly accentuated by the large sample size (n = 2500)

The residual plots reveal slight non-linearity and increased dispersion for high values of the response variable, along with a heavier right tail in the QQ-plot and some influential outliers

Overall, the model is generally adequate, with well-centred residuals and no autocorrelation, but with some issues relating to heteroscedasticity and non-normal tails

The main weaknesses concern the presence of a few influential points, slight non-linearity and deviations from the normality assumptions in the tails

FORECAST

df_predict = data.frame(
  N.gravidanze = 3,
  Gestazione = 39,
  Lunghezza = mean(df$Lunghezza, na.rm = TRUE),
  Cranio = mean(df$Cranio, na.rm = TRUE),
  Sesso = "F"
)
pred = predict(mod4, newdata = df_predict)
pred[[1]]

## [1] 3271.09

Using the mod4 model, when the N.gravidanze are 3, weeks of Gestazione are 39, Sesso is F (considering for Lunghezza and Cranio the sample mean) we have a prediction on Peso of 3271.09

VISUALIZATIONS

ggplot(df) +
       geom_point(aes(x = Gestazione, y = Peso, color = Sesso), position = "jitter") +
       geom_smooth(aes(x = Gestazione, y = Peso, color = Sesso),
                   method = "lm", se = FALSE)  +
  labs(title = "Peso vs Gestazione by Sesso",
       x = "Gestazione",
       y = "Peso",
       color = "Sesso")

The plot Peso and Gestazione by Sesso show that for both male and female an increase in weeks of Gestazione means an increase of Peso (the slope of the line seems to be the same for male and female). The difference is that, for the same week of Gestazione, males have a Peso higher than females, specifically 77.993 grams more

ggplot(df) +
  geom_point(aes(x = Gestazione, y = Peso, color = as.factor(Fumatrici)), position = "jitter") +
  geom_smooth(aes(x = Gestazione, y = Peso, color = as.factor(Fumatrici)),
              method = "lm", se = FALSE)  +
  labs(title = "Peso vs Gestazione by Fumatrici",
       x = "Gestazione",
       y = "Peso",
       color = "Fumatrici")

boxplot(Peso ~ Fumatrici, main = "Boxplot of Peso conditional on the Fumatrici variable")

Given that datas for smokers are much lower than datas of non-smokers, the plot Peso and Gestazione by Fumatrici show quite the same slope for both males and females, meaning that whether or not you smoke has no effect on your son’s or daughter’s weight

Prediction of neonatal weight

Tommaso Scortecci

2026-06-18

DESCRIPTIVE ANALYTICS

From the first analysis we can see few things:

- The dimension of the dataset is 2500 observations x 10 columns

- Most of the women are non-smokers (95.8%) and have had a natural birth (70.9%)

- All three hospitals have roughly the same absolute frequency

- Most of the women had 3 or less than 3 pregnancies, specifically 96.2% of them

All the plots above show negative skewness; for some variables is more pronounced, for others less

Anni.madre shows values for 0 and 1 meaning they had a child at that age; that is impossible and may be some sort of error (like a typing error)

Gestazione shows most of the values between 37 and 41 weeks, that is consistent with the 9-months pregnancy

To bear in mind is the fact that the variabile Peso (that will be our target variable) has a slight negative skewness and could influence the models

HYPOTHESIS TESTS

From the baloonplot seems that more caesarean sections are being performed at osp2

The Chisq test (H0 independence between variables) shows a p-value > 0.05, so we can’t reject the null hypotesis. Ospedale and Tipo.parto seem to be independent

The same results can be seen from the density curve plot where the test statistics is deep inside the acceptance region

3300 for Peso and 500 for Lunghezza are the population means found in literature

From the T test (H0 sample mean = population mean) we can’t reject the null hypotesis for Peso, meaning the sample mean is equal to 3300, and we reject the null hypotesis for Lunghezza, meaning the sample mean is not equal to 500

REGRESSION MODELS

From the above matrix we can see both graphically and numerically the relationship between variables

Considering Peso as our target variable, the higher relationships are with Lunghezza, Cranio and Gestazione (all seem to be positive)

I decided to investigate five multiple linear regression models. For everyone the Adjusted R squared seems to be more or less the same

Starting with the mod1, where I decided to bring all the variables, I have removed the non-significant variables in the mod2

In mod3 I assumed a non-linear relationship between Peso and Gestazione

Then mod4 was the one with the fewest significant variables. In mod5 I investigated the interaction between Gestazione and Lunghezza

SELECTION OF THE OPTIMAL MODEL

From AIC and BIC method of comparison, the models with the fewest value is mod5. I decided to prefer the BIC method and decided to compare mod4 and mod5

The p-value < 0.05, so there is a significant increase in the explained variance when the interaction term is added. Given that mod5 seems to be the best one

With VIF I checked whether there is multicollinearity in your model (whether some explanatory variables are highly correlated with one another)

For mod4 all VIF are below 5 so there is not multicollinearity

For mod5 I used the type = “predictor” and all the GVIF are below 5

I also used the stepAIC method (with k = log(n), so utilizing the BIC) to see which is the method chosen by the automatic procedure

It seems the mod4 as the best method chosen by the procedure

I also decided to chose the mod4 as the best method because it uses simple variables and all of them are significant (so I decided not to introduce the interaction in the chosen model)

Below the interpretation of the model’s coefficients:

- An increase of N.gravidanze results in an increase of 12.475 grams of Peso

- An increase of Gestazione results in an increase of 32.332 grams of Peso

- An increase of Lunghezza results in an increase of 10.249 grams of Peso

- An increase of Cranio results in an increase of 10.540 grams of Peso

- For Sesso variable, all other factors being equal, a male weighs 77.993 grams more than a female

MODEL QUALITY ANALYSIS

The Adjusted R squared is more or less 0.73, therefore, the model explains 73% of the observed variability in Peso

RMSE is equal to the Residual Standard Error (~275 grams), which means that, on average, the model’s predictions are wrong by around 275 grams (corresponding to approximately 8% of the observed mean Peso -> 275/mean(Peso))

From the residual analysis we can see four graphs:

- The firs shows a curved pattern, so it appears that some of the information has not been properly captured by the regressors and has been reflected in the residuals

- In the second graph, the data points appear to lie on the line with the exception of the upper curve, which does not seem to fit well

- In the third graph a curvature is noticeable, although not too excessive

- In the fourth outliers are visible; in particular, observation 1551 is approaching the warning threshold (set at 1 for Cook’s distance)

It seems there are several levearge points (values that lie far apart from the other observations in the space of the predictors)

From the graph seems there are several outlier points (in response variable) but with the outlierTest the Bonferroni correction is applied to this test, only 3 otuliers points are identified

From Cook’s distance plot there is one observation near the warning threshold and is the observation 1551; this observation could have an impact on the regression estimates

For Breusch-Pagan test the H0 hypothesis of homoscedasticity (non-constant variance) is rejected

For Durbin-Watson test the H0 hypothesis of no autocorrelation in the residuals is not rejected (the residuals are not autocorrelated)

For Shapiro-Wilk test the H0 null hypothesis of normality of the residuals is rejected (non-normal residuals). From the density curve of residuals we can see an elongated right tail

RESULTS

The model demonstrates good explanatory and predictive power, accounting for approximately 73% of the variability in Peso and featuring coefficients that are all highly significant

The residual diagnostics indicate some deviations from the ideal assumptions, with mild heteroscedasticity confirmed by the Breusch-Pagan test, though this is partly accentuated by the large sample size (n = 2500)

The residual plots reveal slight non-linearity and increased dispersion for high values of the response variable, along with a heavier right tail in the QQ-plot and some influential outliers

Overall, the model is generally adequate, with well-centred residuals and no autocorrelation, but with some issues relating to heteroscedasticity and non-normal tails

The main weaknesses concern the presence of a few influential points, slight non-linearity and deviations from the normality assumptions in the tails

FORECAST

Using the mod4 model, when the N.gravidanze are 3, weeks of Gestazione are 39, Sesso is F (considering for Lunghezza and Cranio the sample mean) we have a prediction on Peso of 3271.09

VISUALIZATIONS

Given that datas for smokers are much lower than datas of non-smokers, the plot Peso and Gestazione by Fumatrici show quite the same slope for both males and females, meaning that whether or not you smoke has no effect on your son’s or daughter’s weight

Actually the boxplot of Peso conditional on Fumatrici variable does not show much difference between the two graphs