ANALISYS AND MODELING

# PRELIMINARY ANALYSIS
# Load packages
library(ggplot2)
library(dplyr)
library(readr)

# Import dataset
data <- read_csv("neonati.csv")

#Rename variables
colnames(data)[colnames(data) == "Anni.madre"] <- "età_madre"
colnames(data)[colnames(data) == "N.gravidanze"] <- "numero_gravidanze"
colnames(data)[colnames(data) == "Fumatrici"] <- "madre_fumatrice"
colnames(data)[colnames(data) == "Gestazione"] <- "durata_gravidanza"
colnames(data)[colnames(data) == "Peso"] <- "peso_neonato"
colnames(data)[colnames(data) == "Lunghezza"] <- "lunghezza_neonato"
colnames(data)[colnames(data) == "Cranio"] <- "diametro_cranio"
colnames(data)[colnames(data) == "Tipo.parto"] <- "tipo_parto"
colnames(data)[colnames(data) == "Ospedale"] <- "ospedale"
colnames(data)[colnames(data) == "Sesso"] <- "sesso_neonato"

attach(data) # to use dataset variables in a directical way

# EDA (Explorative Data Analysis) with ggplot2
newborn_boxplot <- ggplot(data, aes(y = peso_neonato)) +
  geom_boxplot(fill = 'gray90', color = "black", width = 0.2) +  
  labs(title = "Distribution of Newborn Weight", y = "Newborn Weight (g)") +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),  
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )

# Print the plot
newborn_boxplot

This boxplot shows the distribution of newborn weights in the dataset.

# Mapping of Italian variable names to English labels
label_mapping_1 <- c(
  "tipo_parto" = "Type of Delivery",
  "sesso_neonato" = "Newborn Sex",
  "ospedale" = "Hospital",
  "madre_fumatrice" = "Mother Smokes"
)

# Define a function that generates the boxplot
plot_boxplot <- function(x_var_1) {
  if (x_var_1 == "madre_fumatrice") {
    ggplot(data, aes(x = factor(madre_fumatrice), y = peso_neonato)) +
      geom_boxplot(fill = 'gray90', color = "black") +
      labs(title = paste("Newborn Weight by", label_mapping_1[x_var_1]), 
           x = label_mapping_1[x_var_1], 
           y = "Newborn Weight (g)") +
      theme_minimal() +
      theme(
        plot.title = element_text(hjust = 0.5), 
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 10),
        legend.position = "none"  
      )
  } else {
    ggplot(data, aes_string(x = x_var_1, y = "peso_neonato")) +
      geom_boxplot(fill = 'gray90', color = "black") +
      labs(title = paste("Newborn Weight by", label_mapping_1[x_var_1]), 
           x = label_mapping_1[x_var_1], 
           y = "Newborn Weight (g)") +
      theme_minimal() +
      theme(
        plot.title = element_text(hjust = 0.5),   
        axis.title = element_text(size = 12),
        axis.text = element_text(size = 10),
        legend.position = "none"
      )
  }
}

# List of variables for the x-axis
x_vars_1 <- c("tipo_parto", "sesso_neonato", "ospedale", "madre_fumatrice")

# Iterate over the x_vars list and apply the function to each
for (x_var_1 in x_vars_1) {
  print(plot_boxplot(x_var_1))  
}

These boxplots visually show the distribution of newborn weights for different types of delivery; the comparation of how newborn weights are distributed across various hospitals; the difference between the distribution of newborn weights between male and female newborns and the distribution of newborn weights for two groups based on the smoking status of the mother.

# Create a mapping of Italian variable names to English labels
label_mapping_2 <- c(
  "età_madre" = "Mother's Age",
  "numero_gravidanze" = "Number of Pregnancies",
  "durata_gravidanza" = "Pregnancy Duration",
  "lunghezza_neonato" = "Newborn Length",
  "diametro_cranio" = "Head Diameter"
)

# Define a function that generates scatter plots
plot_scatter_1 <- function(x_var_2) {
  ggplot(data, aes_string(x = x_var_2, y = "peso_neonato")) +
    geom_point(alpha = 0.6, color = "blue") +
    geom_smooth(method = "lm", color = "red", linetype = "dashed") +  
    labs(title = paste("Newborn Weight vs.", label_mapping_2[x_var_2]), 
         x = label_mapping_2[x_var_2], 
         y = "Newborn Weight (g)") +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5),  
      axis.title = element_text(size = 12),
      axis.text = element_text(size = 10)
    )
}

# List of variables for the x-axis (specified variables)
x_vars_2 <- c("età_madre", "numero_gravidanze", "durata_gravidanza", 
              "lunghezza_neonato", "diametro_cranio")

# Iterate over the x_vars list and apply the function to each for scatter plots only
for (x_var_2 in x_vars_2) {
  print(plot_scatter_1(x_var_2))  
}

We have different scatterplots here: the relationship between the mother’s age and the newborn’s birth weight; between the number of pregnancies the mother has had and the newborn’s birth weight; between the duration of the pregnancy (in weeks) and the newborn’s birth weight; between the newborn’s length at birth and the newborn’s weight and between the newborn’s head diameter and the newborn’s birth weight. These scatter plots provide insights into how various maternal and newborn characteristics (mother’s age, number of pregnancies, pregnancy duration, newborn length, and head diameter) are related to the birth weight of the newborn. The linear regression line helps to visualize the trend (positive or negative) in each case, indicating whether there’s a correlation between the variables.

# Select only numeric variables in the dataset
numeric_data <- data[sapply(data, is.numeric)]

# Calculate the correlation matrix
cor_matrix <- cor(numeric_data, use = "complete.obs")

# load the corrplot package
library(corrplot)

# Create a heatmap of the correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper", 
         col = colorRampPalette(c("blue", "white", "red"))(200),
         title = "Correlation Matrix", 
         mar = c(0, 0, 2, 0))

The correlation matrix shows the pairwise relationships between all numeric variables in the dataset. Each circle in the heatmap represents the correlation between two variables: - Positive correlation: A red circle indicates a positive relationship (when one variable increases, the other tends to increase as well). - Negative correlation: A blue circle indicates a negative relationship (when one variable increases, the other tends to decrease). - No or weak correlation: A white circle indicates little to no correlation between the variables.

Stronger correlations: Large red or blue circles indicate a stronger correlation (close to +1 or -1).
Weaker correlations: Small circles near white indicate weak or no correlation (close to 0).
Symmetry: The matrix is symmetric, meaning that the correlation between variable A and variable B is the same as the correlation between variable B and variable A.

#1-In some hospitals, more cesarean sections are done

# Create a contingency table of the two categorical variables: 'ospedale' (hospital) and 'tipo_parto' (birth type)
table_hospital_birth_type <- table(ospedale, tipo_parto)

# Perform the Chi-square test on the contingency table to check if there's an association between hospital and birth type
chi_square_test <- chisq.test(table_hospital_birth_type)

# Display the results of the Chi-square test
chi_square_test

## 
##  Pearson's Chi-squared test
## 
## data:  table_hospital_birth_type
## X-squared = 1.0972, df = 2, p-value = 0.5778

This type of test is used in contingency tables to assess the hypothesis of independence between qualitative, quantitative variables in classes or discrete quantitative variables. It can also be used for nominal-scale variables, but it tells us nothing about the direction of the association (in this sense, it is better to employ other indices, such as Kendall’s correlation index, directly on the raw data). We have three keys to interpret: - Chi-square statistic: A larger value indicates a greater difference between observed and expected frequencies; - p-value: If the p-value is less than 0.05, you reject the null hypothesis and conclude that there is a significant relationship between the hospital and the type of birth (i.e., some hospitals perform more cesarean births); - Degrees of freedom (df): It indicates the number of categories considered in the test; The p-value, in this case, is particularly high (about 0.58): it means that we cannot reject the null hypothesis, and it is concluded that there are no hospitals in this sample in which more cesarean births are performed than in others, according to the initial hypothesis system. There is no difference in the proportions of cesarean births between hospitals (i.e., the type of birth is independent of the hospital).

#2- The average weight and length of this sample of infants are significantly the same as those of the population

# Population means
population_mean_newborn_weight <- 3300  # Population mean newborn weight in grams
population_mean_newborn_length <- 500    # Population mean newborn length in mm

# Newborn weight test t
t_test_mean_weight <- t.test(peso_neonato, mu = population_mean_newborn_weight)

# Newborn length test t
t_test_mean_length <- t.test(lunghezza_neonato, mu = population_mean_newborn_length)

# Results print
t_test_mean_weight

## 
##  One Sample t-test
## 
## data:  peso_neonato
## t = -1.516, df = 2499, p-value = 0.1296
## alternative hypothesis: true mean is not equal to 3300
## 95 percent confidence interval:
##  3263.490 3304.672
## sample estimates:
## mean of x 
##  3284.081

t_test_mean_length

## 
##  One Sample t-test
## 
## data:  lunghezza_neonato
## t = -10.084, df = 2499, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 500
## 95 percent confidence interval:
##  493.6598 495.7242
## sample estimates:
## mean of x 
##   494.692

The t-test for a single sample is used to determine if the mean of a sample is significantly different from a known or hypothesized population mean. In this case, there’s a comparation between the mean weight and the mean length of newborns in the sample with the ones of the population averages (3300 grams for weight and 500 mm for length). We have different types of parameters to evaluate: - The t-value is calculated as the difference between the sample mean and the population mean, divided by the standard error of the sample. It tells you how far the sample mean is from the population mean in terms of standard deviations. The degrees of freedom (df) is used to determine the appropriate distribution for the test statistic. The p-value tells you the probability of observing a sample mean at least as extreme as the one you obtained, assuming the null hypothesis is true (i.e., the sample mean equals the population mean). If the p-value is less than 0.05 (commonly used threshold), you reject the null hypothesis, indicating that the sample mean is significantly different from the population mean. The confidence interval for the mean difference tells you the range in which the true population mean is likely to fall. A 95% confidence interval means that if you repeated the study many times, 95% of the intervals would contain the true population mean. If the population mean lies outside the confidence interval, you can conclude that the sample mean is significantly different from the population mean. The t-test for a single sample tests the null hypothesis that the mean of the sample is equal to the population mean. Specifically, you’re testing: - Null hypothesis (H₀): The sample mean of the newborn weight (or length) is equal to the population mean (3300 grams for weight, 500 mm for length). - Alternative hypothesis (H₁): The sample mean of the newborn weight (or length) is not equal to the population mean.

For t_test_weight, it’s possible to conclude that there is no significant difference between the sample mean newborn weight and the population one (p-value is about 0.13). So we can accept the null hypothesis.
For t_test_length, we can say that there is a significant difference between the two means due to a low p-value (2.2e-16). Here, we have to reject the null hypothesis.

#3- Anthropometric measurements are significantly different between the two sexes

# Conditional boxplots
# Mapping of Italian variable names to English labels
label <- c(
  "peso_neonato" = "Newborn Weight",
  "lunghezza_neonato" = "Newborn Length",
  "diametro_cranio" = "Head Circumference"
)

# List of variables to plot (in Italian)
variables_to_plot <- c("peso_neonato", "lunghezza_neonato", "diametro_cranio")

# Iterate over each variable and create the boxplot
for (var in variables_to_plot) {
  boxplot_formula <- as.formula(paste(var, "~ sesso_neonato"))
  boxplot(boxplot_formula, 
          main = paste(label[var], "by Newborn Sex"), 
          xlab = "Newborn Sex", 
          ylab = paste(label[var]))
}

# Perform independent t-tests
t_test_sex_weight <- t.test(peso_neonato ~ sesso_neonato)
t_test_sex_length <- t.test(lunghezza_neonato ~ sesso_neonato)
t_test_sex_skull <- t.test(diametro_cranio ~ sesso_neonato)

# Display the results
t_test_sex_weight

## 
##  Welch Two Sample t-test
## 
## data:  peso_neonato by sesso_neonato
## t = -12.106, df = 2490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -287.1051 -207.0615
## sample estimates:
## mean in group F mean in group M 
##        3161.132        3408.215

t_test_sex_length

## 
##  Welch Two Sample t-test
## 
## data:  lunghezza_neonato by sesso_neonato
## t = -9.582, df = 2459.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -11.929470  -7.876273
## sample estimates:
## mean in group F mean in group M 
##        489.7643        499.6672

t_test_sex_skull

## 
##  Welch Two Sample t-test
## 
## data:  diametro_cranio by sesso_neonato
## t = -7.4102, df = 2491.4, p-value = 1.718e-13
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -6.089912 -3.541270
## sample estimates:
## mean in group F mean in group M 
##        337.6330        342.4486

For t_test_sex_weight, we reject the null hypothesis, then it can be stated, with a significance level of 1% (thus with a confidence level of 99%), that the weight of the newborn turns out to be significantly different for both sexes. Therefore, we accept the alternative hypothesis that the difference in the averages between group F and group M is not 0. It’s possible to reject the null hypotesis for t_test_sex_length and for t_test_sex_skull (p-value are very low), too and this can be stated with a significance level of 1% (thus with a confidence level of 99%). So, there is a significant difference in newborn length and newborn skull diameter for both sexes.

# CREATION OF THE REGRESSION MODEL
summary(data)

##    età_madre     numero_gravidanze madre_fumatrice  durata_gravidanza
##  Min.   : 0.00   Min.   : 0.0000   Min.   :0.0000   Min.   :25.00    
##  1st Qu.:25.00   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:38.00    
##  Median :28.00   Median : 1.0000   Median :0.0000   Median :39.00    
##  Mean   :28.16   Mean   : 0.9812   Mean   :0.0416   Mean   :38.98    
##  3rd Qu.:32.00   3rd Qu.: 1.0000   3rd Qu.:0.0000   3rd Qu.:40.00    
##  Max.   :46.00   Max.   :12.0000   Max.   :1.0000   Max.   :43.00    
##   peso_neonato  lunghezza_neonato diametro_cranio  tipo_parto       
##  Min.   : 830   Min.   :310.0     Min.   :235     Length:2500       
##  1st Qu.:2990   1st Qu.:480.0     1st Qu.:330     Class :character  
##  Median :3300   Median :500.0     Median :340     Mode  :character  
##  Mean   :3284   Mean   :494.7     Mean   :340                       
##  3rd Qu.:3620   3rd Qu.:510.0     3rd Qu.:350                       
##  Max.   :4930   Max.   :565.0     Max.   :390                       
##    ospedale         sesso_neonato     
##  Length:2500        Length:2500       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
##

n <- nrow(data)
moments::skewness(peso_neonato) #negative skewness

## [1] -0.6470308

moments::kurtosis(peso_neonato)-3 #leptokurtosis

## [1] 2.031532

shapiro.test(peso_neonato) #no Gaussian distribution

## 
##  Shapiro-Wilk normality test
## 
## data:  peso_neonato
## W = 0.97066, p-value < 2.2e-16

options(scipen=0)
full_mod <- lm(peso_neonato ~., data = data)
summary(full_mod)

## 
## Call:
## lm(formula = peso_neonato ~ ., data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1124.40  -181.66   -14.42   160.91  2611.89 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6738.4762   141.3087 -47.686  < 2e-16 ***
## età_madre             0.8921     1.1323   0.788   0.4308    
## numero_gravidanze    11.2665     4.6608   2.417   0.0157 *  
## madre_fumatrice     -30.1631    27.5386  -1.095   0.2735    
## durata_gravidanza    32.5696     3.8187   8.529  < 2e-16 ***
## lunghezza_neonato    10.2945     0.3007  34.236  < 2e-16 ***
## diametro_cranio      10.4707     0.4260  24.578  < 2e-16 ***
## tipo_partoNat        29.5254    12.0844   2.443   0.0146 *  
## ospedaleosp2        -11.2095    13.4379  -0.834   0.4043    
## ospedaleosp3         28.0958    13.4957   2.082   0.0375 *  
## sesso_neonatoM       77.5409    11.1776   6.937 5.08e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.9 on 2489 degrees of freedom
## Multiple R-squared:  0.7289, Adjusted R-squared:  0.7278 
## F-statistic: 669.2 on 10 and 2489 DF,  p-value: < 2.2e-16

Here, we have a linear regression model, in which we included all the explainatory variables, in order to predict the response variable (peso_neonato). We have several parameters in the summary output. These parameters provide insights into the model’s performance, coefficients, and statistical significance. Here’s a breakdown of the key components of the summary: - In Residuals -> Min, 1Q (first quartile), Median, 3Q (third quartile), Max: these represent the distribution of the residuals (the differences between the observed and predicted values). Ideally, residuals should be centered around 0 with a symmetric distribution. Large deviations could suggest model misfit; - Residual standard error -> This is the standard deviation of the residuals, which indicates the typical size of the error in your predictions. A smaller value is better; - Coefficients -> this section shows the estimated coefficients (betas) for each independent variable, along with their standard errors, t-values, and p-values; - Multiple R-squared -> this statistic shows the proportion of variance in the response variable (peso_neonato) that is explained by the independent variables in the model. It ranges from 0 to 1, with higher values indicating a better fit; - Adjusted R-squared -> this version of R-squared adjusts for the number of predictors in the model, providing a more accurate measure of model performance when comparing models with different numbers of predictors; - F-statistic -> it tests the overall significance of the model. It compares the model with no predictors (intercept-only model) against the model you’ve fitted. The associated p-value indicates whether the independent variables, as a whole, significantly predict the dependent variable (peso_neonato). A small p-value suggests that the model is statistically significant.

Let’s talk about the interpretation of the coefficients estimates in this model: età_madre, madre_fumatrice and ospedale show parameters that are not particularly significant, given their high p-values; while, numero_gravidanze, durata_gravidanza, lunghezza_neonato, diametro_cranio and sesso_neonato show parameters that are particularly significant, in order to predict the dipendent variable peso_neonato: - numero_gravidanze -> it is statistically significative with an alpha = 5% (but not with an alpha = 1%): as pregnancy increases, infant weight increases, on average, by 11.27 grams, ceteris paribus (holding steady the marginal effects of the other explanatory variables); - durata_gravidanza <- it is statistically significative with an alpha = 1%: as one week of gestation increases, the infant’s weight increases, on average, by about 32.57 grams, ceteris paribus; - lunghezza_neonato <- it is statistically significative with an alpha = 1%: as the infant’s length increases by one millimeter, the weight of the infant itself increases, on average, by about 10.29 grams, ceteris paribus; - diametro_cranio <- it is statistically significative with an alpha = 1%: as the infant’s skull diameter increases by one millimeter, the weight of the infant itself increases, on average, by about 10.47 grams, ceteris paribus; - tipo_parto <- it is statistically significative with an alpha = 5% (but not with an alpha = 1%): this parameter can be interpreted as the difference in average weight between infants who are born by natural childbirth versus those who are born by cesarean section: those born by natural childbirth will have, on average, a weight approximately 29.53 grams greater than those born by cesarean section, ceteris paribus; - ospedale <- it is statistically significative with an alpha = 5% (but not with an alpha = 1%), but only this category (ospedaleosp3), taking ospedaleosp1 as the baseline: infants born in hospital 3 have, on average, a 28.1 grams greater weight than those born in hospital 1, ceteris paribus; - sesso_neonato <- it is statistically significative with an alpha = 1%: male infants have, on average, 77.54 grams more weight than female infants, ceteris paribus. R^2 and adjusted R^2 (this is more relevant to evaluate the real contribution of a new variable in the model) are about 0.73: they are good values but a better model can be sought in order to increase the proportion of variance explained of the dependent variable by the explanatory variables.

shapiro.test(peso_neonato)

## 
##  Shapiro-Wilk normality test
## 
## data:  peso_neonato
## W = 0.97066, p-value < 2.2e-16

Through the rejection of the basic hypothesis of the Shapiro-Wilk normality test, it can be seen that this variable does not distribute normally, and this may suggest that the linear regression model may not be the best fit for these data. Need to investigate.

# SELECTION OF THE OPTIMAL MODEL
mod1 <- update(full_mod,~.-età_madre-madre_fumatrice-ospedale)
summary(mod1)

## 
## Call:
## lm(formula = peso_neonato ~ numero_gravidanze + durata_gravidanza + 
##     lunghezza_neonato + diametro_cranio + tipo_parto + sesso_neonato, 
##     data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1129.31  -181.70   -16.31   161.07  2638.85 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6707.2971   135.9911 -49.322  < 2e-16 ***
## numero_gravidanze    12.7558     4.3366   2.941   0.0033 ** 
## durata_gravidanza    32.2713     3.7941   8.506  < 2e-16 ***
## lunghezza_neonato    10.2864     0.3007  34.207  < 2e-16 ***
## diametro_cranio      10.5057     0.4260  24.659  < 2e-16 ***
## tipo_partoNat        30.0342    12.0969   2.483   0.0131 *  
## sesso_neonatoM       77.9285    11.1905   6.964 4.22e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.3 on 2493 degrees of freedom
## Multiple R-squared:  0.7277, Adjusted R-squared:  0.727 
## F-statistic:  1110 on 6 and 2493 DF,  p-value: < 2.2e-16

In this model, we removed all those explanatory variables that were found to be not statistically significant in the full model: now the p-values of numero_gravidanze and tipo_parto have become lower, improving the significance of the parameters referring to these variables. Note, however, that despite the improvement in the significance of certain parameters, the adjusted R^2 and R^2 turn out to be even lower, but only slightly, than those presented in the full model. Let us explore other types of models by adding interaction effects and/or nonlinear effects.

mod2 <- update(mod1,~.+I(durata_gravidanza^2))
summary(mod2)

## 
## Call:
## lm(formula = peso_neonato ~ numero_gravidanze + durata_gravidanza + 
##     lunghezza_neonato + diametro_cranio + tipo_parto + sesso_neonato + 
##     I(durata_gravidanza^2), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1127.83  -180.56   -16.62   162.80  2661.28 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -4713.3849   897.6462  -5.251 1.64e-07 ***
## numero_gravidanze         12.8408     4.3333   2.963  0.00307 ** 
## durata_gravidanza        -79.0197    49.6700  -1.591  0.11176    
## lunghezza_neonato         10.3888     0.3039  34.185  < 2e-16 ***
## diametro_cranio           10.6005     0.4278  24.780  < 2e-16 ***
## tipo_partoNat             29.5657    12.0889   2.446  0.01453 *  
## sesso_neonatoM            75.7674    11.2228   6.751 1.82e-11 ***
## I(durata_gravidanza^2)     1.4857     0.6612   2.247  0.02472 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.1 on 2492 degrees of freedom
## Multiple R-squared:  0.7282, Adjusted R-squared:  0.7275 
## F-statistic: 953.9 on 7 and 2492 DF,  p-value: < 2.2e-16

In this new model, having introduced the quadratic term I(durata_gravidanza^2), given the nonlinear relationship between this variable and peso_neonato, seems to have had only the effect of worsening the significance of durata_gravidanza, where in contrast the significance of the other parameters appears to be fairly unchanged. It was decided to include such a term because of the curvature in the relationship between peso_neonato and durata_gravidanza but, at equal significance, it is better to prefer more parsimonious models (since even that adjusted R^2 and R^2 remained about the same).

options(scipen = 999)
mod3 <- update(mod1,~.+lunghezza_neonato*diametro_cranio)
summary(mod3)

## 
## Call:
## lm(formula = peso_neonato ~ numero_gravidanze + durata_gravidanza + 
##     lunghezza_neonato + diametro_cranio + tipo_parto + sesso_neonato + 
##     lunghezza_neonato:diametro_cranio, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1131.58  -179.13   -14.04   162.30  2862.49 
## 
## Coefficients:
##                                       Estimate   Std. Error t value
## (Intercept)                       -1887.920116  1017.455360  -1.856
## numero_gravidanze                    13.209788     4.318793   3.059
## durata_gravidanza                    37.966985     3.961134   9.585
## lunghezza_neonato                    -0.135367     2.201113  -0.061
## diametro_cranio                      -4.598135     3.188657  -1.442
## tipo_partoNat                        28.547927    12.048292   2.369
## sesso_neonatoM                       73.267796    11.184446   6.551
## lunghezza_neonato:diametro_cranio     0.031180     0.006524   4.779
##                                               Pr(>|t|)    
## (Intercept)                                    0.06364 .  
## numero_gravidanze                              0.00225 ** 
## durata_gravidanza                 < 0.0000000000000002 ***
## lunghezza_neonato                              0.95097    
## diametro_cranio                                0.14942    
## tipo_partoNat                                  0.01789 *  
## sesso_neonatoM                         0.0000000000693 ***
## lunghezza_neonato:diametro_cranio      0.0000018622221 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 273.1 on 2492 degrees of freedom
## Multiple R-squared:  0.7301, Adjusted R-squared:  0.7294 
## F-statistic: 963.2 on 7 and 2492 DF,  p-value: < 0.00000000000000022

Here, it was decided to include an interaction term between lunghezza_neonato and diametro_cranio (two anthropometric measures closely related to peso_neonato): in this case, the introduction of such a term had the effect of worsening the significance of the parameters of lunghezza_neonato and diametro_cranio taken individually. This suggests that the introduction of an interaction term turns out to be unnecessary.

options(scipen = 0)
mod4 <- update(mod1,~.-tipo_parto)
summary(mod4)

## 
## Call:
## lm(formula = peso_neonato ~ numero_gravidanze + durata_gravidanza + 
##     lunghezza_neonato + diametro_cranio + sesso_neonato, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1149.44  -180.81   -15.58   163.64  2639.72 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6681.1445   135.7229 -49.226  < 2e-16 ***
## numero_gravidanze    12.4750     4.3396   2.875  0.00408 ** 
## durata_gravidanza    32.3321     3.7980   8.513  < 2e-16 ***
## lunghezza_neonato    10.2486     0.3006  34.090  < 2e-16 ***
## diametro_cranio      10.5402     0.4262  24.728  < 2e-16 ***
## sesso_neonatoM       77.9927    11.2021   6.962 4.26e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.6 on 2494 degrees of freedom
## Multiple R-squared:  0.727,  Adjusted R-squared:  0.7265 
## F-statistic:  1328 on 5 and 2494 DF,  p-value: < 2.2e-16

Here, it was decided to eliminate the variable tipo_parto, which had a significant parameter only at a significance level of 5% (but not 1%): the p-values of the model parameters tended to remain the same (except for some slight increase). The proportions of variance explained are slightly lower than those presented in mod1, but they do not change much.

BIC(mod1, mod2, mod3, mod4)

AIC(mod1, mod2, mod3, mod4)

stepwise_mod <- MASS::stepAIC(full_mod,
                              direction = "both",
                              k=log(n))

## Start:  AIC=28139.32
## peso_neonato ~ età_madre + numero_gravidanze + madre_fumatrice + 
##     durata_gravidanza + lunghezza_neonato + diametro_cranio + 
##     tipo_parto + ospedale + sesso_neonato
## 
##                     Df Sum of Sq       RSS   AIC
## - età_madre          1     46578 186809099 28132
## - madre_fumatrice    1     90019 186852540 28133
## - ospedale           2    685979 187448501 28133
## - numero_gravidanze  1    438452 187200974 28137
## - tipo_parto         1    447929 187210450 28138
## <none>                           186762521 28139
## - sesso_neonato      1   3611021 190373542 28179
## - durata_gravidanza  1   5458403 192220925 28204
## - diametro_cranio    1  45326172 232088693 28675
## - lunghezza_neonato  1  87951062 274713583 29096
## 
## Step:  AIC=28132.12
## peso_neonato ~ numero_gravidanze + madre_fumatrice + durata_gravidanza + 
##     lunghezza_neonato + diametro_cranio + tipo_parto + ospedale + 
##     sesso_neonato
## 
##                     Df Sum of Sq       RSS   AIC
## - madre_fumatrice    1     90897 186899996 28126
## - ospedale           2    692738 187501837 28126
## - tipo_parto         1    448222 187257321 28130
## <none>                           186809099 28132
## - numero_gravidanze  1    633756 187442855 28133
## + età_madre          1     46578 186762521 28139
## - sesso_neonato      1   3618736 190427835 28172
## - durata_gravidanza  1   5412879 192221978 28196
## - diametro_cranio    1  45588236 232397335 28670
## - lunghezza_neonato  1  87950050 274759149 29089
## 
## Step:  AIC=28125.51
## peso_neonato ~ numero_gravidanze + durata_gravidanza + lunghezza_neonato + 
##     diametro_cranio + tipo_parto + ospedale + sesso_neonato
## 
##                     Df Sum of Sq       RSS   AIC
## - ospedale           2    701680 187601677 28119
## - tipo_parto         1    440684 187340680 28124
## <none>                           186899996 28126
## - numero_gravidanze  1    610840 187510837 28126
## + madre_fumatrice    1     90897 186809099 28132
## + età_madre          1     47456 186852540 28133
## - sesso_neonato      1   3602797 190502794 28165
## - durata_gravidanza  1   5346781 192246777 28188
## - diametro_cranio    1  45632149 232532146 28664
## - lunghezza_neonato  1  88355030 275255027 29086
## 
## Step:  AIC=28119.23
## peso_neonato ~ numero_gravidanze + durata_gravidanza + lunghezza_neonato + 
##     diametro_cranio + tipo_parto + sesso_neonato
## 
##                     Df Sum of Sq       RSS   AIC
## - tipo_parto         1    463870 188065546 28118
## <none>                           187601677 28119
## - numero_gravidanze  1    651066 188252743 28120
## + ospedale           2    701680 186899996 28126
## + madre_fumatrice    1     99840 187501837 28126
## + età_madre          1     54392 187547285 28126
## - sesso_neonato      1   3649259 191250936 28160
## - durata_gravidanza  1   5444109 193045786 28183
## - diametro_cranio    1  45758101 233359778 28657
## - lunghezza_neonato  1  88054432 275656108 29074
## 
## Step:  AIC=28117.58
## peso_neonato ~ numero_gravidanze + durata_gravidanza + lunghezza_neonato + 
##     diametro_cranio + sesso_neonato
## 
##                     Df Sum of Sq       RSS   AIC
## <none>                           188065546 28118
## - numero_gravidanze  1    623141 188688687 28118
## + tipo_parto         1    463870 187601677 28119
## + ospedale           2    724866 187340680 28124
## + madre_fumatrice    1     91892 187973654 28124
## + età_madre          1     54816 188010731 28125
## - sesso_neonato      1   3655292 191720838 28158
## - durata_gravidanza  1   5464853 193530399 28181
## - diametro_cranio    1  46108583 234174130 28658
## - lunghezza_neonato  1  87632762 275698308 29066

summary(stepwise_mod)

## 
## Call:
## lm(formula = peso_neonato ~ numero_gravidanze + durata_gravidanza + 
##     lunghezza_neonato + diametro_cranio + sesso_neonato, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1149.44  -180.81   -15.58   163.64  2639.72 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6681.1445   135.7229 -49.226  < 2e-16 ***
## numero_gravidanze    12.4750     4.3396   2.875  0.00408 ** 
## durata_gravidanza    32.3321     3.7980   8.513  < 2e-16 ***
## lunghezza_neonato    10.2486     0.3006  34.090  < 2e-16 ***
## diametro_cranio      10.5402     0.4262  24.728  < 2e-16 ***
## sesso_neonatoM       77.9927    11.2021   6.962 4.26e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.6 on 2494 degrees of freedom
## Multiple R-squared:  0.727,  Adjusted R-squared:  0.7265 
## F-statistic:  1328 on 5 and 2494 DF,  p-value: < 2.2e-16

BIC(mod4, stepwise_mod)

car::vif(mod4) #no multicollinearity between explainatory variables (VIF < 5)

## numero_gravidanze durata_gravidanza lunghezza_neonato   diametro_cranio 
##          1.023475          1.669189          2.074689          1.624465 
##     sesso_neonato 
##          1.040054

The stepwise method (stepAIC) shows that mod4 turns out to be the best model (but AIC and BIC show that mod3 is the best model), and the one that best combines significance of the parameters in the model and parsimony, with the same proportion of variance explained compared to the other models (although mod1 has a slightly greater adjusted R^2 than mod4).

# MODEL QUALITY ANALYSIS
#It is also necessary to investigate the erratic part of the model: assess
#whether the assumptions of normality, homoschedasticity, and uncorrelation 
#are met.
library(lmtest)

shapiro.test(residuals(mod4))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(mod4)
## W = 0.97408, p-value < 2.2e-16

#we reject the null hypothesis that the model residuals distribute according 
#to a normal distribution

bptest(mod4)

## 
##  studentized Breusch-Pagan test
## 
## data:  mod4
## BP = 90.253, df = 5, p-value < 2.2e-16

#we reject the null hypothesis of homoschedasticity (constant variance) of 
#the residuals

dwtest(mod4)

## 
##  Durbin-Watson test
## 
## data:  mod4
## DW = 1.9535, p-value = 0.1224
## alternative hypothesis: true autocorrelation is greater than 0

#on the other hand, the null hypothesis of no autocorrelation between residuals #is not rejected

#Basically, already the response variable does not distribute according to a 
#normal distribution
shapiro.test(peso_neonato)

## 
##  Shapiro-Wilk normality test
## 
## data:  peso_neonato
## W = 0.97066, p-value < 2.2e-16

moments::skewness(peso_neonato)

## [1] -0.6470308

plot(density(peso_neonato))

#It is necessary to find another type of model that fits the data better
glm_mod <- glm(peso_neonato ~ numero_gravidanze + durata_gravidanza + lunghezza_neonato + diametro_cranio + sesso_neonato, 
               data = data, family = gaussian(link = "identity"))
summary(glm_mod)

## 
## Call:
## glm(formula = peso_neonato ~ numero_gravidanze + durata_gravidanza + 
##     lunghezza_neonato + diametro_cranio + sesso_neonato, family = gaussian(link = "identity"), 
##     data = data)
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6681.1445   135.7229 -49.226  < 2e-16 ***
## numero_gravidanze    12.4750     4.3396   2.875  0.00408 ** 
## durata_gravidanza    32.3321     3.7980   8.513  < 2e-16 ***
## lunghezza_neonato    10.2486     0.3006  34.090  < 2e-16 ***
## diametro_cranio      10.5402     0.4262  24.728  < 2e-16 ***
## sesso_neonatoM       77.9927    11.2021   6.962 4.26e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 75407.2)
## 
##     Null deviance: 688888542  on 2499  degrees of freedom
## Residual deviance: 188065546  on 2494  degrees of freedom
## AIC: 35179
## 
## Number of Fisher Scoring iterations: 2

library(sandwich)
coeftest(glm_mod, vcov = vcovHC(glm_mod, type = "HC3"))

## 
## z test of coefficients:
## 
##                      Estimate  Std. Error  z value  Pr(>|z|)    
## (Intercept)       -6681.14450   167.67708 -39.8453 < 2.2e-16 ***
## numero_gravidanze    12.47497     4.63383   2.6921  0.007099 ** 
## durata_gravidanza    32.33206     5.26581   6.1400 8.252e-10 ***
## lunghezza_neonato    10.24857     0.73587  13.9271 < 2.2e-16 ***
## diametro_cranio      10.54020     0.80312  13.1241 < 2.2e-16 ***
## sesso_neonatoM       77.99271    11.08633   7.0350 1.992e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

If the residuals show heteroschedasticity (nonconstant variance), the standard errors of the LM or GLM model without correction are underestimated or overestimated, leading to misinterpretations. The use of vcovHC() corrects this problem, making inferences more reliable. Although the homoschedasticity test (e.g., Breusch-Pagan) indicated a violation, the structure of the LM model was already robust enough not to be affected by this violation. Heteroschedasticity primarily impacts the standard errors and thus the significance of the coefficients. If the p-values do not change after correction, it means that the bias was minimal. The original LM model found with stepwise was already optimal in terms of variable selection and data explanation. As for the violation of the normality assumption of the residuals, it can be seen that the distribution of mod4 turns out to be approximately normal, has a slight negative skewness (about -0.65) and a leptokurtic shape (about 2.03).

#Returning to the optimal model found via stepwise, which coincides with mod4:
summary(mod4)

## 
## Call:
## lm(formula = peso_neonato ~ numero_gravidanze + durata_gravidanza + 
##     lunghezza_neonato + diametro_cranio + sesso_neonato, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1149.44  -180.81   -15.58   163.64  2639.72 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -6681.1445   135.7229 -49.226  < 2e-16 ***
## numero_gravidanze    12.4750     4.3396   2.875  0.00408 ** 
## durata_gravidanza    32.3321     3.7980   8.513  < 2e-16 ***
## lunghezza_neonato    10.2486     0.3006  34.090  < 2e-16 ***
## diametro_cranio      10.5402     0.4262  24.728  < 2e-16 ***
## sesso_neonatoM       77.9927    11.2021   6.962 4.26e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.6 on 2494 degrees of freedom
## Multiple R-squared:  0.727,  Adjusted R-squared:  0.7265 
## F-statistic:  1328 on 5 and 2494 DF,  p-value: < 2.2e-16

In this case, we have an R^2 of 0.727 and an adjusted R^2 of 0.7265 : the model exhibits a good degree of fit to the data, and is at values that we also find for other models (although less optimal than this one).

#Now, we move on to investigate graphically the residuals of mod4:
par(mfrow=c(2,2))
plot(mod4)

- The graph of residuals vs fitted values allows us to assess whether there are trends in the distribution of the residuals themselves or whether there is a constant variability: the dots should be scattered randomly around the mean of 0. A slightly curved pattern seems to present itself, and it appears that the information was not filtered well by the regressors and that this was spilled over to the residuals (note the rejection of Breusch-Pagan Test, but we saw that the bias was minimal, given the correction made for the variance); - The Q-Q plot of standardized residuals allows us to check whether the errors are normally distributed: in this case, it seems that the dots concentrate well, except for some dots in the tails (despite of the rejection of the previously Shapiro Test); - The scale-location plot is used to determine whether the distribution of residuals is constant over the entire range of expected values and is useful in detecting outliers. This graph should not display patterns in the points but only in a horizontal line around a value of y that will indicate a constant variance: there is a fairly sharp concentration of points; - The graph residuals vs. leverage (standardized residuals vs. leverage points) is intended to highlight whether there are any high leverage data that might affect the model estimate. In the same graph, we find Cook’s distance, which is another measure of the importance of each observation to the regression. Cook’s distance is defined as a standardized measure of the distance between the OLS estimates and the same vector obtained by removal of the i-th observation. Small distances mean that removal of that observation has little effect on the regression results. Distances greater than 1 are suspicious and suggest the presence of a possible high-leverage or poor model: In this case, observation 1551 is between the thresholds of 0.5 (warning) and 1 (alarm), thus potentially influential.

#Cook's distance
cook <- cooks.distance(mod4)
plot(cook)

max(cook)

## [1] 0.8300965

We can see the observation 1551 in this graph.

FORECASTS AND RESULTS

prediction <- predict(mod4, new_data = data.frame(numero_gravidanze = 3, 
                                                 durata_gravidanza = 39, 
                                                 sesso_neonato = 'F'))

prediction is the predicted value generated by the best model (mod4) for a mother with 3 pregnancies, who will give birth at 39 weeks of gestation, and has a female baby. prediction will contain a single prediction based on the input characteristics (3 pregnancies, 39 weeks of gestation, female baby). If the model was trained to predict, for example, the weight of the newborn, then prediction will return the predicted weight of the newborn for this combination of features.

VISUALIZATIONS

# Predict the baby's weight based on the model (mod4)
data$pred_peso_neonato <- round(predict(mod4, data), 2)
attach(data)

# Create a mapping of Italian variable names to English labels for x-axis variables
label_mapping_3 <- c(
  "numero_gravidanze" = "Number of Pregnancies",
  "durata_gravidanza" = "Pregnancy Duration",
  "lunghezza_neonato" = "Newborn Length",
  "diametro_cranio" = "Head Diameter"
)

# Define a function that generates scatter plots for a given x variable
plot_scatter_2 <- function(x_var_3) {
  ggplot(data, aes_string(x = x_var_3, y = "pred_peso_neonato")) +
    geom_point(alpha = 0.3, color = "blue", size = 1) +  
    geom_smooth(method = "lm", se = FALSE, color = "red", linewidth = 1) +  
    labs(
      title = paste("Impact of", label_mapping_3[x_var_3], "on Predicted Newborn Weight"),
      x = label_mapping_3[x_var_3],
      y = "Predicted Newborn Weight (grams)"
    ) +
    theme_minimal() +
    theme(
      plot.title = element_text(hjust = 0.5),
      axis.title = element_text(size = 12),
      axis.text = element_text(size = 10)
    )
}

# List of variables for the x-axis
x_vars_3 <- c("numero_gravidanze", "durata_gravidanza", "lunghezza_neonato", 
            "diametro_cranio")

# Iterate over the list of x-axis variables and print the plots
for (x_var_3 in x_vars_3) {
  print(plot_scatter_2(x_var_3))
}

Here, we have various scatter plots that examine the relationships between predicted newborn weight (pred_peso_neonato) and four different predictor variables: Number of Pregnancies (numero_gravidanze), Pregnancy Duration (durata_gravidanza), Newborn Length (lunghezza_neonato), and Head Diameter (diametro_cranio).

# Box plot showing the relationship between the baby's sex and predicted baby weight
pred_boxplot <- ggplot(data, aes(x = sesso_neonato, y = pred_peso_neonato)) +
  geom_boxplot(fill = 'gray90') +  
  labs(
    title = "Impact of Newborn's Sex on Predicted Newborn Weight",
    x = "Newborn's Sex",
    y = "Predicted Newborn Weight (grams)"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(hjust = 0.5),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )

pred_boxplot

This boxplot shows the relationship between newborn sex (sesso_neonato) and predicted newborn weight (pred_peso_neonato).

library(plotly)

# Create a mapping of Italian variable names to English labels for x and y axes
label_mapping_4 <- c(
  "numero_gravidanze" = "Number of Pregnancies",
  "durata_gravidanza" = "Pregnancy Duration (weeks)",
  "lunghezza_neonato" = "Newborn Length (mm)",
  "diametro_cranio" = "Head Diameter (mm)"
)

# Create a mapping of variable combinations for x and y
variable_combinations <- list(
  c("numero_gravidanze", "durata_gravidanza", "Impact of Number of Pregnancies and Pregnancy Duration on Predicted Newborn Weight"),
  c("lunghezza_neonato", "diametro_cranio", "Impact of Newborn Length and Cranial Diameter on Predicted Newborn Weight")
)

# Define a function to generate 3D scatter plots
plot_3d_scatter_1 <- function(x_var, y_var, title) {
  plot_ly(data, 
          x = ~get(x_var),  
          y = ~get(y_var),  
          z = ~pred_peso_neonato,  
          type = 'scatter3d',      
          mode = 'markers',        
          marker = list(
            size = 5,              
            color = ~pred_peso_neonato,  
            colorscale = 'Viridis',  
            showscale = TRUE        
          ),
          text = ~paste(x_var, ": ", get(x_var),   
                        "<br>", y_var, ": ", get(y_var), 
                        "<br>pred_peso_neonato: ", pred_peso_neonato),  
          hoverinfo = 'text'  
  ) %>%
    layout(
      title = title,  
      scene = list(
        xaxis = list(title = label_mapping_4[x_var]),
        yaxis = list(title = label_mapping_4[y_var]),
        zaxis = list(title = 'Predicted Newborn Weight (grams)'),  
        bgcolor = 'rgba(240, 240, 240, 0.95)',  
        gridcolor = 'rgba(200, 200, 200, 0.5)'  
      ),
      plot_bgcolor = 'rgba(255, 255, 255, 0.95)'  
    )
}

# Iterate over the variable combinations and generate each plot separately
plotly_1 <- plot_3d_scatter_1(variable_combinations[[1]][1], variable_combinations[[1]][2], variable_combinations[[1]][3])
plotly_2 <- plot_3d_scatter_1(variable_combinations[[2]][1], variable_combinations[[2]][2], variable_combinations[[2]][3])

# Show the results
plotly_1

plotly_2

First plot: Shows the relationship between number of pregnancies and pregnancy duration, and how they impact the predicted newborn weight;
Second plot: Displays the relationship between newborn length and cranial diameter, and their effect on predicted newborn weight.

Statistica_Inferenziale_Project

Lorenzo Canobbio

2025-03-05

ANALISYS AND MODELING

FORECASTS AND RESULTS

VISUALIZATIONS