Statistical Model for Predicting Newborn Weight

1. Dateset Structure

#import dataset
setwd("D:/Users/Anton/Desktop/ProfessionAI/03-Statistica inferenziale/Progetto")
data=read.csv("neonati.csv",
              sep=",",
              stringsAsFactors = T)

# Convert Smokers binary column into a factor variable with 2 levels
data$Fumatrici <- factor(data$Fumatrici, levels = c(0, 1), labels = c("NO", "SI"))

# Caricamento delle librerie necessarie
library(knitr)
library(kableExtra)

# First row df
df = data.frame(
  Anni.madre = data[1:5,1],
  N.gravidanze = data[1:5,2],
  Fumatrici = data[1:5,3],
  Gestazione = data[1:5,4],
  Peso = data[1:5,5],
  Lunghezza = data[1:5,6],
  Cranio = data[1:5,7],
  Tipo.parto = data[1:5,8],
  Ospedale = data[1:5,9],
  Sesso=data[1:5,10]
)

kable(df, format = "html", row.names = T,align = "c") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

	Anni.madre	N.gravidanze	Fumatrici	Gestazione	Peso	Lunghezza	Cranio	Tipo.parto	Ospedale	Sesso
1	26	0	NO	42	3380	490	325	Nat	osp3	M
2	21	2	NO	39	3150	490	345	Nat	osp1	F
3	34	3	NO	38	3640	500	375	Nat	osp2	M
4	28	1	NO	41	3690	515	365	Nat	osp2	M
5	20	0	NO	38	3700	480	335	Nat	osp3	F

The dataset contains the following variables:

Anni madre: mother’s age (Quantitative discrete variable) (Qualitative variable on a nominal scale)
N.gravidanze: number of pregnancies (Quantitative discrete variable)
Fumatrici: smokers (Qualitative variable, coded with numbers 1 Yes and 0 No)
Gestazione: total number pregancie week (Quantitative discrete variable)
Peso: neonatal weight expressed in grams (Quantitative discrete variable)
Lunghezza: neonatal length expressed in millimeters (Quantitative continuous variable)
Cranio: neonatal skull diameter expressed in millimeters (Quantitative discrete variable)
Tipo.parto: type of birth (Qualitative variable)
Ospedale: hospital where the birth occurred (Qualitative variable)
Sesso: sex (Qualitative variable)

2. Preliminary data analysis

Summarize the data

n=nrow(data)

#df numeric
df_num=data[,sapply(data,is.numeric)]
summary_dfnum = summary(df_num)

#df factor
df_factor=data[,sapply(data,is.factor)]
summary_dffact = summary(df_factor)
summary_dffact[is.na(summary_dffact)] ="" 


kable(summary_dfnum, format = "html",align = "c", caption = "Numeric Summary Table") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

Numeric Summary Table
Anni.madre	N.gravidanze	Gestazione	Peso	Lunghezza	Cranio
Min. : 0.00	Min. : 0.0000	Min. :25.00	Min. : 830	Min. :310.0	Min. :235
1st Qu.:25.00	1st Qu.: 0.0000	1st Qu.:38.00	1st Qu.:2990	1st Qu.:480.0	1st Qu.:330
Median :28.00	Median : 1.0000	Median :39.00	Median :3300	Median :500.0	Median :340
Mean :28.16	Mean : 0.9812	Mean :38.98	Mean :3284	Mean :494.7	Mean :340
3rd Qu.:32.00	3rd Qu.: 1.0000	3rd Qu.:40.00	3rd Qu.:3620	3rd Qu.:510.0	3rd Qu.:350
Max. :46.00	Max. :12.0000	Max. :43.00	Max. :4930	Max. :565.0	Max. :390

kable(summary_dffact, format = "html",align = "c", caption = "Factor Summary Table") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

Factor Summary Table
Fumatrici	Tipo.parto	Ospedale	Sesso
NO:2396	Ces: 728	osp1:816	F:1256
SI: 104	Nat:1772	osp2:849	M:1244
		osp3:835

Looking at the numeric summary table, immediately jumps out the minimum value of the variable Anni.madre. It is for sure a data entry error.

Filtering the data will show if it is an isolated value or if there are other anomalous observation.

#Load library
library(dplyr)
#Filter anni.madre <15
data_filt=data %>% filter(data$Anni.madre<15)
#sort by anni.madre
data_filt=data_filt[order(data_filt$Anni.madre), ]

kable(data_filt$Anni.madre, format = "html", col.names = "Anni.madre", align="c")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

Anni.madre
0
1
13
14
14

The are two year values (0 and 1) that for sure are not possible. This observation has to be removed.

#Filter anni.madre >10
data=data %>% filter(data$Anni.madre>10)

2.1 Hospital and birth type relationship

In order to investigate if there are hospitals in which are preferred cesarean birth, it is possibile to perform a Chi-squared test.

attach(data)
library(kableExtra)
library(knitr)

#Contingency table
contingency_birthtype=table(Ospedale, Tipo.parto)

# Chi-squared Test
chi_sq_test_birthtype <- chisq.test(contingency_birthtype)

# Results dataframe
results_birthtype = data.frame(
  Statistic = chi_sq_test_birthtype$statistic,
  Degrees_of_Freedom = chi_sq_test_birthtype$parameter,
  P_Value = chi_sq_test_birthtype$p.value
)

kable(results_birthtype, format = "html", align="c", caption = "Birthtype Chi-squared Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

Birthtype Chi-squared Test Result
	Statistic	Degrees_of_Freedom	P_Value
X-squared	1.082984	2	0.5818793

chisq_2grades = rchisq(1000000, 2)

library(ggplot2)
ggplot(data.frame(x = density(chisq_2grades)$x, y = density(chisq_2grades)$y),
       aes(x = x, y = y)) +
  geom_line() +
  geom_vline(xintercept = qchisq(0.99, 2), color = "red") +
  geom_point(aes(x = chi_sq_test_birthtype$statistic, y = 0), color = "blue", size = 3) +
  labs(x = "Value", y = "Density", title = "Chi-square distribution (2 df) with test statistic") +
  scale_y_continuous(breaks = seq(0, 0.5, by = 0.1), limits = c(0, 0.5)) +
  scale_x_continuous(breaks=seq(0,30, by=5), limits=c(-1,30))+
  theme_minimal()+
    theme(plot.title = element_text(hjust = 0.5, size = 14),  # Center Title
        axis.title.x = element_text(size = 12),  # axis name size
        axis.title.y = element_text(size = 12))

The test returns a very high p-value, therefore the null hypothesis is not rejected. There aren’t hospital in which cesarean birth is more frequent. Also graphically it is possible to notice that the test statistic is very far from the rejection zone.

2.2 Mean weight and length

Evaluate whether the average weight and length of the sample are significantly equal to those of the population. From a quick search on google,it is found that \(\mu_{weight} = 3300\;g\) and \(\mu_{length} = 500\;mm\). To evaluate the hypothesis it is possible to perform a t-test.

#population mean weight
mu_weigth=3300
#population mean length
mu_length=500

#weigth t.test
weigth_test=t.test(Peso,
                   mu=mu_weigth,
                   conf.level=0.95,
                   alternative="two.sided")

# Weigth result dataframe
weigth_test_results = data.frame(
  Statistic = weigth_test$statistic,
  Degrees_of_Freedom = weigth_test$parameter,
  P_Value = weigth_test$p.value,
  `95%_CI_Lower` = weigth_test$conf.int[1],
  `95%_CI_Upper` = weigth_test$conf.int[2],
  Mean = weigth_test$estimate
)

kable(weigth_test_results, format = "html", align="c", caption = "Weight T Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

Weight T Test Result
	Statistic	Degrees_of_Freedom	P_Value	X95._CI_Lower	X95._CI_Upper	Mean
t	-1.505011	2497	0.1324476	3263.577	3304.791	3284.184

#length t.test
length_test=t.test(Lunghezza,
                   mu=mu_length,
                   conf.level=0.95,
                   alternative="two.sided")

# Weigth result dataframe
length_test_results = data.frame(
  Statistic = length_test$statistic,
  Degrees_of_Freedom = length_test$parameter,
  P_Value = formatC(length_test$p.value, format = "e", digits = 2),
  `95%_CI_Lower` = length_test$conf.int[1],
  `95%_CI_Upper` = length_test$conf.int[2],
  Mean = length_test$estimate
)

kable(length_test_results, format = "html",align="c", caption = "Length T Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

Length T Test Result
	Statistic	Degrees_of_Freedom	P_Value	X95._CI_Lower	X95._CI_Upper	Mean
t	-10.06903	2497	2.10e-23	493.6628	495.7287	494.6958

weight: Since the p-value is greater than 0.05, we cannot reject the null hypothesis. This means that we do not have sufficient evidence to conclude that the average birth weight in the sample is significantly different from 3300 grams. In addition, the confidence interval includes the value 3300, which further reinforces the idea that there is no significant difference between the sample mean and the hypothesized population mean.
length: Since the p-value is extremely low, we can reject the null hypothesis. This means that we have sufficient evidence to conclude that the average birth length in the sample is significantly different from 500 millimeters. The confidence interval does not include the value 500, which further strengthens the evidence against the null hypothesis. The average length is significantly lower than the population mean.

2.3 Difference in anthropometric measurements of the two sexes

To evaluate any difference in anthropometric measurements between the two sexes it is possible to perform a statistical test. First, we have to check if the variables weight, length and skull diameter respect the hypotheses of normal distribution and equality of variance between groups. To verify that, we can perform a Shapiro-Wilk test (normality) and a Levene test (variance equality).

library(car)
library(knitr)
library(kableExtra)

# Normality Test
shapiro_weight_M=shapiro.test(data$Peso[data$Sesso == "M"])
shapiro_weight_F=shapiro.test(data$Peso[data$Sesso == "F"])

shapiro_length_M=shapiro.test(data$Lunghezza[data$Sesso == "M"])
shapiro_length_F=shapiro.test(data$Lunghezza[data$Sesso == "F"])

shapiro_skull_M=shapiro.test(data$Cranio[data$Sesso == "M"])
shapiro_skull_F=shapiro.test(data$Cranio[data$Sesso == "F"])

# Variance Equality Test
levene_weight=leveneTest(Peso ~ Sesso, data = data)
levene_length=leveneTest(Lunghezza ~ Sesso, data = data)
levene_skull=leveneTest(Cranio ~ Sesso, data = data)

# normality dataframe
df_normality=data.frame(
  Variable = c("weight", "length","skull diameter"),
  Male = c(shapiro_weight_M$p.value, shapiro_length_M$p.value, shapiro_skull_M$p.value),
  Female =c(shapiro_weight_F$p.value, shapiro_length_F$p.value, shapiro_skull_F$p.value))

df_normality$Male = formatC(df_normality$Male, format = "e", digits = 2)
df_normality$Female = formatC(df_normality$Female, format = "e", digits = 2)
  
# levene dataframe
df_levene=data.frame(
  Variable = c("weight", "length","skull diameter"),
  P_Value = c(levene_weight$`Pr(>F)`[1], levene_length$`Pr(>F)`[1], levene_skull$`Pr(>F)`[1]))

# Shapiro test table
kable(df_normality, format = "html", align="c",caption = "Shapiro-Wilk Test") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
  column_spec(1, width = "3cm") %>%
  column_spec(2, width = "5cm") %>%
  column_spec(3, width = "5cm")

Shapiro-Wilk Test
Variable	Male	Female
weight	2.23e-16	2.35e-17
length	4.22e-25	6.87e-28
skull diameter	2.90e-15	4.29e-19

# Levene test table
kable(df_levene, format = "html", align="c",caption = "Levene Test") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
  column_spec(1, width = "3cm") %>%
  column_spec(2, width = "5cm")

Levene Test
Variable	P_Value
weight	0.3646375
length	0.0011640
skull diameter	0.2714821

All the variables, in both groups, show a p value extremely lower than 0.05 that means the hypothesis of normal distribution is not respected. To compare the averages of anthropometric measurements between the two sexes, it is more appropriate to use a non-parametric test, such as the Wilcoxon Test.

# Wilcoxon test for weigth
wilcox_weight=wilcox.test(Peso ~ Sesso, data = data)

# Wilcoxon test for length
wilcox_length=wilcox.test(Lunghezza ~ Sesso, data = data)

# Wilcoxon test for skull diameter
wilcox_skull=wilcox.test(Cranio ~ Sesso, data = data)

# Wilcoxon dataframe
df_wilcox=data.frame(
  Variable = c("weight", "length","skull diameter"),
  P_Value = c(wilcox_weight$p.value, wilcox_length$p.value, wilcox_skull$p.value))

df_wilcox$P_Value = formatC(df_wilcox$P_Value, format = "e", digits = 2)

# Wilcoxon test table
kable(df_wilcox, format = "html", align="c",caption = "Wilcoxon Test") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
  column_spec(1, width = "3cm") %>%
  column_spec(2, width = "5cm")

Wilcoxon Test
Variable	P_Value
weight	2.91e-41
length	2.66e-25
skull diameter	7.14e-15

The result of the Wilcoxon test for all variables (weight, length and skull diameter) indicates that there is a significant difference between the sexes (p-value extremely lower than 0.05). The observed difference in weight, length and skull diameter between males and females is statistically significant.

3. Regression Model

Analyze the relationship between variables

First of all, evaluate the normality of the response variable, as any non-normality of the response variable could affect the residuals.

library(moments)
library(knitr)
library(kableExtra)

# shape index dataframe
df_shape=data.frame(
  Index = c("Skewness", "Kurtosis"),
  Value = c(round(skewness(Peso),2), round((kurtosis(Peso)-3),2)))
  
# shape index table
kable(df_shape, format = "html", align="c",caption = "Weight Shape Index") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
  column_spec(1, width = "3cm") %>%
  column_spec(2, width = "3cm")

Weight Shape Index
Index	Value
Skewness	-0.65
Kurtosis	2.03

The variable weight shows a negative asymmetry (\(\gamma_{1weight}<0\)), and is leptokurtic (\(\gamma_{2weight}>0\)).

# weight test  Shapiro-Wilk
shapiro_test_peso <- shapiro.test(Peso)

# Creare un data frame con i risultati del test di Shapiro-Wilk
shapiro_peso_df <- data.frame(
  Statistic = shapiro_test_peso$statistic,
  P_Value = formatC(shapiro_test_peso$p.value, format = "e", digits = 2),
  Test = "Shapiro-Wilk",
  Data = "Peso"
)

kable(shapiro_peso_df, format = "html", align="c",caption = "Weight Shapiro-Wilk Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

Weight Shapiro-Wilk Test Result
	Statistic	P_Value	Test	Data
W	0.9706841	3.38e-22	Shapiro-Wilk	Peso

With an extremely low p value, the null hypothesis is rejected. The weight variable is not normal. This non-normality of the response variable could have repercussions on the residuals of the model. This will be verified later during the residue analysis phase.

Before creating the model, it is important to investigate the relationship between the available variables.

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor)
}
pairs(data, lower.panel = panel.cor, upper.panel = panel.smooth)

length: The linear correlation coefficient between weight and length is very high and positive (0.80), therefore weight is very positively correlated to length. The scatterplot also shows the presence of an elongated point cloud with a positive slope.
skull diameter: Skull diameter also has a positive and high linear correlation coefficient. The scatterplot appears to show a non-linearity for the lower values of this variable. Not to be understimated the presence of correlation coefficients of 0.60 between skull diameter and length. It is not a very high value, but in any case it is worth keeping an eye on and subsequently checking that there are no multicollinearity problems.
gestation: gestation has a relatively high and positive correlation coefficient with weight. The scatterplot shows the presence of a non linear effect at the lower value of this variable. To keep attention to the correlation coefficient between this variable and length (0.62).
sex: : with a coefficient of 0.24, the sex variable shows an effect on the response variable. As seen previously with the Wilcoxon test, there are significant differences in weight between the two sexes.

The remaining variables (type of birth, hospital, smokers, number of pregnancies and mother years), since their coefficients are extremely low, do not show correlation with the response variable.

In the first instance it is possible to create a model containing all the variables present in the dataset and subsequently update the model by removing non-significant variables or adding any non-linear or interaction effects between variables.

# Formatting Pr(>|t|) value function
format_p_value = function(p_value) {
  if (p_value >= 1e-4) {
    return(format(round(p_value, 4), nsmall = 4))
  } else if (p_value <= 2e-16) {
    return("< 2e-16")
  } else {
    return(format(p_value, digits = 3, scientific = TRUE))
  }
}

# generates table from model function
generate_model_tables = function(model) {
  model_name = deparse(substitute(model))
  
  # mod summury
  summary_model = summary(model)
  
  # summary data frame
  coefficients_table = as.data.frame(summary_model$coefficients)
  
  # Round Estimate, Std. Error, t value
  coefficients_table[, 1:3] = round(coefficients_table[, 1:3], 2)
  
  # Significativity Pr(>|t|) value
  significance_codes = symnum(coefficients_table[, 4], corr = FALSE, na = FALSE,
                               cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                               symbols = c('***', '**', '\\*', ".", " "))

  
  # Formatting Pr(>|t|) value
  coefficients_table[, 4] = sapply(coefficients_table[, 4], format_p_value)
  
  # Adding significativity codes
  coefficients_table$Signif = significance_codes
  
  # Additional info table
  additional_info = data.frame(
    Description = c("Signif. codes:", "Residual standard error:", "Multiple R-squared:", "Adjusted R-squared:", "F-statistic:", "p-value:"),
    Value = c("0 '\\***'  0.001 '\\**'  0.01 '\\*'  0.05 '.'  0.1 ' '  1", 
              paste(round(summary_model$sigma, 0), "on", summary_model$df[2], "degrees of freedom"),
              round(summary_model$r.squared, 4),
              round(summary_model$adj.r.squared, 4),
              paste(round(summary_model$fstatistic[1], 1), "on", summary_model$fstatistic[2], "and",       summary_model$fstatistic[3], "DF"),
              "< 2.2e-16")
  )
  
  # Return table
  return(list(coefficients_table = coefficients_table,
              additional_info = additional_info,
              model_name = model_name))
}

mod1=lm(Peso ~ Anni.madre+N.gravidanze+Fumatrici+Gestazione+
          Lunghezza+Cranio+Tipo.parto+Ospedale+
          Sesso,data=data)

result_mod1 = generate_model_tables(mod1)

Coefficient Table mod1
	Estimate	Std. Error	t value	Pr(>\|t\|)	Signif
(Intercept)	-6735.80	141.48	-47.61	< 2e-16	***
Anni.madre	0.80	1.15	0.70	0.4845
N.gravidanze	11.38	4.67	2.44	0.0148	*
FumatriciSI	-30.27	27.55	-1.10	0.2719
Gestazione	32.58	3.82	8.53	< 2e-16	***
Lunghezza	10.29	0.30	34.21	< 2e-16	***
Cranio	10.47	0.43	24.57	< 2e-16	***
Tipo.partoNat	29.63	12.09	2.45	0.0143	*
Ospedaleosp2	-11.09	13.45	-0.82	0.4096
Ospedaleosp3	28.25	13.51	2.09	0.0366	*
SessoM	77.57	11.19	6.93	5.18e-12	***

Additional Info Table mod1
Info	Value
Signif. codes:	0 ’*’ 0.001 ’’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error:	274 on 2487 degrees of freedom
Multiple R-squared:	0.7289
Adjusted R-squared:	0.7278
F-statistic:	668.7 on 10 and 2487 DF
p-value:	< 2.2e-16

The model that includes all available variables shows a relatively high Adjusted R-squared (0.73).

Mother’s age and smokers seem to be not significant. Hospitals number 3 seems to be lightly significant, however its unlikely that the hospitals directly affect the newborn weight.

As next step, remove mother’s age and hospitals. Let’s keep the variable smokers as it is known in scientific literature that maternal smoking affects the weight of the newborn directly and indirectly favoring premature birth.

mod2=update(mod1, ~.-(Anni.madre+Ospedale))

result_mod2 = generate_model_tables(mod2)

Coefficient Table mod2
	Estimate	Std. Error	t value	Pr(>\|t\|)	Signif
(Intercept)	-6708.81	136.06	-49.31	< 2e-16	***
N.gravidanze	12.99	4.34	2.99	0.0028	**
FumatriciSI	-31.88	27.58	-1.16	0.2478
Gestazione	32.60	3.80	8.57	< 2e-16	***
Lunghezza	10.27	0.30	34.10	< 2e-16	***
Cranio	10.50	0.43	24.64	< 2e-16	***
Tipo.partoNat	30.42	12.10	2.51	0.0120	*
SessoM	78.10	11.20	6.97	3.94e-12	***

Additional Info Table mod2
Info	Value
Signif. codes:	0 ’*’ 0.001 ’’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error:	274 on 2490 degrees of freedom
Multiple R-squared:	0.7278
Adjusted R-squared:	0.7271
F-statistic:	951.3 on 7 and 2490 DF
p-value:	< 2.2e-16

Removing 2 variables, the Adjusted R-squared only changed on the fourth decimal digit.The minimum reduction of the Adjusted R-squared is an acceptable compromise considering that the model has been simplified. In addition, the variables Number of pregnancies shows a greater significance. The type of birth continues to show less relevance than the other variables. Removing this variable will show how it affect the model.

mod3=update(mod2, ~.-Tipo.parto)

result_mod3 = generate_model_tables(mod3)

Coefficient Table mod3
	Estimate	Std. Error	t value	Pr(>\|t\|)	Signif
(Intercept)	-6682.26	135.80	-49.21	< 2e-16	***
N.gravidanze	12.70	4.35	2.92	0.0035	**
FumatriciSI	-30.57	27.60	-1.11	0.2682
Gestazione	32.64	3.81	8.57	< 2e-16	***
Lunghezza	10.23	0.30	33.98	< 2e-16	***
Cranio	10.54	0.43	24.71	< 2e-16	***
SessoM	78.16	11.21	6.97	4.01e-12	***

Additional Info Table mod3
Info	Value
Signif. codes:	0 ’*’ 0.001 ’’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error:	275 on 2491 degrees of freedom
Multiple R-squared:	0.7271
Adjusted R-squared:	0.7265
F-statistic:	1106.4 on 6 and 2491 DF
p-value:	< 2.2e-16

By removing the type of birth variable, the reduction in Adjusted R-squared is minimal.

During the analysis of the relationship between the variables, a correlation coefficient between length and diameter of the skull of 0.60 and a correlation coefficient between length and gestation of 0.62 emerged. Before further modifying the model, we verify that there are no multicollinearity problems.

vif_mod3 = round(vif(mod3),3)

kable(vif_mod3, format = "html", col.names = c("Variable", "VIF"), align="c",caption = "VIF Table mod3")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

VIF Table mod3
Variable	VIF
N.gravidanze	1.026
Fumatrici	1.007
Gestazione	1.676
Lunghezza	2.080
Cranio	1.625
Sesso	1.040

All VIF values are less than 5, which indicates that there are no significant multicollinearity issues in the model 3. In particular, the highest value is 2.080 for the variable Length , which is still considered acceptable.

From the analysis of the correlations between variables a non-linearity of the skull diameter and gestation came out. Let’s add these effects in two separate steps.

mod4a=update(mod3, ~.+I(Gestazione^2))

result_mod4a = generate_model_tables(mod4a)

Coefficient Table mod4a
	Estimate	Std. Error	t value	Pr(>\|t\|)	Signif
(Intercept)	-4665.63	898.78	-5.19	2.26e-07	***
N.gravidanze	12.78	4.34	2.94	0.0033	**
FumatriciSI	-29.35	27.59	-1.06	0.2874
Gestazione	-79.95	49.75	-1.61	0.1082
Lunghezza	10.34	0.30	33.96	< 2e-16	***
Cranio	10.63	0.43	24.83	< 2e-16	***
SessoM	75.95	11.24	6.75	1.78e-11	***
I(Gestazione^2)	1.50	0.66	2.27	0.0233	*

Additional Info Table mod4a
Info	Value
Signif. codes:	0 ’*’ 0.001 ’’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error:	274 on 2490 degrees of freedom
Multiple R-squared:	0.7277
Adjusted R-squared:	0.7269
F-statistic:	950.7 on 7 and 2490 DF
p-value:	< 2.2e-16

The addition of the non linear effect of the gestation variable breaks the model causing the linear effect of gestation to become non significant.

mod4b=update(mod3, ~.+I(Cranio^2))

result_mod4b = generate_model_tables(mod4b)

Coefficient Table mod4b
	Estimate	Std. Error	t value	Pr(>\|t\|)	Signif
(Intercept)	56.88	1152.12	0.05	0.9606
N.gravidanze	12.98	4.32	3.00	0.0027	**
FumatriciSI	-26.69	27.43	-0.97	0.3306
Gestazione	39.11	3.94	9.93	< 2e-16	***
Lunghezza	10.47	0.30	34.69	< 2e-16	***
Cranio	-31.63	7.17	-4.41	1.08e-05	***
SessoM	73.28	11.17	6.56	6.45e-11	***
I(Cranio^2)	0.06	0.01	5.89	4.39e-09	***

Additional Info Table mod4b
Info	Value
Signif. codes:	0 ’*’ 0.001 ’’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error:	273 on 2490 degrees of freedom
Multiple R-squared:	0.7309
Adjusted R-squared:	0.7301
F-statistic:	966.1 on 7 and 2490 DF
p-value:	< 2.2e-16

The addition of the non linear effect of the skull diameter variable results in a minimal increase in the Adjusted R-squared. It is not worthwhile to complicate the model in the face of a negligible increase in explained variability.

In scientific literature it is well known that maternal smoking indirectly impacts the weight of the newborn, favoring premature birth. Let’s try to introduce the combined effect of the smoking and gestation variables into the model.

mod4c=update(mod3, ~.+(Fumatrici*Gestazione))

result_mod4c = generate_model_tables(mod4c)

Coefficient Table mod4c
	Estimate	Std. Error	t value	Pr(>\|t\|)	Signif
(Intercept)	-6699.69	136.73	-49.00	< 2e-16	***
N.gravidanze	12.75	4.35	2.93	0.0034	**
FumatriciSI	795.70	757.53	1.05	0.2936
Gestazione	33.20	3.84	8.64	< 2e-16	***
Lunghezza	10.23	0.30	33.96	< 2e-16	***
Cranio	10.53	0.43	24.69	< 2e-16	***
SessoM	78.74	11.22	7.02	2.94e-12	***
FumatriciSI:Gestazione	-21.05	19.28	-1.09	0.2752

Additional Info Table mod4c
Info	Value
Signif. codes:	0 ’*’ 0.001 ’’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error:	275 on 2490 degrees of freedom
Multiple R-squared:	0.7273
Adjusted R-squared:	0.7265
F-statistic:	948.6 on 7 and 2490 DF
p-value:	< 2.2e-16

Also in this case, maternal smoking appears not to be significant and does not bring any benefit to the model. The adjusted R-square is the same as that of model 3.

There appears to be no evidence to support the thesis that maternal smoking positively impacts the model. Let’s remove this variable to understand how the model changes.

mod5=update(mod3, ~.-Fumatrici)

result_mod5 = generate_model_tables(mod5)

Coefficient Table mod5
	Estimate	Std. Error	t value	Pr(>\|t\|)	Signif
(Intercept)	-6681.73	135.80	-49.20	< 2e-16	***
N.gravidanze	12.46	4.34	2.87	0.0042	**
Gestazione	32.38	3.80	8.52	< 2e-16	***
Lunghezza	10.25	0.30	34.06	< 2e-16	***
Cranio	10.54	0.43	24.72	< 2e-16	***
SessoM	77.98	11.21	6.96	4.47e-12	***

Additional Info Table mod5
Info	Value
Signif. codes:	0 ’*’ 0.001 ’’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error:	275 on 2492 degrees of freedom
Multiple R-squared:	0.727
Adjusted R-squared:	0.7265
F-statistic:	1327.3 on 5 and 2492 DF
p-value:	< 2.2e-16

The adjusted R-square is the same as that of model 3. With the starting data available, it can be concluded that the smoking variable is not significant.

The remaining variables all have high significance. It is concluded that model 5 is the best model.

The model chosen as the best includes the following variables:

Number of pregnancies: a unit increase in this variable will result in an increase in the newborn’s weight of 12.46 grams
Gestation: : a unit increase in this variable will result in an increase in the newborn’s weight of 32.38 grams.
Length: for every increment of one millimeter in the length of the newborn, the weight increases by 10.25 grams
Skull diameter: for every increment of one millimeter in the diameter of the newborn’s skull, the weight increases by 10.54 grams
Sex: a newborn male, on average, weighs 77.98 grams more than a newborn female

To verify the statement that model 5 is the best one, it is possible to use the BIC indicator. In addition it is possible to perform an automatic identification of the best model using the stepwise_AIC procedure and compare the output with the results obtained.

library(MASS)

#automatic model
stepwise_mod=stepAIC(mod1,direction = "both", k=log(n))

# BIC of all models found
bic_output = BIC(mod1, mod2, mod3, mod4a, mod4b, mod4c, mod5,stepwise_mod)

kable(bic_output, format = "html",col.names = c("Df", "BIC"),
      align = "c", caption = "BIC Values for Different Models")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

BIC Values for Different Models
	Df	BIC
mod1	12	35215.45
mod2	9	35201.73
mod3	8	35200.24
mod4a	9	35202.90
mod4b	9	35173.50
mod4c	9	35206.87
mod5	7	35193.65
stepwise_mod	7	35193.65

BIC has to be read as lower is better. In that case the model 4b is the lowest. However, the analysis showed that increasing the complexity of the model 4b does not lead to a significant increase in the explained variability. In addition, the automatic procedure found that the best model has same BIC of the model 5. It can be concluded that the best model is model 5.

4. Residue analysis

After obtaining the model it is necessary to carry out an analysis of the residues to evaluate whether they comply with the following assumptions:

Normal Distribution
Mean equals 0
Homoscedasticity
Incorrelation

par(mfrow=c(2,2)) 
plot(mod5)

The following graphs are shown in the image above:

Residuals vs Fitted: The points should be randomly distributed around the horizontal zero line. The graph shows some patterns at extreme values, indicating potential problems with the assumption of residuals means equals 0.
Q-Q Plot: This plot compares the standardized residuals to the theoretical quantiles of a normal distribution. It is used to test the assumption of normality of the residuals. The points should be on the diagonal line if the residuals are normally distributed. The graph shows some deviations from the line, especially at the start and at the ends, indicating potential non-normality.
Scale-Location: This plot is used to test the homoscedasticity assumption. The dots should be randomly distributed around a horizontal line. The graph shows some patterns, indicating potential problems with homoscedasticity.
Residuals vs Leverage: This graph is used to identify influential observations. Points with high leverage and large residuals are potentially influential. The chart shows one point (1549) crossing the attention line of 0.5.

it is good practice, in addition to graphical analysis, to carry out statistical tests to verify that the model assumptions are respected.

library(lmtest)
# Test
bp_test = bptest(mod5)
dw_test = dwtest(mod5)
shapiro_test = shapiro.test(residuals(mod5))

# Result dataframe
results_residuals = data.frame(
  Test = c("Breusch-Pagan", "Durbin-Watson", "Shapiro-Wilk"),
  Statistic = round(c(bp_test$statistic, dw_test$statistic, shapiro_test$statistic),3),
  `p-value` = c(bp_test$p.value, dw_test$p.value, shapiro_test$p.value)
)

# Formatting P value
results_residuals[, 3] = sapply(results_residuals[, 3], format_p_value)

kable(results_residuals, format = "html", align = "c", caption = "Residual Test Results")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

Residual Test Results
	Test	Statistic	p.value
BP	Breusch-Pagan	90.297	< 2e-16
DW	Durbin-Watson	1.953	0.1209
W	Shapiro-Wilk	0.974	< 2e-16

Breusch-Pagan Test: with an extremely low p-value the null hypothesis is rejected. The residuals of the model not satisfy the assumption of homoscedasticity.
Durbin-Watson Test: the p-value of 0.1209 suggests that the null hypothesis is not rejected. The residuals are not affected by autocorrelation.
Shapiro-Wilk Normality Test: with an extremely low p-value the null hypothesis is rejected. The residuals do not follow a normal distribution.

Just because the model didn’t pass the Breusch-Pagan and Shapiro-Wilk tests doesn’t mean it’s not valid. The model has an Adjusted R-square of 0.7265 which indicates that it manages to explain more than 70% of the variability of the response variable. It can be used to make predictions with awareness of its limits.

library(ggplot2)
library(car)

# Leverage
lev = hatvalues(mod5)
p = sum(lev)
soglia_lev = 2 * p / n

# Leverage Plot
leverage_plot = ggplot(data.frame(Index = 1:length(lev), Leverage = lev), aes(x = Index, y = Leverage)) +
  geom_point() +
  geom_hline(yintercept = soglia_lev, color = "red", size = 1.5) +
  ggtitle("Leverage Values") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Outlier
rstudent_vals = rstudent(mod5)

# Outlier plot
outlier_plot = ggplot(data.frame(Index = 1:length(rstudent_vals), RStudent = rstudent_vals), aes(x = Index, y = RStudent)) +
  geom_point() +
  geom_hline(yintercept = c(-2, 2), color = "red", size = 1.5) + 
  ggtitle("Studentized Residuals") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  

library(gridExtra)
grid.arrange(leverage_plot, outlier_plot, ncol = 2, widths = c(1, 1))

num_leverage=sum(lev>soglia_lev)
num_outlier = sum(abs(rstudent_vals) > 2)

ris_influent <- data.frame(
  Influent = c("Leverage", "Outlier"),
  Total = c(num_leverage, num_outlier)
)

kable(ris_influent, col.names = c("", "Total"),align="c", caption = "Leverage and Outliers total value")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")%>%
                column_spec(1, width = "5cm")%>%
                column_spec(2, width = "2cm")

Leverage and Outliers total value
	Total
Leverage	153
Outlier	101

153 leverage values and 101 outlier values has been identified. From the graphical analysis it is noted that most of these values slightly exceed the threshold. Let’s use another visualization (cook distance) to analyze the leverage values and the outlier values at the same time.

# Cook distance
cook <- cooks.distance(mod5)

# index max value
max_cook_index <- which.max(cook)
max_cook_value <- cook[max_cook_index]


ggplot(data.frame(Index = 1:length(cook), Cook = cook), aes(x = Index, y = Cook)) +
  geom_point() +
  geom_text(aes(label = ifelse(Index == max_cook_index, max_cook_index, '')),
            vjust = -1, hjust = 1) +  
  geom_point(data = data.frame(Index = max_cook_index, Cook = max_cook_value),
             aes(x = Index, y = Cook), color = "red", size = 3) +  
  ggtitle("Cook's Distance") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +  
  ylim(0, 1)

The Cook’s Distance graph shows only one value with a high cook distance. Let’s remove the observation 1549 from the dataset to analyze the effect on the model.

detach(data)
data <- data %>% filter(row_number() != 1549)
attach(data)

mod5_new=lm(Peso ~ N.gravidanze+Gestazione+Lunghezza+Cranio+Sesso,
            data=data)

result_mod5new = generate_model_tables(mod5_new)

Coefficient Table mod5_new
	Estimate	Std. Error	t value	Pr(>\|t\|)	Signif
(Intercept)	-6683.83	133.16	-50.19	< 2e-16	***
N.gravidanze	13.14	4.26	3.09	0.0020	**
Gestazione	29.63	3.74	7.93	3.27e-15	***
Lunghezza	10.89	0.30	36.08	< 2e-16	***
Cranio	9.92	0.42	23.46	< 2e-16	***
SessoM	78.14	10.99	7.11	1.53e-12	***

Additional Info Table mod5_new
Info	Value
Signif. codes:	0 ’*’ 0.001 ’’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error:	269 on 2491 degrees of freedom
Multiple R-squared:	0.7372
Adjusted R-squared:	0.7367
F-statistic:	1397.5 on 5 and 2491 DF
p-value:	< 2.2e-16

Following the removal of observation 1549, the descriptive variables present in the model continue to show all high significance. Furthermore, an increase in the adjusted R-squared is observed.

Statistical tests will help in understanding how the removal of the observation 1549 affects the residuals.

bp_test = bptest(mod5_new)
dw_test = dwtest(mod5_new)
shapiro_test = shapiro.test(residuals(mod5_new))

# Result dataframe
results_residuals = data.frame(
  Test = c("Breusch-Pagan", "Durbin-Watson", "Shapiro-Wilk"),
  Statistic = round(c(bp_test$statistic, dw_test$statistic, shapiro_test$statistic),3),
  `p-value` = c(bp_test$p.value, dw_test$p.value, shapiro_test$p.value)
)

# Formatting P value
results_residuals[, 3] = sapply(results_residuals[, 3], format_p_value)

kable(results_residuals, format = "html", align = "c", caption = "Residual Test Results")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

Residual Test Results
	Test	Statistic	p.value
BP	Breusch-Pagan	11.343	0.0450
DW	Durbin-Watson	1.954	0.1227
W	Shapiro-Wilk	0.989	5.23e-13

All test statistics show an increase. The p-value of the Shapiro-Wilk test continues to be extremely low, rejecting the hypothesis of a normal distribution of the residuals. However, the p-value of the Breusch-Pagan test is borderline (still rejecting the hypothesis of homoscedasticity) at a significance level of 5%.

# Calcola i residui del modello
residuls_mod5new <- residuals(mod5_new)

# Crea il grafico di densità con ggplot
ggplot() +
  geom_density(aes(x = residuls_mod5new, y = ..density.., color = "Model Residuals"), size = 1) +
  geom_density(aes(x = rnorm(1000000, 
                             mean = mean(residuls_mod5new), 
                             sd = sd(residuls_mod5new)), 
                   y = ..density.., color = "Normal Distribution"), size = 1) +
  scale_color_manual(values = c("Model Residuals" = "black", "Normal Distribution" = "red"), 
                     name = "Curve") +
  ggtitle("Density of probability") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x = "Value", y = "Density")

From a graphical analysis seems not to be present major differences between a normal distribution and the probability density of the residuals of the final model. Considering the value of the Adjusted R-squared, the results of the statistical tests and the graphical analyses, this model can be considered acceptable.

4. Predictions and Results

Since the model includes 5 descriptive variables, to predict the weight of a female newborn considering a mother in her third pregnancy who will give birth at the 39th week, the values of length and skull diameter are necessary. For these last two variables, assume the average values calculated on the dataset from which the model was derived.

# new_born 
new_born = data.frame( N.gravidanze = 3, 
                       Gestazione = 39, 
                       Lunghezza = mean(data$Lunghezza),
                       Cranio = mean(data$Cranio), 
                       Sesso = "F" ) 

# Prediction
pred_new_born <- predict(mod5_new, newdata = new_born,interval = "confidence") 

res_pred <- data.frame(
  Result = "Predicted Weight in grams",
  Fit = round(pred_new_born[1, "fit"],2),
  Lower = round(pred_new_born[1, "lwr"],2),
  Upper = round(pred_new_born[1, "upr"],2)
)

# Mostra i risultati utilizzando kable
kable(res_pred, col.names = c("", "Fit", "Lower", "Upper"), format = "html", align = "c") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")

	Fit	Lower	Upper
Predicted Weight in grams	3271.98	3249.09	3294.87

In scientific literature, the average weight of a newborn is reported as 3300g. This value is very close to the upper limit of the predicted confidence interval. It can be considered that the prediction is valid.

5. Model Graphical Visualization

After obtaining the model it is possible to explore the main relationships between variables with the help of graphs.

#Weight vs Gestation
ggplot(data, aes(x = Gestazione, y = Peso, color = Sesso)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_manual(values = c("M" = "blue", "F" = "red")) +
  scale_x_continuous(breaks = seq(24, 43, by =2), limits = c(24, 43)) +
  labs(title = "Weight vs Gestation",
       x = "Gestation (week)",
       y = "Weight (g)",
       color=NULL) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

The graph shows a positive trend for both sexes in which weight increases as the number of weeks of gestation increases.

The regression lines for the two sexes appear parallell. The regression line for the female sex is shifted lower. This indicates that for both sexes, the unit increase in the variable gestation leads to the same increase in weight with the difference that on average female newborns weigh less for the same weeks of gestation.

library(scatterplot3d)
library(car)
library(rgl)


colors = c("F" = "red", "M" = "blue")

s3d = scatterplot3d(Lunghezza, Cranio, Peso, color=colors[Sesso], pch=19,
                     xlab="Length (mm)", ylab="Skull diameter (mm)", zlab="Weight (g)",
                     main="Relation between Weight, Length e Skull Diameter")

# Regression planes by sex
regF = lm(Peso ~ Lunghezza + Cranio, data = data[data$Sesso == "F", ])
regM = lm(Peso ~ Lunghezza + Cranio, data = data[data$Sesso == "M", ])

# Adding regression planes to graph
s3d$plane3d(regF, col = "red", lty = "solid")
s3d$plane3d(regM, col = "blue", lty = "dashed")

legend("topright", legend=c("F", "M"), col=c("red", "blue"), pch=19, bty="n")

The graph shows a positive effect of both descriptive variables (length and skull diameter) on the weight of the newborn. The regression plans for the two modes of the sex variable are very similar to each other indicating that the effect of the length and skull diameter is the same for both sexes.