1. Dateset Structure

#import dataset
setwd("D:/Users/Anton/Desktop/ProfessionAI/03-Statistica inferenziale/Progetto")
data=read.csv("neonati.csv",
              sep=",",
              stringsAsFactors = T)

# Convert Smokers binary column into a factor variable with 2 levels
data$Fumatrici <- factor(data$Fumatrici, levels = c(0, 1), labels = c("NO", "SI"))

# Caricamento delle librerie necessarie
library(knitr)
library(kableExtra)

# First row df
df = data.frame(
  Anni.madre = data[1:5,1],
  N.gravidanze = data[1:5,2],
  Fumatrici = data[1:5,3],
  Gestazione = data[1:5,4],
  Peso = data[1:5,5],
  Lunghezza = data[1:5,6],
  Cranio = data[1:5,7],
  Tipo.parto = data[1:5,8],
  Ospedale = data[1:5,9],
  Sesso=data[1:5,10]
)

kable(df, format = "html", row.names = T,align = "c") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Anni.madre N.gravidanze Fumatrici Gestazione Peso Lunghezza Cranio Tipo.parto Ospedale Sesso
1 26 0 NO 42 3380 490 325 Nat osp3 M
2 21 2 NO 39 3150 490 345 Nat osp1 F
3 34 3 NO 38 3640 500 375 Nat osp2 M
4 28 1 NO 41 3690 515 365 Nat osp2 M
5 20 0 NO 38 3700 480 335 Nat osp3 F

The dataset contains the following variables:

2. Preliminary data analysis

Summarize the data

n=nrow(data)

#df numeric
df_num=data[,sapply(data,is.numeric)]
summary_dfnum = summary(df_num)

#df factor
df_factor=data[,sapply(data,is.factor)]
summary_dffact = summary(df_factor)
summary_dffact[is.na(summary_dffact)] ="" 


kable(summary_dfnum, format = "html",align = "c", caption = "Numeric Summary Table") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Numeric Summary Table
Anni.madre N.gravidanze Gestazione Peso Lunghezza Cranio
Min. : 0.00 Min. : 0.0000 Min. :25.00 Min. : 830 Min. :310.0 Min. :235
1st Qu.:25.00 1st Qu.: 0.0000 1st Qu.:38.00 1st Qu.:2990 1st Qu.:480.0 1st Qu.:330
Median :28.00 Median : 1.0000 Median :39.00 Median :3300 Median :500.0 Median :340
Mean :28.16 Mean : 0.9812 Mean :38.98 Mean :3284 Mean :494.7 Mean :340
3rd Qu.:32.00 3rd Qu.: 1.0000 3rd Qu.:40.00 3rd Qu.:3620 3rd Qu.:510.0 3rd Qu.:350
Max. :46.00 Max. :12.0000 Max. :43.00 Max. :4930 Max. :565.0 Max. :390
kable(summary_dffact, format = "html",align = "c", caption = "Factor Summary Table") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Factor Summary Table
Fumatrici Tipo.parto Ospedale Sesso
NO:2396 Ces: 728 osp1:816 F:1256
SI: 104 Nat:1772 osp2:849 M:1244
osp3:835

Looking at the numeric summary table, immediately jumps out the minimum value of the variable Anni.madre. It is for sure a data entry error.

Filtering the data will show if it is an isolated value or if there are other anomalous observation.

#Load library
library(dplyr)
#Filter anni.madre <15
data_filt=data %>% filter(data$Anni.madre<15)
#sort by anni.madre
data_filt=data_filt[order(data_filt$Anni.madre), ]

kable(data_filt$Anni.madre, format = "html", col.names = "Anni.madre", align="c")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Anni.madre
0
1
13
14
14

The are two year values (0 and 1) that for sure are not possible. This observation has to be removed.

#Filter anni.madre >10
data=data %>% filter(data$Anni.madre>10)

2.1 Hospital and birth type relationship

In order to investigate if there are hospitals in which are preferred cesarean birth, it is possibile to perform a Chi-squared test.

attach(data)
library(kableExtra)
library(knitr)

#Contingency table
contingency_birthtype=table(Ospedale, Tipo.parto)

# Chi-squared Test
chi_sq_test_birthtype <- chisq.test(contingency_birthtype)

# Results dataframe
results_birthtype = data.frame(
  Statistic = chi_sq_test_birthtype$statistic,
  Degrees_of_Freedom = chi_sq_test_birthtype$parameter,
  P_Value = chi_sq_test_birthtype$p.value
)

kable(results_birthtype, format = "html", align="c", caption = "Birthtype Chi-squared Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Birthtype Chi-squared Test Result
Statistic Degrees_of_Freedom P_Value
X-squared 1.082984 2 0.5818793
chisq_2grades = rchisq(1000000, 2)

library(ggplot2)
ggplot(data.frame(x = density(chisq_2grades)$x, y = density(chisq_2grades)$y),
       aes(x = x, y = y)) +
  geom_line() +
  geom_vline(xintercept = qchisq(0.99, 2), color = "red") +
  geom_point(aes(x = chi_sq_test_birthtype$statistic, y = 0), color = "blue", size = 3) +
  labs(x = "Value", y = "Density", title = "Chi-square distribution (2 df) with test statistic") +
  scale_y_continuous(breaks = seq(0, 0.5, by = 0.1), limits = c(0, 0.5)) +
  scale_x_continuous(breaks=seq(0,30, by=5), limits=c(-1,30))+
  theme_minimal()+
    theme(plot.title = element_text(hjust = 0.5, size = 14),  # Center Title
        axis.title.x = element_text(size = 12),  # axis name size
        axis.title.y = element_text(size = 12)) 

The test returns a very high p-value, therefore the null hypothesis is not rejected. There aren’t hospital in which cesarean birth is more frequent. Also graphically it is possible to notice that the test statistic is very far from the rejection zone.

2.2 Mean weight and length

Evaluate whether the average weight and length of the sample are significantly equal to those of the population. From a quick search on google,it is found that \(\mu_{weight} = 3300\;g\) and \(\mu_{length} = 500\;mm\). To evaluate the hypothesis it is possible to perform a t-test.

#population mean weight
mu_weigth=3300
#population mean length
mu_length=500

#weigth t.test
weigth_test=t.test(Peso,
                   mu=mu_weigth,
                   conf.level=0.95,
                   alternative="two.sided")

# Weigth result dataframe
weigth_test_results = data.frame(
  Statistic = weigth_test$statistic,
  Degrees_of_Freedom = weigth_test$parameter,
  P_Value = weigth_test$p.value,
  `95%_CI_Lower` = weigth_test$conf.int[1],
  `95%_CI_Upper` = weigth_test$conf.int[2],
  Mean = weigth_test$estimate
)

kable(weigth_test_results, format = "html", align="c", caption = "Weight T Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Weight T Test Result
Statistic Degrees_of_Freedom P_Value X95._CI_Lower X95._CI_Upper Mean
t -1.505011 2497 0.1324476 3263.577 3304.791 3284.184
#length t.test
length_test=t.test(Lunghezza,
                   mu=mu_length,
                   conf.level=0.95,
                   alternative="two.sided")

# Weigth result dataframe
length_test_results = data.frame(
  Statistic = length_test$statistic,
  Degrees_of_Freedom = length_test$parameter,
  P_Value = formatC(length_test$p.value, format = "e", digits = 2),
  `95%_CI_Lower` = length_test$conf.int[1],
  `95%_CI_Upper` = length_test$conf.int[2],
  Mean = length_test$estimate
)

kable(length_test_results, format = "html",align="c", caption = "Length T Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Length T Test Result
Statistic Degrees_of_Freedom P_Value X95._CI_Lower X95._CI_Upper Mean
t -10.06903 2497 2.10e-23 493.6628 495.7287 494.6958
  • weight: Since the p-value is greater than 0.05, we cannot reject the null hypothesis. This means that we do not have sufficient evidence to conclude that the average birth weight in the sample is significantly different from 3300 grams. In addition, the confidence interval includes the value 3300, which further reinforces the idea that there is no significant difference between the sample mean and the hypothesized population mean.

  • length: Since the p-value is extremely low, we can reject the null hypothesis. This means that we have sufficient evidence to conclude that the average birth length in the sample is significantly different from 500 millimeters. The confidence interval does not include the value 500, which further strengthens the evidence against the null hypothesis. The average length is significantly lower than the population mean.

2.3 Difference in anthropometric measurements of the two sexes

To evaluate any difference in anthropometric measurements between the two sexes it is possible to perform a statistical test. First, we have to check if the variables weight, length and skull diameter respect the hypotheses of normal distribution and equality of variance between groups. To verify that, we can perform a Shapiro-Wilk test (normality) and a Levene test (variance equality).

library(car)
library(knitr)
library(kableExtra)

# Normality Test
shapiro_weight_M=shapiro.test(data$Peso[data$Sesso == "M"])
shapiro_weight_F=shapiro.test(data$Peso[data$Sesso == "F"])

shapiro_length_M=shapiro.test(data$Lunghezza[data$Sesso == "M"])
shapiro_length_F=shapiro.test(data$Lunghezza[data$Sesso == "F"])

shapiro_skull_M=shapiro.test(data$Cranio[data$Sesso == "M"])
shapiro_skull_F=shapiro.test(data$Cranio[data$Sesso == "F"])

# Variance Equality Test
levene_weight=leveneTest(Peso ~ Sesso, data = data)
levene_length=leveneTest(Lunghezza ~ Sesso, data = data)
levene_skull=leveneTest(Cranio ~ Sesso, data = data)

# normality dataframe
df_normality=data.frame(
  Variable = c("weight", "length","skull diameter"),
  Male = c(shapiro_weight_M$p.value, shapiro_length_M$p.value, shapiro_skull_M$p.value),
  Female =c(shapiro_weight_F$p.value, shapiro_length_F$p.value, shapiro_skull_F$p.value))

df_normality$Male = formatC(df_normality$Male, format = "e", digits = 2)
df_normality$Female = formatC(df_normality$Female, format = "e", digits = 2)
  
# levene dataframe
df_levene=data.frame(
  Variable = c("weight", "length","skull diameter"),
  P_Value = c(levene_weight$`Pr(>F)`[1], levene_length$`Pr(>F)`[1], levene_skull$`Pr(>F)`[1]))

# Shapiro test table
kable(df_normality, format = "html", align="c",caption = "Shapiro-Wilk Test") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
  column_spec(1, width = "3cm") %>%
  column_spec(2, width = "5cm") %>%
  column_spec(3, width = "5cm")
Shapiro-Wilk Test
Variable Male Female
weight 2.23e-16 2.35e-17
length 4.22e-25 6.87e-28
skull diameter 2.90e-15 4.29e-19
# Levene test table
kable(df_levene, format = "html", align="c",caption = "Levene Test") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
  column_spec(1, width = "3cm") %>%
  column_spec(2, width = "5cm")
Levene Test
Variable P_Value
weight 0.3646375
length 0.0011640
skull diameter 0.2714821

All the variables, in both groups, show a p value extremely lower than 0.05 that means the hypothesis of normal distribution is not respected. To compare the averages of anthropometric measurements between the two sexes, it is more appropriate to use a non-parametric test, such as the Wilcoxon Test.

# Wilcoxon test for weigth
wilcox_weight=wilcox.test(Peso ~ Sesso, data = data)

# Wilcoxon test for length
wilcox_length=wilcox.test(Lunghezza ~ Sesso, data = data)

# Wilcoxon test for skull diameter
wilcox_skull=wilcox.test(Cranio ~ Sesso, data = data)

# Wilcoxon dataframe
df_wilcox=data.frame(
  Variable = c("weight", "length","skull diameter"),
  P_Value = c(wilcox_weight$p.value, wilcox_length$p.value, wilcox_skull$p.value))

df_wilcox$P_Value = formatC(df_wilcox$P_Value, format = "e", digits = 2)

# Wilcoxon test table
kable(df_wilcox, format = "html", align="c",caption = "Wilcoxon Test") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
  column_spec(1, width = "3cm") %>%
  column_spec(2, width = "5cm")
Wilcoxon Test
Variable P_Value
weight 2.91e-41
length 2.66e-25
skull diameter 7.14e-15

The result of the Wilcoxon test for all variables (weight, length and skull diameter) indicates that there is a significant difference between the sexes (p-value extremely lower than 0.05). The observed difference in weight, length and skull diameter between males and females is statistically significant.

3. Regression Model

Analyze the relationship between variables

First of all, evaluate the normality of the response variable, as any non-normality of the response variable could affect the residuals.

library(moments)
library(knitr)
library(kableExtra)

# shape index dataframe
df_shape=data.frame(
  Index = c("Skewness", "Kurtosis"),
  Value = c(round(skewness(Peso),2), round((kurtosis(Peso)-3),2)))
  
# shape index table
kable(df_shape, format = "html", align="c",caption = "Weight Shape Index") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
  column_spec(1, width = "3cm") %>%
  column_spec(2, width = "3cm")
Weight Shape Index
Index Value
Skewness -0.65
Kurtosis 2.03

The variable weight shows a negative asymmetry (\(\gamma_{1weight}<0\)), and is leptokurtic (\(\gamma_{2weight}>0\)).

# weight test  Shapiro-Wilk
shapiro_test_peso <- shapiro.test(Peso)

# Creare un data frame con i risultati del test di Shapiro-Wilk
shapiro_peso_df <- data.frame(
  Statistic = shapiro_test_peso$statistic,
  P_Value = formatC(shapiro_test_peso$p.value, format = "e", digits = 2),
  Test = "Shapiro-Wilk",
  Data = "Peso"
)

kable(shapiro_peso_df, format = "html", align="c",caption = "Weight Shapiro-Wilk Test Result") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Weight Shapiro-Wilk Test Result
Statistic P_Value Test Data
W 0.9706841 3.38e-22 Shapiro-Wilk Peso

With an extremely low p value, the null hypothesis is rejected. The weight variable is not normal. This non-normality of the response variable could have repercussions on the residuals of the model. This will be verified later during the residue analysis phase.

Before creating the model, it is important to investigate the relationship between the available variables.

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
  par(usr = c(0, 1, 0, 1))
  r <- abs(cor(x, y))
  txt <- format(c(r, 0.123456789), digits = digits)[1]
  txt <- paste0(prefix, txt)
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  text(0.5, 0.5, txt, cex = cex.cor)
}
pairs(data, lower.panel = panel.cor, upper.panel = panel.smooth)

The remaining variables (type of birth, hospital, smokers, number of pregnancies and mother years), since their coefficients are extremely low, do not show correlation with the response variable.

In the first instance it is possible to create a model containing all the variables present in the dataset and subsequently update the model by removing non-significant variables or adding any non-linear or interaction effects between variables.

# Formatting Pr(>|t|) value function
format_p_value = function(p_value) {
  if (p_value >= 1e-4) {
    return(format(round(p_value, 4), nsmall = 4))
  } else if (p_value <= 2e-16) {
    return("< 2e-16")
  } else {
    return(format(p_value, digits = 3, scientific = TRUE))
  }
}

# generates table from model function
generate_model_tables = function(model) {
  model_name = deparse(substitute(model))
  
  # mod summury
  summary_model = summary(model)
  
  # summary data frame
  coefficients_table = as.data.frame(summary_model$coefficients)
  
  # Round Estimate, Std. Error, t value
  coefficients_table[, 1:3] = round(coefficients_table[, 1:3], 2)
  
  # Significativity Pr(>|t|) value
  significance_codes = symnum(coefficients_table[, 4], corr = FALSE, na = FALSE,
                               cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                               symbols = c('***', '**', '\\*', ".", " "))

  
  # Formatting Pr(>|t|) value
  coefficients_table[, 4] = sapply(coefficients_table[, 4], format_p_value)
  
  # Adding significativity codes
  coefficients_table$Signif = significance_codes
  
  # Additional info table
  additional_info = data.frame(
    Description = c("Signif. codes:", "Residual standard error:", "Multiple R-squared:", "Adjusted R-squared:", "F-statistic:", "p-value:"),
    Value = c("0 '\\***'  0.001 '\\**'  0.01 '\\*'  0.05 '.'  0.1 ' '  1", 
              paste(round(summary_model$sigma, 0), "on", summary_model$df[2], "degrees of freedom"),
              round(summary_model$r.squared, 4),
              round(summary_model$adj.r.squared, 4),
              paste(round(summary_model$fstatistic[1], 1), "on", summary_model$fstatistic[2], "and",       summary_model$fstatistic[3], "DF"),
              "< 2.2e-16")
  )
  
  # Return table
  return(list(coefficients_table = coefficients_table,
              additional_info = additional_info,
              model_name = model_name))
}
mod1=lm(Peso ~ Anni.madre+N.gravidanze+Fumatrici+Gestazione+
          Lunghezza+Cranio+Tipo.parto+Ospedale+
          Sesso,data=data)

result_mod1 = generate_model_tables(mod1)
Coefficient Table mod1
Estimate Std. Error t value Pr(>|t|) Signif
(Intercept) -6735.80 141.48 -47.61 < 2e-16 ***
Anni.madre 0.80 1.15 0.70 0.4845
N.gravidanze 11.38 4.67 2.44 0.0148 *
FumatriciSI -30.27 27.55 -1.10 0.2719
Gestazione 32.58 3.82 8.53 < 2e-16 ***
Lunghezza 10.29 0.30 34.21 < 2e-16 ***
Cranio 10.47 0.43 24.57 < 2e-16 ***
Tipo.partoNat 29.63 12.09 2.45 0.0143 *
Ospedaleosp2 -11.09 13.45 -0.82 0.4096
Ospedaleosp3 28.25 13.51 2.09 0.0366 *
SessoM 77.57 11.19 6.93 5.18e-12 ***
Additional Info Table mod1
Info Value
Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error: 274 on 2487 degrees of freedom
Multiple R-squared: 0.7289
Adjusted R-squared: 0.7278
F-statistic: 668.7 on 10 and 2487 DF
p-value: < 2.2e-16

The model that includes all available variables shows a relatively high Adjusted R-squared (0.73).

Mother’s age and smokers seem to be not significant. Hospitals number 3 seems to be lightly significant, however its unlikely that the hospitals directly affect the newborn weight.

As next step, remove mother’s age and hospitals. Let’s keep the variable smokers as it is known in scientific literature that maternal smoking affects the weight of the newborn directly and indirectly favoring premature birth.

mod2=update(mod1, ~.-(Anni.madre+Ospedale))

result_mod2 = generate_model_tables(mod2)
Coefficient Table mod2
Estimate Std. Error t value Pr(>|t|) Signif
(Intercept) -6708.81 136.06 -49.31 < 2e-16 ***
N.gravidanze 12.99 4.34 2.99 0.0028 **
FumatriciSI -31.88 27.58 -1.16 0.2478
Gestazione 32.60 3.80 8.57 < 2e-16 ***
Lunghezza 10.27 0.30 34.10 < 2e-16 ***
Cranio 10.50 0.43 24.64 < 2e-16 ***
Tipo.partoNat 30.42 12.10 2.51 0.0120 *
SessoM 78.10 11.20 6.97 3.94e-12 ***
Additional Info Table mod2
Info Value
Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error: 274 on 2490 degrees of freedom
Multiple R-squared: 0.7278
Adjusted R-squared: 0.7271
F-statistic: 951.3 on 7 and 2490 DF
p-value: < 2.2e-16

Removing 2 variables, the Adjusted R-squared only changed on the fourth decimal digit.The minimum reduction of the Adjusted R-squared is an acceptable compromise considering that the model has been simplified. In addition, the variables Number of pregnancies shows a greater significance. The type of birth continues to show less relevance than the other variables. Removing this variable will show how it affect the model.

mod3=update(mod2, ~.-Tipo.parto)

result_mod3 = generate_model_tables(mod3)
Coefficient Table mod3
Estimate Std. Error t value Pr(>|t|) Signif
(Intercept) -6682.26 135.80 -49.21 < 2e-16 ***
N.gravidanze 12.70 4.35 2.92 0.0035 **
FumatriciSI -30.57 27.60 -1.11 0.2682
Gestazione 32.64 3.81 8.57 < 2e-16 ***
Lunghezza 10.23 0.30 33.98 < 2e-16 ***
Cranio 10.54 0.43 24.71 < 2e-16 ***
SessoM 78.16 11.21 6.97 4.01e-12 ***
Additional Info Table mod3
Info Value
Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error: 275 on 2491 degrees of freedom
Multiple R-squared: 0.7271
Adjusted R-squared: 0.7265
F-statistic: 1106.4 on 6 and 2491 DF
p-value: < 2.2e-16

By removing the type of birth variable, the reduction in Adjusted R-squared is minimal.

During the analysis of the relationship between the variables, a correlation coefficient between length and diameter of the skull of 0.60 and a correlation coefficient between length and gestation of 0.62 emerged. Before further modifying the model, we verify that there are no multicollinearity problems.

vif_mod3 = round(vif(mod3),3)

kable(vif_mod3, format = "html", col.names = c("Variable", "VIF"), align="c",caption = "VIF Table mod3")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
VIF Table mod3
Variable VIF
N.gravidanze 1.026
Fumatrici 1.007
Gestazione 1.676
Lunghezza 2.080
Cranio 1.625
Sesso 1.040

All VIF values are less than 5, which indicates that there are no significant multicollinearity issues in the model 3. In particular, the highest value is 2.080 for the variable Length , which is still considered acceptable.

From the analysis of the correlations between variables a non-linearity of the skull diameter and gestation came out. Let’s add these effects in two separate steps.

mod4a=update(mod3, ~.+I(Gestazione^2))

result_mod4a = generate_model_tables(mod4a)
Coefficient Table mod4a
Estimate Std. Error t value Pr(>|t|) Signif
(Intercept) -4665.63 898.78 -5.19 2.26e-07 ***
N.gravidanze 12.78 4.34 2.94 0.0033 **
FumatriciSI -29.35 27.59 -1.06 0.2874
Gestazione -79.95 49.75 -1.61 0.1082
Lunghezza 10.34 0.30 33.96 < 2e-16 ***
Cranio 10.63 0.43 24.83 < 2e-16 ***
SessoM 75.95 11.24 6.75 1.78e-11 ***
I(Gestazione^2) 1.50 0.66 2.27 0.0233 *
Additional Info Table mod4a
Info Value
Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error: 274 on 2490 degrees of freedom
Multiple R-squared: 0.7277
Adjusted R-squared: 0.7269
F-statistic: 950.7 on 7 and 2490 DF
p-value: < 2.2e-16

The addition of the non linear effect of the gestation variable breaks the model causing the linear effect of gestation to become non significant.

mod4b=update(mod3, ~.+I(Cranio^2))

result_mod4b = generate_model_tables(mod4b)
Coefficient Table mod4b
Estimate Std. Error t value Pr(>|t|) Signif
(Intercept) 56.88 1152.12 0.05 0.9606
N.gravidanze 12.98 4.32 3.00 0.0027 **
FumatriciSI -26.69 27.43 -0.97 0.3306
Gestazione 39.11 3.94 9.93 < 2e-16 ***
Lunghezza 10.47 0.30 34.69 < 2e-16 ***
Cranio -31.63 7.17 -4.41 1.08e-05 ***
SessoM 73.28 11.17 6.56 6.45e-11 ***
I(Cranio^2) 0.06 0.01 5.89 4.39e-09 ***
Additional Info Table mod4b
Info Value
Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error: 273 on 2490 degrees of freedom
Multiple R-squared: 0.7309
Adjusted R-squared: 0.7301
F-statistic: 966.1 on 7 and 2490 DF
p-value: < 2.2e-16

The addition of the non linear effect of the skull diameter variable results in a minimal increase in the Adjusted R-squared. It is not worthwhile to complicate the model in the face of a negligible increase in explained variability.

In scientific literature it is well known that maternal smoking indirectly impacts the weight of the newborn, favoring premature birth. Let’s try to introduce the combined effect of the smoking and gestation variables into the model.

mod4c=update(mod3, ~.+(Fumatrici*Gestazione))

result_mod4c = generate_model_tables(mod4c)
Coefficient Table mod4c
Estimate Std. Error t value Pr(>|t|) Signif
(Intercept) -6699.69 136.73 -49.00 < 2e-16 ***
N.gravidanze 12.75 4.35 2.93 0.0034 **
FumatriciSI 795.70 757.53 1.05 0.2936
Gestazione 33.20 3.84 8.64 < 2e-16 ***
Lunghezza 10.23 0.30 33.96 < 2e-16 ***
Cranio 10.53 0.43 24.69 < 2e-16 ***
SessoM 78.74 11.22 7.02 2.94e-12 ***
FumatriciSI:Gestazione -21.05 19.28 -1.09 0.2752
Additional Info Table mod4c
Info Value
Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error: 275 on 2490 degrees of freedom
Multiple R-squared: 0.7273
Adjusted R-squared: 0.7265
F-statistic: 948.6 on 7 and 2490 DF
p-value: < 2.2e-16

Also in this case, maternal smoking appears not to be significant and does not bring any benefit to the model. The adjusted R-square is the same as that of model 3.

There appears to be no evidence to support the thesis that maternal smoking positively impacts the model. Let’s remove this variable to understand how the model changes.

mod5=update(mod3, ~.-Fumatrici)

result_mod5 = generate_model_tables(mod5)
Coefficient Table mod5
Estimate Std. Error t value Pr(>|t|) Signif
(Intercept) -6681.73 135.80 -49.20 < 2e-16 ***
N.gravidanze 12.46 4.34 2.87 0.0042 **
Gestazione 32.38 3.80 8.52 < 2e-16 ***
Lunghezza 10.25 0.30 34.06 < 2e-16 ***
Cranio 10.54 0.43 24.72 < 2e-16 ***
SessoM 77.98 11.21 6.96 4.47e-12 ***
Additional Info Table mod5
Info Value
Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error: 275 on 2492 degrees of freedom
Multiple R-squared: 0.727
Adjusted R-squared: 0.7265
F-statistic: 1327.3 on 5 and 2492 DF
p-value: < 2.2e-16

The adjusted R-square is the same as that of model 3. With the starting data available, it can be concluded that the smoking variable is not significant.

The remaining variables all have high significance. It is concluded that model 5 is the best model.

The model chosen as the best includes the following variables:

To verify the statement that model 5 is the best one, it is possible to use the BIC indicator. In addition it is possible to perform an automatic identification of the best model using the stepwise_AIC procedure and compare the output with the results obtained.

library(MASS)

#automatic model
stepwise_mod=stepAIC(mod1,direction = "both", k=log(n))
# BIC of all models found
bic_output = BIC(mod1, mod2, mod3, mod4a, mod4b, mod4c, mod5,stepwise_mod)

kable(bic_output, format = "html",col.names = c("Df", "BIC"),
      align = "c", caption = "BIC Values for Different Models")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
BIC Values for Different Models
Df BIC
mod1 12 35215.45
mod2 9 35201.73
mod3 8 35200.24
mod4a 9 35202.90
mod4b 9 35173.50
mod4c 9 35206.87
mod5 7 35193.65
stepwise_mod 7 35193.65

BIC has to be read as lower is better. In that case the model 4b is the lowest. However, the analysis showed that increasing the complexity of the model 4b does not lead to a significant increase in the explained variability. In addition, the automatic procedure found that the best model has same BIC of the model 5. It can be concluded that the best model is model 5.

4. Residue analysis

After obtaining the model it is necessary to carry out an analysis of the residues to evaluate whether they comply with the following assumptions:

par(mfrow=c(2,2)) 
plot(mod5)

The following graphs are shown in the image above:

it is good practice, in addition to graphical analysis, to carry out statistical tests to verify that the model assumptions are respected.

library(lmtest)
# Test
bp_test = bptest(mod5)
dw_test = dwtest(mod5)
shapiro_test = shapiro.test(residuals(mod5))

# Result dataframe
results_residuals = data.frame(
  Test = c("Breusch-Pagan", "Durbin-Watson", "Shapiro-Wilk"),
  Statistic = round(c(bp_test$statistic, dw_test$statistic, shapiro_test$statistic),3),
  `p-value` = c(bp_test$p.value, dw_test$p.value, shapiro_test$p.value)
)

# Formatting P value
results_residuals[, 3] = sapply(results_residuals[, 3], format_p_value)

kable(results_residuals, format = "html", align = "c", caption = "Residual Test Results")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Residual Test Results
Test Statistic p.value
BP Breusch-Pagan 90.297 < 2e-16
DW Durbin-Watson 1.953 0.1209
W Shapiro-Wilk 0.974 < 2e-16

Just because the model didn’t pass the Breusch-Pagan and Shapiro-Wilk tests doesn’t mean it’s not valid. The model has an Adjusted R-square of 0.7265 which indicates that it manages to explain more than 70% of the variability of the response variable. It can be used to make predictions with awareness of its limits.

library(ggplot2)
library(car)

# Leverage
lev = hatvalues(mod5)
p = sum(lev)
soglia_lev = 2 * p / n

# Leverage Plot
leverage_plot = ggplot(data.frame(Index = 1:length(lev), Leverage = lev), aes(x = Index, y = Leverage)) +
  geom_point() +
  geom_hline(yintercept = soglia_lev, color = "red", size = 1.5) +
  ggtitle("Leverage Values") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))

# Outlier
rstudent_vals = rstudent(mod5)

# Outlier plot
outlier_plot = ggplot(data.frame(Index = 1:length(rstudent_vals), RStudent = rstudent_vals), aes(x = Index, y = RStudent)) +
  geom_point() +
  geom_hline(yintercept = c(-2, 2), color = "red", size = 1.5) + 
  ggtitle("Studentized Residuals") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5))  

library(gridExtra)
grid.arrange(leverage_plot, outlier_plot, ncol = 2, widths = c(1, 1))

num_leverage=sum(lev>soglia_lev)
num_outlier = sum(abs(rstudent_vals) > 2)

ris_influent <- data.frame(
  Influent = c("Leverage", "Outlier"),
  Total = c(num_leverage, num_outlier)
)

kable(ris_influent, col.names = c("", "Total"),align="c", caption = "Leverage and Outliers total value")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")%>%
                column_spec(1, width = "5cm")%>%
                column_spec(2, width = "2cm")
Leverage and Outliers total value
Total
Leverage 153
Outlier 101

153 leverage values and 101 outlier values has been identified. From the graphical analysis it is noted that most of these values slightly exceed the threshold. Let’s use another visualization (cook distance) to analyze the leverage values and the outlier values at the same time.

# Cook distance
cook <- cooks.distance(mod5)

# index max value
max_cook_index <- which.max(cook)
max_cook_value <- cook[max_cook_index]


ggplot(data.frame(Index = 1:length(cook), Cook = cook), aes(x = Index, y = Cook)) +
  geom_point() +
  geom_text(aes(label = ifelse(Index == max_cook_index, max_cook_index, '')),
            vjust = -1, hjust = 1) +  
  geom_point(data = data.frame(Index = max_cook_index, Cook = max_cook_value),
             aes(x = Index, y = Cook), color = "red", size = 3) +  
  ggtitle("Cook's Distance") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +  
  ylim(0, 1) 

The Cook’s Distance graph shows only one value with a high cook distance. Let’s remove the observation 1549 from the dataset to analyze the effect on the model.

detach(data)
data <- data %>% filter(row_number() != 1549)
attach(data)

mod5_new=lm(Peso ~ N.gravidanze+Gestazione+Lunghezza+Cranio+Sesso,
            data=data)

result_mod5new = generate_model_tables(mod5_new)
Coefficient Table mod5_new
Estimate Std. Error t value Pr(>|t|) Signif
(Intercept) -6683.83 133.16 -50.19 < 2e-16 ***
N.gravidanze 13.14 4.26 3.09 0.0020 **
Gestazione 29.63 3.74 7.93 3.27e-15 ***
Lunghezza 10.89 0.30 36.08 < 2e-16 ***
Cranio 9.92 0.42 23.46 < 2e-16 ***
SessoM 78.14 10.99 7.11 1.53e-12 ***
Additional Info Table mod5_new
Info Value
Signif. codes: 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1
Residual standard error: 269 on 2491 degrees of freedom
Multiple R-squared: 0.7372
Adjusted R-squared: 0.7367
F-statistic: 1397.5 on 5 and 2491 DF
p-value: < 2.2e-16

Following the removal of observation 1549, the descriptive variables present in the model continue to show all high significance. Furthermore, an increase in the adjusted R-squared is observed.

Statistical tests will help in understanding how the removal of the observation 1549 affects the residuals.

bp_test = bptest(mod5_new)
dw_test = dwtest(mod5_new)
shapiro_test = shapiro.test(residuals(mod5_new))

# Result dataframe
results_residuals = data.frame(
  Test = c("Breusch-Pagan", "Durbin-Watson", "Shapiro-Wilk"),
  Statistic = round(c(bp_test$statistic, dw_test$statistic, shapiro_test$statistic),3),
  `p-value` = c(bp_test$p.value, dw_test$p.value, shapiro_test$p.value)
)

# Formatting P value
results_residuals[, 3] = sapply(results_residuals[, 3], format_p_value)

kable(results_residuals, format = "html", align = "c", caption = "Residual Test Results")%>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Residual Test Results
Test Statistic p.value
BP Breusch-Pagan 11.343 0.0450
DW Durbin-Watson 1.954 0.1227
W Shapiro-Wilk 0.989 5.23e-13

All test statistics show an increase. The p-value of the Shapiro-Wilk test continues to be extremely low, rejecting the hypothesis of a normal distribution of the residuals. However, the p-value of the Breusch-Pagan test is borderline (still rejecting the hypothesis of homoscedasticity) at a significance level of 5%.

# Calcola i residui del modello
residuls_mod5new <- residuals(mod5_new)

# Crea il grafico di densità con ggplot
ggplot() +
  geom_density(aes(x = residuls_mod5new, y = ..density.., color = "Model Residuals"), size = 1) +
  geom_density(aes(x = rnorm(1000000, 
                             mean = mean(residuls_mod5new), 
                             sd = sd(residuls_mod5new)), 
                   y = ..density.., color = "Normal Distribution"), size = 1) +
  scale_color_manual(values = c("Model Residuals" = "black", "Normal Distribution" = "red"), 
                     name = "Curve") +
  ggtitle("Density of probability") +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) +
  labs(x = "Value", y = "Density")

From a graphical analysis seems not to be present major differences between a normal distribution and the probability density of the residuals of the final model. Considering the value of the Adjusted R-squared, the results of the statistical tests and the graphical analyses, this model can be considered acceptable.

4. Predictions and Results

Since the model includes 5 descriptive variables, to predict the weight of a female newborn considering a mother in her third pregnancy who will give birth at the 39th week, the values of length and skull diameter are necessary. For these last two variables, assume the average values calculated on the dataset from which the model was derived.

# new_born 
new_born = data.frame( N.gravidanze = 3, 
                       Gestazione = 39, 
                       Lunghezza = mean(data$Lunghezza),
                       Cranio = mean(data$Cranio), 
                       Sesso = "F" ) 

# Prediction
pred_new_born <- predict(mod5_new, newdata = new_born,interval = "confidence") 

res_pred <- data.frame(
  Result = "Predicted Weight in grams",
  Fit = round(pred_new_born[1, "fit"],2),
  Lower = round(pred_new_born[1, "lwr"],2),
  Upper = round(pred_new_born[1, "upr"],2)
)

# Mostra i risultati utilizzando kable
kable(res_pred, col.names = c("", "Fit", "Lower", "Upper"), format = "html", align = "c") %>%
  kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
Fit Lower Upper
Predicted Weight in grams 3271.98 3249.09 3294.87

In scientific literature, the average weight of a newborn is reported as 3300g. This value is very close to the upper limit of the predicted confidence interval. It can be considered that the prediction is valid.

5. Model Graphical Visualization

After obtaining the model it is possible to explore the main relationships between variables with the help of graphs.

#Weight vs Gestation
ggplot(data, aes(x = Gestazione, y = Peso, color = Sesso)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE) +
  scale_color_manual(values = c("M" = "blue", "F" = "red")) +
  scale_x_continuous(breaks = seq(24, 43, by =2), limits = c(24, 43)) +
  labs(title = "Weight vs Gestation",
       x = "Gestation (week)",
       y = "Weight (g)",
       color=NULL) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5)) 

The graph shows a positive trend for both sexes in which weight increases as the number of weeks of gestation increases.

The regression lines for the two sexes appear parallell. The regression line for the female sex is shifted lower. This indicates that for both sexes, the unit increase in the variable gestation leads to the same increase in weight with the difference that on average female newborns weigh less for the same weeks of gestation.

library(scatterplot3d)
library(car)
library(rgl)


colors = c("F" = "red", "M" = "blue")

s3d = scatterplot3d(Lunghezza, Cranio, Peso, color=colors[Sesso], pch=19,
                     xlab="Length (mm)", ylab="Skull diameter (mm)", zlab="Weight (g)",
                     main="Relation between Weight, Length e Skull Diameter")

# Regression planes by sex
regF = lm(Peso ~ Lunghezza + Cranio, data = data[data$Sesso == "F", ])
regM = lm(Peso ~ Lunghezza + Cranio, data = data[data$Sesso == "M", ])

# Adding regression planes to graph
s3d$plane3d(regF, col = "red", lty = "solid")
s3d$plane3d(regM, col = "blue", lty = "dashed")

legend("topright", legend=c("F", "M"), col=c("red", "blue"), pch=19, bty="n")

The graph shows a positive effect of both descriptive variables (length and skull diameter) on the weight of the newborn. The regression plans for the two modes of the sex variable are very similar to each other indicating that the effect of the length and skull diameter is the same for both sexes.