Project Statistical Model

1) Data Loading

setwd("C:/Users/tomma/Desktop")

data <- read.csv("neonati.csv", stringsAsFactors = T)

library(ggplot2)
library(tidyr)
suppressPackageStartupMessages(library(dplyr))
library(patchwork)
library(e1071)

2) Dataset

The data come from 3 Hospitals and concern 2500 Newborns. The Dataset contains 10 variables reported below:

Age of the Mother: continuous quantitative variable
Number of sustained Pregnancies: discrete quantitative variable
Smoking Mother (0=NO, YES=1): dummy variable
Number of Weeks of Gestation: continuous quantitative variable
Weight in grams of the Newborn: continuous quantitative variable
Length in mm of the Newborn: continuous quantitative variable
Diameter in mm of the Newborn’s Skull: continuous quantitative variable
Type of Birth: qualitative variable on a nominal scale
Hospital: qualitative variable on a nominal scale
Sex of the Newborn: qualitative variable on a nominal scale

The aim of the study is to evaluate whether it is possible to predict the Weight of the newborn at Birth considering above all the relationship with the Mother’s variables, to understand whether or not these have a significant effect.

attach(data)
summary(data)

##    Anni.madre     N.gravidanze       Fumatrici        Gestazione   
##  Min.   : 0.00   Min.   : 0.0000   Min.   :0.0000   Min.   :25.00  
##  1st Qu.:25.00   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:38.00  
##  Median :28.00   Median : 1.0000   Median :0.0000   Median :39.00  
##  Mean   :28.16   Mean   : 0.9812   Mean   :0.0416   Mean   :38.98  
##  3rd Qu.:32.00   3rd Qu.: 1.0000   3rd Qu.:0.0000   3rd Qu.:40.00  
##  Max.   :46.00   Max.   :12.0000   Max.   :1.0000   Max.   :43.00  
##       Peso        Lunghezza         Cranio    Tipo.parto Ospedale   Sesso   
##  Min.   : 830   Min.   :310.0   Min.   :235   Ces: 728   osp1:816   F:1256  
##  1st Qu.:2990   1st Qu.:480.0   1st Qu.:330   Nat:1772   osp2:849   M:1244  
##  Median :3300   Median :500.0   Median :340              osp3:835           
##  Mean   :3284   Mean   :494.7   Mean   :340                                 
##  3rd Qu.:3620   3rd Qu.:510.0   3rd Qu.:350                                 
##  Max.   :4930   Max.   :565.0   Max.   :390

Here you can see the main data of the variables present in the dataset. However, this is not enough for me. I want an analysis of the statistical indices of the quantitative and qualitative variables.

data <- data[Anni.madre > 10,] #outlier adjustment
n <- nrow(data)

p1 <- ggplot(data, aes(x = Anni.madre)) +
    geom_histogram(binwidth = 1, fill = "lightblue", col = 1, lwd = 0.15) +
    scale_x_continuous(breaks = seq(10, max(data$Anni.madre, na.rm = TRUE), 5)) +
    scale_y_continuous(breaks = seq(0, 200, 25)) +
    labs(x = "Age of the Mother", y = "N. of Mothers") +
    theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

p2 <- ggplot(data, aes(x = factor(N.gravidanze))) +
    geom_bar(fill = "lightblue", col = 1) +
    scale_y_continuous(breaks = seq(200, 1000, 200)) +
    labs(x = "Number of sustained Pregnancies", y = "N. of Mothers") +
    theme_bw()

p3 <- ggplot(data, aes(x = factor(Fumatrici))) +
    geom_bar(fill = "lightblue", col = 1) +
    scale_x_discrete(labels = c("No", "Yes")) +
    labs(x = "Smoking Mother", y = "N. of Mothers") +
    theme_bw()

p4 <- ggplot(data, aes(x = Gestazione)) +
    geom_bar(fill = "lightblue", col = 1) +
    scale_x_continuous(breaks = seq(min(data$Gestazione, na.rm = TRUE), max(data$Gestazione, na.rm = TRUE), 2)) +
    scale_y_continuous(breaks = seq(0, 700, 100)) +
    labs(x = "Number of Weeks of Gestation", y = "N. of Mothers") +
    theme_bw()

p5 <- ggplot(data, aes(x = Peso)) +
    geom_density(fill = "lightblue") +
    geom_vline(xintercept = median(data$Peso, na.rm = TRUE), col = 2) +
    scale_x_continuous(breaks = seq(1000, 5000, 500)) +
    labs(x = "Weight of the Newborn (gr.)", y = "") +
    theme_bw()

(p1 + p2 + p3 + p4 + p5) +
    plot_layout(ncol = 2, guides = "collect") +
    plot_annotation(title = "Dataset Variables", tag_levels = "I", tag_suffix = ")")

p6 <- ggplot()+
    geom_density(aes(x = Lunghezza),
                 fill = "lightblue")+
    geom_vline(xintercept = median(Lunghezza), col = 2)+
    geom_label(aes(x = median(Lunghezza),
                   y = .5 * max(density(Lunghezza)$y),
                   label = median(Lunghezza)))+
    scale_x_continuous(breaks = seq(300,550,50))+
    scale_y_continuous(breaks = NULL)+
    labs(x = "Lenght of the Newborn (mm)",
         y = "")+
    theme_bw()

p7 <- ggplot()+
    geom_density(aes(x = Cranio),
                 fill = "lightblue")+
    geom_vline(xintercept = median(Cranio), col = 2)+
    geom_label(aes(x = median(Cranio),
                   y = .5 * max(density(Cranio)$y),
                   label = median(Cranio)))+
    scale_x_continuous(breaks = seq(250, 375, 25))+
    scale_y_continuous(breaks = NULL)+
    labs(x = "Diameter of the Skull (mm)",
         y = "")+
    theme_bw()

p8 <- ggplot()+
    geom_bar(aes(x = Tipo.parto),
             fill = "lightblue",
             col = 1)+
    scale_x_discrete(labels = c("Ces" = "Cesarean", "Nat" = "Normal"))+
    labs(x = "Type of Birth",
         y = "N.of Births")+
    theme_bw()

p9 <- ggplot()+
    geom_bar(aes(x = Ospedale),
             col = 1,
             fill = "lightblue")+
    scale_x_discrete(labels = c("osp1" = "Hospital1", "osp2" = "Hospital2", "osp3" = "Hospital3"))+
    labs(x = "Hospital",
         y = "N.of Births")+
    theme_bw()

p10 <- ggplot()+
    geom_bar(aes(x = Sesso),
             col = 1,
             fill = c("pink", "deepskyblue"))+
    labs(x = "Sex of the Newborn",
         y = "N. of newborns")+
    theme_bw()

(p6 + p7 + p8 + p9 + p10) +
    plot_layout(ncol = 2, guides = "collect") +
    plot_annotation(title = "Dataset Variables", tag_levels = "I", tag_suffix = ")")

indici.statistici <- function(x){
    
    indici_posizione <- summary(x)
    
    
    indici_dispersione <- c("Intervallo" = max(x, na.rm = TRUE) - min(x, na.rm = TRUE),
                            "IQR" = IQR(x, na.rm = TRUE),
                            "dev.st" = sd(x, na.rm = TRUE),
                            "var" = var(x, na.rm = TRUE),
                            "CV" = sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE) * 100)
    
    
    indici_forma <- c("Asimmetria" = moments::skewness(x, na.rm = TRUE),
                      "Curtosi" = moments::kurtosis(x, na.rm = TRUE) - 3)
    
    
    indici <- c(indici_posizione, indici_dispersione, indici_forma)
    return(indici)
}

qualitative_vars <- c("Fumatrici", "Tipo.parto", "Ospedale", "Sesso")
quantitative_vars <- names(data)[!(names(data) %in% qualitative_vars)]

indici_statistici_df <- data.frame()

for (x in quantitative_vars) {
    
    idx <- indici.statistici(data[[x]])
    
    df_idx <- data.frame(Indice = names(idx), Valore = round(idx, 2), Variabile = x)
    
    indici_statistici_df <- rbind(indici_statistici_df, df_idx)
}

write.csv(indici_statistici_df, "indici_statistici.csv", row.names = FALSE)

freq_tables_list <- list()

for (x in qualitative_vars) {
    
    freq_table <- table(data[[x]])
    
    df_freq <- as.data.frame(freq_table)
    df_freq$Variabile <- x
    
    freq_tables_list[[x]] <- df_freq
}

final_freq_tables <- do.call(rbind, freq_tables_list)

write.csv(final_freq_tables, "frequenze_qualitative.csv", row.names = FALSE)

View(indici_statistici_df)
View(final_freq_tables)

print(indici_statistici_df)

##                 Indice    Valore    Variabile
## Min.              Min.     13.00   Anni.madre
## 1st Qu.        1st Qu.     25.00   Anni.madre
## Median          Median     28.00   Anni.madre
## Mean              Mean     28.19   Anni.madre
## 3rd Qu.        3rd Qu.     32.00   Anni.madre
## Max.              Max.     46.00   Anni.madre
## Intervallo  Intervallo     33.00   Anni.madre
## IQR                IQR      7.00   Anni.madre
## dev.st          dev.st      5.22   Anni.madre
## var                var     27.22   Anni.madre
## CV                  CV     18.51   Anni.madre
## Asimmetria  Asimmetria      0.15   Anni.madre
## Curtosi        Curtosi     -0.11   Anni.madre
## Min.1             Min.      0.00 N.gravidanze
## 1st Qu.1       1st Qu.      0.00 N.gravidanze
## Median1         Median      1.00 N.gravidanze
## Mean1             Mean      0.98 N.gravidanze
## 3rd Qu.1       3rd Qu.      1.00 N.gravidanze
## Max.1             Max.     12.00 N.gravidanze
## Intervallo1 Intervallo     12.00 N.gravidanze
## IQR1               IQR      1.00 N.gravidanze
## dev.st1         dev.st      1.28 N.gravidanze
## var1               var      1.64 N.gravidanze
## CV1                 CV    130.50 N.gravidanze
## Asimmetria1 Asimmetria      2.51 N.gravidanze
## Curtosi1       Curtosi     10.98 N.gravidanze
## Min.2             Min.     25.00   Gestazione
## 1st Qu.2       1st Qu.     38.00   Gestazione
## Median2         Median     39.00   Gestazione
## Mean2             Mean     38.98   Gestazione
## 3rd Qu.2       3rd Qu.     40.00   Gestazione
## Max.2             Max.     43.00   Gestazione
## Intervallo2 Intervallo     18.00   Gestazione
## IQR2               IQR      2.00   Gestazione
## dev.st2         dev.st      1.87   Gestazione
## var2               var      3.49   Gestazione
## CV2                 CV      4.79   Gestazione
## Asimmetria2 Asimmetria     -2.07   Gestazione
## Curtosi2       Curtosi      8.26   Gestazione
## Min.3             Min.    830.00         Peso
## 1st Qu.3       1st Qu.   2990.00         Peso
## Median3         Median   3300.00         Peso
## Mean3             Mean   3284.18         Peso
## 3rd Qu.3       3rd Qu.   3620.00         Peso
## Max.3             Max.   4930.00         Peso
## Intervallo3 Intervallo   4100.00         Peso
## IQR3               IQR    630.00         Peso
## dev.st3         dev.st    525.23         Peso
## var3               var 275865.90         Peso
## CV3                 CV     15.99         Peso
## Asimmetria3 Asimmetria     -0.65         Peso
## Curtosi3       Curtosi      2.03         Peso
## Min.4             Min.    310.00    Lunghezza
## 1st Qu.4       1st Qu.    480.00    Lunghezza
## Median4         Median    500.00    Lunghezza
## Mean4             Mean    494.70    Lunghezza
## 3rd Qu.4       3rd Qu.    510.00    Lunghezza
## Max.4             Max.    565.00    Lunghezza
## Intervallo4 Intervallo    255.00    Lunghezza
## IQR4               IQR     30.00    Lunghezza
## dev.st4         dev.st     26.33    Lunghezza
## var4               var    693.21    Lunghezza
## CV4                 CV      5.32    Lunghezza
## Asimmetria4 Asimmetria     -1.51    Lunghezza
## Curtosi4       Curtosi      6.48    Lunghezza
## Min.5             Min.    235.00       Cranio
## 1st Qu.5       1st Qu.    330.00       Cranio
## Median5         Median    340.00       Cranio
## Mean5             Mean    340.03       Cranio
## 3rd Qu.5       3rd Qu.    350.00       Cranio
## Max.5             Max.    390.00       Cranio
## Intervallo5 Intervallo    155.00       Cranio
## IQR5               IQR     20.00       Cranio
## dev.st5         dev.st     16.43       Cranio
## var5               var    269.93       Cranio
## CV5                 CV      4.83       Cranio
## Asimmetria5 Asimmetria     -0.79       Cranio
## Curtosi5       Curtosi      2.94       Cranio

print(final_freq_tables)

##              Var1 Freq  Variabile
## Fumatrici.1     0 2394  Fumatrici
## Fumatrici.2     1  104  Fumatrici
## Tipo.parto.1  Ces  728 Tipo.parto
## Tipo.parto.2  Nat 1770 Tipo.parto
## Ospedale.1   osp1  816   Ospedale
## Ospedale.2   osp2  848   Ospedale
## Ospedale.3   osp3  834   Ospedale
## Sesso.1         F 1255      Sesso
## Sesso.2         M 1243      Sesso

Here the graphs related to each variable are shown. We can say that the most of the women examined are between 20 and 30 years old. Most of the woman have no previous pregnancies. 95% of the women examined are non-smokers. The number of weeks of gestation ranges from 25 to 43 with an average of about 39 weeks. The babies born have an average weight of about 3.3kg with a maximum of 4.9kg and minimum of 0.830kg.

3) Hypothesis

On average, the Weight and Height in the newborn population in Italy are 3.3kg and 50cm respectively. (source: https://www.ospedalebambinogesu.it/da-0-a-30-giorni-come-si-presenta-e-come-cresce-80012/#:~:text=In%20media%20il%20peso%20nascita,pari%20mediamente%20a%2050%20centimetri)

t.test(Peso, mu = 3300, conf.level = 0.95, alternative = "two.sided")

## 
##  One Sample t-test
## 
## data:  Peso
## t = -1.516, df = 2499, p-value = 0.1296
## alternative hypothesis: true mean is not equal to 3300
## 95 percent confidence interval:
##  3263.490 3304.672
## sample estimates:
## mean of x 
##  3284.081

t.test(Lunghezza, mu = 500, conf.level = 0.95, alternative = "two.sided")

## 
##  One Sample t-test
## 
## data:  Lunghezza
## t = -10.084, df = 2499, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 500
## 95 percent confidence interval:
##  493.6598 495.7242
## sample estimates:
## mean of x 
##   494.692

In the first case (newborn weight) the p-value is 0.13, so for a significance level of α = 0.05 I do not reject the null hypothesis. I conclude that the length of this sample of newborns is not significantly different from that of the population. In the second case (newborn length) the p-value is very small (of the order of 10−16), so I reject the null hypothesis. I conclude that the weight of this sample of newborns is significantly different from that of the population.

t.test(Peso ~ Sesso)

## 
##  Welch Two Sample t-test
## 
## data:  Peso by Sesso
## t = -12.106, df = 2490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -287.1051 -207.0615
## sample estimates:
## mean in group F mean in group M 
##        3161.132        3408.215

t.test(Lunghezza ~ Sesso)

## 
##  Welch Two Sample t-test
## 
## data:  Lunghezza by Sesso
## t = -9.582, df = 2459.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -11.929470  -7.876273
## sample estimates:
## mean in group F mean in group M 
##        489.7643        499.6672

t.test(Cranio ~ Sesso)

## 
##  Welch Two Sample t-test
## 
## data:  Cranio by Sesso
## t = -7.4102, df = 2491.4, p-value = 1.718e-13
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -6.089912 -3.541270
## sample estimates:
## mean in group F mean in group M 
##        337.6330        342.4486

chisq.test(table(Tipo.parto, Sesso))

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  table(Tipo.parto, Sesso)
## X-squared = 0.039147, df = 1, p-value = 0.8432

I performed independent samples t-tests to verify significant differences between the two sexes, accompanying the numerical tests with boxplots to have a graphical confirmation. The differences in weight, length, skull diameter are all significant between the two sexes (always considering a significance level of 0.05). On the other hand, the type of birth (natural or cesarean) is independent of the sex of the newborn. In this case, since the variable Tipo.parto is qualitative, I calculated the frequency table between the variables involved and perform a chi-square test to test the hypothesis of independence.

tipo_parto_per_ospedale <- table(Ospedale, Tipo.parto)
prop.test(tipo_parto_per_ospedale)

## 
##  3-sample test for equality of proportions without continuity correction
## 
## data:  tipo_parto_per_ospedale
## X-squared = 1.0972, df = 2, p-value = 0.5778
## alternative hypothesis: two.sided
## sample estimates:
##    prop 1    prop 2    prop 3 
## 0.2965686 0.2991755 0.2778443

chisq.test(tipo_parto_per_ospedale)

## 
##  Pearson's Chi-squared test
## 
## data:  tipo_parto_per_ospedale
## X-squared = 1.0972, df = 2, p-value = 0.5778

I tested another hypothesis: there were rumors that some hospitals perform more cesarean sections, so I took steps to verify it. Since I had to test a proportion, I used prop.test() on the frequency table between the variables involved (I should point out, however, that the chi-square test gives the exact same result).The p-value is very high, so I do not reject the null hypothesis. I conclude that there are no significant differences in the type of birth depending on the hospital.

4) Analysis

moments::skewness(Peso)

## [1] -0.6470308

moments::kurtosis(Peso) - 3

## [1] 2.031532

shapiro.test(Peso)

## 
##  Shapiro-Wilk normality test
## 
## data:  Peso
## W = 0.97066, p-value < 2.2e-16

First of all, I checked that the response variable (Weight) is approximately normal, by looking at the shape indices and performing a Shapiro-Wilk test. I checked this in advance because any deviations from normality of the response variable often also affect the residuals. I noticed that the response variable (Weight) does not follow a normal distribution. In fact, the Shapiro-Wilk test clearly rejects the null hypothesis of normality, and we can note that the distribution is particularly leptokurtic (pointed), with a kurtosis value of 2.03. I know that in this case it would be appropriate to use a GLM, but I try with a linear regression model anyway.

panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
    
    r <- abs(cor(x, y))
    
    txt <- format(c(r, 0.123456789), digits = digits)[1]
    txt <- paste0(prefix, txt)  

     
    if (missing(cex.cor)) cex.cor <- 0.8 / strwidth(txt)
    
    
    text(0.5, 0.5, txt, cex = cex.cor * r)
}


data_quantitative <- data %>% select(all_of(quantitative_vars))


pairs(data_quantitative, upper.panel = panel.smooth, lower.panel = panel.cor)

boxplot(Peso ~ Sesso)

t.test(Peso ~ Sesso)

## 
##  Welch Two Sample t-test
## 
## data:  Peso by Sesso
## t = -12.106, df = 2490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##  -287.1051 -207.0615
## sample estimates:
## mean in group F mean in group M 
##        3161.132        3408.215

boxplot(Peso ~ Fumatrici)

t.test(Peso ~ Fumatrici)

## 
##  Welch Two Sample t-test
## 
## data:  Peso by Fumatrici
## t = 1.034, df = 114.1, p-value = 0.3033
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -45.61354 145.22674
## sample estimates:
## mean in group 0 mean in group 1 
##        3286.153        3236.346

boxplot(Peso ~ Tipo.parto)

t.test(Peso ~ Tipo.parto)

## 
##  Welch Two Sample t-test
## 
## data:  Peso by Tipo.parto
## t = -0.12968, df = 1493, p-value = 0.8968
## alternative hypothesis: true difference in means between group Ces and group Nat is not equal to 0
## 95 percent confidence interval:
##  -46.27992  40.54037
## sample estimates:
## mean in group Ces mean in group Nat 
##          3282.047          3284.916

boxplot(Peso ~ Ospedale)

pairwise.t.test(Peso, Ospedale, paired = F, pool.sd = T, p.adjust.method = "bonferroni")

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  Peso and Ospedale 
## 
##      osp1 osp2
## osp2 1.00 -   
## osp3 0.33 0.33
## 
## P value adjustment method: bonferroni

I investigated the two-by-two relationships between quantitative variables, both numerically (correlation matrix) and graphically. At first glance, length, skull, gestation appear significant. I investigated the relationships between the qualitative variables and the response variable. Only Sex is significant (instead the variable Smokers - which indicates whether a mother is a smoker or not - is not significant, unlike what one might expect, with a p-value of 0.30).

mod1 <- lm(Peso ~ . , data = data)
summary(mod1)

## 
## Call:
## lm(formula = Peso ~ ., data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1123.26  -181.53   -14.45   161.05  2611.89 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6735.7960   141.4790 -47.610  < 2e-16 ***
## Anni.madre        0.8018     1.1467   0.699   0.4845    
## N.gravidanze     11.3812     4.6686   2.438   0.0148 *  
## Fumatrici       -30.2741    27.5492  -1.099   0.2719    
## Gestazione       32.5773     3.8208   8.526  < 2e-16 ***
## Lunghezza        10.2922     0.3009  34.207  < 2e-16 ***
## Cranio           10.4722     0.4263  24.567  < 2e-16 ***
## Tipo.partoNat    29.6335    12.0905   2.451   0.0143 *  
## Ospedaleosp2    -11.0912    13.4471  -0.825   0.4096    
## Ospedaleosp3     28.2495    13.5054   2.092   0.0366 *  
## SessoM           77.5723    11.1865   6.934 5.18e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274 on 2487 degrees of freedom
## Multiple R-squared:  0.7289, Adjusted R-squared:  0.7278 
## F-statistic: 668.7 on 10 and 2487 DF,  p-value: < 2.2e-16

BIC(mod1)

## [1] 35215.45

The model has R2adj = 0.72, which is a reasonable value. This first model includes 10 variables (9 actually, but for the Hospital factor two are created because it has 3 modes). Some have very high p-values, so they can most likely be removed from the model to streamline it without losing a significant amount of explained variance.

stepwise_mod <- MASS::stepAIC(mod1, direction = "both", k = log(n))

## Start:  AIC=28118.61
## Peso ~ Anni.madre + N.gravidanze + Fumatrici + Gestazione + Lunghezza + 
##     Cranio + Tipo.parto + Ospedale + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Anni.madre    1     36710 186779904 28111
## - Fumatrici     1     90677 186833870 28112
## - Ospedale      2    687555 187430749 28112
## - N.gravidanze  1    446244 187189438 28117
## - Tipo.parto    1    451073 187194266 28117
## <none>                      186743194 28119
## - Sesso         1   3610705 190353899 28159
## - Gestazione    1   5458852 192202046 28183
## - Cranio        1  45318506 232061700 28654
## - Lunghezza     1  87861708 274604902 29074
## 
## Step:  AIC=28111.28
## Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio + 
##     Tipo.parto + Ospedale + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Fumatrici     1     91599 186871503 28105
## - Ospedale      2    693914 187473818 28105
## - Tipo.parto    1    452049 187231953 28110
## <none>                      186779904 28111
## - N.gravidanze  1    631082 187410986 28112
## + Anni.madre    1     36710 186743194 28119
## - Sesso         1   3617809 190397713 28151
## - Gestazione    1   5424800 192204704 28175
## - Cranio        1  45569477 232349381 28649
## - Lunghezza     1  87852027 274631931 29066
## 
## Step:  AIC=28104.68
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto + 
##     Ospedale + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Ospedale      2    702925 187574428 28098
## - Tipo.parto    1    444404 187315907 28103
## <none>                      186871503 28105
## - N.gravidanze  1    608136 187479640 28105
## + Fumatrici     1     91599 186779904 28111
## + Anni.madre    1     37633 186833870 28112
## - Sesso         1   3601860 190473363 28145
## - Gestazione    1   5358199 192229702 28168
## - Cranio        1  45613331 232484834 28642
## - Lunghezza     1  88259386 275130889 29063
## 
## Step:  AIC=28098.41
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto + 
##     Sesso
## 
##                Df Sum of Sq       RSS   AIC
## - Tipo.parto    1    467626 188042054 28097
## <none>                      187574428 28098
## - N.gravidanze  1    648873 188223301 28099
## + Ospedale      2    702925 186871503 28105
## + Fumatrici     1    100610 187473818 28105
## + Anni.madre    1     44184 187530244 28106
## - Sesso         1   3644818 191219246 28139
## - Gestazione    1   5457887 193032315 28162
## - Cranio        1  45747094 233321522 28636
## - Lunghezza     1  87955701 275530129 29051
## 
## Step:  AIC=28096.81
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## 
##                Df Sum of Sq       RSS   AIC
## <none>                      188042054 28097
## - N.gravidanze  1    621053 188663107 28097
## + Tipo.parto    1    467626 187574428 28098
## + Ospedale      2    726146 187315907 28103
## + Fumatrici     1     92548 187949505 28103
## + Anni.madre    1     45366 187996688 28104
## - Sesso         1   3650790 191692844 28137
## - Gestazione    1   5477493 193519547 28161
## - Cranio        1  46098547 234140601 28637
## - Lunghezza     1  87532691 275574744 29044

summary(stepwise_mod)

## 
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + 
##     Sesso, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1149.37  -180.98   -15.57   163.69  2639.09 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6681.7251   135.8036 -49.201  < 2e-16 ***
## N.gravidanze    12.4554     4.3416   2.869  0.00415 ** 
## Gestazione      32.3827     3.8008   8.520  < 2e-16 ***
## Lunghezza       10.2455     0.3008  34.059  < 2e-16 ***
## Cranio          10.5410     0.4265  24.717  < 2e-16 ***
## SessoM          77.9807    11.2111   6.956 4.47e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.7 on 2492 degrees of freedom
## Multiple R-squared:  0.727,  Adjusted R-squared:  0.7265 
## F-statistic:  1327 on 5 and 2492 DF,  p-value: < 2.2e-16

mod2 <- update(mod1, ~.-Anni.madre)
summary(mod2)

## 
## Call:
## lm(formula = Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + 
##     Cranio + Tipo.parto + Ospedale + Sesso, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1113.82  -180.30   -16.22   160.66  2616.32 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6708.6189   136.0211 -49.320  < 2e-16 ***
## N.gravidanze     12.5833     4.3400   2.899  0.00377 ** 
## Fumatrici       -30.4268    27.5455  -1.105  0.26944    
## Gestazione       32.2996     3.7997   8.501  < 2e-16 ***
## Lunghezza        10.2916     0.3008  34.209  < 2e-16 ***
## Cranio           10.4874     0.4257  24.638  < 2e-16 ***
## Tipo.partoNat    29.6654    12.0892   2.454  0.01420 *  
## Ospedaleosp2    -10.9509    13.4442  -0.815  0.41541    
## Ospedaleosp3     28.5171    13.4986   2.113  0.03474 *  
## SessoM           77.6452    11.1849   6.942 4.91e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274 on 2488 degrees of freedom
## Multiple R-squared:  0.7288, Adjusted R-squared:  0.7279 
## F-statistic: 743.1 on 9 and 2488 DF,  p-value: < 2.2e-16

anova(mod2, mod1)

## Analysis of Variance Table
## 
## Model 1: Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio + 
##     Tipo.parto + Ospedale + Sesso
## Model 2: Peso ~ Anni.madre + N.gravidanze + Fumatrici + Gestazione + Lunghezza + 
##     Cranio + Tipo.parto + Ospedale + Sesso
##   Res.Df       RSS Df Sum of Sq      F Pr(>F)
## 1   2488 186779904                           
## 2   2487 186743194  1     36710 0.4889 0.4845

BIC(mod2, mod1)

##      df      BIC
## mod2 11 35208.12
## mod1 12 35215.45

car::vif(mod2)

##                  GVIF Df GVIF^(1/(2*Df))
## N.gravidanze 1.027985  1        1.013896
## Fumatrici    1.007363  1        1.003675
## Gestazione   1.677351  1        1.295126
## Lunghezza    2.086862  1        1.444597
## Cranio       1.626789  1        1.275456
## Tipo.parto   1.004213  1        1.002104
## Ospedale     1.003460  2        1.000864
## Sesso        1.040653  1        1.020124

mod3 <- update(mod2, ~.-Ospedale)
summary(mod3)

## 
## Call:
## lm(formula = Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + 
##     Cranio + Tipo.parto + Sesso, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1129.88  -181.36   -16.24   160.63  2634.62 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6708.8092   136.0640 -49.306  < 2e-16 ***
## N.gravidanze     12.9927     4.3439   2.991  0.00281 ** 
## Fumatrici       -31.8823    27.5803  -1.156  0.24780    
## Gestazione       32.5970     3.8039   8.569  < 2e-16 ***
## Lunghezza        10.2684     0.3011  34.098  < 2e-16 ***
## Cranio           10.5015     0.4262  24.637  < 2e-16 ***
## Tipo.partoNat    30.4244    12.1041   2.514  0.01201 *  
## SessoM           78.1031    11.1998   6.974 3.94e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.4 on 2490 degrees of freedom
## Multiple R-squared:  0.7278, Adjusted R-squared:  0.7271 
## F-statistic: 951.3 on 7 and 2490 DF,  p-value: < 2.2e-16

BIC(mod3, mod2)

##      df      BIC
## mod3  9 35201.73
## mod2 11 35208.12

anova(mod3, mod2)

## Analysis of Variance Table
## 
## Model 1: Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio + 
##     Tipo.parto + Sesso
## Model 2: Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio + 
##     Tipo.parto + Ospedale + Sesso
##   Res.Df       RSS Df Sum of Sq      F   Pr(>F)   
## 1   2490 187473818                                
## 2   2488 186779904  2    693914 4.6216 0.009921 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mod4 <- update(mod3, ~.-Fumatrici)
summary(mod4)

## 
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + 
##     Tipo.parto + Sesso, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1129.14  -181.97   -16.26   160.95  2638.18 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -6708.0171   136.0715 -49.298  < 2e-16 ***
## N.gravidanze     12.7356     4.3385   2.935  0.00336 ** 
## Gestazione       32.3253     3.7969   8.514  < 2e-16 ***
## Lunghezza        10.2833     0.3009  34.177  < 2e-16 ***
## Cranio           10.5063     0.4263  24.648  < 2e-16 ***
## Tipo.partoNat    30.1601    12.1027   2.492  0.01277 *  
## SessoM           77.9171    11.1994   6.957 4.42e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.4 on 2491 degrees of freedom
## Multiple R-squared:  0.7277, Adjusted R-squared:  0.727 
## F-statistic:  1109 on 6 and 2491 DF,  p-value: < 2.2e-16

BIC(mod4, mod3)

##      df      BIC
## mod4  8 35195.25
## mod3  9 35201.73

anova(mod4, mod3)

## Analysis of Variance Table
## 
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto + 
##     Sesso
## Model 2: Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio + 
##     Tipo.parto + Sesso
##   Res.Df       RSS Df Sum of Sq      F Pr(>F)
## 1   2491 187574428                           
## 2   2490 187473818  1    100610 1.3363 0.2478

mod5 <- update(mod4, ~.-Tipo.parto)
summary(mod5)

## 
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + 
##     Sesso, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1149.37  -180.98   -15.57   163.69  2639.09 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6681.7251   135.8036 -49.201  < 2e-16 ***
## N.gravidanze    12.4554     4.3416   2.869  0.00415 ** 
## Gestazione      32.3827     3.8008   8.520  < 2e-16 ***
## Lunghezza       10.2455     0.3008  34.059  < 2e-16 ***
## Cranio          10.5410     0.4265  24.717  < 2e-16 ***
## SessoM          77.9807    11.2111   6.956 4.47e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.7 on 2492 degrees of freedom
## Multiple R-squared:  0.727,  Adjusted R-squared:  0.7265 
## F-statistic:  1327 on 5 and 2492 DF,  p-value: < 2.2e-16

BIC(mod4, mod5)

##      df      BIC
## mod4  8 35195.25
## mod5  7 35193.65

anova(mod4, mod5)

## Analysis of Variance Table
## 
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto + 
##     Sesso
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
##   Res.Df       RSS Df Sum of Sq      F  Pr(>F)  
## 1   2491 187574428                              
## 2   2492 188042054 -1   -467626 6.2101 0.01277 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

mod6 <- update(mod5, ~.-N.gravidanze)
summary(mod6)

## 
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + Sesso, 
##     data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1138.1  -184.4   -17.4   163.3  2626.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -6651.6732   135.5952 -49.055  < 2e-16 ***
## Gestazione     31.3262     3.7884   8.269  < 2e-16 ***
## Lunghezza      10.2024     0.3009  33.909  < 2e-16 ***
## Cranio         10.6706     0.4247  25.126  < 2e-16 ***
## SessoM         79.1027    11.2205   7.050 2.31e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 275.1 on 2493 degrees of freedom
## Multiple R-squared:  0.7261, Adjusted R-squared:  0.7257 
## F-statistic:  1652 on 4 and 2493 DF,  p-value: < 2.2e-16

BIC(mod5, mod6)

##      df      BIC
## mod5  7 35193.65
## mod6  6 35194.06

anova(mod5, mod6)

## Analysis of Variance Table
## 
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## Model 2: Peso ~ Gestazione + Lunghezza + Cranio + Sesso
##   Res.Df       RSS Df Sum of Sq      F   Pr(>F)   
## 1   2492 188042054                                
## 2   2493 188663107 -1   -621053 8.2304 0.004154 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

car::vif(mod5)

## N.gravidanze   Gestazione    Lunghezza       Cranio        Sesso 
##     1.023462     1.669779     2.075747     1.624568     1.040184

mod10 <- update(mod5, ~.+I(Gestazione^2))
summary(mod10)

## 
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + 
##     Sesso + I(Gestazione^2), data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1144.0  -181.5   -12.9   165.8  2661.9 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -4646.7158   898.6322  -5.171 2.52e-07 ***
## N.gravidanze       12.5489     4.3381   2.893  0.00385 ** 
## Gestazione        -81.2309    49.7402  -1.633  0.10257    
## Lunghezza          10.3502     0.3040  34.045  < 2e-16 ***
## Cranio             10.6376     0.4282  24.843  < 2e-16 ***
## SessoM             75.7563    11.2435   6.738 1.99e-11 ***
## I(Gestazione^2)     1.5168     0.6621   2.291  0.02206 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.5 on 2491 degrees of freedom
## Multiple R-squared:  0.7276, Adjusted R-squared:  0.7269 
## F-statistic:  1109 on 6 and 2491 DF,  p-value: < 2.2e-16

BIC(mod10, mod5)

##       df      BIC
## mod10  8 35196.21
## mod5   7 35193.65

anova(mod10, mod5)

## Analysis of Variance Table
## 
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso + 
##     I(Gestazione^2)
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
##   Res.Df       RSS Df Sum of Sq      F  Pr(>F)  
## 1   2491 187646731                              
## 2   2492 188042054 -1   -395323 5.2479 0.02206 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Before doing manual tests, I looked at what I got from the function that implements the automatic stepwise procedure - using the BIC criterion (preferable to the AIC as it does not overestimate overparameterized models). I noticed that that model includes the variables N.pregnancies, Gestation, Length, Skull, Sex. Then I tried to proceed manually step by step. In the model with all the variables I noticed that Years.mother has a p-value of 0.43. Removing it, R2adj remains unchanged, the BIC has decreased and the ANOVA test did not indicate a significant difference in explained variance (p-value of 0.43). So I removed it without hesitation. The variable Hospital has two p-values (because it is qualitative with 3 modalities) of 0.40 and 0.03. Removing it, R2adj went down not even a single precentage point, BIC went down and the ANOVA test showed a significant difference (p-value of 0.009). But for such a small decrease in R2adj for removing two variables, I chose not to keep it. The variable Smokers has a p-value of 0.25. Removing it, R2adj went down only 0.1%, BIC went down and the ANOVA test showed a significant difference (p-value of 0.01). But for such a low decrease in R2adj in the face of removing a variable, I chose not to keep it. The variable in the model with the highest p-value was N.pregnancies with a p-value of 0.004, I tried to remove it. It turned out that R2 adj decreased by not even one percentage point, but the BIC for the first time increased. The ANOVA test indicated a significant difference from the previous model (p-value of 0.004). Given this, I chose to keep the variable in the model and stop there, because all the other variables have p-values of the order of 10−12 or less. I checked that there are no problems of multicollinearity (all VIFs are less than 5 so no problem). From the graphs created previously I noted a possible quadratic effect between Gestation and Y. Trying to insert it into the model, I saw that R2 adj only increased by 0.04%, the p-value of Gestation was 0.02 (but that of Gestation went from 0.3% to 11%), the BIC increases and the ANOVA indicated a significant difference between the two models. Even if graphically it seems to me that the quadratic effect could be relevant, the numerical tests did not indicate a significant improvement, so I chose not to add it to the model. I chose not to consider interactions between the variables, because thinking about it mentally I do not imagine any relationship that could be significant between the variables in the final model (excluding the anthropometric control variables).

par(mfrow = c(2,2))
plot(mod5)

shapiro.test(residuals(mod5))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(mod5)
## W = 0.97414, p-value < 2.2e-16

plot(density(residuals(mod5)))
ggplot()+
    geom_density(aes(x = residuals(mod5)))+
    geom_vline(xintercept = mean(residuals(mod5)), col = 2)+
    theme_bw()+
    labs(x = "Residues", y = "")+
    theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())

I checked, both graphically and numerically, whether the model met all the assumptions about the residuals (normality with mean 0, homoscedasticity, independence). In the first graph (which relates the estimates obtained by the model and the respective residuals), most of the points are a random cloud of points around a mean of 0, which would mean everything is normal. The only visible problem is that the left tail deviates upwards (i.e., the model underestimates the weight predictions of light newborns). In the second graph (which compares the quantiles of Weight with the quantiles of the normal) most of the points in the graph align along the line y = x, indicating that the residuals align along a normal. Again, you can see that the tails have a different pattern than most of the points. In the third graph we see, similarly to the first, mainly a random cloud of points around a value of y (this would indicate a constant variance, respecting the hypothesis of homoscedasticity). Here too, however, the left tail presents the problem of deviating a little upwards. In the fourth graph (outliers) only one value exceeds the warning threshold (Cook’s distance > 0.5) and none exceeds the alarm threshold of 1, so graphically no problems with outliers are shown. The Shapiro-Wilk test clearly rejects the normality of the residuals, but in the Q-Q plot we had predicted a reasonable possibility of normality. It can be seen that the distribution of the residuals resembles a normal one; it is mainly symmetric (with the right tail being particularly long), but it seems decidedly leptokurtic. In fact, the kurtosis, calculated with moments::kurtosis(residuals(mod5)) - 3, is 4.16, quite high.

lmtest::dwtest(mod5)

## 
##  Durbin-Watson test
## 
## data:  mod5
## DW = 1.9532, p-value = 0.1209
## alternative hypothesis: true autocorrelation is greater than 0

lmtest::bptest(mod5)

## 
##  studentized Breusch-Pagan test
## 
## data:  mod5
## BP = 90.297, df = 5, p-value < 2.2e-16

cook <- cooks.distance(mod5)
cook[cook > 0.5]

##      1551 
## 0.8297645

car::outlierTest(mod5)

##       rstudent unadjusted p-value Bonferroni p
## 1551 10.046230         2.6345e-23   6.5810e-20
## 155   5.025345         5.3818e-07   1.3444e-03
## 1306  4.824963         1.4848e-06   3.7092e-03

lev <- hatvalues(mod5)
p <- sum(lev)
soglia <- 2 * p/n
plot(lev)
abline(h=soglia,col=2,lwd=2)

lev[lev > soglia]; length(lev[lev > soglia])

##          13          15          34          67          89          96 
## 0.005632888 0.007056483 0.006748974 0.005896189 0.012817563 0.005351869 
##         101         106         131         134         151         155 
## 0.007528094 0.014487904 0.007237755 0.007553514 0.010889886 0.007209736 
##         161         189         190         204         205         206 
## 0.020341133 0.004894598 0.005366831 0.014494112 0.005351828 0.009482503 
##         220         294         305         310         312         315 
## 0.007403130 0.005914307 0.005445189 0.028815300 0.013173723 0.005386836 
##         378         440         442         445         486         492 
## 0.015942080 0.005405733 0.007725708 0.007511227 0.005165714 0.008274392 
##         497         516         582         587         592         614 
## 0.005167522 0.013080707 0.011667393 0.008415872 0.006385013 0.005300091 
##         638         656         657         684         697         702 
## 0.006693153 0.005934494 0.005323517 0.008825948 0.005864920 0.005203163 
##         729         748         750         757         765         805 
## 0.005024448 0.008567254 0.006943755 0.008146487 0.006076412 0.014358658 
##         828         893         895         913         928         946 
## 0.007180179 0.005076266 0.005297643 0.005574016 0.022745493 0.006910965 
##         947         956         985        1008        1014        1049 
## 0.008409518 0.007791530 0.007040035 0.005343994 0.008474086 0.004956561 
##        1067        1091        1106        1130        1166        1181 
## 0.008467035 0.008940030 0.005967511 0.031739177 0.005513581 0.005678043 
##        1188        1200        1219        1238        1248        1273 
## 0.006481533 0.005493009 0.030697778 0.005912496 0.014631359 0.007089553 
##        1291        1293        1311        1321        1325        1356 
## 0.006118589 0.006074423 0.009626391 0.009295311 0.004857340 0.005306940 
##        1357        1385        1395        1400        1402        1411 
## 0.006967713 0.012641828 0.005129491 0.005932524 0.004816704 0.008049539 
##        1420        1428        1429        1450        1505        1551 
## 0.005156670 0.008195421 0.021758961 0.015106684 0.013334256 0.048802841 
##        1553        1556        1573        1593        1606        1610 
## 0.008507417 0.005923458 0.005049368 0.005624961 0.005009312 0.008725821 
##        1617        1619        1628        1686        1693        1701 
## 0.004869997 0.015069038 0.005070717 0.009356578 0.005079185 0.010846383 
##        1712        1718        1727        1735        1780        1781 
## 0.006993461 0.006961226 0.013303625 0.004886226 0.025544997 0.016833696 
##        1809        1827        1868        1892        1962        1967 
## 0.008711082 0.006068655 0.005206137 0.005333985 0.005541287 0.005339716 
##        1977        2037        2040        2046        2086        2089 
## 0.006928794 0.004890003 0.011504028 0.005471894 0.013194769 0.006293791 
##        2098        2114        2115        2120        2140        2146 
## 0.005097113 0.013318873 0.011779730 0.018667270 0.006244737 0.005804705 
##        2148        2149        2157        2175        2200        2215 
## 0.007930323 0.013589469 0.005910248 0.032531736 0.011679031 0.004894044 
##        2216        2220        2221        2224        2225        2244 
## 0.008120291 0.005415586 0.021633907 0.005841119 0.005593576 0.006929530 
##        2257        2307        2317        2318        2337        2359 
## 0.006171101 0.013972901 0.007677230 0.004834368 0.005230839 0.010068259 
##        2408        2422        2436        2437        2452        2458 
## 0.009702475 0.021536269 0.004986609 0.023951342 0.023848222 0.008509142 
##        2471        2478 
## 0.020905930 0.005777111

## [1] 152

The only value previously identified with Cook’s distance > 0.5 is observation 1551. Examining it, I see that Length=315 and Weight=4370. Comparing the weight with that of babies of similar length, I get extremely different numbers (around 1000) so it was probably a typing error. Since no other value exceeds the warning threshold, and no value reaches the alarm threshold, there is no problem with outliers. However, for completeness I examine separately extreme values in the response variable and in the regressor space (outlier values and leverage). The outlier Test function of the package “car” returns 3 outliers (observations 1551, 155 and 1306). Instead, the manual calculation of leverage values returns 152 values (6% of the total of 2500), which correspond to all observations that are far from the others in the regressor space.

newdata_nat <- data.frame(N.gravidanze=2, Gestazione=39, Lunghezza=mean(Lunghezza), Cranio=mean(Cranio), Tipo.parto=factor("Nat"), Sesso=factor("F"))
newdata_ces <- data.frame(N.gravidanze=2, Gestazione=39, Lunghezza=mean(Lunghezza), Cranio=mean(Cranio), Tipo.parto=factor("Ces"), Sesso=factor("F"))
pred1 <- predict(mod5, newdata = newdata_nat)
pred2 <- predict(mod5, newdata = newdata_ces)
prediction = 1 / n * (pred1 * dim(data[Tipo.parto == "Nat",])[1]
                      + pred2 * dim(data[Tipo.parto == "Ces",])[1])

print(paste("Weighted Forecast is:", prediction))

## [1] "Weighted Forecast is: 3261.29732763148"

Overall, the model’s R2 adj is 0.72, meaning that 72% of the variability of the Weight is explained by the model. There are no alarming outliers. The residuals have a mean of 0 and are not autocorrelated. In light of this, even if there is heteroskedasticity and the Shapiro-Wilk test rejects normality, the model seems good enough to explain the variability of the response variable. I make a prediction for the weight of a newborn baby: the mother is in her third pregnancy (so N. pregnancies will be 2, i.e. the previous ones) and will give birth in the 39th week. No measurements from the ultrasound are available. To make a prediction in R I have to pass a data frame with all the variables, even those that I don’t have. For the quantitative ones (Length, Skull) I can simply use the average of them, while for Birth Type I choose to consider both modalities and make the weighted average. The result is a weight of 3261g, perfectly reasonable.

4) Visualizations

summary(mod5)

## 
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + 
##     Sesso, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1149.37  -180.98   -15.57   163.69  2639.09 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6681.7251   135.8036 -49.201  < 2e-16 ***
## N.gravidanze    12.4554     4.3416   2.869  0.00415 ** 
## Gestazione      32.3827     3.8008   8.520  < 2e-16 ***
## Lunghezza       10.2455     0.3008  34.059  < 2e-16 ***
## Cranio          10.5410     0.4265  24.717  < 2e-16 ***
## SessoM          77.9807    11.2111   6.956 4.47e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 274.7 on 2492 degrees of freedom
## Multiple R-squared:  0.727,  Adjusted R-squared:  0.7265 
## F-statistic:  1327 on 5 and 2492 DF,  p-value: < 2.2e-16

p1 <- ggplot(data, aes(x = Gestazione, y = Peso, col = Sesso)) +
    geom_point(alpha = 0.5, position = "jitter") +   
    geom_smooth(method = "lm", se = FALSE, lwd = 1.5) +  
    labs(x = "Weeks of Gestation", y = "Weight of the Newborn (gr.)", title = "Link between Gestation and Weight") +
    theme_bw() +  
    theme(plot.title = element_text(size = 14),  
          plot.margin = margin(1, 1, 2, 1))  

p2 <- ggplot(data, aes(x = as.factor(N.gravidanze), y = Peso)) +
    geom_boxplot() +
    geom_smooth(aes(x = as.numeric(N.gravidanze), y = Peso), method = "lm", se = FALSE, lwd = 1.25) +
    labs(x = "N. of sustained pregnancies", y = "Weight of the Newborn (gr)", title = "Weight based on Pregnancy") +
    theme_bw() +
    theme(plot.title = element_text(size = 14), 
          plot.margin = margin(1, 1, 2, 1))


p1 + p2

## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'