setwd("C:/Users/tomma/Desktop")
data <- read.csv("neonati.csv", stringsAsFactors = T)
library(ggplot2)
library(tidyr)
suppressPackageStartupMessages(library(dplyr))
library(patchwork)
library(e1071)
The data come from 3 Hospitals and concern 2500 Newborns. The Dataset contains 10 variables reported below:
Age of the Mother: continuous quantitative variable
Number of sustained Pregnancies: discrete quantitative variable
Smoking Mother (0=NO, YES=1): dummy variable
Number of Weeks of Gestation: continuous quantitative variable
Weight in grams of the Newborn: continuous quantitative variable
Length in mm of the Newborn: continuous quantitative variable
Diameter in mm of the Newborn’s Skull: continuous quantitative variable
Type of Birth: qualitative variable on a nominal scale
Hospital: qualitative variable on a nominal scale
Sex of the Newborn: qualitative variable on a nominal scale
The aim of the study is to evaluate whether it is possible to predict the Weight of the newborn at Birth considering above all the relationship with the Mother’s variables, to understand whether or not these have a significant effect.
attach(data)
summary(data)
## Anni.madre N.gravidanze Fumatrici Gestazione
## Min. : 0.00 Min. : 0.0000 Min. :0.0000 Min. :25.00
## 1st Qu.:25.00 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:38.00
## Median :28.00 Median : 1.0000 Median :0.0000 Median :39.00
## Mean :28.16 Mean : 0.9812 Mean :0.0416 Mean :38.98
## 3rd Qu.:32.00 3rd Qu.: 1.0000 3rd Qu.:0.0000 3rd Qu.:40.00
## Max. :46.00 Max. :12.0000 Max. :1.0000 Max. :43.00
## Peso Lunghezza Cranio Tipo.parto Ospedale Sesso
## Min. : 830 Min. :310.0 Min. :235 Ces: 728 osp1:816 F:1256
## 1st Qu.:2990 1st Qu.:480.0 1st Qu.:330 Nat:1772 osp2:849 M:1244
## Median :3300 Median :500.0 Median :340 osp3:835
## Mean :3284 Mean :494.7 Mean :340
## 3rd Qu.:3620 3rd Qu.:510.0 3rd Qu.:350
## Max. :4930 Max. :565.0 Max. :390
Here you can see the main data of the variables present in the dataset. However, this is not enough for me. I want an analysis of the statistical indices of the quantitative and qualitative variables.
data <- data[Anni.madre > 10,] #outlier adjustment
n <- nrow(data)
p1 <- ggplot(data, aes(x = Anni.madre)) +
geom_histogram(binwidth = 1, fill = "lightblue", col = 1, lwd = 0.15) +
scale_x_continuous(breaks = seq(10, max(data$Anni.madre, na.rm = TRUE), 5)) +
scale_y_continuous(breaks = seq(0, 200, 25)) +
labs(x = "Age of the Mother", y = "N. of Mothers") +
theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
p2 <- ggplot(data, aes(x = factor(N.gravidanze))) +
geom_bar(fill = "lightblue", col = 1) +
scale_y_continuous(breaks = seq(200, 1000, 200)) +
labs(x = "Number of sustained Pregnancies", y = "N. of Mothers") +
theme_bw()
p3 <- ggplot(data, aes(x = factor(Fumatrici))) +
geom_bar(fill = "lightblue", col = 1) +
scale_x_discrete(labels = c("No", "Yes")) +
labs(x = "Smoking Mother", y = "N. of Mothers") +
theme_bw()
p4 <- ggplot(data, aes(x = Gestazione)) +
geom_bar(fill = "lightblue", col = 1) +
scale_x_continuous(breaks = seq(min(data$Gestazione, na.rm = TRUE), max(data$Gestazione, na.rm = TRUE), 2)) +
scale_y_continuous(breaks = seq(0, 700, 100)) +
labs(x = "Number of Weeks of Gestation", y = "N. of Mothers") +
theme_bw()
p5 <- ggplot(data, aes(x = Peso)) +
geom_density(fill = "lightblue") +
geom_vline(xintercept = median(data$Peso, na.rm = TRUE), col = 2) +
scale_x_continuous(breaks = seq(1000, 5000, 500)) +
labs(x = "Weight of the Newborn (gr.)", y = "") +
theme_bw()
(p1 + p2 + p3 + p4 + p5) +
plot_layout(ncol = 2, guides = "collect") +
plot_annotation(title = "Dataset Variables", tag_levels = "I", tag_suffix = ")")
p6 <- ggplot()+
geom_density(aes(x = Lunghezza),
fill = "lightblue")+
geom_vline(xintercept = median(Lunghezza), col = 2)+
geom_label(aes(x = median(Lunghezza),
y = .5 * max(density(Lunghezza)$y),
label = median(Lunghezza)))+
scale_x_continuous(breaks = seq(300,550,50))+
scale_y_continuous(breaks = NULL)+
labs(x = "Lenght of the Newborn (mm)",
y = "")+
theme_bw()
p7 <- ggplot()+
geom_density(aes(x = Cranio),
fill = "lightblue")+
geom_vline(xintercept = median(Cranio), col = 2)+
geom_label(aes(x = median(Cranio),
y = .5 * max(density(Cranio)$y),
label = median(Cranio)))+
scale_x_continuous(breaks = seq(250, 375, 25))+
scale_y_continuous(breaks = NULL)+
labs(x = "Diameter of the Skull (mm)",
y = "")+
theme_bw()
p8 <- ggplot()+
geom_bar(aes(x = Tipo.parto),
fill = "lightblue",
col = 1)+
scale_x_discrete(labels = c("Ces" = "Cesarean", "Nat" = "Normal"))+
labs(x = "Type of Birth",
y = "N.of Births")+
theme_bw()
p9 <- ggplot()+
geom_bar(aes(x = Ospedale),
col = 1,
fill = "lightblue")+
scale_x_discrete(labels = c("osp1" = "Hospital1", "osp2" = "Hospital2", "osp3" = "Hospital3"))+
labs(x = "Hospital",
y = "N.of Births")+
theme_bw()
p10 <- ggplot()+
geom_bar(aes(x = Sesso),
col = 1,
fill = c("pink", "deepskyblue"))+
labs(x = "Sex of the Newborn",
y = "N. of newborns")+
theme_bw()
(p6 + p7 + p8 + p9 + p10) +
plot_layout(ncol = 2, guides = "collect") +
plot_annotation(title = "Dataset Variables", tag_levels = "I", tag_suffix = ")")
indici.statistici <- function(x){
indici_posizione <- summary(x)
indici_dispersione <- c("Intervallo" = max(x, na.rm = TRUE) - min(x, na.rm = TRUE),
"IQR" = IQR(x, na.rm = TRUE),
"dev.st" = sd(x, na.rm = TRUE),
"var" = var(x, na.rm = TRUE),
"CV" = sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE) * 100)
indici_forma <- c("Asimmetria" = moments::skewness(x, na.rm = TRUE),
"Curtosi" = moments::kurtosis(x, na.rm = TRUE) - 3)
indici <- c(indici_posizione, indici_dispersione, indici_forma)
return(indici)
}
qualitative_vars <- c("Fumatrici", "Tipo.parto", "Ospedale", "Sesso")
quantitative_vars <- names(data)[!(names(data) %in% qualitative_vars)]
indici_statistici_df <- data.frame()
for (x in quantitative_vars) {
idx <- indici.statistici(data[[x]])
df_idx <- data.frame(Indice = names(idx), Valore = round(idx, 2), Variabile = x)
indici_statistici_df <- rbind(indici_statistici_df, df_idx)
}
write.csv(indici_statistici_df, "indici_statistici.csv", row.names = FALSE)
freq_tables_list <- list()
for (x in qualitative_vars) {
freq_table <- table(data[[x]])
df_freq <- as.data.frame(freq_table)
df_freq$Variabile <- x
freq_tables_list[[x]] <- df_freq
}
final_freq_tables <- do.call(rbind, freq_tables_list)
write.csv(final_freq_tables, "frequenze_qualitative.csv", row.names = FALSE)
View(indici_statistici_df)
View(final_freq_tables)
print(indici_statistici_df)
## Indice Valore Variabile
## Min. Min. 13.00 Anni.madre
## 1st Qu. 1st Qu. 25.00 Anni.madre
## Median Median 28.00 Anni.madre
## Mean Mean 28.19 Anni.madre
## 3rd Qu. 3rd Qu. 32.00 Anni.madre
## Max. Max. 46.00 Anni.madre
## Intervallo Intervallo 33.00 Anni.madre
## IQR IQR 7.00 Anni.madre
## dev.st dev.st 5.22 Anni.madre
## var var 27.22 Anni.madre
## CV CV 18.51 Anni.madre
## Asimmetria Asimmetria 0.15 Anni.madre
## Curtosi Curtosi -0.11 Anni.madre
## Min.1 Min. 0.00 N.gravidanze
## 1st Qu.1 1st Qu. 0.00 N.gravidanze
## Median1 Median 1.00 N.gravidanze
## Mean1 Mean 0.98 N.gravidanze
## 3rd Qu.1 3rd Qu. 1.00 N.gravidanze
## Max.1 Max. 12.00 N.gravidanze
## Intervallo1 Intervallo 12.00 N.gravidanze
## IQR1 IQR 1.00 N.gravidanze
## dev.st1 dev.st 1.28 N.gravidanze
## var1 var 1.64 N.gravidanze
## CV1 CV 130.50 N.gravidanze
## Asimmetria1 Asimmetria 2.51 N.gravidanze
## Curtosi1 Curtosi 10.98 N.gravidanze
## Min.2 Min. 25.00 Gestazione
## 1st Qu.2 1st Qu. 38.00 Gestazione
## Median2 Median 39.00 Gestazione
## Mean2 Mean 38.98 Gestazione
## 3rd Qu.2 3rd Qu. 40.00 Gestazione
## Max.2 Max. 43.00 Gestazione
## Intervallo2 Intervallo 18.00 Gestazione
## IQR2 IQR 2.00 Gestazione
## dev.st2 dev.st 1.87 Gestazione
## var2 var 3.49 Gestazione
## CV2 CV 4.79 Gestazione
## Asimmetria2 Asimmetria -2.07 Gestazione
## Curtosi2 Curtosi 8.26 Gestazione
## Min.3 Min. 830.00 Peso
## 1st Qu.3 1st Qu. 2990.00 Peso
## Median3 Median 3300.00 Peso
## Mean3 Mean 3284.18 Peso
## 3rd Qu.3 3rd Qu. 3620.00 Peso
## Max.3 Max. 4930.00 Peso
## Intervallo3 Intervallo 4100.00 Peso
## IQR3 IQR 630.00 Peso
## dev.st3 dev.st 525.23 Peso
## var3 var 275865.90 Peso
## CV3 CV 15.99 Peso
## Asimmetria3 Asimmetria -0.65 Peso
## Curtosi3 Curtosi 2.03 Peso
## Min.4 Min. 310.00 Lunghezza
## 1st Qu.4 1st Qu. 480.00 Lunghezza
## Median4 Median 500.00 Lunghezza
## Mean4 Mean 494.70 Lunghezza
## 3rd Qu.4 3rd Qu. 510.00 Lunghezza
## Max.4 Max. 565.00 Lunghezza
## Intervallo4 Intervallo 255.00 Lunghezza
## IQR4 IQR 30.00 Lunghezza
## dev.st4 dev.st 26.33 Lunghezza
## var4 var 693.21 Lunghezza
## CV4 CV 5.32 Lunghezza
## Asimmetria4 Asimmetria -1.51 Lunghezza
## Curtosi4 Curtosi 6.48 Lunghezza
## Min.5 Min. 235.00 Cranio
## 1st Qu.5 1st Qu. 330.00 Cranio
## Median5 Median 340.00 Cranio
## Mean5 Mean 340.03 Cranio
## 3rd Qu.5 3rd Qu. 350.00 Cranio
## Max.5 Max. 390.00 Cranio
## Intervallo5 Intervallo 155.00 Cranio
## IQR5 IQR 20.00 Cranio
## dev.st5 dev.st 16.43 Cranio
## var5 var 269.93 Cranio
## CV5 CV 4.83 Cranio
## Asimmetria5 Asimmetria -0.79 Cranio
## Curtosi5 Curtosi 2.94 Cranio
print(final_freq_tables)
## Var1 Freq Variabile
## Fumatrici.1 0 2394 Fumatrici
## Fumatrici.2 1 104 Fumatrici
## Tipo.parto.1 Ces 728 Tipo.parto
## Tipo.parto.2 Nat 1770 Tipo.parto
## Ospedale.1 osp1 816 Ospedale
## Ospedale.2 osp2 848 Ospedale
## Ospedale.3 osp3 834 Ospedale
## Sesso.1 F 1255 Sesso
## Sesso.2 M 1243 Sesso
Here the graphs related to each variable are shown. We can say that the most of the women examined are between 20 and 30 years old. Most of the woman have no previous pregnancies. 95% of the women examined are non-smokers. The number of weeks of gestation ranges from 25 to 43 with an average of about 39 weeks. The babies born have an average weight of about 3.3kg with a maximum of 4.9kg and minimum of 0.830kg.
On average, the Weight and Height in the newborn population in Italy are 3.3kg and 50cm respectively. (source: https://www.ospedalebambinogesu.it/da-0-a-30-giorni-come-si-presenta-e-come-cresce-80012/#:~:text=In%20media%20il%20peso%20nascita,pari%20mediamente%20a%2050%20centimetri)
t.test(Peso, mu = 3300, conf.level = 0.95, alternative = "two.sided")
##
## One Sample t-test
##
## data: Peso
## t = -1.516, df = 2499, p-value = 0.1296
## alternative hypothesis: true mean is not equal to 3300
## 95 percent confidence interval:
## 3263.490 3304.672
## sample estimates:
## mean of x
## 3284.081
t.test(Lunghezza, mu = 500, conf.level = 0.95, alternative = "two.sided")
##
## One Sample t-test
##
## data: Lunghezza
## t = -10.084, df = 2499, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 500
## 95 percent confidence interval:
## 493.6598 495.7242
## sample estimates:
## mean of x
## 494.692
In the first case (newborn weight) the p-value is 0.13, so for a significance level of α = 0.05 I do not reject the null hypothesis. I conclude that the length of this sample of newborns is not significantly different from that of the population. In the second case (newborn length) the p-value is very small (of the order of 10−16), so I reject the null hypothesis. I conclude that the weight of this sample of newborns is significantly different from that of the population.
t.test(Peso ~ Sesso)
##
## Welch Two Sample t-test
##
## data: Peso by Sesso
## t = -12.106, df = 2490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -287.1051 -207.0615
## sample estimates:
## mean in group F mean in group M
## 3161.132 3408.215
t.test(Lunghezza ~ Sesso)
##
## Welch Two Sample t-test
##
## data: Lunghezza by Sesso
## t = -9.582, df = 2459.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -11.929470 -7.876273
## sample estimates:
## mean in group F mean in group M
## 489.7643 499.6672
t.test(Cranio ~ Sesso)
##
## Welch Two Sample t-test
##
## data: Cranio by Sesso
## t = -7.4102, df = 2491.4, p-value = 1.718e-13
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -6.089912 -3.541270
## sample estimates:
## mean in group F mean in group M
## 337.6330 342.4486
chisq.test(table(Tipo.parto, Sesso))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(Tipo.parto, Sesso)
## X-squared = 0.039147, df = 1, p-value = 0.8432
I performed independent samples t-tests to verify significant differences between the two sexes, accompanying the numerical tests with boxplots to have a graphical confirmation. The differences in weight, length, skull diameter are all significant between the two sexes (always considering a significance level of 0.05). On the other hand, the type of birth (natural or cesarean) is independent of the sex of the newborn. In this case, since the variable Tipo.parto is qualitative, I calculated the frequency table between the variables involved and perform a chi-square test to test the hypothesis of independence.
tipo_parto_per_ospedale <- table(Ospedale, Tipo.parto)
prop.test(tipo_parto_per_ospedale)
##
## 3-sample test for equality of proportions without continuity correction
##
## data: tipo_parto_per_ospedale
## X-squared = 1.0972, df = 2, p-value = 0.5778
## alternative hypothesis: two.sided
## sample estimates:
## prop 1 prop 2 prop 3
## 0.2965686 0.2991755 0.2778443
chisq.test(tipo_parto_per_ospedale)
##
## Pearson's Chi-squared test
##
## data: tipo_parto_per_ospedale
## X-squared = 1.0972, df = 2, p-value = 0.5778
I tested another hypothesis: there were rumors that some hospitals perform more cesarean sections, so I took steps to verify it. Since I had to test a proportion, I used prop.test() on the frequency table between the variables involved (I should point out, however, that the chi-square test gives the exact same result).The p-value is very high, so I do not reject the null hypothesis. I conclude that there are no significant differences in the type of birth depending on the hospital.
moments::skewness(Peso)
## [1] -0.6470308
moments::kurtosis(Peso) - 3
## [1] 2.031532
shapiro.test(Peso)
##
## Shapiro-Wilk normality test
##
## data: Peso
## W = 0.97066, p-value < 2.2e-16
First of all, I checked that the response variable (Weight) is approximately normal, by looking at the shape indices and performing a Shapiro-Wilk test. I checked this in advance because any deviations from normality of the response variable often also affect the residuals. I noticed that the response variable (Weight) does not follow a normal distribution. In fact, the Shapiro-Wilk test clearly rejects the null hypothesis of normality, and we can note that the distribution is particularly leptokurtic (pointed), with a kurtosis value of 2.03. I know that in this case it would be appropriate to use a GLM, but I try with a linear regression model anyway.
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...) {
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if (missing(cex.cor)) cex.cor <- 0.8 / strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
data_quantitative <- data %>% select(all_of(quantitative_vars))
pairs(data_quantitative, upper.panel = panel.smooth, lower.panel = panel.cor)
boxplot(Peso ~ Sesso)
t.test(Peso ~ Sesso)
##
## Welch Two Sample t-test
##
## data: Peso by Sesso
## t = -12.106, df = 2490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -287.1051 -207.0615
## sample estimates:
## mean in group F mean in group M
## 3161.132 3408.215
boxplot(Peso ~ Fumatrici)
t.test(Peso ~ Fumatrici)
##
## Welch Two Sample t-test
##
## data: Peso by Fumatrici
## t = 1.034, df = 114.1, p-value = 0.3033
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -45.61354 145.22674
## sample estimates:
## mean in group 0 mean in group 1
## 3286.153 3236.346
boxplot(Peso ~ Tipo.parto)
t.test(Peso ~ Tipo.parto)
##
## Welch Two Sample t-test
##
## data: Peso by Tipo.parto
## t = -0.12968, df = 1493, p-value = 0.8968
## alternative hypothesis: true difference in means between group Ces and group Nat is not equal to 0
## 95 percent confidence interval:
## -46.27992 40.54037
## sample estimates:
## mean in group Ces mean in group Nat
## 3282.047 3284.916
boxplot(Peso ~ Ospedale)
pairwise.t.test(Peso, Ospedale, paired = F, pool.sd = T, p.adjust.method = "bonferroni")
##
## Pairwise comparisons using t tests with pooled SD
##
## data: Peso and Ospedale
##
## osp1 osp2
## osp2 1.00 -
## osp3 0.33 0.33
##
## P value adjustment method: bonferroni
I investigated the two-by-two relationships between quantitative variables, both numerically (correlation matrix) and graphically. At first glance, length, skull, gestation appear significant. I investigated the relationships between the qualitative variables and the response variable. Only Sex is significant (instead the variable Smokers - which indicates whether a mother is a smoker or not - is not significant, unlike what one might expect, with a p-value of 0.30).
mod1 <- lm(Peso ~ . , data = data)
summary(mod1)
##
## Call:
## lm(formula = Peso ~ ., data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1123.26 -181.53 -14.45 161.05 2611.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6735.7960 141.4790 -47.610 < 2e-16 ***
## Anni.madre 0.8018 1.1467 0.699 0.4845
## N.gravidanze 11.3812 4.6686 2.438 0.0148 *
## Fumatrici -30.2741 27.5492 -1.099 0.2719
## Gestazione 32.5773 3.8208 8.526 < 2e-16 ***
## Lunghezza 10.2922 0.3009 34.207 < 2e-16 ***
## Cranio 10.4722 0.4263 24.567 < 2e-16 ***
## Tipo.partoNat 29.6335 12.0905 2.451 0.0143 *
## Ospedaleosp2 -11.0912 13.4471 -0.825 0.4096
## Ospedaleosp3 28.2495 13.5054 2.092 0.0366 *
## SessoM 77.5723 11.1865 6.934 5.18e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274 on 2487 degrees of freedom
## Multiple R-squared: 0.7289, Adjusted R-squared: 0.7278
## F-statistic: 668.7 on 10 and 2487 DF, p-value: < 2.2e-16
BIC(mod1)
## [1] 35215.45
The model has R2adj = 0.72, which is a reasonable value. This first model includes 10 variables (9 actually, but for the Hospital factor two are created because it has 3 modes). Some have very high p-values, so they can most likely be removed from the model to streamline it without losing a significant amount of explained variance.
stepwise_mod <- MASS::stepAIC(mod1, direction = "both", k = log(n))
## Start: AIC=28118.61
## Peso ~ Anni.madre + N.gravidanze + Fumatrici + Gestazione + Lunghezza +
## Cranio + Tipo.parto + Ospedale + Sesso
##
## Df Sum of Sq RSS AIC
## - Anni.madre 1 36710 186779904 28111
## - Fumatrici 1 90677 186833870 28112
## - Ospedale 2 687555 187430749 28112
## - N.gravidanze 1 446244 187189438 28117
## - Tipo.parto 1 451073 187194266 28117
## <none> 186743194 28119
## - Sesso 1 3610705 190353899 28159
## - Gestazione 1 5458852 192202046 28183
## - Cranio 1 45318506 232061700 28654
## - Lunghezza 1 87861708 274604902 29074
##
## Step: AIC=28111.28
## Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio +
## Tipo.parto + Ospedale + Sesso
##
## Df Sum of Sq RSS AIC
## - Fumatrici 1 91599 186871503 28105
## - Ospedale 2 693914 187473818 28105
## - Tipo.parto 1 452049 187231953 28110
## <none> 186779904 28111
## - N.gravidanze 1 631082 187410986 28112
## + Anni.madre 1 36710 186743194 28119
## - Sesso 1 3617809 190397713 28151
## - Gestazione 1 5424800 192204704 28175
## - Cranio 1 45569477 232349381 28649
## - Lunghezza 1 87852027 274631931 29066
##
## Step: AIC=28104.68
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto +
## Ospedale + Sesso
##
## Df Sum of Sq RSS AIC
## - Ospedale 2 702925 187574428 28098
## - Tipo.parto 1 444404 187315907 28103
## <none> 186871503 28105
## - N.gravidanze 1 608136 187479640 28105
## + Fumatrici 1 91599 186779904 28111
## + Anni.madre 1 37633 186833870 28112
## - Sesso 1 3601860 190473363 28145
## - Gestazione 1 5358199 192229702 28168
## - Cranio 1 45613331 232484834 28642
## - Lunghezza 1 88259386 275130889 29063
##
## Step: AIC=28098.41
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto +
## Sesso
##
## Df Sum of Sq RSS AIC
## - Tipo.parto 1 467626 188042054 28097
## <none> 187574428 28098
## - N.gravidanze 1 648873 188223301 28099
## + Ospedale 2 702925 186871503 28105
## + Fumatrici 1 100610 187473818 28105
## + Anni.madre 1 44184 187530244 28106
## - Sesso 1 3644818 191219246 28139
## - Gestazione 1 5457887 193032315 28162
## - Cranio 1 45747094 233321522 28636
## - Lunghezza 1 87955701 275530129 29051
##
## Step: AIC=28096.81
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
##
## Df Sum of Sq RSS AIC
## <none> 188042054 28097
## - N.gravidanze 1 621053 188663107 28097
## + Tipo.parto 1 467626 187574428 28098
## + Ospedale 2 726146 187315907 28103
## + Fumatrici 1 92548 187949505 28103
## + Anni.madre 1 45366 187996688 28104
## - Sesso 1 3650790 191692844 28137
## - Gestazione 1 5477493 193519547 28161
## - Cranio 1 46098547 234140601 28637
## - Lunghezza 1 87532691 275574744 29044
summary(stepwise_mod)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Sesso, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1149.37 -180.98 -15.57 163.69 2639.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6681.7251 135.8036 -49.201 < 2e-16 ***
## N.gravidanze 12.4554 4.3416 2.869 0.00415 **
## Gestazione 32.3827 3.8008 8.520 < 2e-16 ***
## Lunghezza 10.2455 0.3008 34.059 < 2e-16 ***
## Cranio 10.5410 0.4265 24.717 < 2e-16 ***
## SessoM 77.9807 11.2111 6.956 4.47e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.7 on 2492 degrees of freedom
## Multiple R-squared: 0.727, Adjusted R-squared: 0.7265
## F-statistic: 1327 on 5 and 2492 DF, p-value: < 2.2e-16
mod2 <- update(mod1, ~.-Anni.madre)
summary(mod2)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza +
## Cranio + Tipo.parto + Ospedale + Sesso, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1113.82 -180.30 -16.22 160.66 2616.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6708.6189 136.0211 -49.320 < 2e-16 ***
## N.gravidanze 12.5833 4.3400 2.899 0.00377 **
## Fumatrici -30.4268 27.5455 -1.105 0.26944
## Gestazione 32.2996 3.7997 8.501 < 2e-16 ***
## Lunghezza 10.2916 0.3008 34.209 < 2e-16 ***
## Cranio 10.4874 0.4257 24.638 < 2e-16 ***
## Tipo.partoNat 29.6654 12.0892 2.454 0.01420 *
## Ospedaleosp2 -10.9509 13.4442 -0.815 0.41541
## Ospedaleosp3 28.5171 13.4986 2.113 0.03474 *
## SessoM 77.6452 11.1849 6.942 4.91e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274 on 2488 degrees of freedom
## Multiple R-squared: 0.7288, Adjusted R-squared: 0.7279
## F-statistic: 743.1 on 9 and 2488 DF, p-value: < 2.2e-16
anova(mod2, mod1)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio +
## Tipo.parto + Ospedale + Sesso
## Model 2: Peso ~ Anni.madre + N.gravidanze + Fumatrici + Gestazione + Lunghezza +
## Cranio + Tipo.parto + Ospedale + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2488 186779904
## 2 2487 186743194 1 36710 0.4889 0.4845
BIC(mod2, mod1)
## df BIC
## mod2 11 35208.12
## mod1 12 35215.45
car::vif(mod2)
## GVIF Df GVIF^(1/(2*Df))
## N.gravidanze 1.027985 1 1.013896
## Fumatrici 1.007363 1 1.003675
## Gestazione 1.677351 1 1.295126
## Lunghezza 2.086862 1 1.444597
## Cranio 1.626789 1 1.275456
## Tipo.parto 1.004213 1 1.002104
## Ospedale 1.003460 2 1.000864
## Sesso 1.040653 1 1.020124
mod3 <- update(mod2, ~.-Ospedale)
summary(mod3)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza +
## Cranio + Tipo.parto + Sesso, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1129.88 -181.36 -16.24 160.63 2634.62
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6708.8092 136.0640 -49.306 < 2e-16 ***
## N.gravidanze 12.9927 4.3439 2.991 0.00281 **
## Fumatrici -31.8823 27.5803 -1.156 0.24780
## Gestazione 32.5970 3.8039 8.569 < 2e-16 ***
## Lunghezza 10.2684 0.3011 34.098 < 2e-16 ***
## Cranio 10.5015 0.4262 24.637 < 2e-16 ***
## Tipo.partoNat 30.4244 12.1041 2.514 0.01201 *
## SessoM 78.1031 11.1998 6.974 3.94e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.4 on 2490 degrees of freedom
## Multiple R-squared: 0.7278, Adjusted R-squared: 0.7271
## F-statistic: 951.3 on 7 and 2490 DF, p-value: < 2.2e-16
BIC(mod3, mod2)
## df BIC
## mod3 9 35201.73
## mod2 11 35208.12
anova(mod3, mod2)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio +
## Tipo.parto + Sesso
## Model 2: Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio +
## Tipo.parto + Ospedale + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2490 187473818
## 2 2488 186779904 2 693914 4.6216 0.009921 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod4 <- update(mod3, ~.-Fumatrici)
summary(mod4)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Tipo.parto + Sesso, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1129.14 -181.97 -16.26 160.95 2638.18
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6708.0171 136.0715 -49.298 < 2e-16 ***
## N.gravidanze 12.7356 4.3385 2.935 0.00336 **
## Gestazione 32.3253 3.7969 8.514 < 2e-16 ***
## Lunghezza 10.2833 0.3009 34.177 < 2e-16 ***
## Cranio 10.5063 0.4263 24.648 < 2e-16 ***
## Tipo.partoNat 30.1601 12.1027 2.492 0.01277 *
## SessoM 77.9171 11.1994 6.957 4.42e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.4 on 2491 degrees of freedom
## Multiple R-squared: 0.7277, Adjusted R-squared: 0.727
## F-statistic: 1109 on 6 and 2491 DF, p-value: < 2.2e-16
BIC(mod4, mod3)
## df BIC
## mod4 8 35195.25
## mod3 9 35201.73
anova(mod4, mod3)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto +
## Sesso
## Model 2: Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio +
## Tipo.parto + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2491 187574428
## 2 2490 187473818 1 100610 1.3363 0.2478
mod5 <- update(mod4, ~.-Tipo.parto)
summary(mod5)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Sesso, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1149.37 -180.98 -15.57 163.69 2639.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6681.7251 135.8036 -49.201 < 2e-16 ***
## N.gravidanze 12.4554 4.3416 2.869 0.00415 **
## Gestazione 32.3827 3.8008 8.520 < 2e-16 ***
## Lunghezza 10.2455 0.3008 34.059 < 2e-16 ***
## Cranio 10.5410 0.4265 24.717 < 2e-16 ***
## SessoM 77.9807 11.2111 6.956 4.47e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.7 on 2492 degrees of freedom
## Multiple R-squared: 0.727, Adjusted R-squared: 0.7265
## F-statistic: 1327 on 5 and 2492 DF, p-value: < 2.2e-16
BIC(mod4, mod5)
## df BIC
## mod4 8 35195.25
## mod5 7 35193.65
anova(mod4, mod5)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto +
## Sesso
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2491 187574428
## 2 2492 188042054 -1 -467626 6.2101 0.01277 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod6 <- update(mod5, ~.-N.gravidanze)
summary(mod6)
##
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + Sesso,
## data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1138.1 -184.4 -17.4 163.3 2626.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6651.6732 135.5952 -49.055 < 2e-16 ***
## Gestazione 31.3262 3.7884 8.269 < 2e-16 ***
## Lunghezza 10.2024 0.3009 33.909 < 2e-16 ***
## Cranio 10.6706 0.4247 25.126 < 2e-16 ***
## SessoM 79.1027 11.2205 7.050 2.31e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 275.1 on 2493 degrees of freedom
## Multiple R-squared: 0.7261, Adjusted R-squared: 0.7257
## F-statistic: 1652 on 4 and 2493 DF, p-value: < 2.2e-16
BIC(mod5, mod6)
## df BIC
## mod5 7 35193.65
## mod6 6 35194.06
anova(mod5, mod6)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## Model 2: Peso ~ Gestazione + Lunghezza + Cranio + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2492 188042054
## 2 2493 188663107 -1 -621053 8.2304 0.004154 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
car::vif(mod5)
## N.gravidanze Gestazione Lunghezza Cranio Sesso
## 1.023462 1.669779 2.075747 1.624568 1.040184
mod10 <- update(mod5, ~.+I(Gestazione^2))
summary(mod10)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Sesso + I(Gestazione^2), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1144.0 -181.5 -12.9 165.8 2661.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4646.7158 898.6322 -5.171 2.52e-07 ***
## N.gravidanze 12.5489 4.3381 2.893 0.00385 **
## Gestazione -81.2309 49.7402 -1.633 0.10257
## Lunghezza 10.3502 0.3040 34.045 < 2e-16 ***
## Cranio 10.6376 0.4282 24.843 < 2e-16 ***
## SessoM 75.7563 11.2435 6.738 1.99e-11 ***
## I(Gestazione^2) 1.5168 0.6621 2.291 0.02206 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.5 on 2491 degrees of freedom
## Multiple R-squared: 0.7276, Adjusted R-squared: 0.7269
## F-statistic: 1109 on 6 and 2491 DF, p-value: < 2.2e-16
BIC(mod10, mod5)
## df BIC
## mod10 8 35196.21
## mod5 7 35193.65
anova(mod10, mod5)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso +
## I(Gestazione^2)
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2491 187646731
## 2 2492 188042054 -1 -395323 5.2479 0.02206 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Before doing manual tests, I looked at what I got from the function that implements the automatic stepwise procedure - using the BIC criterion (preferable to the AIC as it does not overestimate overparameterized models). I noticed that that model includes the variables N.pregnancies, Gestation, Length, Skull, Sex. Then I tried to proceed manually step by step. In the model with all the variables I noticed that Years.mother has a p-value of 0.43. Removing it, R2adj remains unchanged, the BIC has decreased and the ANOVA test did not indicate a significant difference in explained variance (p-value of 0.43). So I removed it without hesitation. The variable Hospital has two p-values (because it is qualitative with 3 modalities) of 0.40 and 0.03. Removing it, R2adj went down not even a single precentage point, BIC went down and the ANOVA test showed a significant difference (p-value of 0.009). But for such a small decrease in R2adj for removing two variables, I chose not to keep it. The variable Smokers has a p-value of 0.25. Removing it, R2adj went down only 0.1%, BIC went down and the ANOVA test showed a significant difference (p-value of 0.01). But for such a low decrease in R2adj in the face of removing a variable, I chose not to keep it. The variable in the model with the highest p-value was N.pregnancies with a p-value of 0.004, I tried to remove it. It turned out that R2 adj decreased by not even one percentage point, but the BIC for the first time increased. The ANOVA test indicated a significant difference from the previous model (p-value of 0.004). Given this, I chose to keep the variable in the model and stop there, because all the other variables have p-values of the order of 10−12 or less. I checked that there are no problems of multicollinearity (all VIFs are less than 5 so no problem). From the graphs created previously I noted a possible quadratic effect between Gestation and Y. Trying to insert it into the model, I saw that R2 adj only increased by 0.04%, the p-value of Gestation was 0.02 (but that of Gestation went from 0.3% to 11%), the BIC increases and the ANOVA indicated a significant difference between the two models. Even if graphically it seems to me that the quadratic effect could be relevant, the numerical tests did not indicate a significant improvement, so I chose not to add it to the model. I chose not to consider interactions between the variables, because thinking about it mentally I do not imagine any relationship that could be significant between the variables in the final model (excluding the anthropometric control variables).
par(mfrow = c(2,2))
plot(mod5)
shapiro.test(residuals(mod5))
##
## Shapiro-Wilk normality test
##
## data: residuals(mod5)
## W = 0.97414, p-value < 2.2e-16
plot(density(residuals(mod5)))
ggplot()+
geom_density(aes(x = residuals(mod5)))+
geom_vline(xintercept = mean(residuals(mod5)), col = 2)+
theme_bw()+
labs(x = "Residues", y = "")+
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank())
I checked, both graphically and numerically, whether the model met all the assumptions about the residuals (normality with mean 0, homoscedasticity, independence). In the first graph (which relates the estimates obtained by the model and the respective residuals), most of the points are a random cloud of points around a mean of 0, which would mean everything is normal. The only visible problem is that the left tail deviates upwards (i.e., the model underestimates the weight predictions of light newborns). In the second graph (which compares the quantiles of Weight with the quantiles of the normal) most of the points in the graph align along the line y = x, indicating that the residuals align along a normal. Again, you can see that the tails have a different pattern than most of the points. In the third graph we see, similarly to the first, mainly a random cloud of points around a value of y (this would indicate a constant variance, respecting the hypothesis of homoscedasticity). Here too, however, the left tail presents the problem of deviating a little upwards. In the fourth graph (outliers) only one value exceeds the warning threshold (Cook’s distance > 0.5) and none exceeds the alarm threshold of 1, so graphically no problems with outliers are shown. The Shapiro-Wilk test clearly rejects the normality of the residuals, but in the Q-Q plot we had predicted a reasonable possibility of normality. It can be seen that the distribution of the residuals resembles a normal one; it is mainly symmetric (with the right tail being particularly long), but it seems decidedly leptokurtic. In fact, the kurtosis, calculated with moments::kurtosis(residuals(mod5)) - 3, is 4.16, quite high.
lmtest::dwtest(mod5)
##
## Durbin-Watson test
##
## data: mod5
## DW = 1.9532, p-value = 0.1209
## alternative hypothesis: true autocorrelation is greater than 0
lmtest::bptest(mod5)
##
## studentized Breusch-Pagan test
##
## data: mod5
## BP = 90.297, df = 5, p-value < 2.2e-16
cook <- cooks.distance(mod5)
cook[cook > 0.5]
## 1551
## 0.8297645
car::outlierTest(mod5)
## rstudent unadjusted p-value Bonferroni p
## 1551 10.046230 2.6345e-23 6.5810e-20
## 155 5.025345 5.3818e-07 1.3444e-03
## 1306 4.824963 1.4848e-06 3.7092e-03
lev <- hatvalues(mod5)
p <- sum(lev)
soglia <- 2 * p/n
plot(lev)
abline(h=soglia,col=2,lwd=2)
lev[lev > soglia]; length(lev[lev > soglia])
## 13 15 34 67 89 96
## 0.005632888 0.007056483 0.006748974 0.005896189 0.012817563 0.005351869
## 101 106 131 134 151 155
## 0.007528094 0.014487904 0.007237755 0.007553514 0.010889886 0.007209736
## 161 189 190 204 205 206
## 0.020341133 0.004894598 0.005366831 0.014494112 0.005351828 0.009482503
## 220 294 305 310 312 315
## 0.007403130 0.005914307 0.005445189 0.028815300 0.013173723 0.005386836
## 378 440 442 445 486 492
## 0.015942080 0.005405733 0.007725708 0.007511227 0.005165714 0.008274392
## 497 516 582 587 592 614
## 0.005167522 0.013080707 0.011667393 0.008415872 0.006385013 0.005300091
## 638 656 657 684 697 702
## 0.006693153 0.005934494 0.005323517 0.008825948 0.005864920 0.005203163
## 729 748 750 757 765 805
## 0.005024448 0.008567254 0.006943755 0.008146487 0.006076412 0.014358658
## 828 893 895 913 928 946
## 0.007180179 0.005076266 0.005297643 0.005574016 0.022745493 0.006910965
## 947 956 985 1008 1014 1049
## 0.008409518 0.007791530 0.007040035 0.005343994 0.008474086 0.004956561
## 1067 1091 1106 1130 1166 1181
## 0.008467035 0.008940030 0.005967511 0.031739177 0.005513581 0.005678043
## 1188 1200 1219 1238 1248 1273
## 0.006481533 0.005493009 0.030697778 0.005912496 0.014631359 0.007089553
## 1291 1293 1311 1321 1325 1356
## 0.006118589 0.006074423 0.009626391 0.009295311 0.004857340 0.005306940
## 1357 1385 1395 1400 1402 1411
## 0.006967713 0.012641828 0.005129491 0.005932524 0.004816704 0.008049539
## 1420 1428 1429 1450 1505 1551
## 0.005156670 0.008195421 0.021758961 0.015106684 0.013334256 0.048802841
## 1553 1556 1573 1593 1606 1610
## 0.008507417 0.005923458 0.005049368 0.005624961 0.005009312 0.008725821
## 1617 1619 1628 1686 1693 1701
## 0.004869997 0.015069038 0.005070717 0.009356578 0.005079185 0.010846383
## 1712 1718 1727 1735 1780 1781
## 0.006993461 0.006961226 0.013303625 0.004886226 0.025544997 0.016833696
## 1809 1827 1868 1892 1962 1967
## 0.008711082 0.006068655 0.005206137 0.005333985 0.005541287 0.005339716
## 1977 2037 2040 2046 2086 2089
## 0.006928794 0.004890003 0.011504028 0.005471894 0.013194769 0.006293791
## 2098 2114 2115 2120 2140 2146
## 0.005097113 0.013318873 0.011779730 0.018667270 0.006244737 0.005804705
## 2148 2149 2157 2175 2200 2215
## 0.007930323 0.013589469 0.005910248 0.032531736 0.011679031 0.004894044
## 2216 2220 2221 2224 2225 2244
## 0.008120291 0.005415586 0.021633907 0.005841119 0.005593576 0.006929530
## 2257 2307 2317 2318 2337 2359
## 0.006171101 0.013972901 0.007677230 0.004834368 0.005230839 0.010068259
## 2408 2422 2436 2437 2452 2458
## 0.009702475 0.021536269 0.004986609 0.023951342 0.023848222 0.008509142
## 2471 2478
## 0.020905930 0.005777111
## [1] 152
The only value previously identified with Cook’s distance > 0.5 is observation 1551. Examining it, I see that Length=315 and Weight=4370. Comparing the weight with that of babies of similar length, I get extremely different numbers (around 1000) so it was probably a typing error. Since no other value exceeds the warning threshold, and no value reaches the alarm threshold, there is no problem with outliers. However, for completeness I examine separately extreme values in the response variable and in the regressor space (outlier values and leverage). The outlier Test function of the package “car” returns 3 outliers (observations 1551, 155 and 1306). Instead, the manual calculation of leverage values returns 152 values (6% of the total of 2500), which correspond to all observations that are far from the others in the regressor space.
newdata_nat <- data.frame(N.gravidanze=2, Gestazione=39, Lunghezza=mean(Lunghezza), Cranio=mean(Cranio), Tipo.parto=factor("Nat"), Sesso=factor("F"))
newdata_ces <- data.frame(N.gravidanze=2, Gestazione=39, Lunghezza=mean(Lunghezza), Cranio=mean(Cranio), Tipo.parto=factor("Ces"), Sesso=factor("F"))
pred1 <- predict(mod5, newdata = newdata_nat)
pred2 <- predict(mod5, newdata = newdata_ces)
prediction = 1 / n * (pred1 * dim(data[Tipo.parto == "Nat",])[1]
+ pred2 * dim(data[Tipo.parto == "Ces",])[1])
print(paste("Weighted Forecast is:", prediction))
## [1] "Weighted Forecast is: 3261.29732763148"
Overall, the model’s R2 adj is 0.72, meaning that 72% of the variability of the Weight is explained by the model. There are no alarming outliers. The residuals have a mean of 0 and are not autocorrelated. In light of this, even if there is heteroskedasticity and the Shapiro-Wilk test rejects normality, the model seems good enough to explain the variability of the response variable. I make a prediction for the weight of a newborn baby: the mother is in her third pregnancy (so N. pregnancies will be 2, i.e. the previous ones) and will give birth in the 39th week. No measurements from the ultrasound are available. To make a prediction in R I have to pass a data frame with all the variables, even those that I don’t have. For the quantitative ones (Length, Skull) I can simply use the average of them, while for Birth Type I choose to consider both modalities and make the weighted average. The result is a weight of 3261g, perfectly reasonable.
summary(mod5)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Sesso, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1149.37 -180.98 -15.57 163.69 2639.09
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6681.7251 135.8036 -49.201 < 2e-16 ***
## N.gravidanze 12.4554 4.3416 2.869 0.00415 **
## Gestazione 32.3827 3.8008 8.520 < 2e-16 ***
## Lunghezza 10.2455 0.3008 34.059 < 2e-16 ***
## Cranio 10.5410 0.4265 24.717 < 2e-16 ***
## SessoM 77.9807 11.2111 6.956 4.47e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.7 on 2492 degrees of freedom
## Multiple R-squared: 0.727, Adjusted R-squared: 0.7265
## F-statistic: 1327 on 5 and 2492 DF, p-value: < 2.2e-16
p1 <- ggplot(data, aes(x = Gestazione, y = Peso, col = Sesso)) +
geom_point(alpha = 0.5, position = "jitter") +
geom_smooth(method = "lm", se = FALSE, lwd = 1.5) +
labs(x = "Weeks of Gestation", y = "Weight of the Newborn (gr.)", title = "Link between Gestation and Weight") +
theme_bw() +
theme(plot.title = element_text(size = 14),
plot.margin = margin(1, 1, 2, 1))
p2 <- ggplot(data, aes(x = as.factor(N.gravidanze), y = Peso)) +
geom_boxplot() +
geom_smooth(aes(x = as.numeric(N.gravidanze), y = Peso), method = "lm", se = FALSE, lwd = 1.25) +
labs(x = "N. of sustained pregnancies", y = "Weight of the Newborn (gr)", title = "Weight based on Pregnancy") +
theme_bw() +
theme(plot.title = element_text(size = 14),
plot.margin = margin(1, 1, 2, 1))
p1 + p2
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'