DESCRIPTIVE ANALYTICS
summary(df)
## Anni.madre N.gravidanze Fumatrici Gestazione
## Min. : 0.00 Min. : 0.0000 Min. :0.0000 Min. :25.00
## 1st Qu.:25.00 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:38.00
## Median :28.00 Median : 1.0000 Median :0.0000 Median :39.00
## Mean :28.16 Mean : 0.9812 Mean :0.0416 Mean :38.98
## 3rd Qu.:32.00 3rd Qu.: 1.0000 3rd Qu.:0.0000 3rd Qu.:40.00
## Max. :46.00 Max. :12.0000 Max. :1.0000 Max. :43.00
## Peso Lunghezza Cranio Tipo.parto Ospedale Sesso
## Min. : 830 Min. :310.0 Min. :235 Ces: 728 osp1:816 F:1256
## 1st Qu.:2990 1st Qu.:480.0 1st Qu.:330 Nat:1772 osp2:849 M:1244
## Median :3300 Median :500.0 Median :340 osp3:835
## Mean :3284 Mean :494.7 Mean :340
## 3rd Qu.:3620 3rd Qu.:510.0 3rd Qu.:350
## Max. :4930 Max. :565.0 Max. :390
results <- data.frame(
Variables_categories = c(
paste("Fumatrici:", names(table(Fumatrici))),
paste("Tipo parto:", names(table(Tipo.parto)))
),
Percentage = c(
(table(Fumatrici) / length(Fumatrici)) * 100,
(table(Tipo.parto) / length(Tipo.parto)) * 100
)
)
results$Percentage <- paste0(round(results$Percentage, 1), "%")
knitr::kable(results, row.names = FALSE)
| Fumatrici: 0 |
95.8% |
| Fumatrici: 1 |
4.2% |
| Tipo parto: Ces |
29.1% |
| Tipo parto: Nat |
70.9% |
bp_osp = barplot(table(Ospedale),
col = "lightblue",
main = "Number of births per hospital",
xlab = "Ospedale",
ylim = c(0, max(table(Ospedale)) * 1.15))
text(x = bp_osp,
y = table(Ospedale),
labels = table(Ospedale),
pos = 3,
cex = 0.7)
box()

bp_ngr = barplot(table(N.gravidanze),
col = "lightgrey",
main = "Barplot of N.gravidanze",
xlab = "N.gravidanze",
ylim = c(0, 1200))
text(x = bp_ngr[1:4],
y = table(N.gravidanze)[1:4],
labels = table(N.gravidanze)[1:4],
pos = 3,
cex = 0.7)
box()

freq = cumsum(prop.table(table(N.gravidanze)))*100
x = as.numeric(names(freq))
y = as.numeric(freq)
plot(as.numeric(names(freq)),
freq,
type = "b",
pch = 19,
col = "blue",
xlab = "N.gravidanze",
ylab = "Cumulative frequency (%)",
main = "Cumulative percentage of N.gravidanze")
grid(col = "lightgray", lty = "dotted")
text(x[1:4],
y[1:4],
labels = paste0(round(y[1:4], 1), "%"),
pos = 3,
cex = 0.7,
col = "black")

From the first analysis we can see few things:
- The dimension of the dataset is
2500 observations x 10 columns
- Most of the women are non-smokers (95.8%) and
have had a natural birth (70.9%)
- All three hospitals have roughly the same
absolute frequency
- Most of the women had 3 or less than 3
pregnancies, specifically 96.2% of them
par(mfrow = c(1, 2))
plot(density(Anni.madre),
main = "Density curve Anni.madre",
col = "blue",
lwd = 2)
boxplot(Anni.madre,
main = "Boxplot Anni.madre",
col = "blue")

par(mfrow = c(1, 1))
bp_gest = barplot(table(Gestazione),
main = "Barplot of Gestazione",
col = "green",
ylim = c(0, 900))
text(x = bp_gest[13:17],
y = table(Gestazione)[13:17],
labels = table(Gestazione)[13:17],
pos = 3,
cex = 0.7,
col = "black")

skewness(Gestazione)
## [1] -2.065313
par(mfrow = c(1, 2))
plot(density(Lunghezza),
main = "Density curve of Lunghezza",
col = "orange")
boxplot(Lunghezza, horizontal = T,
main = "Boxplot of Lunghezza",
col = "orange")

skewness(Lunghezza)
## [1] -1.514699
plot(density(Cranio),
main = "Density curve of Cranio",
col = "red")
boxplot(Cranio, horizontal = T,
main = "Boxplot of Cranio",
col = "red")

par(mfrow = c(1, 1))
skewness(Cranio)
## [1] -0.7850527
par(mfrow = c(1, 2))
hist(Peso, probability = T, ylim = c(0, max(density((Peso))$y) * 1.1),
main = "Histogram and density curve of Peso")
lines(density(Peso), col = "violet", lwd = 2)
boxplot(Peso, horizontal = T,
main = "Boxplot of Peso",
col = "violet")

par(mfrow = c(1, 1))
skewness(Peso)
## [1] -0.6470308
All the plots above show negative skewness; for
some variables is more pronounced, for others less
Anni.madre shows values for 0 and
1 meaning they had a child at that age; that is
impossible and may be some sort of error (like a typing
error)
Gestazione shows most of the values between
37 and 41 weeks, that is consistent with the 9-months
pregnancy
To bear in mind is the fact that the variabile Peso
(that will be our target variable) has a slight negative
skewness and could influence the models
HYPOTHESIS TESTS
tab.cong = table(Ospedale, Tipo.parto)
ggpubr::ggballoonplot(data = as.data.frame(tab.cong), fill="blue") +
labs(title = "Baloonplot Ospedale-Tipo.parto",
x = "Ospedale",
y = "Tipo.parto")

test1 = chisq.test(Ospedale, Tipo.parto)
data.frame(
X_squared = round(test1$statistic, 3),
df = test1$parameter,
p_value = round(test1$p.value, 3)
) |>
knitr::kable() |>
kableExtra::kable_styling(full_width = FALSE)
|
|
X_squared
|
df
|
p_value
|
|
X-squared
|
1.097
|
2
|
0.578
|
plot(density(rchisq(1000000, 2)), xlim=c(0, 8),
main = "Density curve for a sample of 1Mln values generated by a Chi-Square distribution",
cex.main = 0.9)
abline(v=qchisq(0.95, 2), col=2)
points(1.0972, 0, cex=3, col=4, pch=20)

From the baloonplot seems that more
caesarean sections are being performed at
osp2
The Chisq test (H0 independence between variables)
shows a p-value > 0.05, so we can’t reject the null
hypotesis. Ospedale and Tipo.parto
seem to be independent
The same results can be seen from the
density curve plot where the test statistics is
deep inside the acceptance region
test2a = t.test(Peso, mu = 3300, conf.level = 0.95, alternative = "two.sided")
test2b = t.test(Lunghezza, mu = 500, conf.level = 0.95, alternative = "two.sided")
df_test2 = data.frame(
Variables_name = c("Peso", "Lunghezza"),
t = c(
round(as.numeric(test2a$statistic), 3),
round(as.numeric(test2b$statistic), 3)
),
df = c(
as.numeric(test2a$parameter),
as.numeric(test2b$parameter)
),
Estimate = c(
round(as.numeric(test2a$estimate), 3),
round(as.numeric(test2b$estimate), 3)
),
p_value = c(
round(test2a$p.value, 3),
round(test2b$p.value, 3)
)
)
df_test2 |>
knitr::kable(
caption = "T tests for equality of means"
) |>
kableExtra::kable_styling(full_width = FALSE)
T tests for equality of means
|
Variables_name
|
t
|
df
|
Estimate
|
p_value
|
|
Peso
|
-1.516
|
2499
|
3284.081
|
0.13
|
|
Lunghezza
|
-10.084
|
2499
|
494.692
|
0.00
|
3300 for Peso and
500 for Lunghezza are the
population means found in
literature
From the T test (H0 sample mean = population mean)
we can’t reject the null hypotesis for Peso, meaning
the sample mean is equal to 3300, and we reject the null
hypotesis for Lunghezza, meaning the sample
mean is not equal to 500
datas = df[c("Peso", "Lunghezza", "Cranio")]
variables = c("Peso", "Lunghezza", "Cranio")
shapiro_results = do.call(
rbind,
lapply(variables, function(v) {
test = shapiro.test(datas[[v]])
data.frame(
Variabile = v,
W = round(as.numeric(test$statistic), 3),
p_value = round(test$p.value, 3)
)
})
)
shapiro_results |>
knitr::kable(
caption = "Shapiro-Wilk normality tests"
) |>
kableExtra::kable_styling(full_width = FALSE)
Shapiro-Wilk normality tests
|
Variabile
|
W
|
p_value
|
|
Peso
|
0.971
|
0
|
|
Lunghezza
|
0.909
|
0
|
|
Cranio
|
0.964
|
0
|
wilcox_results = do.call(
rbind,
lapply(variables, function(v) {
test <- wilcox.test(
as.formula(paste(v, "~ Sesso")),
data = df
)
data.frame(
Variabile = v,
W = round(as.numeric(test$statistic), 3),
p_value = round(test$p.value, 3)
)
})
)
wilcox_results |>
knitr::kable(
caption = "Wilcoxon rank-sum tests by sex"
) |>
kableExtra::kable_styling(full_width = FALSE)
Wilcoxon rank-sum tests by sex
|
Variabile
|
W
|
p_value
|
|
Peso
|
538640.5
|
0
|
|
Lunghezza
|
594454.5
|
0
|
|
Cranio
|
641637.5
|
0
|
par(mfrow = c(1, 3))
boxplot(Cranio ~ Sesso,
main = "Boxplot of Cranio by Sesso",
col = "lightblue")
boxplot(Lunghezza ~ Sesso,
main = "Boxplot of Lunghezza by Sesso",
col = "lightgrey")
boxplot(Peso ~ Sesso,
main = "Boxplot of Peso by Sesso",
col = "lightgreen")

par(mfrow = c(1, 1))
After rejecting the null hypotesis for
Shapiro-Wilk test (H0 datas came from a population with
normal distribution), I decided to use the Wilcoxon rank-sum
test for non-parametric tests (H0 distribution of the
anthropometric measurement is the same for males and females).
P-values < 0.05 for Peso, Lunghezza and
Cranio show that males and females have different
distributions for every variables
REGRESSION MODELS
panel.cor = function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
par(usr = c(0, 1, 0, 1))
r = abs(cor(x, y))
txt = format(c(r, 0.123456789), digits = digits)[1]
txt = paste0(prefix, txt)
if(missing(cex.cor)) cex.cor = 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
pairs(df, upper.panel = panel.smooth, lower.panel = panel.cor,
main = "Scatterplot and Correlation Matrix")

From the above matrix we can see both
graphically and numerically the relationship
between variables
Considering Peso as our target
variable, the higher relationships are with
Lunghezza, Cranio and
Gestazione (all seem to be positive)
options(kableExtra.html.bsTable = TRUE)
model_report = function(model, title) {
cat("\n\n##", title, "\n\n")
coef_tab = as.data.frame(summary(model)$coefficients)
colnames(coef_tab) = c(
"Estimate",
"Std_Error",
"t_value",
"p_value"
)
print(knitr::kable(coef_tab, digits = 3))
stats_tab = data.frame(
Statistic = c("R_squared",
"Adj_R_squared",
"Residual_SE",
"F_statistic"),
Value = c(
summary(model)$r.squared,
summary(model)$adj.r.squared,
summary(model)$sigma,
summary(model)$fstatistic[1]
)
)
print(knitr::kable(stats_tab, digits = 3, row.names = FALSE))
}
models <- list(
"MODEL 1" = lm(Peso ~ ., data = df),
"MODEL 2" = lm(Peso ~ . -Anni.madre -Fumatrici -Ospedale, data = df),
"MODEL 3" = update(lm(Peso ~ . -Anni.madre -Fumatrici -Ospedale, data = df),
~ . + I(Gestazione^2)),
"MODEL 4" = update(lm(Peso ~ . -Anni.madre -Fumatrici -Ospedale, data = df),
~ . - I(Gestazione^2) - Tipo.parto),
"MODEL 5" = lm(Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
Sesso + Lunghezza:Gestazione, data = df)
)
for (i in names(models)) {
model_report(models[[i]], i)
}
##
##
## ## MODEL 1
##
##
##
## | | Estimate| Std_Error| t_value| p_value|
## |:-------------|---------:|---------:|-------:|-------:|
## |(Intercept) | -6738.476| 141.309| -47.686| 0.000|
## |Anni.madre | 0.892| 1.132| 0.788| 0.431|
## |N.gravidanze | 11.267| 4.661| 2.417| 0.016|
## |Fumatrici | -30.163| 27.539| -1.095| 0.273|
## |Gestazione | 32.570| 3.819| 8.529| 0.000|
## |Lunghezza | 10.295| 0.301| 34.236| 0.000|
## |Cranio | 10.471| 0.426| 24.578| 0.000|
## |Tipo.partoNat | 29.525| 12.084| 2.443| 0.015|
## |Ospedaleosp2 | -11.210| 13.438| -0.834| 0.404|
## |Ospedaleosp3 | 28.096| 13.496| 2.082| 0.037|
## |SessoM | 77.541| 11.178| 6.937| 0.000|
##
##
## |Statistic | Value|
## |:-------------|-------:|
## |R_squared | 0.729|
## |Adj_R_squared | 0.728|
## |Residual_SE | 273.925|
## |F_statistic | 669.188|
##
##
## ## MODEL 2
##
##
##
## | | Estimate| Std_Error| t_value| p_value|
## |:-------------|---------:|---------:|-------:|-------:|
## |(Intercept) | -6707.297| 135.991| -49.322| 0.000|
## |N.gravidanze | 12.756| 4.337| 2.941| 0.003|
## |Gestazione | 32.271| 3.794| 8.506| 0.000|
## |Lunghezza | 10.286| 0.301| 34.207| 0.000|
## |Cranio | 10.506| 0.426| 24.659| 0.000|
## |Tipo.partoNat | 30.034| 12.097| 2.483| 0.013|
## |SessoM | 77.929| 11.191| 6.964| 0.000|
##
##
## |Statistic | Value|
## |:-------------|--------:|
## |R_squared | 0.728|
## |Adj_R_squared | 0.727|
## |Residual_SE | 274.320|
## |F_statistic | 1110.250|
##
##
## ## MODEL 3
##
##
##
## | | Estimate| Std_Error| t_value| p_value|
## |:---------------|---------:|---------:|-------:|-------:|
## |(Intercept) | -4713.385| 897.646| -5.251| 0.000|
## |N.gravidanze | 12.841| 4.333| 2.963| 0.003|
## |Gestazione | -79.020| 49.670| -1.591| 0.112|
## |Lunghezza | 10.389| 0.304| 34.185| 0.000|
## |Cranio | 10.601| 0.428| 24.780| 0.000|
## |Tipo.partoNat | 29.566| 12.089| 2.446| 0.015|
## |SessoM | 75.767| 11.223| 6.751| 0.000|
## |I(Gestazione^2) | 1.486| 0.661| 2.247| 0.025|
##
##
## |Statistic | Value|
## |:-------------|-------:|
## |R_squared | 0.728|
## |Adj_R_squared | 0.727|
## |Residual_SE | 274.097|
## |F_statistic | 953.910|
##
##
## ## MODEL 4
##
##
##
## | | Estimate| Std_Error| t_value| p_value|
## |:------------|---------:|---------:|-------:|-------:|
## |(Intercept) | -6681.144| 135.723| -49.226| 0.000|
## |N.gravidanze | 12.475| 4.340| 2.875| 0.004|
## |Gestazione | 32.332| 3.798| 8.513| 0.000|
## |Lunghezza | 10.249| 0.301| 34.090| 0.000|
## |Cranio | 10.540| 0.426| 24.728| 0.000|
## |SessoM | 77.993| 11.202| 6.962| 0.000|
##
##
## |Statistic | Value|
## |:-------------|--------:|
## |R_squared | 0.727|
## |Adj_R_squared | 0.726|
## |Residual_SE | 274.604|
## |F_statistic | 1328.316|
##
##
## ## MODEL 5
##
##
##
## | | Estimate| Std_Error| t_value| p_value|
## |:--------------------|---------:|---------:|-------:|-------:|
## |(Intercept) | -1990.630| 920.217| -2.163| 0.031|
## |N.gravidanze | 13.044| 4.319| 3.020| 0.003|
## |Gestazione | -93.965| 24.799| -3.789| 0.000|
## |Lunghezza | -0.081| 2.027| -0.040| 0.968|
## |Cranio | 10.760| 0.426| 25.245| 0.000|
## |SessoM | 72.280| 11.200| 6.454| 0.000|
## |Gestazione:Lunghezza | 0.273| 0.053| 5.153| 0.000|
##
##
## |Statistic | Value|
## |:-------------|--------:|
## |R_squared | 0.730|
## |Adj_R_squared | 0.729|
## |Residual_SE | 273.208|
## |F_statistic | 1122.697|
I decided to investigate five multiple linear regression
models. For everyone the Adjusted R squared
seems to be more or less the same
Starting with the mod1, where I decided to bring
all the variables, I have removed the
non-significant variables in the
mod2
In mod3 I assumed a non-linear
relationship between Peso and Gestazione
Then mod4 was the one with the fewest
significant variables. In mod5 I investigated
the interaction between Gestazione and
Lunghezza
SELECTION OF THE OPTIMAL MODEL
table_ic = data.frame(
Model = c("mod1", "mod2", "mod3", "mod4", "mod5"),
AIC = c(AIC(mod1), AIC(mod2), AIC(mod3), AIC(mod4), AIC(mod5)),
BIC = c(BIC(mod1), BIC(mod2), BIC(mod3), BIC(mod4), BIC(mod5))
)
knitr::kable(table_ic, digits = 3,
caption = "Comparison between AIC and BIC models") |>
kableExtra::kable_styling(full_width = FALSE)
Comparison between AIC and BIC models
|
Model
|
AIC
|
BIC
|
|
mod1
|
35171.95
|
35241.84
|
|
mod2
|
35175.16
|
35221.75
|
|
mod3
|
35172.10
|
35224.51
|
|
mod4
|
35179.33
|
35220.10
|
|
mod5
|
35154.84
|
35201.44
|
From AIC and BIC method of comparison, the models
with the fewest value is mod5. I
decided to prefer the BIC method and
decided to compare mod4 and mod5
anova(mod5, mod4)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso +
## Lunghezza:Gestazione
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2493 186083565
## 2 2494 188065546 -1 -1981981 26.553 2.765e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value < 0.05, so there is a
significant increase in the explained variance when the
interaction term is added. Given that mod5
seems to be the best one
vif4 <- data.frame(
Model = "mod4",
Predictor = names(vif(mod4)),
VIF = as.numeric(vif(mod4))
)
vif_mod5 = vif(mod5, type = "predictor")
vif5 = data.frame(
Model = "mod5",
Predictor = rownames(vif_mod5),
VIF = vif_mod5[, "GVIF^(1/(2*Df))"]
)
table_vif = rbind(vif4, vif5)
knitr::kable(table_vif, digits = 3,
caption = "VIF values for mod4 and mod5") |>
kableExtra::kable_styling(full_width = FALSE)
VIF values for mod4 and mod5
|
Model
|
Predictor
|
VIF
|
|
mod4
|
N.gravidanze
|
1.023
|
|
mod4
|
Gestazione
|
1.669
|
|
mod4
|
Lunghezza
|
2.075
|
|
mod4
|
Cranio
|
1.624
|
|
mod4
|
Sesso
|
1.040
|
|
mod5
|
N.gravidanze
|
1.012
|
|
mod5
|
Gestazione
|
1.092
|
|
mod5
|
Lunghezza
|
1.092
|
|
mod5
|
Cranio
|
1.281
|
|
mod5
|
Sesso
|
1.025
|
With VIF I checked whether there is
multicollinearity in your model (whether some explanatory
variables are highly correlated with one another)
For mod4 all VIF are below 5 so
there is not multicollinearity
For mod5 I used the type =
“predictor” and all the GVIF are below
5
n = nrow(df)
stepwise.mod <- MASS::stepAIC(mod1, direction = "both", k = log(n))
## Start: AIC=28139.32
## Peso ~ Anni.madre + N.gravidanze + Fumatrici + Gestazione + Lunghezza +
## Cranio + Tipo.parto + Ospedale + Sesso
##
## Df Sum of Sq RSS AIC
## - Anni.madre 1 46578 186809099 28132
## - Fumatrici 1 90019 186852540 28133
## - Ospedale 2 685979 187448501 28133
## - N.gravidanze 1 438452 187200974 28137
## - Tipo.parto 1 447929 187210450 28138
## <none> 186762521 28139
## - Sesso 1 3611021 190373542 28179
## - Gestazione 1 5458403 192220925 28204
## - Cranio 1 45326172 232088693 28675
## - Lunghezza 1 87951062 274713583 29096
##
## Step: AIC=28132.12
## Peso ~ N.gravidanze + Fumatrici + Gestazione + Lunghezza + Cranio +
## Tipo.parto + Ospedale + Sesso
##
## Df Sum of Sq RSS AIC
## - Fumatrici 1 90897 186899996 28126
## - Ospedale 2 692738 187501837 28126
## - Tipo.parto 1 448222 187257321 28130
## <none> 186809099 28132
## - N.gravidanze 1 633756 187442855 28133
## + Anni.madre 1 46578 186762521 28139
## - Sesso 1 3618736 190427835 28172
## - Gestazione 1 5412879 192221978 28196
## - Cranio 1 45588236 232397335 28670
## - Lunghezza 1 87950050 274759149 29089
##
## Step: AIC=28125.51
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto +
## Ospedale + Sesso
##
## Df Sum of Sq RSS AIC
## - Ospedale 2 701680 187601677 28119
## - Tipo.parto 1 440684 187340680 28124
## <none> 186899996 28126
## - N.gravidanze 1 610840 187510837 28126
## + Fumatrici 1 90897 186809099 28132
## + Anni.madre 1 47456 186852540 28133
## - Sesso 1 3602797 190502794 28165
## - Gestazione 1 5346781 192246777 28188
## - Cranio 1 45632149 232532146 28664
## - Lunghezza 1 88355030 275255027 29086
##
## Step: AIC=28119.23
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto +
## Sesso
##
## Df Sum of Sq RSS AIC
## - Tipo.parto 1 463870 188065546 28118
## <none> 187601677 28119
## - N.gravidanze 1 651066 188252743 28120
## + Ospedale 2 701680 186899996 28126
## + Fumatrici 1 99840 187501837 28126
## + Anni.madre 1 54392 187547285 28126
## - Sesso 1 3649259 191250936 28160
## - Gestazione 1 5444109 193045786 28183
## - Cranio 1 45758101 233359778 28657
## - Lunghezza 1 88054432 275656108 29074
##
## Step: AIC=28117.58
## Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
##
## Df Sum of Sq RSS AIC
## <none> 188065546 28118
## - N.gravidanze 1 623141 188688687 28118
## + Tipo.parto 1 463870 187601677 28119
## + Ospedale 2 724866 187340680 28124
## + Fumatrici 1 91892 187973654 28124
## + Anni.madre 1 54816 188010731 28125
## - Sesso 1 3655292 191720838 28158
## - Gestazione 1 5464853 193530399 28181
## - Cranio 1 46108583 234174130 28658
## - Lunghezza 1 87632762 275698308 29066
coef_tab = as.data.frame(summary(stepwise.mod)$coefficients)
colnames(coef_tab) = c("Estimate", "Std_Error", "t_value", "p_value")
knitr::kable(coef_tab, digits = 3,
caption = "Stepwise Model Coefficients") |>
kableExtra::kable_styling(full_width = FALSE)
Stepwise Model Coefficients
|
|
Estimate
|
Std_Error
|
t_value
|
p_value
|
|
(Intercept)
|
-6681.144
|
135.723
|
-49.226
|
0.000
|
|
N.gravidanze
|
12.475
|
4.340
|
2.875
|
0.004
|
|
Gestazione
|
32.332
|
3.798
|
8.513
|
0.000
|
|
Lunghezza
|
10.249
|
0.301
|
34.090
|
0.000
|
|
Cranio
|
10.540
|
0.426
|
24.728
|
0.000
|
|
SessoM
|
77.993
|
11.202
|
6.962
|
0.000
|
stats_tab = data.frame(
Statistic = c("R_squared", "Adj_R_squared", "Residual_SE"),
Value = c(
summary(stepwise.mod)$r.squared,
summary(stepwise.mod)$adj.r.squared,
summary(stepwise.mod)$sigma
)
)
knitr::kable(stats_tab, digits = 3,
caption = "Stepwise Model Fit Statistics") |>
kableExtra::kable_styling(full_width = FALSE)
Stepwise Model Fit Statistics
|
Statistic
|
Value
|
|
R_squared
|
0.727
|
|
Adj_R_squared
|
0.726
|
|
Residual_SE
|
274.604
|
I also used the stepAIC method (with k = log(n), so
utilizing the BIC) to see which is the method chosen by the
automatic procedure
It seems the mod4 as the best
method chosen by the procedure
I also decided to chose the mod4 as the best method
because it uses simple variables and all of
them are significant (so I decided not to introduce the
interaction in the chosen model)
Below the interpretation of the model’s
coefficients:
- An increase of N.gravidanze results in an
increase of 12.475 grams of Peso
- An increase of Gestazione results in an
increase of 32.332 grams of Peso
- An increase of Lunghezza results in an
increase of 10.249 grams of Peso
- An increase of Cranio results in an
increase of 10.540 grams of Peso
- For Sesso variable, all other factors being
equal, a male weighs 77.993 grams more than a
female
MODEL QUALITY ANALYSIS
The Adjusted R squared is more or less
0.73, therefore, the model explains 73% of the
observed variability in Peso
RMSE is equal to the Residual Standard Error (~275
grams), which means that, on average, the model’s predictions
are wrong by around 275 grams (corresponding to approximately
8% of the observed mean Peso -> 275/mean(Peso))
par(mfrow=c(2,2))
plot(mod4)

par(mfrow=c(1,1))
From the residual analysis we can see four
graphs:
- The firs shows a curved pattern, so it appears
that some of the information has not been properly captured by
the regressors and has been reflected in the residuals
- In the second graph, the data points appear to lie on the
line with the exception of the upper curve,
which does not seem to fit well
- In the third graph a curvature is noticeable,
although not too excessive
- In the fourth outliers are visible; in
particular, observation 1551 is approaching the
warning threshold (set at 1 for Cook’s distance)
lev = hatvalues(mod4)
plot(lev, main = "Leverage values (hat values) mod4",
ylab = "Leverage", xlab = "Observation index")
p = sum(lev)
soglia = 2 * p/n
abline(h=soglia, col=2)

lev[lev > soglia]
## 13 15 34 67 89 96
## 0.005630918 0.007050422 0.006744143 0.005891590 0.012816470 0.005351586
## 101 106 131 134 151 155
## 0.007526751 0.014479871 0.007229339 0.007552819 0.010883937 0.007207682
## 161 189 190 204 205 206
## 0.020335483 0.004893297 0.005366557 0.014490604 0.005351634 0.009476652
## 220 294 305 310 312 315
## 0.007393997 0.005912765 0.005442061 0.028812123 0.013169272 0.005385800
## 378 440 442 445 486 492
## 0.015934061 0.005404736 0.007723662 0.007509382 0.005164446 0.008274018
## 497 516 582 587 592 614
## 0.005166306 0.013079851 0.011665555 0.008412325 0.006384116 0.005299262
## 638 656 657 684 697 702
## 0.006688287 0.005927777 0.005322685 0.008818987 0.005863826 0.005202259
## 729 748 750 757 765 805
## 0.005023115 0.008565543 0.006942097 0.008145491 0.006070298 0.014356657
## 828 893 895 913 928 946
## 0.007179817 0.005075205 0.005295896 0.005571144 0.022742332 0.006909044
## 947 956 985 1008 1014 1049
## 0.008409465 0.007784123 0.007039416 0.005343037 0.008470133 0.004956169
## 1067 1091 1106 1130 1166 1181
## 0.008465430 0.008933360 0.005967317 0.031728597 0.005513559 0.005677676
## 1188 1200 1219 1238 1248 1273
## 0.006477203 0.005492370 0.030694311 0.005908078 0.014622914 0.007085831
## 1291 1293 1311 1321 1325 1356
## 0.006117497 0.006073639 0.009625908 0.009293111 0.004857169 0.005303442
## 1357 1385 1395 1400 1402 1411
## 0.006965051 0.012636943 0.005126697 0.005925069 0.004811441 0.008048184
## 1420 1428 1429 1450 1505 1551
## 0.005155654 0.008192811 0.021757172 0.015104831 0.013330439 0.048769569
## 1553 1556 1573 1593 1606 1610
## 0.008504889 0.005919673 0.005047204 0.005623758 0.005001812 0.008722184
## 1617 1619 1628 1686 1693 1701
## 0.004866796 0.015067498 0.005069731 0.009349313 0.005077858 0.010842957
## 1712 1718 1727 1735 1780 1781
## 0.006992084 0.006958857 0.013300523 0.004884846 0.025538678 0.016832361
## 1809 1827 1868 1892 1962 1967
## 0.008707504 0.006065698 0.005205637 0.005332812 0.005540442 0.005337356
## 1977 2037 2040 2046 2086 2089
## 0.006927281 0.004889127 0.011494872 0.005471670 0.013193090 0.006293550
## 2098 2114 2115 2120 2140 2146
## 0.005094455 0.013316875 0.011772090 0.018659995 0.006244232 0.005802168
## 2148 2149 2157 2175 2200 2215
## 0.007926839 0.013583436 0.005907225 0.032527273 0.011670024 0.004892265
## 2216 2220 2221 2224 2225 2244
## 0.008117864 0.005414040 0.021628717 0.005838076 0.005591261 0.006929217
## 2257 2307 2317 2318 2337 2359
## 0.006170254 0.013965608 0.007673614 0.004831118 0.005230450 0.010067364
## 2408 2422 2436 2437 2452 2458
## 0.009696691 0.021532808 0.004986522 0.023943328 0.023838489 0.008506087
## 2471 2478
## 0.020903740 0.005775173
It seems there are several levearge points (values
that lie far apart from the other observations in the space of the
predictors)
plot(rstudent(mod4), main = "Studentized residuals mod4",
ylab = "Studentized residuals",
xlab = "Index")
abline(h=c(-2,2), col=2)

outlierTest(mod4)
## rstudent unadjusted p-value Bonferroni p
## 1551 10.051908 2.4906e-23 6.2265e-20
## 155 5.027798 5.3138e-07 1.3285e-03
## 1306 4.827238 1.4681e-06 3.6702e-03
From the graph seems there are several outlier
points (in response variable) but with the
outlierTest the Bonferroni correction is
applied to this test, only 3 otuliers points
are identified
cook = cooks.distance(mod4)
plot(cook, main = "Cook's distance mod4", ylab = "Studentized residuals", xlab = "Index", pch=19)
max_i <- which.max(cook)
points(max_i, cook[max_i],
col = "red", pch = 19, cex = 1.5)
text(max_i, cook[max_i],
labels = max_i,
pos = 1,
cex = 0.8)

knitr::kable(
df[1551, , drop = FALSE],
caption = "Observation 1551"
) |>
kableExtra::kable_styling(full_width = FALSE)
Observation 1551
|
|
Anni.madre
|
N.gravidanze
|
Fumatrici
|
Gestazione
|
Peso
|
Lunghezza
|
Cranio
|
Tipo.parto
|
Ospedale
|
Sesso
|
|
1551
|
35
|
1
|
0
|
38
|
4370
|
315
|
374
|
Nat
|
osp3
|
F
|
From Cook’s distance plot there is one
observation near the warning threshold and is the
observation 1551; this observation could have an impact
on the regression estimates
bptest(mod4)
##
## studentized Breusch-Pagan test
##
## data: mod4
## BP = 90.253, df = 5, p-value < 2.2e-16
dwtest(mod4)
##
## Durbin-Watson test
##
## data: mod4
## DW = 1.9535, p-value = 0.1224
## alternative hypothesis: true autocorrelation is greater than 0
shapiro.test(residuals(mod4))
##
## Shapiro-Wilk normality test
##
## data: residuals(mod4)
## W = 0.97408, p-value < 2.2e-16
plot(density(residuals(mod4)), main = "Density curve on residuals")

For Breusch-Pagan test the H0 hypothesis of
homoscedasticity (non-constant variance) is
rejected
For Durbin-Watson test the H0 hypothesis of
no autocorrelation in the residuals is not
rejected (the residuals are not autocorrelated)
For Shapiro-Wilk test the H0 null hypothesis of
normality of the residuals is rejected
(non-normal residuals). From the density curve of
residuals we can see an elongated right
tail
RESULTS
The model demonstrates good explanatory and
predictive power, accounting for approximately 73% of
the variability in Peso and featuring coefficients that
are all highly significant
The residual diagnostics indicate some
deviations from the ideal assumptions, with mild
heteroscedasticity confirmed by the Breusch-Pagan test, though
this is partly accentuated by the large sample size (n = 2500)
The residual plots reveal slight
non-linearity and increased dispersion for high values of the response
variable, along with a heavier right tail in the
QQ-plot and some influential outliers
Overall, the model is generally adequate, with
well-centred residuals and no
autocorrelation, but with some issues relating to
heteroscedasticity and non-normal tails
The main weaknesses concern the presence of
a few influential points, slight non-linearity
and deviations from the normality assumptions in the
tails
FORECAST
df_predict = data.frame(
N.gravidanze = 3,
Gestazione = 39,
Lunghezza = mean(df$Lunghezza, na.rm = TRUE),
Cranio = mean(df$Cranio, na.rm = TRUE),
Sesso = "F"
)
pred = predict(mod4, newdata = df_predict)
pred[[1]]
## [1] 3271.09
Using the mod4 model, when the
N.gravidanze are 3, weeks of
Gestazione are 39,
Sesso is F (considering for
Lunghezza and Cranio the
sample mean) we have a prediction on
Peso of 3271.09
VISUALIZATIONS
ggplot(df) +
geom_point(aes(x = Gestazione, y = Peso, color = Sesso), position = "jitter") +
geom_smooth(aes(x = Gestazione, y = Peso, color = Sesso),
method = "lm", se = FALSE) +
labs(title = "Peso vs Gestazione by Sesso",
x = "Gestazione",
y = "Peso",
color = "Sesso")

The plot Peso and Gestazione by Sesso show that for
both male and female an increase in weeks of Gestazione means an
increase of Peso (the slope of the line seems to be the same
for male and female). The difference is that, for the
same week of Gestazione, males have a Peso higher than
females, specifically 77.993 grams more
ggplot(df) +
geom_point(aes(x = Gestazione, y = Peso, color = as.factor(Fumatrici)), position = "jitter") +
geom_smooth(aes(x = Gestazione, y = Peso, color = as.factor(Fumatrici)),
method = "lm", se = FALSE) +
labs(title = "Peso vs Gestazione by Fumatrici",
x = "Gestazione",
y = "Peso",
color = "Fumatrici")

boxplot(Peso ~ Fumatrici, main = "Boxplot of Peso conditional on the Fumatrici variable")

Given that datas for smokers are much lower than datas of
non-smokers, the plot Peso and Gestazione by Fumatrici
show quite the same slope for both males and females,
meaning that whether or not you smoke has no effect on your
son’s or daughter’s weight
Actually the boxplot of Peso conditional on Fumatrici
variable does not show much difference between the two
graphs