#import dataset
setwd("D:/Users/Anton/Desktop/ProfessionAI/03-Statistica inferenziale/Progetto")
data=read.csv("neonati.csv",
sep=",",
stringsAsFactors = T)
# Convert Smokers binary column into a factor variable with 2 levels
data$Fumatrici <- factor(data$Fumatrici, levels = c(0, 1), labels = c("NO", "SI"))
# Caricamento delle librerie necessarie
library(knitr)
library(kableExtra)
# First row df
df = data.frame(
Anni.madre = data[1:5,1],
N.gravidanze = data[1:5,2],
Fumatrici = data[1:5,3],
Gestazione = data[1:5,4],
Peso = data[1:5,5],
Lunghezza = data[1:5,6],
Cranio = data[1:5,7],
Tipo.parto = data[1:5,8],
Ospedale = data[1:5,9],
Sesso=data[1:5,10]
)
kable(df, format = "html", row.names = T,align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Anni.madre | N.gravidanze | Fumatrici | Gestazione | Peso | Lunghezza | Cranio | Tipo.parto | Ospedale | Sesso | |
|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 26 | 0 | NO | 42 | 3380 | 490 | 325 | Nat | osp3 | M |
| 2 | 21 | 2 | NO | 39 | 3150 | 490 | 345 | Nat | osp1 | F |
| 3 | 34 | 3 | NO | 38 | 3640 | 500 | 375 | Nat | osp2 | M |
| 4 | 28 | 1 | NO | 41 | 3690 | 515 | 365 | Nat | osp2 | M |
| 5 | 20 | 0 | NO | 38 | 3700 | 480 | 335 | Nat | osp3 | F |
The dataset contains the following variables:
Anni madre: mother’s age (Quantitative discrete variable) (Qualitative variable on a nominal scale)
N.gravidanze: number of pregnancies (Quantitative discrete variable)
Fumatrici: smokers (Qualitative variable, coded with numbers 1 Yes and 0 No)
Gestazione: total number pregancie week (Quantitative discrete variable)
Peso: neonatal weight expressed in grams (Quantitative discrete variable)
Lunghezza: neonatal length expressed in millimeters (Quantitative continuous variable)
Cranio: neonatal skull diameter expressed in millimeters (Quantitative discrete variable)
Tipo.parto: type of birth (Qualitative variable)
Ospedale: hospital where the birth occurred (Qualitative variable)
Sesso: sex (Qualitative variable)
Summarize the data
n=nrow(data)
#df numeric
df_num=data[,sapply(data,is.numeric)]
summary_dfnum = summary(df_num)
#df factor
df_factor=data[,sapply(data,is.factor)]
summary_dffact = summary(df_factor)
summary_dffact[is.na(summary_dffact)] =""
kable(summary_dfnum, format = "html",align = "c", caption = "Numeric Summary Table") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Anni.madre | N.gravidanze | Gestazione | Peso | Lunghezza | Cranio | |
|---|---|---|---|---|---|---|
| Min. : 0.00 | Min. : 0.0000 | Min. :25.00 | Min. : 830 | Min. :310.0 | Min. :235 | |
| 1st Qu.:25.00 | 1st Qu.: 0.0000 | 1st Qu.:38.00 | 1st Qu.:2990 | 1st Qu.:480.0 | 1st Qu.:330 | |
| Median :28.00 | Median : 1.0000 | Median :39.00 | Median :3300 | Median :500.0 | Median :340 | |
| Mean :28.16 | Mean : 0.9812 | Mean :38.98 | Mean :3284 | Mean :494.7 | Mean :340 | |
| 3rd Qu.:32.00 | 3rd Qu.: 1.0000 | 3rd Qu.:40.00 | 3rd Qu.:3620 | 3rd Qu.:510.0 | 3rd Qu.:350 | |
| Max. :46.00 | Max. :12.0000 | Max. :43.00 | Max. :4930 | Max. :565.0 | Max. :390 |
kable(summary_dffact, format = "html",align = "c", caption = "Factor Summary Table") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Fumatrici | Tipo.parto | Ospedale | Sesso | |
|---|---|---|---|---|
| NO:2396 | Ces: 728 | osp1:816 | F:1256 | |
| SI: 104 | Nat:1772 | osp2:849 | M:1244 | |
| osp3:835 |
Looking at the numeric summary table, immediately jumps out the minimum value of the variable Anni.madre. It is for sure a data entry error.
Filtering the data will show if it is an isolated value or if there are other anomalous observation.
#Load library
library(dplyr)
#Filter anni.madre <15
data_filt=data %>% filter(data$Anni.madre<15)
#sort by anni.madre
data_filt=data_filt[order(data_filt$Anni.madre), ]
kable(data_filt$Anni.madre, format = "html", col.names = "Anni.madre", align="c")%>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Anni.madre |
|---|
| 0 |
| 1 |
| 13 |
| 14 |
| 14 |
The are two year values (0 and 1) that for sure are not possible. This observation has to be removed.
#Filter anni.madre >10
data=data %>% filter(data$Anni.madre>10)
In order to investigate if there are hospitals in which are preferred cesarean birth, it is possibile to perform a Chi-squared test.
attach(data)
library(kableExtra)
library(knitr)
#Contingency table
contingency_birthtype=table(Ospedale, Tipo.parto)
# Chi-squared Test
chi_sq_test_birthtype <- chisq.test(contingency_birthtype)
# Results dataframe
results_birthtype = data.frame(
Statistic = chi_sq_test_birthtype$statistic,
Degrees_of_Freedom = chi_sq_test_birthtype$parameter,
P_Value = chi_sq_test_birthtype$p.value
)
kable(results_birthtype, format = "html", align="c", caption = "Birthtype Chi-squared Test Result") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Statistic | Degrees_of_Freedom | P_Value | |
|---|---|---|---|
| X-squared | 1.082984 | 2 | 0.5818793 |
chisq_2grades = rchisq(1000000, 2)
library(ggplot2)
ggplot(data.frame(x = density(chisq_2grades)$x, y = density(chisq_2grades)$y),
aes(x = x, y = y)) +
geom_line() +
geom_vline(xintercept = qchisq(0.99, 2), color = "red") +
geom_point(aes(x = chi_sq_test_birthtype$statistic, y = 0), color = "blue", size = 3) +
labs(x = "Value", y = "Density", title = "Chi-square distribution (2 df) with test statistic") +
scale_y_continuous(breaks = seq(0, 0.5, by = 0.1), limits = c(0, 0.5)) +
scale_x_continuous(breaks=seq(0,30, by=5), limits=c(-1,30))+
theme_minimal()+
theme(plot.title = element_text(hjust = 0.5, size = 14), # Center Title
axis.title.x = element_text(size = 12), # axis name size
axis.title.y = element_text(size = 12))
The test returns a very high p-value, therefore the null hypothesis is not rejected. There aren’t hospital in which cesarean birth is more frequent. Also graphically it is possible to notice that the test statistic is very far from the rejection zone.
Evaluate whether the average weight and length of the sample are significantly equal to those of the population. From a quick search on google,it is found that \(\mu_{weight} = 3300\;g\) and \(\mu_{length} = 500\;mm\). To evaluate the hypothesis it is possible to perform a t-test.
#population mean weight
mu_weigth=3300
#population mean length
mu_length=500
#weigth t.test
weigth_test=t.test(Peso,
mu=mu_weigth,
conf.level=0.95,
alternative="two.sided")
# Weigth result dataframe
weigth_test_results = data.frame(
Statistic = weigth_test$statistic,
Degrees_of_Freedom = weigth_test$parameter,
P_Value = weigth_test$p.value,
`95%_CI_Lower` = weigth_test$conf.int[1],
`95%_CI_Upper` = weigth_test$conf.int[2],
Mean = weigth_test$estimate
)
kable(weigth_test_results, format = "html", align="c", caption = "Weight T Test Result") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Statistic | Degrees_of_Freedom | P_Value | X95._CI_Lower | X95._CI_Upper | Mean | |
|---|---|---|---|---|---|---|
| t | -1.505011 | 2497 | 0.1324476 | 3263.577 | 3304.791 | 3284.184 |
#length t.test
length_test=t.test(Lunghezza,
mu=mu_length,
conf.level=0.95,
alternative="two.sided")
# Weigth result dataframe
length_test_results = data.frame(
Statistic = length_test$statistic,
Degrees_of_Freedom = length_test$parameter,
P_Value = formatC(length_test$p.value, format = "e", digits = 2),
`95%_CI_Lower` = length_test$conf.int[1],
`95%_CI_Upper` = length_test$conf.int[2],
Mean = length_test$estimate
)
kable(length_test_results, format = "html",align="c", caption = "Length T Test Result") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Statistic | Degrees_of_Freedom | P_Value | X95._CI_Lower | X95._CI_Upper | Mean | |
|---|---|---|---|---|---|---|
| t | -10.06903 | 2497 | 2.10e-23 | 493.6628 | 495.7287 | 494.6958 |
weight: Since the p-value is greater than 0.05, we cannot reject the null hypothesis. This means that we do not have sufficient evidence to conclude that the average birth weight in the sample is significantly different from 3300 grams. In addition, the confidence interval includes the value 3300, which further reinforces the idea that there is no significant difference between the sample mean and the hypothesized population mean.
length: Since the p-value is extremely low, we can reject the null hypothesis. This means that we have sufficient evidence to conclude that the average birth length in the sample is significantly different from 500 millimeters. The confidence interval does not include the value 500, which further strengthens the evidence against the null hypothesis. The average length is significantly lower than the population mean.
To evaluate any difference in anthropometric measurements between the two sexes it is possible to perform a statistical test. First, we have to check if the variables weight, length and skull diameter respect the hypotheses of normal distribution and equality of variance between groups. To verify that, we can perform a Shapiro-Wilk test (normality) and a Levene test (variance equality).
library(car)
library(knitr)
library(kableExtra)
# Normality Test
shapiro_weight_M=shapiro.test(data$Peso[data$Sesso == "M"])
shapiro_weight_F=shapiro.test(data$Peso[data$Sesso == "F"])
shapiro_length_M=shapiro.test(data$Lunghezza[data$Sesso == "M"])
shapiro_length_F=shapiro.test(data$Lunghezza[data$Sesso == "F"])
shapiro_skull_M=shapiro.test(data$Cranio[data$Sesso == "M"])
shapiro_skull_F=shapiro.test(data$Cranio[data$Sesso == "F"])
# Variance Equality Test
levene_weight=leveneTest(Peso ~ Sesso, data = data)
levene_length=leveneTest(Lunghezza ~ Sesso, data = data)
levene_skull=leveneTest(Cranio ~ Sesso, data = data)
# normality dataframe
df_normality=data.frame(
Variable = c("weight", "length","skull diameter"),
Male = c(shapiro_weight_M$p.value, shapiro_length_M$p.value, shapiro_skull_M$p.value),
Female =c(shapiro_weight_F$p.value, shapiro_length_F$p.value, shapiro_skull_F$p.value))
df_normality$Male = formatC(df_normality$Male, format = "e", digits = 2)
df_normality$Female = formatC(df_normality$Female, format = "e", digits = 2)
# levene dataframe
df_levene=data.frame(
Variable = c("weight", "length","skull diameter"),
P_Value = c(levene_weight$`Pr(>F)`[1], levene_length$`Pr(>F)`[1], levene_skull$`Pr(>F)`[1]))
# Shapiro test table
kable(df_normality, format = "html", align="c",caption = "Shapiro-Wilk Test") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
column_spec(1, width = "3cm") %>%
column_spec(2, width = "5cm") %>%
column_spec(3, width = "5cm")
| Variable | Male | Female |
|---|---|---|
| weight | 2.23e-16 | 2.35e-17 |
| length | 4.22e-25 | 6.87e-28 |
| skull diameter | 2.90e-15 | 4.29e-19 |
# Levene test table
kable(df_levene, format = "html", align="c",caption = "Levene Test") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
column_spec(1, width = "3cm") %>%
column_spec(2, width = "5cm")
| Variable | P_Value |
|---|---|
| weight | 0.3646375 |
| length | 0.0011640 |
| skull diameter | 0.2714821 |
All the variables, in both groups, show a p value extremely lower than 0.05 that means the hypothesis of normal distribution is not respected. To compare the averages of anthropometric measurements between the two sexes, it is more appropriate to use a non-parametric test, such as the Wilcoxon Test.
# Wilcoxon test for weigth
wilcox_weight=wilcox.test(Peso ~ Sesso, data = data)
# Wilcoxon test for length
wilcox_length=wilcox.test(Lunghezza ~ Sesso, data = data)
# Wilcoxon test for skull diameter
wilcox_skull=wilcox.test(Cranio ~ Sesso, data = data)
# Wilcoxon dataframe
df_wilcox=data.frame(
Variable = c("weight", "length","skull diameter"),
P_Value = c(wilcox_weight$p.value, wilcox_length$p.value, wilcox_skull$p.value))
df_wilcox$P_Value = formatC(df_wilcox$P_Value, format = "e", digits = 2)
# Wilcoxon test table
kable(df_wilcox, format = "html", align="c",caption = "Wilcoxon Test") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
column_spec(1, width = "3cm") %>%
column_spec(2, width = "5cm")
| Variable | P_Value |
|---|---|
| weight | 2.91e-41 |
| length | 2.66e-25 |
| skull diameter | 7.14e-15 |
The result of the Wilcoxon test for all variables (weight, length and skull diameter) indicates that there is a significant difference between the sexes (p-value extremely lower than 0.05). The observed difference in weight, length and skull diameter between males and females is statistically significant.
Analyze the relationship between variables
First of all, evaluate the normality of the response variable, as any non-normality of the response variable could affect the residuals.
library(moments)
library(knitr)
library(kableExtra)
# shape index dataframe
df_shape=data.frame(
Index = c("Skewness", "Kurtosis"),
Value = c(round(skewness(Peso),2), round((kurtosis(Peso)-3),2)))
# shape index table
kable(df_shape, format = "html", align="c",caption = "Weight Shape Index") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left") %>%
column_spec(1, width = "3cm") %>%
column_spec(2, width = "3cm")
| Index | Value |
|---|---|
| Skewness | -0.65 |
| Kurtosis | 2.03 |
The variable weight shows a negative asymmetry (\(\gamma_{1weight}<0\)), and is leptokurtic (\(\gamma_{2weight}>0\)).
# weight test Shapiro-Wilk
shapiro_test_peso <- shapiro.test(Peso)
# Creare un data frame con i risultati del test di Shapiro-Wilk
shapiro_peso_df <- data.frame(
Statistic = shapiro_test_peso$statistic,
P_Value = formatC(shapiro_test_peso$p.value, format = "e", digits = 2),
Test = "Shapiro-Wilk",
Data = "Peso"
)
kable(shapiro_peso_df, format = "html", align="c",caption = "Weight Shapiro-Wilk Test Result") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Statistic | P_Value | Test | Data | |
|---|---|---|---|---|
| W | 0.9706841 | 3.38e-22 | Shapiro-Wilk | Peso |
With an extremely low p value, the null hypothesis is rejected. The weight variable is not normal. This non-normality of the response variable could have repercussions on the residuals of the model. This will be verified later during the residue analysis phase.
Before creating the model, it is important to investigate the relationship between the available variables.
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor)
}
pairs(data, lower.panel = panel.cor, upper.panel = panel.smooth)
length: The linear correlation coefficient between weight and length is very high and positive (0.80), therefore weight is very positively correlated to length. The scatterplot also shows the presence of an elongated point cloud with a positive slope.
skull diameter: Skull diameter also has a positive and high linear correlation coefficient. The scatterplot appears to show a non-linearity for the lower values of this variable. Not to be understimated the presence of correlation coefficients of 0.60 between skull diameter and length. It is not a very high value, but in any case it is worth keeping an eye on and subsequently checking that there are no multicollinearity problems.
gestation: gestation has a relatively high and positive correlation coefficient with weight. The scatterplot shows the presence of a non linear effect at the lower value of this variable. To keep attention to the correlation coefficient between this variable and length (0.62).
sex: : with a coefficient of 0.24, the sex variable shows an effect on the response variable. As seen previously with the Wilcoxon test, there are significant differences in weight between the two sexes.
The remaining variables (type of birth, hospital, smokers, number of pregnancies and mother years), since their coefficients are extremely low, do not show correlation with the response variable.
In the first instance it is possible to create a model containing all the variables present in the dataset and subsequently update the model by removing non-significant variables or adding any non-linear or interaction effects between variables.
# Formatting Pr(>|t|) value function
format_p_value = function(p_value) {
if (p_value >= 1e-4) {
return(format(round(p_value, 4), nsmall = 4))
} else if (p_value <= 2e-16) {
return("< 2e-16")
} else {
return(format(p_value, digits = 3, scientific = TRUE))
}
}
# generates table from model function
generate_model_tables = function(model) {
model_name = deparse(substitute(model))
# mod summury
summary_model = summary(model)
# summary data frame
coefficients_table = as.data.frame(summary_model$coefficients)
# Round Estimate, Std. Error, t value
coefficients_table[, 1:3] = round(coefficients_table[, 1:3], 2)
# Significativity Pr(>|t|) value
significance_codes = symnum(coefficients_table[, 4], corr = FALSE, na = FALSE,
cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c('***', '**', '\\*', ".", " "))
# Formatting Pr(>|t|) value
coefficients_table[, 4] = sapply(coefficients_table[, 4], format_p_value)
# Adding significativity codes
coefficients_table$Signif = significance_codes
# Additional info table
additional_info = data.frame(
Description = c("Signif. codes:", "Residual standard error:", "Multiple R-squared:", "Adjusted R-squared:", "F-statistic:", "p-value:"),
Value = c("0 '\\***' 0.001 '\\**' 0.01 '\\*' 0.05 '.' 0.1 ' ' 1",
paste(round(summary_model$sigma, 0), "on", summary_model$df[2], "degrees of freedom"),
round(summary_model$r.squared, 4),
round(summary_model$adj.r.squared, 4),
paste(round(summary_model$fstatistic[1], 1), "on", summary_model$fstatistic[2], "and", summary_model$fstatistic[3], "DF"),
"< 2.2e-16")
)
# Return table
return(list(coefficients_table = coefficients_table,
additional_info = additional_info,
model_name = model_name))
}
mod1=lm(Peso ~ Anni.madre+N.gravidanze+Fumatrici+Gestazione+
Lunghezza+Cranio+Tipo.parto+Ospedale+
Sesso,data=data)
result_mod1 = generate_model_tables(mod1)
| Estimate | Std. Error | t value | Pr(>|t|) | Signif | |
|---|---|---|---|---|---|
| (Intercept) | -6735.80 | 141.48 | -47.61 | < 2e-16 | *** |
| Anni.madre | 0.80 | 1.15 | 0.70 | 0.4845 | |
| N.gravidanze | 11.38 | 4.67 | 2.44 | 0.0148 | * |
| FumatriciSI | -30.27 | 27.55 | -1.10 | 0.2719 | |
| Gestazione | 32.58 | 3.82 | 8.53 | < 2e-16 | *** |
| Lunghezza | 10.29 | 0.30 | 34.21 | < 2e-16 | *** |
| Cranio | 10.47 | 0.43 | 24.57 | < 2e-16 | *** |
| Tipo.partoNat | 29.63 | 12.09 | 2.45 | 0.0143 | * |
| Ospedaleosp2 | -11.09 | 13.45 | -0.82 | 0.4096 | |
| Ospedaleosp3 | 28.25 | 13.51 | 2.09 | 0.0366 | * |
| SessoM | 77.57 | 11.19 | 6.93 | 5.18e-12 | *** |
| Info | Value |
|---|---|
| Signif. codes: | 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1 |
| Residual standard error: | 274 on 2487 degrees of freedom |
| Multiple R-squared: | 0.7289 |
| Adjusted R-squared: | 0.7278 |
| F-statistic: | 668.7 on 10 and 2487 DF |
| p-value: | < 2.2e-16 |
The model that includes all available variables shows a relatively high Adjusted R-squared (0.73).
Mother’s age and smokers seem to be not significant. Hospitals number 3 seems to be lightly significant, however its unlikely that the hospitals directly affect the newborn weight.
As next step, remove mother’s age and hospitals. Let’s keep the variable smokers as it is known in scientific literature that maternal smoking affects the weight of the newborn directly and indirectly favoring premature birth.
mod2=update(mod1, ~.-(Anni.madre+Ospedale))
result_mod2 = generate_model_tables(mod2)
| Estimate | Std. Error | t value | Pr(>|t|) | Signif | |
|---|---|---|---|---|---|
| (Intercept) | -6708.81 | 136.06 | -49.31 | < 2e-16 | *** |
| N.gravidanze | 12.99 | 4.34 | 2.99 | 0.0028 | ** |
| FumatriciSI | -31.88 | 27.58 | -1.16 | 0.2478 | |
| Gestazione | 32.60 | 3.80 | 8.57 | < 2e-16 | *** |
| Lunghezza | 10.27 | 0.30 | 34.10 | < 2e-16 | *** |
| Cranio | 10.50 | 0.43 | 24.64 | < 2e-16 | *** |
| Tipo.partoNat | 30.42 | 12.10 | 2.51 | 0.0120 | * |
| SessoM | 78.10 | 11.20 | 6.97 | 3.94e-12 | *** |
| Info | Value |
|---|---|
| Signif. codes: | 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1 |
| Residual standard error: | 274 on 2490 degrees of freedom |
| Multiple R-squared: | 0.7278 |
| Adjusted R-squared: | 0.7271 |
| F-statistic: | 951.3 on 7 and 2490 DF |
| p-value: | < 2.2e-16 |
Removing 2 variables, the Adjusted R-squared only changed on the fourth decimal digit.The minimum reduction of the Adjusted R-squared is an acceptable compromise considering that the model has been simplified. In addition, the variables Number of pregnancies shows a greater significance. The type of birth continues to show less relevance than the other variables. Removing this variable will show how it affect the model.
mod3=update(mod2, ~.-Tipo.parto)
result_mod3 = generate_model_tables(mod3)
| Estimate | Std. Error | t value | Pr(>|t|) | Signif | |
|---|---|---|---|---|---|
| (Intercept) | -6682.26 | 135.80 | -49.21 | < 2e-16 | *** |
| N.gravidanze | 12.70 | 4.35 | 2.92 | 0.0035 | ** |
| FumatriciSI | -30.57 | 27.60 | -1.11 | 0.2682 | |
| Gestazione | 32.64 | 3.81 | 8.57 | < 2e-16 | *** |
| Lunghezza | 10.23 | 0.30 | 33.98 | < 2e-16 | *** |
| Cranio | 10.54 | 0.43 | 24.71 | < 2e-16 | *** |
| SessoM | 78.16 | 11.21 | 6.97 | 4.01e-12 | *** |
| Info | Value |
|---|---|
| Signif. codes: | 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1 |
| Residual standard error: | 275 on 2491 degrees of freedom |
| Multiple R-squared: | 0.7271 |
| Adjusted R-squared: | 0.7265 |
| F-statistic: | 1106.4 on 6 and 2491 DF |
| p-value: | < 2.2e-16 |
By removing the type of birth variable, the reduction in Adjusted R-squared is minimal.
During the analysis of the relationship between the variables, a correlation coefficient between length and diameter of the skull of 0.60 and a correlation coefficient between length and gestation of 0.62 emerged. Before further modifying the model, we verify that there are no multicollinearity problems.
vif_mod3 = round(vif(mod3),3)
kable(vif_mod3, format = "html", col.names = c("Variable", "VIF"), align="c",caption = "VIF Table mod3")%>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Variable | VIF |
|---|---|
| N.gravidanze | 1.026 |
| Fumatrici | 1.007 |
| Gestazione | 1.676 |
| Lunghezza | 2.080 |
| Cranio | 1.625 |
| Sesso | 1.040 |
All VIF values are less than 5, which indicates that there are no significant multicollinearity issues in the model 3. In particular, the highest value is 2.080 for the variable Length , which is still considered acceptable.
From the analysis of the correlations between variables a non-linearity of the skull diameter and gestation came out. Let’s add these effects in two separate steps.
mod4a=update(mod3, ~.+I(Gestazione^2))
result_mod4a = generate_model_tables(mod4a)
| Estimate | Std. Error | t value | Pr(>|t|) | Signif | |
|---|---|---|---|---|---|
| (Intercept) | -4665.63 | 898.78 | -5.19 | 2.26e-07 | *** |
| N.gravidanze | 12.78 | 4.34 | 2.94 | 0.0033 | ** |
| FumatriciSI | -29.35 | 27.59 | -1.06 | 0.2874 | |
| Gestazione | -79.95 | 49.75 | -1.61 | 0.1082 | |
| Lunghezza | 10.34 | 0.30 | 33.96 | < 2e-16 | *** |
| Cranio | 10.63 | 0.43 | 24.83 | < 2e-16 | *** |
| SessoM | 75.95 | 11.24 | 6.75 | 1.78e-11 | *** |
| I(Gestazione^2) | 1.50 | 0.66 | 2.27 | 0.0233 | * |
| Info | Value |
|---|---|
| Signif. codes: | 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1 |
| Residual standard error: | 274 on 2490 degrees of freedom |
| Multiple R-squared: | 0.7277 |
| Adjusted R-squared: | 0.7269 |
| F-statistic: | 950.7 on 7 and 2490 DF |
| p-value: | < 2.2e-16 |
The addition of the non linear effect of the gestation variable breaks the model causing the linear effect of gestation to become non significant.
mod4b=update(mod3, ~.+I(Cranio^2))
result_mod4b = generate_model_tables(mod4b)
| Estimate | Std. Error | t value | Pr(>|t|) | Signif | |
|---|---|---|---|---|---|
| (Intercept) | 56.88 | 1152.12 | 0.05 | 0.9606 | |
| N.gravidanze | 12.98 | 4.32 | 3.00 | 0.0027 | ** |
| FumatriciSI | -26.69 | 27.43 | -0.97 | 0.3306 | |
| Gestazione | 39.11 | 3.94 | 9.93 | < 2e-16 | *** |
| Lunghezza | 10.47 | 0.30 | 34.69 | < 2e-16 | *** |
| Cranio | -31.63 | 7.17 | -4.41 | 1.08e-05 | *** |
| SessoM | 73.28 | 11.17 | 6.56 | 6.45e-11 | *** |
| I(Cranio^2) | 0.06 | 0.01 | 5.89 | 4.39e-09 | *** |
| Info | Value |
|---|---|
| Signif. codes: | 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1 |
| Residual standard error: | 273 on 2490 degrees of freedom |
| Multiple R-squared: | 0.7309 |
| Adjusted R-squared: | 0.7301 |
| F-statistic: | 966.1 on 7 and 2490 DF |
| p-value: | < 2.2e-16 |
The addition of the non linear effect of the skull diameter variable results in a minimal increase in the Adjusted R-squared. It is not worthwhile to complicate the model in the face of a negligible increase in explained variability.
In scientific literature it is well known that maternal smoking indirectly impacts the weight of the newborn, favoring premature birth. Let’s try to introduce the combined effect of the smoking and gestation variables into the model.
mod4c=update(mod3, ~.+(Fumatrici*Gestazione))
result_mod4c = generate_model_tables(mod4c)
| Estimate | Std. Error | t value | Pr(>|t|) | Signif | |
|---|---|---|---|---|---|
| (Intercept) | -6699.69 | 136.73 | -49.00 | < 2e-16 | *** |
| N.gravidanze | 12.75 | 4.35 | 2.93 | 0.0034 | ** |
| FumatriciSI | 795.70 | 757.53 | 1.05 | 0.2936 | |
| Gestazione | 33.20 | 3.84 | 8.64 | < 2e-16 | *** |
| Lunghezza | 10.23 | 0.30 | 33.96 | < 2e-16 | *** |
| Cranio | 10.53 | 0.43 | 24.69 | < 2e-16 | *** |
| SessoM | 78.74 | 11.22 | 7.02 | 2.94e-12 | *** |
| FumatriciSI:Gestazione | -21.05 | 19.28 | -1.09 | 0.2752 |
| Info | Value |
|---|---|
| Signif. codes: | 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1 |
| Residual standard error: | 275 on 2490 degrees of freedom |
| Multiple R-squared: | 0.7273 |
| Adjusted R-squared: | 0.7265 |
| F-statistic: | 948.6 on 7 and 2490 DF |
| p-value: | < 2.2e-16 |
Also in this case, maternal smoking appears not to be significant and does not bring any benefit to the model. The adjusted R-square is the same as that of model 3.
There appears to be no evidence to support the thesis that maternal smoking positively impacts the model. Let’s remove this variable to understand how the model changes.
mod5=update(mod3, ~.-Fumatrici)
result_mod5 = generate_model_tables(mod5)
| Estimate | Std. Error | t value | Pr(>|t|) | Signif | |
|---|---|---|---|---|---|
| (Intercept) | -6681.73 | 135.80 | -49.20 | < 2e-16 | *** |
| N.gravidanze | 12.46 | 4.34 | 2.87 | 0.0042 | ** |
| Gestazione | 32.38 | 3.80 | 8.52 | < 2e-16 | *** |
| Lunghezza | 10.25 | 0.30 | 34.06 | < 2e-16 | *** |
| Cranio | 10.54 | 0.43 | 24.72 | < 2e-16 | *** |
| SessoM | 77.98 | 11.21 | 6.96 | 4.47e-12 | *** |
| Info | Value |
|---|---|
| Signif. codes: | 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1 |
| Residual standard error: | 275 on 2492 degrees of freedom |
| Multiple R-squared: | 0.727 |
| Adjusted R-squared: | 0.7265 |
| F-statistic: | 1327.3 on 5 and 2492 DF |
| p-value: | < 2.2e-16 |
The adjusted R-square is the same as that of model 3. With the starting data available, it can be concluded that the smoking variable is not significant.
The remaining variables all have high significance. It is concluded that model 5 is the best model.
The model chosen as the best includes the following variables:
Number of pregnancies: a unit increase in this variable will result in an increase in the newborn’s weight of 12.46 grams
Gestation: : a unit increase in this variable will result in an increase in the newborn’s weight of 32.38 grams.
Length: for every increment of one millimeter in the length of the newborn, the weight increases by 10.25 grams
Skull diameter: for every increment of one millimeter in the diameter of the newborn’s skull, the weight increases by 10.54 grams
Sex: a newborn male, on average, weighs 77.98 grams more than a newborn female
To verify the statement that model 5 is the best one, it is possible to use the BIC indicator. In addition it is possible to perform an automatic identification of the best model using the stepwise_AIC procedure and compare the output with the results obtained.
library(MASS)
#automatic model
stepwise_mod=stepAIC(mod1,direction = "both", k=log(n))
# BIC of all models found
bic_output = BIC(mod1, mod2, mod3, mod4a, mod4b, mod4c, mod5,stepwise_mod)
kable(bic_output, format = "html",col.names = c("Df", "BIC"),
align = "c", caption = "BIC Values for Different Models")%>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Df | BIC | |
|---|---|---|
| mod1 | 12 | 35215.45 |
| mod2 | 9 | 35201.73 |
| mod3 | 8 | 35200.24 |
| mod4a | 9 | 35202.90 |
| mod4b | 9 | 35173.50 |
| mod4c | 9 | 35206.87 |
| mod5 | 7 | 35193.65 |
| stepwise_mod | 7 | 35193.65 |
BIC has to be read as lower is better. In that case the model 4b is the lowest. However, the analysis showed that increasing the complexity of the model 4b does not lead to a significant increase in the explained variability. In addition, the automatic procedure found that the best model has same BIC of the model 5. It can be concluded that the best model is model 5.
After obtaining the model it is necessary to carry out an analysis of the residues to evaluate whether they comply with the following assumptions:
par(mfrow=c(2,2))
plot(mod5)
The following graphs are shown in the image above:
Residuals vs Fitted: The points should be randomly distributed around the horizontal zero line. The graph shows some patterns at extreme values, indicating potential problems with the assumption of residuals means equals 0.
Q-Q Plot: This plot compares the standardized residuals to the theoretical quantiles of a normal distribution. It is used to test the assumption of normality of the residuals. The points should be on the diagonal line if the residuals are normally distributed. The graph shows some deviations from the line, especially at the start and at the ends, indicating potential non-normality.
Scale-Location: This plot is used to test the homoscedasticity assumption. The dots should be randomly distributed around a horizontal line. The graph shows some patterns, indicating potential problems with homoscedasticity.
Residuals vs Leverage: This graph is used to identify influential observations. Points with high leverage and large residuals are potentially influential. The chart shows one point (1549) crossing the attention line of 0.5.
it is good practice, in addition to graphical analysis, to carry out statistical tests to verify that the model assumptions are respected.
library(lmtest)
# Test
bp_test = bptest(mod5)
dw_test = dwtest(mod5)
shapiro_test = shapiro.test(residuals(mod5))
# Result dataframe
results_residuals = data.frame(
Test = c("Breusch-Pagan", "Durbin-Watson", "Shapiro-Wilk"),
Statistic = round(c(bp_test$statistic, dw_test$statistic, shapiro_test$statistic),3),
`p-value` = c(bp_test$p.value, dw_test$p.value, shapiro_test$p.value)
)
# Formatting P value
results_residuals[, 3] = sapply(results_residuals[, 3], format_p_value)
kable(results_residuals, format = "html", align = "c", caption = "Residual Test Results")%>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Test | Statistic | p.value | |
|---|---|---|---|
| BP | Breusch-Pagan | 90.297 | < 2e-16 |
| DW | Durbin-Watson | 1.953 | 0.1209 |
| W | Shapiro-Wilk | 0.974 | < 2e-16 |
Breusch-Pagan Test: with an extremely low p-value the null hypothesis is rejected. The residuals of the model not satisfy the assumption of homoscedasticity.
Durbin-Watson Test: the p-value of 0.1209 suggests that the null hypothesis is not rejected. The residuals are not affected by autocorrelation.
Shapiro-Wilk Normality Test: with an extremely low p-value the null hypothesis is rejected. The residuals do not follow a normal distribution.
Just because the model didn’t pass the Breusch-Pagan and Shapiro-Wilk tests doesn’t mean it’s not valid. The model has an Adjusted R-square of 0.7265 which indicates that it manages to explain more than 70% of the variability of the response variable. It can be used to make predictions with awareness of its limits.
library(ggplot2)
library(car)
# Leverage
lev = hatvalues(mod5)
p = sum(lev)
soglia_lev = 2 * p / n
# Leverage Plot
leverage_plot = ggplot(data.frame(Index = 1:length(lev), Leverage = lev), aes(x = Index, y = Leverage)) +
geom_point() +
geom_hline(yintercept = soglia_lev, color = "red", size = 1.5) +
ggtitle("Leverage Values") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
# Outlier
rstudent_vals = rstudent(mod5)
# Outlier plot
outlier_plot = ggplot(data.frame(Index = 1:length(rstudent_vals), RStudent = rstudent_vals), aes(x = Index, y = RStudent)) +
geom_point() +
geom_hline(yintercept = c(-2, 2), color = "red", size = 1.5) +
ggtitle("Studentized Residuals") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
library(gridExtra)
grid.arrange(leverage_plot, outlier_plot, ncol = 2, widths = c(1, 1))
num_leverage=sum(lev>soglia_lev)
num_outlier = sum(abs(rstudent_vals) > 2)
ris_influent <- data.frame(
Influent = c("Leverage", "Outlier"),
Total = c(num_leverage, num_outlier)
)
kable(ris_influent, col.names = c("", "Total"),align="c", caption = "Leverage and Outliers total value")%>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")%>%
column_spec(1, width = "5cm")%>%
column_spec(2, width = "2cm")
| Total | |
|---|---|
| Leverage | 153 |
| Outlier | 101 |
153 leverage values and 101 outlier values has been identified. From the graphical analysis it is noted that most of these values slightly exceed the threshold. Let’s use another visualization (cook distance) to analyze the leverage values and the outlier values at the same time.
# Cook distance
cook <- cooks.distance(mod5)
# index max value
max_cook_index <- which.max(cook)
max_cook_value <- cook[max_cook_index]
ggplot(data.frame(Index = 1:length(cook), Cook = cook), aes(x = Index, y = Cook)) +
geom_point() +
geom_text(aes(label = ifelse(Index == max_cook_index, max_cook_index, '')),
vjust = -1, hjust = 1) +
geom_point(data = data.frame(Index = max_cook_index, Cook = max_cook_value),
aes(x = Index, y = Cook), color = "red", size = 3) +
ggtitle("Cook's Distance") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
ylim(0, 1)
The Cook’s Distance graph shows only one value with a high cook distance. Let’s remove the observation 1549 from the dataset to analyze the effect on the model.
detach(data)
data <- data %>% filter(row_number() != 1549)
attach(data)
mod5_new=lm(Peso ~ N.gravidanze+Gestazione+Lunghezza+Cranio+Sesso,
data=data)
result_mod5new = generate_model_tables(mod5_new)
| Estimate | Std. Error | t value | Pr(>|t|) | Signif | |
|---|---|---|---|---|---|
| (Intercept) | -6683.83 | 133.16 | -50.19 | < 2e-16 | *** |
| N.gravidanze | 13.14 | 4.26 | 3.09 | 0.0020 | ** |
| Gestazione | 29.63 | 3.74 | 7.93 | 3.27e-15 | *** |
| Lunghezza | 10.89 | 0.30 | 36.08 | < 2e-16 | *** |
| Cranio | 9.92 | 0.42 | 23.46 | < 2e-16 | *** |
| SessoM | 78.14 | 10.99 | 7.11 | 1.53e-12 | *** |
| Info | Value |
|---|---|
| Signif. codes: | 0 ’***’ 0.001 ’**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ’ ’ 1 |
| Residual standard error: | 269 on 2491 degrees of freedom |
| Multiple R-squared: | 0.7372 |
| Adjusted R-squared: | 0.7367 |
| F-statistic: | 1397.5 on 5 and 2491 DF |
| p-value: | < 2.2e-16 |
Following the removal of observation 1549, the descriptive variables present in the model continue to show all high significance. Furthermore, an increase in the adjusted R-squared is observed.
Statistical tests will help in understanding how the removal of the observation 1549 affects the residuals.
bp_test = bptest(mod5_new)
dw_test = dwtest(mod5_new)
shapiro_test = shapiro.test(residuals(mod5_new))
# Result dataframe
results_residuals = data.frame(
Test = c("Breusch-Pagan", "Durbin-Watson", "Shapiro-Wilk"),
Statistic = round(c(bp_test$statistic, dw_test$statistic, shapiro_test$statistic),3),
`p-value` = c(bp_test$p.value, dw_test$p.value, shapiro_test$p.value)
)
# Formatting P value
results_residuals[, 3] = sapply(results_residuals[, 3], format_p_value)
kable(results_residuals, format = "html", align = "c", caption = "Residual Test Results")%>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Test | Statistic | p.value | |
|---|---|---|---|
| BP | Breusch-Pagan | 11.343 | 0.0450 |
| DW | Durbin-Watson | 1.954 | 0.1227 |
| W | Shapiro-Wilk | 0.989 | 5.23e-13 |
All test statistics show an increase. The p-value of the Shapiro-Wilk test continues to be extremely low, rejecting the hypothesis of a normal distribution of the residuals. However, the p-value of the Breusch-Pagan test is borderline (still rejecting the hypothesis of homoscedasticity) at a significance level of 5%.
# Calcola i residui del modello
residuls_mod5new <- residuals(mod5_new)
# Crea il grafico di densità con ggplot
ggplot() +
geom_density(aes(x = residuls_mod5new, y = ..density.., color = "Model Residuals"), size = 1) +
geom_density(aes(x = rnorm(1000000,
mean = mean(residuls_mod5new),
sd = sd(residuls_mod5new)),
y = ..density.., color = "Normal Distribution"), size = 1) +
scale_color_manual(values = c("Model Residuals" = "black", "Normal Distribution" = "red"),
name = "Curve") +
ggtitle("Density of probability") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
labs(x = "Value", y = "Density")
From a graphical analysis seems not to be present major differences between a normal distribution and the probability density of the residuals of the final model. Considering the value of the Adjusted R-squared, the results of the statistical tests and the graphical analyses, this model can be considered acceptable.
Since the model includes 5 descriptive variables, to predict the weight of a female newborn considering a mother in her third pregnancy who will give birth at the 39th week, the values of length and skull diameter are necessary. For these last two variables, assume the average values calculated on the dataset from which the model was derived.
# new_born
new_born = data.frame( N.gravidanze = 3,
Gestazione = 39,
Lunghezza = mean(data$Lunghezza),
Cranio = mean(data$Cranio),
Sesso = "F" )
# Prediction
pred_new_born <- predict(mod5_new, newdata = new_born,interval = "confidence")
res_pred <- data.frame(
Result = "Predicted Weight in grams",
Fit = round(pred_new_born[1, "fit"],2),
Lower = round(pred_new_born[1, "lwr"],2),
Upper = round(pred_new_born[1, "upr"],2)
)
# Mostra i risultati utilizzando kable
kable(res_pred, col.names = c("", "Fit", "Lower", "Upper"), format = "html", align = "c") %>%
kable_styling(bootstrap_options = c("striped", "hover"), full_width = F, position = "left")
| Fit | Lower | Upper | |
|---|---|---|---|
| Predicted Weight in grams | 3271.98 | 3249.09 | 3294.87 |
In scientific literature, the average weight of a newborn is reported as 3300g. This value is very close to the upper limit of the predicted confidence interval. It can be considered that the prediction is valid.
After obtaining the model it is possible to explore the main relationships between variables with the help of graphs.
#Weight vs Gestation
ggplot(data, aes(x = Gestazione, y = Peso, color = Sesso)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
scale_color_manual(values = c("M" = "blue", "F" = "red")) +
scale_x_continuous(breaks = seq(24, 43, by =2), limits = c(24, 43)) +
labs(title = "Weight vs Gestation",
x = "Gestation (week)",
y = "Weight (g)",
color=NULL) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
The graph shows a positive trend for both sexes in which weight increases as the number of weeks of gestation increases.
The regression lines for the two sexes appear parallell. The regression line for the female sex is shifted lower. This indicates that for both sexes, the unit increase in the variable gestation leads to the same increase in weight with the difference that on average female newborns weigh less for the same weeks of gestation.
library(scatterplot3d)
library(car)
library(rgl)
colors = c("F" = "red", "M" = "blue")
s3d = scatterplot3d(Lunghezza, Cranio, Peso, color=colors[Sesso], pch=19,
xlab="Length (mm)", ylab="Skull diameter (mm)", zlab="Weight (g)",
main="Relation between Weight, Length e Skull Diameter")
# Regression planes by sex
regF = lm(Peso ~ Lunghezza + Cranio, data = data[data$Sesso == "F", ])
regM = lm(Peso ~ Lunghezza + Cranio, data = data[data$Sesso == "M", ])
# Adding regression planes to graph
s3d$plane3d(regF, col = "red", lty = "solid")
s3d$plane3d(regM, col = "blue", lty = "dashed")
legend("topright", legend=c("F", "M"), col=c("red", "blue"), pch=19, bty="n")
The graph shows a positive effect of both descriptive variables (length and skull diameter) on the weight of the newborn. The regression plans for the two modes of the sex variable are very similar to each other indicating that the effect of the length and skull diameter is the same for both sexes.