# library
library(ggplot2)
library(gghalves)
library(moments)
library(dplyr)
##
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
## Warning: il pacchetto 'car' è stato creato con R versione 4.4.2
## Caricamento del pacchetto richiesto: carData
## Warning: il pacchetto 'carData' è stato creato con R versione 4.4.2
##
## Caricamento pacchetto: 'car'
## Il seguente oggetto è mascherato da 'package:dplyr':
##
## recode
library(lmtest)
## Warning: il pacchetto 'lmtest' è stato creato con R versione 4.4.2
## Caricamento del pacchetto richiesto: zoo
##
## Caricamento pacchetto: 'zoo'
## I seguenti oggetti sono mascherati da 'package:base':
##
## as.Date, as.Date.numeric
file <- "neonati.csv"
dati <- read.csv(file)
summary(dati)
## Anni.madre N.gravidanze Fumatrici Gestazione
## Min. : 0.00 Min. : 0.0000 Min. :0.0000 Min. :25.00
## 1st Qu.:25.00 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:38.00
## Median :28.00 Median : 1.0000 Median :0.0000 Median :39.00
## Mean :28.16 Mean : 0.9812 Mean :0.0416 Mean :38.98
## 3rd Qu.:32.00 3rd Qu.: 1.0000 3rd Qu.:0.0000 3rd Qu.:40.00
## Max. :46.00 Max. :12.0000 Max. :1.0000 Max. :43.00
## Peso Lunghezza Cranio Tipo.parto
## Min. : 830 Min. :310.0 Min. :235 Length:2500
## 1st Qu.:2990 1st Qu.:480.0 1st Qu.:330 Class :character
## Median :3300 Median :500.0 Median :340 Mode :character
## Mean :3284 Mean :494.7 Mean :340
## 3rd Qu.:3620 3rd Qu.:510.0 3rd Qu.:350
## Max. :4930 Max. :565.0 Max. :390
## Ospedale Sesso
## Length:2500 Length:2500
## Class :character Class :character
## Mode :character Mode :character
##
##
##
print(paste("Number of record:", nrow(dati)))
## [1] "Number of record: 2500"
birth.type <- unique(dati$Tipo.parto)
hospital <- unique(dati$Ospedale)
sex <- unique(dati$Sesso)
table(dati$Tipo.parto)
##
## Ces Nat
## 728 1772
table(dati$Ospedale)
##
## osp1 osp2 osp3
## 816 849 835
table(dati$Sesso)
##
## F M
## 1256 1244
table(dati$Fumatrici)
##
## 0 1
## 2396 104
The dataset contains 2500 record about infants coming from 3 different hospitals. The variables in the dataset are:
Observing the count of records by hospital and sex the dataset is balanced, while in type of birth the natural birth has a much higher value compared to the cesarean one, even for the smokers the values are more shifted on the no smoker mothers.
Distribution analysis of the variables
#coefficient of variation
CV <- function(x){return(sd(x)/mean(x)*100)}
# report of the mesures
report.text <- function(x){
text <- paste0(
"Variability indices",
"\nIQR: ", round(IQR(x),2),
"\nVariance: ", round(var(x),2),
"\nSt.Dev: ", round(sd(x),2),
"\nCoeff.Var: ", round(CV(x),2),
"\n\nShape indices",
"\nSkewness: ", round(skewness(x),2),
"\nKurtosis: ", round(kurtosis(x)-3,2)
)
return(text)
}
# boxplot and distribution in half violin structure
half.box.violin <- function(dataset, feature, W_report=T){
col_var <- dataset[[feature]]
mean_value = mean(col_var)
g_plt <- ggplot(dataset, aes(x=0, y=col_var))+
geom_half_boxplot(aes(y=col_var),
side = "l",
fill = "pink")+
geom_half_violin(aes(y=col_var),
side = "r",
fill = "lightblue")+
scale_x_continuous(limits = c(-1, 1)) +
# Mean
annotate("segment",x = -0.375,xend = 1, y = mean_value, yend = mean_value,
color = "red", linewidth = 1) +
stat_summary(fun = mean, geom = "text",
aes(label = paste("Mean:", round(after_stat(y), 2))),
hjust = 1,vjust = -0.5, color = "red",
position = position_nudge(x = 1))+
# Quartiles
stat_summary(fun.data = function(x) {
r <- quantile(x, probs = c(0, 0.25, 0.5, 0.75, 1))
data.frame(y = r,
label = c(paste("Min:", round(r[1], 2)),
paste("Q1:", round(r[2], 2)),
paste("Median:", round(r[3], 2)),
paste("Q3:", round(r[4], 2)),
paste("Max:", round(r[5], 2)))
)},
geom = "text",
position = position_nudge(x = -0.02),
hjust = 1, vjust = -0.5, color = "blue")+
#Labels
labs(title = paste0("Boxplot and density distribution of ", feature), y = paste0(feature, " value"))+
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.title.x = element_blank())
# Text square
if (W_report){
text = report.text(col_var)
g_plt <- g_plt+
annotate("label", x =-1,y = max(col_var),
label = text,
parse = F,
color = "blue",fill = "white", size = 4, hjust = 0, vjust = 1)
}
return(g_plt)
}
find.outlier <- function(array.val){
Q1 <- quantile(array.val, 0.25)
Q3 <- quantile(array.val, 0.75)
IQR <- Q3-Q1
upper_bound <- Q3 + 1.5*IQR
lower_bound <- Q1 - 1.5*IQR
outlier_indices <- which(array.val < lower_bound | array.val > upper_bound)
return(outlier_indices)
}
distr.var <- colnames(dati)[1:(ncol(dati)-3)]
distr.var <- setdiff(distr.var, "Fumatrici")
for (var_ in distr.var) {
plot <-half.box.violin(dati, var_)
print(plot)
}
title = "Distribution of frequency"
barplot(table(dati$N.gravidanze), ylab = "frequencies", xlab= "n. pregnancies", main = title)
barplot(table(dati$Gestazione), ylab = "frequencies", xlab= "weeks of pregnancy", main = title)
for (var_ in distr.var) {
arr <- dati[[var_]]
outlier <- find.outlier(arr)
print(paste0("Outliers' counter in ", var_, ": ", length(outlier)))
}
## [1] "Outliers' counter in Anni.madre: 13"
## [1] "Outliers' counter in N.gravidanze: 246"
## [1] "Outliers' counter in Gestazione: 67"
## [1] "Outliers' counter in Peso: 69"
## [1] "Outliers' counter in Lunghezza: 59"
## [1] "Outliers' counter in Cranio: 48"
From the boxplots of the distributions we can see a lot of outliers (points out of the whiskers) for each variable, those values could influence the estimation of the regression coefficients in the following analysis.
To understand better the variables distribution N.Gravidanze and Gestazione is clearer to use a frequency distribution plot. Those 2 variables are very skewed. The number of pregnancy is positively skewed meaning that the distribution is high concentrated in the low values (mostly on 0, 1, 2 around 75%). The weeks of pregnancy are negatively skewed showing an higher concentration of values in the high values; 50% of the pregnancies occur between 38 and 40 weeks, almost all are between 35 and 42 weeks.
Anni.madre, Lunghezza, Cranio seem to show a normal-like distribution.
Anni.madre shows some values out of the domain so we check those outliers.
outlier <- find.outlier(dati$Anni.madre)
dati[outlier,]
## Anni.madre N.gravidanze Fumatrici Gestazione Peso Lunghezza Cranio
## 138 13 0 0 38 2760 470 325
## 205 45 2 0 38 3850 505 384
## 230 43 1 0 35 2050 455 305
## 260 44 1 0 40 3500 480 346
## 335 44 0 1 38 3150 465 335
## 855 43 0 0 38 3600 510 336
## 1075 14 1 0 39 3510 490 365
## 1106 46 5 0 36 2710 470 347
## 1152 1 1 0 41 3250 490 350
## 1380 0 0 0 39 3060 490 330
## 1532 14 0 0 39 3550 500 355
## 2026 44 0 0 40 3050 505 345
## 2098 44 5 0 38 3280 475 346
## Tipo.parto Ospedale Sesso
## 138 Nat osp2 F
## 205 Nat osp3 M
## 230 Nat osp2 F
## 260 Nat osp1 F
## 335 Nat osp3 F
## 855 Nat osp3 M
## 1075 Nat osp2 M
## 1106 Nat osp2 M
## 1152 Nat osp2 F
## 1380 Nat osp3 M
## 1532 Ces osp1 M
## 2026 Nat osp1 M
## 2098 Ces osp1 F
The rows 1152 and 1380 have clearly data out of the domain (a mother can’t have 0 or 1 year) so we are going to impute those values.
shapiro.test(dati$Anni.madre)
##
## Shapiro-Wilk normality test
##
## data: dati$Anni.madre
## W = 0.99308, p-value = 1.639e-09
The test shows the non-normality of the distribution so the best way to perform the imputation is to use the median value.
dati$Anni.madre[dati$Anni.madre %in% c(0,1)] <- median(dati$Anni.madre, na.rm=TRUE)
dati[c(1152,1380),]
## Anni.madre N.gravidanze Fumatrici Gestazione Peso Lunghezza Cranio
## 1152 28 1 0 41 3250 490 350
## 1380 28 0 0 39 3060 490 330
## Tipo.parto Ospedale Sesso
## 1152 Nat osp2 F
## 1380 Nat osp3 M
Peso is the target variable and shows a Gaussian curve distribution with mean, median and mode very close, even the skewness is almost 0.
The execution of some hypothesis testing will clarify this assumption.
# shapiro test to check the normality of the target vairable
shapiro.test(dati$Peso)
##
## Shapiro-Wilk normality test
##
## data: dati$Peso
## W = 0.97066, p-value < 2.2e-16
Contrary to what observed before we can’t assume the distributions’ normality of the variable Peso because the Shapiro-Wilk test shows a very low p-value making us reject the null hypothesis.
With the help of others hypothesis testing we’ll check if:
contingency.table <- table(dati$Ospedale, dati$Tipo.parto)
print("Contingency table")
## [1] "Contingency table"
print(contingency.table)
##
## Ces Nat
## osp1 242 574
## osp2 254 595
## osp3 232 603
test.chi2 <- chisq.test(contingency.table)
print(test.chi2)
##
## Pearson's Chi-squared test
##
## data: contingency.table
## X-squared = 1.0972, df = 2, p-value = 0.5778
Was run the chi-squared test to check the independence between the hospital and the type of birth. The result with a high p-value (0.58) leads us to conclude that we can’t reject the null hypothesis of independence between the type of birth and the hospital where it is performed.
So we can assert that there is no significant difference in the cesarean birth practice among the 3 hospitals.
To verify these hypothesis it’s necessary to consider a target value for each average.
Taking data from a well known research center that is Bambino Gesù-Ospedale Pediatrico the average values are: Weight: 3300 g Length: 500 mm
Going in a deeper research from the official WHO (World Health Organization) tables of weight for age and length for age it was found that the data are provided only grouped by sex and not aggregated, so the hypothesis test can be also conducted for the 2 categories.
The values of the mean weight of the population are: Boys: 3346 grams Girls: 3232 grams
The values of the mean length of the population are: Boys: 499 mm Girls: 491 mm
(N.B: the tables of WHO show the median values but we assume the normality of the distribution so median and mean are very close and we use those values as target for the population means)
arr.male.weight <- dati$Peso[dati$Sesso =="M"]
arr.female.weight <- dati$Peso[dati$Sesso =="F"]
arr.male.length<- dati$Lunghezza[dati$Sesso =="M"]
arr.female.length <- dati$Lunghezza[dati$Sesso =="F"]
shapiro.test(arr.male.weight)
##
## Shapiro-Wilk normality test
##
## data: arr.male.weight
## W = 0.96647, p-value = 2.321e-16
shapiro.test(arr.female.weight)
##
## Shapiro-Wilk normality test
##
## data: arr.female.weight
## W = 0.96285, p-value < 2.2e-16
shapiro.test(arr.male.length)
##
## Shapiro-Wilk normality test
##
## data: arr.male.length
## W = 0.92028, p-value < 2.2e-16
shapiro.test(arr.female.length)
##
## Shapiro-Wilk normality test
##
## data: arr.female.length
## W = 0.89953, p-value < 2.2e-16
Here was checked the normality assumption of the t test for the data of weight and length grouped by sex, and with very low p-values we must reject the null hypothesis, as for the arrays aggregated. Even without the normality assumption we can use the t test because the sample is big enough (n>>30) to exploit the Central Limit Theorem.
# weight t test
# no sex division
t.test(dati$Peso, mu=3300, conf.level = 0.95, alternative = "two.sided")
##
## One Sample t-test
##
## data: dati$Peso
## t = -1.516, df = 2499, p-value = 0.1296
## alternative hypothesis: true mean is not equal to 3300
## 95 percent confidence interval:
## 3263.490 3304.672
## sample estimates:
## mean of x
## 3284.081
#sex division
t.test(arr.male.weight, mu=3346, conf.level = 0.95, alternative = "two.sided")
##
## One Sample t-test
##
## data: arr.male.weight
## t = 4.4438, df = 1243, p-value = 9.626e-06
## alternative hypothesis: true mean is not equal to 3346
## 95 percent confidence interval:
## 3380.748 3435.683
## sample estimates:
## mean of x
## 3408.215
t.test(arr.female.weight, mu=3232, conf.level = 0.95, alternative = "two.sided")
##
## One Sample t-test
##
## data: arr.female.weight
## t = -4.772, df = 1255, p-value = 2.038e-06
## alternative hypothesis: true mean is not equal to 3232
## 95 percent confidence interval:
## 3131.997 3190.267
## sample estimates:
## mean of x
## 3161.132
With aggregated data the pvalue is 0.1296 > 0.05 so we can’t reject the null hypothesis and conclude that the population and sample means are not significantly different.
Consider to test the sex separated the results are the opposite, so the values for both male and female are very low and in this case we must reject the null hypothesis saying the means of population and samples are significantly different.
# length t test
# no sex division
t.test(dati$Lunghezza, mu=500, conf.level = 0.95, alternative = "two.sided")
##
## One Sample t-test
##
## data: dati$Lunghezza
## t = -10.084, df = 2499, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 500
## 95 percent confidence interval:
## 493.6598 495.7242
## sample estimates:
## mean of x
## 494.692
# sex divison
t.test(arr.male.length , mu=499, conf.level = 0.95, alternative = "two.sided")
##
## One Sample t-test
##
## data: arr.male.length
## t = 0.97897, df = 1243, p-value = 0.3278
## alternative hypothesis: true mean is not equal to 499
## 95 percent confidence interval:
## 498.3301 501.0043
## sample estimates:
## mean of x
## 499.6672
t.test(arr.female.length, mu=491, conf.level = 0.95, alternative = "two.sided")
##
## One Sample t-test
##
## data: arr.female.length
## t = -1.5905, df = 1255, p-value = 0.112
## alternative hypothesis: true mean is not equal to 491
## 95 percent confidence interval:
## 488.2401 491.2885
## sample estimates:
## mean of x
## 489.7643
Compared to the weight here there’s the opposite situation. For the test with aggregated data the pvalue is very low so the means of population and sample are considered significantly different, while with the sex division we mustn’t reject the null hypothesis because of the high pvalues (0.3278, 0112).
From these results we can conclude that the most important thing is to understand which population consider (and the values associated), because the t test can produce very different results even with little changes in the target values.
For anthropometric measures we intend weight, length, and the head circumference.
male <- subset(dati, Sesso == "M")
female <- subset(dati, Sesso == "F")
shapiro.test(male$Cranio)
##
## Shapiro-Wilk normality test
##
## data: male$Cranio
## W = 0.97046, p-value = 3.006e-15
shapiro.test(female$Cranio)
##
## Shapiro-Wilk normality test
##
## data: female$Cranio
## W = 0.95543, p-value < 2.2e-16
plot(density(male$Peso))
plot(density(female$Peso))
plot(density(male$Lunghezza))
plot(density(female$Lunghezza))
plot(density(male$Cranio))
plot(density(female$Cranio))
For the variables Peso and Lunghezza, we already know that aren’t normally distributed. With the shapiro test for the Cranio variable we arrive at the same conclusion (pvalues very low, so we must reject the null hypothesis of normal distribution). The density distributions show a good symmetry with some outlier in the tails.
For the same reason as before (n>>30) we use the Central Limit Theorem to be able to conduct the t test, but before we have to check the homogeneity of the variances. Without the assumption of normality we can’t use directly the F test but we perform a Levene test that is robust to not normal distributions
dati$Sesso <- as.factor(dati$Sesso)
# default value of central tendency = median, more robust to outlier
leveneTest(Peso ~ Sesso, data = dati)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 0.7931 0.3732
## 2498
leveneTest(Lunghezza ~ Sesso, data = dati)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 10.496 0.001212 **
## 2498
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
leveneTest(Cranio ~ Sesso, data = dati)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 1 1.1978 0.2739
## 2498
Looking at the results of the variance analysis we can assume that for the variables Peso and Cranio the variance is the same for both sex while in Lunghezza we must reject the homogeneity variance hypothesis.
t.test(Peso ~ Sesso, data = dati, var.equal = TRUE)
##
## Two Sample t-test
##
## data: Peso by Sesso
## t = -12.102, df = 2498, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -287.1173 -207.0493
## sample estimates:
## mean in group F mean in group M
## 3161.132 3408.215
t.test(Lunghezza ~ Sesso, data = dati, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: Lunghezza by Sesso
## t = -9.582, df = 2459.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -11.929470 -7.876273
## sample estimates:
## mean in group F mean in group M
## 489.7643 499.6672
t.test(Cranio ~ Sesso, data = dati, var.equal = TRUE)
##
## Two Sample t-test
##
## data: Cranio by Sesso
## t = -7.408, df = 2498, p-value = 1.744e-13
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -6.090285 -3.540898
## sample estimates:
## mean in group F mean in group M
## 337.6330 342.4486
# Mann-Whitney U test
wilcox.test(Peso ~ Sesso, data = dati)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Peso by Sesso
## W = 538641, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(Lunghezza ~ Sesso, data = dati)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Lunghezza by Sesso
## W = 594455, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(Cranio ~ Sesso, data = dati)
##
## Wilcoxon rank sum test with continuity correction
##
## data: Cranio by Sesso
## W = 641638, p-value = 9.633e-15
## alternative hypothesis: true location shift is not equal to 0
All the anthropometric measures (weight, length, head circumference) are significantly different (pvalues very low) between different sex (were performed even the non-parametric tests giving the same results).
First of all we are going to analyze the correlation between variables.
df.num <- dati %>% select_if(is.numeric)
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
pairs(df.num, upper.panel = panel.smooth, lower.panel = panel.cor)
Above is shown a matrix where on the diagonal there are the numeric variables, under that there are the values of Pearson correlation between each variable and above the diagonal are printed the scatter plots. The highest correlations with the target variable are:
From the scatter plots we can see a trend line that confirm those values.
Now we will perform the modelling step.
mod1 <- lm(Peso ~ ., data = dati)
summary(mod1)
##
## Call:
## lm(formula = Peso ~ ., data = dati)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1123.3 -181.2 -14.6 160.7 2612.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6735.1677 141.3977 -47.633 < 2e-16 ***
## Anni.madre 0.7983 1.1463 0.696 0.4862
## N.gravidanze 11.4118 4.6665 2.445 0.0145 *
## Fumatrici -30.1567 27.5396 -1.095 0.2736
## Gestazione 32.5265 3.8179 8.520 < 2e-16 ***
## Lunghezza 10.2951 0.3007 34.237 < 2e-16 ***
## Cranio 10.4725 0.4261 24.580 < 2e-16 ***
## Tipo.partoNat 29.5027 12.0848 2.441 0.0147 *
## Ospedaleosp2 -11.2216 13.4388 -0.835 0.4038
## Ospedaleosp3 28.0984 13.4972 2.082 0.0375 *
## SessoM 77.5473 11.1779 6.938 5.07e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.9 on 2489 degrees of freedom
## Multiple R-squared: 0.7289, Adjusted R-squared: 0.7278
## F-statistic: 669.1 on 10 and 2489 DF, p-value: < 2.2e-16
cat("AIC:", AIC(mod1), "\n")
## AIC: 35172.09
cat("BIC:", BIC(mod1), "\n")
## BIC: 35241.97
The first analysis on the linear model, including all the variables, is showing as expected high significance for the variables with high correlation with the target (Lunghezza, Cranio, Gestazione).
(The estimated coefficients for categories Ospedaleops2, Ospedaleosp3 represent the difference in the expected values compared to the reference category osp1)
Explanation of the prediction model of the infant weight:
Catch the eye some variables with p-value > 0.05, allowing us to conclude that their significance is low (if not null) for the purposes of the model goodness. On these basics we are going to eliminate the non-significant variables (Occam’s razor).
Performing a stepwise (mixed) procedure we will find the best model for the optimum trade off between variance explained (best R2) and simplicity (less variables included).
n <- nrow(dati)
mod0 <- lm(Peso ~ Lunghezza+Cranio+Gestazione, data = dati)
stepwise_AIC <- step(mod0, scope = formula(mod1),, direction = "both", k=2, trace = F)
stepwise_BIC <- step(mod0, scope = formula(mod1),, direction = "both", k=log(n), trace = F)
stepwise_AIC
##
## Call:
## lm(formula = Peso ~ Lunghezza + Cranio + Gestazione + Sesso +
## N.gravidanze + Ospedale + Tipo.parto, data = dati)
##
## Coefficients:
## (Intercept) Lunghezza Cranio Gestazione SessoM
## -6707.43 10.31 10.49 31.99 77.44
## N.gravidanze Ospedaleosp2 Ospedaleosp3 Tipo.partoNat
## 12.36 -11.02 28.64 29.28
stepwise_BIC
##
## Call:
## lm(formula = Peso ~ Lunghezza + Cranio + Gestazione + Sesso +
## N.gravidanze, data = dati)
##
## Coefficients:
## (Intercept) Lunghezza Cranio Gestazione SessoM
## -6681.14 10.25 10.54 32.33 77.99
## N.gravidanze
## 12.47
The results considering the 2 decision criterions applied are:
mod.a <- update(mod1, ~. - Anni.madre - Fumatrici)
summary(mod.a)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Tipo.parto + Ospedale + Sesso, data = dati)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1113.18 -181.16 -16.58 161.01 2620.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6707.4293 135.9438 -49.340 < 2e-16 ***
## N.gravidanze 12.3619 4.3325 2.853 0.00436 **
## Gestazione 31.9909 3.7896 8.442 < 2e-16 ***
## Lunghezza 10.3086 0.3004 34.316 < 2e-16 ***
## Cranio 10.4922 0.4254 24.661 < 2e-16 ***
## Tipo.partoNat 29.2803 12.0817 2.424 0.01544 *
## Ospedaleosp2 -11.0227 13.4363 -0.820 0.41209
## Ospedaleosp3 28.6408 13.4886 2.123 0.03382 *
## SessoM 77.4412 11.1756 6.930 5.36e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.9 on 2491 degrees of freedom
## Multiple R-squared: 0.7287, Adjusted R-squared: 0.7278
## F-statistic: 836.3 on 8 and 2491 DF, p-value: < 2.2e-16
mod.b <- update(mod.a, ~. - Tipo.parto - Ospedale)
summary(mod.b)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Sesso, data = dati)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1149.44 -180.81 -15.58 163.64 2639.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6681.1445 135.7229 -49.226 < 2e-16 ***
## N.gravidanze 12.4750 4.3396 2.875 0.00408 **
## Gestazione 32.3321 3.7980 8.513 < 2e-16 ***
## Lunghezza 10.2486 0.3006 34.090 < 2e-16 ***
## Cranio 10.5402 0.4262 24.728 < 2e-16 ***
## SessoM 77.9927 11.2021 6.962 4.26e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.6 on 2494 degrees of freedom
## Multiple R-squared: 0.727, Adjusted R-squared: 0.7265
## F-statistic: 1328 on 5 and 2494 DF, p-value: < 2.2e-16
anova(mod.b, mod.a)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto +
## Ospedale + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2494 188065546
## 2 2491 186899996 3 1165550 5.1781 0.001443 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We can see in the coefficients a non significance on Ospedaleosp2 (with p-value = 0.41), all the other coefficients are enough significant.
ANOVA shows a p-value (0.001443) < 0.05 so it’s possible to reject the null hypothesis, meaning that there’s significant difference in the explanation of the response variable adding those independent variables. Although the anova suggests to consider the more complex model, there’s a very little difference in R2 (and R2 adjusted) between the 2 models, so the decision is to take into account as baseline model the mod.b.
From the scatterplots we can recognize a possible non-linear interaction between Peso and Gestazione, so we plot the data and other possible non-linear relations.
Y=dati$Peso
X=dati$Gestazione
ggplot(dati, aes(x = X, y = Y)) +
geom_point(color = "black", alpha = 0.6) + # Punti dati
geom_smooth(method = "lm", formula = y ~ x, color = "blue", se = FALSE, aes(linetype = "Linear")) + # Lineare
geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "red", se = FALSE, aes(linetype = "Quadratic")) + # Quadratico
geom_smooth(method = "lm", formula = y ~ poly(x, 3), color = "green", se = FALSE, aes(linetype = "Cubic")) + # Cubico
labs(title = "Comparison between different trasformations of Gestazione",
x = "Gestazione",
y = "Peso",
linetype="Line Relation") +
theme_minimal()
In order to evaluate these relations we update the models and compare each of them in an ANOVA.
# quadratic
mod.b_2 <- update(mod.b, ~. + I(Gestazione^2))
summary(mod.b_2)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Sesso + I(Gestazione^2), data = dati)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1144.11 -181.17 -13.16 165.73 2662.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4650.2268 898.1706 -5.177 2.43e-07 ***
## N.gravidanze 12.5660 4.3361 2.898 0.00379 **
## Gestazione -81.0486 49.7127 -1.630 0.10316
## Lunghezza 10.3534 0.3039 34.074 < 2e-16 ***
## Cranio 10.6363 0.4280 24.854 < 2e-16 ***
## SessoM 75.7900 11.2340 6.747 1.88e-11 ***
## I(Gestazione^2) 1.5136 0.6617 2.287 0.02226 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.4 on 2493 degrees of freedom
## Multiple R-squared: 0.7276, Adjusted R-squared: 0.7269
## F-statistic: 1110 on 6 and 2493 DF, p-value: < 2.2e-16
anova(mod.b, mod.b_2)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso +
## I(Gestazione^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2494 188065546
## 2 2493 187671672 1 393875 5.2322 0.02226 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# cubic
mod.b_3 <- update(mod.b, ~. + I(Gestazione^3))
summary(mod.b_3)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Sesso + I(Gestazione^3), data = dati)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1144.44 -180.81 -13.51 165.38 2660.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.394e+03 6.293e+02 -8.572 < 2e-16 ***
## N.gravidanze 1.255e+01 4.337e+00 2.893 0.00384 **
## Gestazione -2.245e+01 2.644e+01 -0.849 0.39584
## Lunghezza 1.035e+01 3.040e-01 34.033 < 2e-16 ***
## Cranio 1.063e+01 4.280e-01 24.831 < 2e-16 ***
## SessoM 7.595e+01 1.124e+01 6.759 1.72e-11 ***
## I(Gestazione^3) 1.294e-02 6.180e-03 2.094 0.03636 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.4 on 2493 degrees of freedom
## Multiple R-squared: 0.7275, Adjusted R-squared: 0.7268
## F-statistic: 1109 on 6 and 2493 DF, p-value: < 2.2e-16
anova(mod.b, mod.b_3)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso +
## I(Gestazione^3)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2494 188065546
## 2 2493 187735354 1 330192 4.3847 0.03636 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The ANOVA results are:
The non-linear relation with more significance is the quadratic one (with lower pvalue) and creates less multicollinearity between Gestazione and the non-linear component.
It’s performed a comparison between the quadratic model with and without Gestazione.
# only with Gestazione^2
mod.b_2_noges <- update(mod.b_2, ~. - Gestazione)
summary(mod.b_2_noges)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Lunghezza + Cranio + Sesso +
## I(Gestazione^2), data = dati)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1146.87 -181.09 -15.12 165.40 2643.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.101e+03 1.235e+02 -49.380 < 2e-16 ***
## N.gravidanze 1.255e+01 4.338e+00 2.894 0.00383 **
## Lunghezza 1.026e+01 2.987e-01 34.358 < 2e-16 ***
## Cranio 1.056e+01 4.255e-01 24.816 < 2e-16 ***
## SessoM 7.733e+01 1.120e+01 6.906 6.32e-12 ***
## I(Gestazione^2) 4.379e-01 5.053e-02 8.667 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.5 on 2494 degrees of freedom
## Multiple R-squared: 0.7273, Adjusted R-squared: 0.7267
## F-statistic: 1330 on 5 and 2494 DF, p-value: < 2.2e-16
anova(mod.b_2_noges, mod.b_2)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Lunghezza + Cranio + Sesso + I(Gestazione^2)
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Sesso +
## I(Gestazione^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2494 187871765
## 2 2493 187671672 1 200094 2.658 0.1032
ANOVA results:
mod <- mod.b_2_noges
The next step is to check the assumptions about the residuals (Normality, Homoscedasticity, Independence) and the Multicollinearity.
residuals_analysis <- function(mod){
plot(mod)
bp <- bptest(mod)
dw <- dwtest(mod)
shap <- shapiro.test(residuals(mod))
plot(density(residuals(mod)))
vif <- vif(mod)
print(bp)
print(dw)
print(shap)
print(vif)
}
residuals_analysis(mod)
##
## studentized Breusch-Pagan test
##
## data: mod
## BP = 90.858, df = 5, p-value < 2.2e-16
##
##
## Durbin-Watson test
##
## data: mod
## DW = 1.9529, p-value = 0.1191
## alternative hypothesis: true autocorrelation is greater than 0
##
##
## Shapiro-Wilk normality test
##
## data: residuals(mod)
## W = 0.97407, p-value < 2.2e-16
##
## N.gravidanze Lunghezza Cranio Sesso I(Gestazione^2)
## 1.023558 2.049668 1.620750 1.040354 1.630173
The density plot and the qqplot show a normal-like curve, except for the tails. Probably due to outliers and points of leverage the normality is challenged.
There isn’t multicollinearity (VIF all under 5).
Observing the problem with the normality we’re going to analyze possible influential points (outliers, leverage).
cook.d <- cooks.distance(mod)
cook.limit <- 4/n
outliers <- outlierTest(mod)
lev <- hatvalues(mod)
lev.limit <- 2 * (length(coef(mod))) / n
dati[which(cook.d>0.5),]
## Anni.madre N.gravidanze Fumatrici Gestazione Peso Lunghezza Cranio
## 1551 35 1 0 38 4370 315 374
## Tipo.parto Ospedale Sesso
## 1551 Nat osp3 F
dati[which(lev>lev.limit),]
## Anni.madre N.gravidanze Fumatrici Gestazione Peso Lunghezza Cranio
## 13 36 5 0 38 3060 455 325
## 15 33 3 0 34 2400 470 298
## 34 27 0 0 39 3150 480 382
## 67 29 0 0 33 2400 450 320
## 89 36 8 0 39 3610 500 351
## 96 39 3 0 37 3640 510 376
## 101 31 0 0 34 1370 390 287
## 106 29 4 0 30 1340 400 273
## 131 30 0 0 34 2290 450 285
## 134 38 6 0 37 3950 500 350
## 151 20 0 0 41 2280 450 280
## 155 30 0 0 36 3610 410 330
## 161 35 9 0 42 3760 540 348
## 189 40 4 0 38 3350 505 320
## 190 26 2 0 39 4050 525 390
## 204 30 8 0 40 3850 518 340
## 205 45 2 0 38 3850 505 384
## 206 39 1 0 31 1500 405 295
## 220 23 1 0 40 3520 445 363
## 294 34 5 0 40 3100 500 330
## 305 41 2 0 33 2300 450 323
## 310 40 3 0 28 1560 420 379
## 312 26 1 0 32 1280 360 276
## 315 33 2 0 35 2910 450 355
## 378 27 0 0 28 1285 400 274
## 440 38 5 0 37 3290 495 336
## 442 35 6 1 38 2430 460 324
## 445 27 0 0 32 1550 410 289
## 486 22 0 0 33 2250 445 310
## 492 34 2 0 33 1410 380 295
## 497 35 5 0 37 2880 490 340
## 516 40 8 0 38 3520 470 341
## 582 30 7 0 35 2220 470 316
## 587 16 1 0 31 1900 440 300
## 592 30 1 0 32 2260 440 322
## 614 37 5 0 40 3700 500 360
## 638 25 0 0 33 1720 420 300
## 656 38 3 0 41 2320 450 330
## 657 32 3 0 41 2970 520 318
## 684 30 1 0 39 3000 475 390
## 697 30 0 0 39 2820 510 300
## 702 25 0 0 33 2220 450 320
## 729 30 3 0 39 3900 555 355
## 748 35 0 0 33 1390 390 277
## 750 24 0 0 35 1450 405 280
## 757 30 6 0 35 2680 450 322
## 765 26 1 0 33 1970 445 300
## 805 30 2 0 29 1190 360 272
## 828 32 6 0 40 4200 510 350
## 893 35 5 0 38 3100 495 335
## 895 30 1 0 34 2580 470 347
## 913 34 5 0 40 3060 485 335
## 928 25 0 0 28 830 310 254
## 946 36 5 0 35 2900 490 340
## 947 34 3 0 32 1615 390 297
## 949 24 3 0 42 3060 480 334
## 956 25 0 0 41 2210 430 310
## 985 24 5 0 42 3600 510 335
## 1008 32 5 0 36 2400 470 333
## 1014 17 0 0 37 2050 390 295
## 1049 35 5 0 40 3950 500 350
## 1067 26 3 0 31 1960 420 300
## 1091 30 1 0 33 1770 410 275
## 1106 46 5 0 36 2710 470 347
## 1130 33 11 0 43 3400 475 360
## 1166 36 3 0 39 2950 505 307
## 1181 30 1 0 36 4070 500 373
## 1188 21 0 0 40 4140 550 320
## 1200 21 0 0 40 3200 525 310
## 1219 38 12 0 39 3350 490 344
## 1238 19 0 0 33 2270 440 315
## 1248 26 1 0 30 1170 370 266
## 1273 32 1 0 33 2040 480 307
## 1291 39 5 0 37 3360 490 320
## 1293 30 3 0 38 4600 485 380
## 1311 40 6 0 34 1840 430 305
## 1321 36 6 0 40 4340 515 383
## 1325 36 5 0 38 3500 500 343
## 1356 32 1 0 40 4090 525 390
## 1357 22 0 0 32 2340 445 304
## 1385 33 0 0 29 1620 410 292
## 1395 30 2 0 39 3790 505 304
## 1400 22 0 0 34 2590 485 314
## 1411 32 6 0 39 3290 480 360
## 1420 32 1 0 38 3770 530 380
## 1428 30 1 0 36 1280 385 292
## 1429 24 4 0 29 1280 390 355
## 1450 36 8 0 41 3730 480 335
## 1505 30 8 0 39 2860 490 337
## 1551 35 1 0 38 4370 315 374
## 1553 30 4 0 35 4520 520 360
## 1556 37 0 0 41 2420 490 300
## 1560 17 0 0 41 2800 455 328
## 1573 34 4 0 40 4260 540 366
## 1593 41 3 0 35 1500 420 304
## 1606 28 0 0 41 3050 460 352
## 1610 37 3 0 33 2000 470 293
## 1617 28 4 0 41 3020 485 330
## 1619 31 0 0 31 990 340 278
## 1628 31 0 0 35 2120 410 312
## 1686 27 0 0 31 1800 430 308
## 1693 25 0 0 41 3100 500 305
## 1701 22 0 0 32 1430 380 301
## 1712 28 0 0 39 3800 520 300
## 1718 34 4 0 42 2660 500 320
## 1727 36 8 0 36 2860 460 334
## 1735 15 0 0 37 2750 440 345
## 1780 25 2 0 25 900 325 253
## 1781 35 9 0 37 3150 490 335
## 1809 35 0 0 32 1780 420 277
## 1827 32 1 0 33 2100 420 310
## 1868 29 0 0 40 3470 525 390
## 1892 33 5 0 38 3090 505 340
## 1962 38 4 0 38 4370 530 340
## 1967 40 5 0 40 3250 500 340
## 1977 39 4 0 34 2970 480 350
## 2037 42 3 0 37 4020 525 368
## 2040 27 1 0 38 3240 410 359
## 2046 35 5 1 40 4300 530 360
## 2086 26 8 0 40 3250 500 355
## 2089 32 1 1 33 1780 400 305
## 2098 44 5 0 38 3280 475 346
## 2114 36 0 0 31 1180 355 270
## 2115 35 1 0 32 1890 500 309
## 2120 32 0 0 27 1140 370 267
## 2140 30 2 0 33 1600 410 290
## 2146 36 5 0 39 3770 520 360
## 2148 33 6 0 37 2900 465 350
## 2149 39 3 0 30 1300 380 276
## 2157 30 5 0 41 3540 515 347
## 2175 37 8 0 28 930 355 235
## 2200 33 0 0 30 1750 410 294
## 2215 29 1 0 40 2440 465 298
## 2216 22 0 0 32 2580 470 330
## 2220 23 3 1 41 2620 475 313
## 2221 35 10 0 39 2950 495 335
## 2224 41 1 0 33 2000 425 312
## 2225 27 0 0 35 3140 465 290
## 2244 35 6 0 39 3300 500 350
## 2257 40 0 0 34 1580 400 300
## 2307 26 1 0 30 1170 370 273
## 2317 25 6 0 38 2530 460 340
## 2318 17 0 0 41 3100 500 307
## 2337 31 3 1 37 3440 460 362
## 2359 25 6 0 33 2230 430 313
## 2408 37 2 0 31 1690 405 290
## 2422 33 10 0 40 3090 485 353
## 2436 36 5 0 40 3800 510 346
## 2437 28 1 0 27 980 320 265
## 2452 28 0 0 26 930 345 245
## 2458 31 0 0 31 1730 430 300
## 2471 34 10 0 38 2880 470 345
## 2478 32 1 0 33 2740 475 324
## Tipo.parto Ospedale Sesso
## 13 Ces osp1 F
## 15 Ces osp3 M
## 34 Nat osp1 F
## 67 Nat osp2 M
## 89 Ces osp1 M
## 96 Ces osp2 M
## 101 Nat osp2 F
## 106 Ces osp1 M
## 131 Nat osp2 M
## 134 Nat osp3 M
## 151 Ces osp3 M
## 155 Nat osp1 M
## 161 Nat osp2 F
## 189 Ces osp3 M
## 190 Nat osp1 M
## 204 Nat osp3 F
## 205 Nat osp3 M
## 206 Nat osp3 M
## 220 Nat osp1 F
## 294 Ces osp2 F
## 305 Nat osp1 M
## 310 Nat osp3 F
## 312 Nat osp2 M
## 315 Nat osp3 F
## 378 Nat osp1 F
## 440 Nat osp1 F
## 442 Nat osp2 F
## 445 Nat osp1 F
## 486 Nat osp3 F
## 492 Nat osp2 F
## 497 Nat osp2 F
## 516 Nat osp3 M
## 582 Nat osp3 M
## 587 Nat osp2 F
## 592 Ces osp3 F
## 614 Ces osp2 M
## 638 Nat osp1 M
## 656 Nat osp2 F
## 657 Ces osp2 M
## 684 Ces osp2 F
## 697 Ces osp3 F
## 702 Nat osp1 F
## 729 Nat osp1 M
## 748 Nat osp1 F
## 750 Nat osp1 F
## 757 Nat osp1 F
## 765 Nat osp3 M
## 805 Nat osp2 F
## 828 Nat osp1 M
## 893 Ces osp3 F
## 895 Nat osp2 M
## 913 Ces osp2 F
## 928 Nat osp1 F
## 946 Nat osp3 M
## 947 Nat osp3 F
## 949 Nat osp1 F
## 956 Nat osp3 F
## 985 Ces osp3 M
## 1008 Nat osp1 F
## 1014 Nat osp2 F
## 1049 Nat osp3 M
## 1067 Nat osp2 F
## 1091 Nat osp3 M
## 1106 Nat osp2 M
## 1130 Nat osp1 M
## 1166 Nat osp1 F
## 1181 Nat osp2 M
## 1188 Ces osp1 M
## 1200 Nat osp1 F
## 1219 Nat osp2 M
## 1238 Nat osp1 M
## 1248 Nat osp2 M
## 1273 Ces osp1 F
## 1291 Nat osp2 M
## 1293 Nat osp1 M
## 1311 Nat osp1 F
## 1321 Nat osp1 M
## 1325 Nat osp1 M
## 1356 Ces osp3 F
## 1357 Nat osp1 F
## 1385 Nat osp3 F
## 1395 Ces osp3 M
## 1400 Nat osp2 M
## 1411 Ces osp2 M
## 1420 Nat osp3 F
## 1428 Nat osp2 F
## 1429 Nat osp1 F
## 1450 Nat osp3 M
## 1505 Ces osp2 F
## 1551 Nat osp3 F
## 1553 Nat osp2 F
## 1556 Ces osp1 M
## 1560 Nat osp3 M
## 1573 Nat osp3 F
## 1593 Nat osp1 M
## 1606 Nat osp3 F
## 1610 Ces osp1 F
## 1617 Nat osp1 F
## 1619 Ces osp2 F
## 1628 Nat osp1 F
## 1686 Ces osp3 M
## 1693 Nat osp2 F
## 1701 Nat osp1 M
## 1712 Nat osp3 F
## 1718 Nat osp2 F
## 1727 Nat osp2 F
## 1735 Nat osp2 M
## 1780 Nat osp3 F
## 1781 Nat osp2 M
## 1809 Ces osp1 F
## 1827 Nat osp1 M
## 1868 Nat osp1 M
## 1892 Ces osp1 F
## 1962 Nat osp3 F
## 1967 Ces osp2 F
## 1977 Ces osp2 F
## 2037 Ces osp1 M
## 2040 Ces osp1 F
## 2046 Nat osp3 M
## 2086 Nat osp2 M
## 2089 Ces osp1 F
## 2098 Ces osp1 F
## 2114 Nat osp3 F
## 2115 Nat osp2 F
## 2120 Nat osp3 F
## 2140 Ces osp1 F
## 2146 Nat osp3 F
## 2148 Ces osp2 F
## 2149 Nat osp1 M
## 2157 Nat osp1 F
## 2175 Nat osp1 F
## 2200 Nat osp2 M
## 2215 Nat osp2 F
## 2216 Nat osp1 F
## 2220 Nat osp3 M
## 2221 Nat osp1 F
## 2224 Ces osp3 M
## 2225 Nat osp2 F
## 2244 Nat osp3 M
## 2257 Nat osp2 F
## 2307 Nat osp3 M
## 2317 Nat osp1 F
## 2318 Ces osp1 M
## 2337 Ces osp1 M
## 2359 Nat osp3 F
## 2408 Nat osp2 M
## 2422 Nat osp3 M
## 2436 Nat osp3 M
## 2437 Nat osp1 M
## 2452 Ces osp3 F
## 2458 Nat osp3 F
## 2471 Ces osp2 M
## 2478 Ces osp2 F
outliers
## rstudent unadjusted p-value Bonferroni p
## 1551 10.071523 2.0566e-23 5.1414e-20
## 155 5.047936 4.7884e-07 1.1971e-03
## 1306 4.812882 1.5766e-06 3.9414e-03
subset(dati, Lunghezza<330 & Lunghezza>300)
## Anni.madre N.gravidanze Fumatrici Gestazione Peso Lunghezza Cranio
## 928 25 0 0 28 830 310 254
## 1551 35 1 0 38 4370 315 374
## 1780 25 2 0 25 900 325 253
## 2437 28 1 0 27 980 320 265
## Tipo.parto Ospedale Sesso
## 928 Nat osp1 F
## 1551 Nat osp3 F
## 1780 Nat osp3 F
## 2437 Nat osp1 M
Usually the limit value to consider an influential point with the cook’s distance is calculated as 4/n (126 points).
As we can see in the graph of residuals exist only an extreme point with a very high value of cook’s distance (>0.5) and is the record 1551. That newborn compared with others of similar length shows a huge weight, so we can conclude that this probably was an error of data entry.
The leverage points that are greater the usual limit of 2k/n are 153. None of the leverage points seem to have errors. Outliers are only 3.
In order to evaluate the distortion given from the extreme point (that is a leverage point and an outlier too) we’ll perform the model without that point and analyze the residuals.
mod.no.extreme <- update(mod, data = dati[-which(cook.d > 0.5), ])
summary(mod.no.extreme)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Lunghezza + Cranio + Sesso +
## I(Gestazione^2), data = dati[-which(cook.d > 0.5), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -1163.05 -179.04 -12.42 164.08 1415.24
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.152e+03 1.212e+02 -50.742 < 2e-16 ***
## N.gravidanze 1.326e+01 4.253e+00 3.117 0.00185 **
## Lunghezza 1.090e+01 2.996e-01 36.380 < 2e-16 ***
## Cranio 9.934e+00 4.218e-01 23.549 < 2e-16 ***
## SessoM 7.752e+01 1.098e+01 7.060 2.15e-12 ***
## I(Gestazione^2) 4.029e-01 4.966e-02 8.113 7.67e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 269.1 on 2493 degrees of freedom
## Multiple R-squared: 0.7375, Adjusted R-squared: 0.737
## F-statistic: 1401 on 5 and 2493 DF, p-value: < 2.2e-16
residuals_analysis(mod.no.extreme)
##
## studentized Breusch-Pagan test
##
## data: mod
## BP = 10.032, df = 5, p-value = 0.07435
##
##
## Durbin-Watson test
##
## data: mod
## DW = 1.9534, p-value = 0.1219
## alternative hypothesis: true autocorrelation is greater than 0
##
##
## Shapiro-Wilk normality test
##
## data: residuals(mod)
## W = 0.98898, p-value = 5.836e-13
##
## N.gravidanze Lunghezza Cranio Sesso I(Gestazione^2)
## 1.023833 2.105523 1.653933 1.039944 1.637994
In this case all the tests are better compared to the base model:
It’s made the last check on models quality comparing R2, R2 adjusted and RMSE.
rmse <- function(mod, data= NULL){
pred <- predict(mod, newdata = data)
rmse <- sqrt(mean((data$Peso - pred)^2))
return(rmse)
}
cat("Base model: RMSE= ", rmse(mod, dati), " R2= ", summary(mod)$r.squared," R2 adj= ", summary(mod)$adj.r.squared, "\n")
## Base model: RMSE= 274.1326 R2= 0.7272828 R2 adj= 0.726736
cat("No extr. model: RMSE= ", rmse(mod.no.extreme, dati[-which(cook.d > 0.5), ]), " R2= ", summary(mod.no.extreme)$r.squared, " R2 adj= ", summary(mod.no.extreme)$adj.r.squared)
## No extr. model: RMSE= 268.774 R2= 0.7374958 R2 adj= 0.7369693
The indicators of goodness between base model and the no extreme one put us in the position to make a clear choice. The model without the extreme point is better in all the indicators.
The 73.7% of the target variable variance is explained by the model selected.
mod <- mod.no.extreme
dati.mod <- dati[-which(cook.d > 0.5), ]
We are going to use the model to estimate the weight of a female newborn considering that the mother is at her third pregnancy and will give birth at the 39th week.
To perform the prediction we have to take some assumptions for the missing data of the case:
newdata <- data.frame(N.gravidanze=2, Gestazione=39, Lunghezza=mean(dati.mod$Lunghezza), Cranio=mean(dati.mod$Cranio), Sesso=factor("F"))
pred <- predict(mod, newdata = newdata, interval = "confidence", level = 0.95)
pred
## fit lwr upr
## 1 3257.761 3240.311 3275.211
The estimation of the baby girl mean weight is 3257 grams and with the 95% of possibility will be between 3240 and 3275 grams.
Obviously with all the data about length, head circumference the prediction could be more detailed and restricted in the values range.
Are shown below the relations between target variable (Peso) and the independent variables (Lunghezza, Cranio, Gestazione) and is plotted the regression line about the model. For the body measurements is highlighted the sex division.
attach(dati.mod)
ylab <- as.character("Weight of the newborn (g)")
col_scale <- scale_color_manual(values = c("M" = "blue", "F" = "red"))
p1 <- ggplot()+
geom_point(aes(x = Gestazione, y = Peso, col = Sesso), alpha = 0.5, position = "jitter")+
geom_smooth(aes(x = Gestazione[Sesso == "M"], y = Peso[Sesso == "M"]),
col = "black", lwd = 1.5, method = "lm", formula = y ~ x + I(x^2), se = F)+
geom_smooth(aes(x = Gestazione[Sesso == "F"], y = Peso[Sesso == "F"]),
col = "black", lwd = 1.5, method = "lm", formula = y ~ x + I(x^2), se = F)+
geom_smooth(aes(x = Gestazione, y = Peso, col = Sesso),
method = "lm", formula = y ~ x + I(x^3), se = F)+
col_scale +
labs(x = "Week of pregnancy",
y = ylab)+
theme_bw()+
theme(legend.position = "none")
p2 <- ggplot()+
geom_boxplot(aes(x = as.factor(N.gravidanze), y = Peso))+
geom_smooth(aes(x = 1+N.gravidanze, y = Peso),
lwd = 1.25, method = "lm", se = F)+
labs(x = "N° of previous pregnancies",
y = ylab)+
theme_bw()
p3 <- ggplot()+
geom_point(aes(x = Lunghezza, y = Peso, col = Sesso), alpha = 0.25, position = "jitter")+
geom_smooth(aes(x = Lunghezza[Sesso == "M"], y = Peso[Sesso == "M"]),
col = 1, lwd = 1.5, method = "lm", se = F)+
geom_smooth(aes(x = Lunghezza[Sesso == "F"], y = Peso[Sesso == "F"]),
col = 1, lwd = 1.5, method = "lm", se = F)+
geom_smooth(aes(x = Lunghezza, y = Peso, col = Sesso), method = "lm", se = F)+
col_scale+
scale_x_continuous(breaks = seq(300, 550, 50))+
scale_y_continuous(breaks = seq(1000, 5000, 500))+
labs(x = "Length of the newborn (mm)",
y = ylab)+
theme_bw()+
theme(legend.position = "none")
p4 <- ggplot()+
geom_point(aes(x = Cranio, y = Peso, col = Sesso), alpha = 0.4, position = "jitter")+
geom_smooth(aes(x = Cranio[Sesso == "M"], y = Peso[Sesso == "M"]),
col = 1, lwd = 1.5, method = "lm", se = F)+
geom_smooth(aes(x = Cranio[Sesso == "F"], y = Peso[Sesso == "F"]),
col = 1, lwd = 1.5, method = "lm", se = F)+
geom_smooth(aes(x = Cranio, y = Peso, col = Sesso), method = "lm", se = F)+
col_scale+
scale_x_continuous(breaks = seq(260, 380, 40))+
scale_y_continuous(breaks = seq(1000, 5000, 500))+
labs(x = "Head circumference of the newborn (mm)",
y = ylab)+
theme_bw()+
theme(legend.position = "none")
p1
p2
## `geom_smooth()` using formula = 'y ~ x'
p3
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
p4
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
The model has a good prediction value (R2=73,7%) but it needs to be perfected in order to give to the health system operators a reliable tool.
Obviously the amount of data has to be increased to be able to find other patterns and discover if are significant those that were excluded in this case.
More complex is a model, higher are the chance to describe in the best way the real word; so it’s possible that the Linear Regression couldn’t be the best solution to predict the newborns weight.