library(knitr)
library(dplyr)
##
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
library(moments)
library(ggplot2)
library(kableExtra)
## Warning: il pacchetto 'kableExtra' è stato creato con R versione 4.5.2
##
## Caricamento pacchetto: 'kableExtra'
## Il seguente oggetto è mascherato da 'package:dplyr':
##
## group_rows
dati<-read.csv("neonati_1.csv",fileEncoding = "UTF-8",stringsAsFactors = T,sep=";")
Load of the data
summary(dati)
## Anni.madre N.gravidanze Fumatrici Gestazione
## Min. : 0.00 Min. : 0.0000 Min. :0.0000 Min. :25.00
## 1st Qu.:25.00 1st Qu.: 0.0000 1st Qu.:0.0000 1st Qu.:38.00
## Median :28.00 Median : 1.0000 Median :0.0000 Median :39.00
## Mean :28.16 Mean : 0.9812 Mean :0.0416 Mean :38.98
## 3rd Qu.:32.00 3rd Qu.: 1.0000 3rd Qu.:0.0000 3rd Qu.:40.00
## Max. :46.00 Max. :12.0000 Max. :1.0000 Max. :43.00
## Peso Lunghezza Cranio Tipo.parto Ospedale Sesso
## Min. : 830 Min. :310.0 Min. :235 Ces: 728 osp1:816 F:1256
## 1st Qu.:2990 1st Qu.:480.0 1st Qu.:330 Nat:1772 osp2:849 M:1244
## Median :3300 Median :500.0 Median :340 osp3:835
## Mean :3284 Mean :494.7 Mean :340
## 3rd Qu.:3620 3rd Qu.:510.0 3rd Qu.:350
## Max. :4930 Max. :565.0 Max. :390
attach(dati)
Overview of the data
str(dati)
## 'data.frame': 2500 obs. of 10 variables:
## $ Anni.madre : int 26 21 34 28 20 32 26 25 22 23 ...
## $ N.gravidanze: int 0 2 3 1 0 0 1 0 1 0 ...
## $ Fumatrici : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Gestazione : int 42 39 38 41 38 40 39 40 40 41 ...
## $ Peso : int 3380 3150 3640 3690 3700 3200 3100 3580 3670 3700 ...
## $ Lunghezza : int 490 490 500 515 480 495 480 510 500 510 ...
## $ Cranio : int 325 345 375 365 335 340 345 349 335 362 ...
## $ Tipo.parto : Factor w/ 2 levels "Ces","Nat": 2 2 2 2 2 2 2 2 1 1 ...
## $ Ospedale : Factor w/ 3 levels "osp1","osp2",..: 3 1 2 2 3 2 3 1 2 2 ...
## $ Sesso : Factor w/ 2 levels "F","M": 2 1 2 2 1 1 1 2 1 1 ...
Here presented the variables in the dataset.
tab <- table(dati$Tipo.parto, dati$Ospedale)
tab
##
## osp1 osp2 osp3
## Ces 242 254 232
## Nat 574 595 603
prop_ces <- prop.table(tab, margin = 2)["Ces", ]
prop_ces
## osp1 osp2 osp3
## 0.2965686 0.2991755 0.2778443
In hospital nr 2 there are more cesarean operation, calculated as number of cesarean operation over the total amount of birth per hospital.
mu_peso <- sum(dati$Peso)/length(dati$Peso)
t.test(dati$Peso, mu = mu_peso)
##
## One Sample t-test
##
## data: dati$Peso
## t = 0, df = 2499, p-value = 1
## alternative hypothesis: true mean is not equal to 3284.081
## 95 percent confidence interval:
## 3263.490 3304.672
## sample estimates:
## mean of x
## 3284.081
mu_lunghezza <- sum(dati$Lunghezza)/length(dati$Lunghezza)
t.test(dati$Lunghezza, mu = mu_lunghezza)
##
## One Sample t-test
##
## data: dati$Lunghezza
## t = 0, df = 2499, p-value = 1
## alternative hypothesis: true mean is not equal to 494.692
## 95 percent confidence interval:
## 493.6598 495.7242
## sample estimates:
## mean of x
## 494.692
In both cases (Peso and Lughezza) the mean of the sample is not different from the mean of the population because p-value > 0.05 on a t test.
summary_table <- dati %>%
group_by(Sesso) %>%
summarise(avg_Peso = mean(Peso, na.rm = TRUE),
avg_Lunghezza = mean(Lunghezza, na.rm = TRUE),
avg_Cranio = mean(Cranio, na.rm = TRUE))
kable(summary_table)%>%
kable_styling() %>%
column_spec(1, color = "blue")
| Sesso | avg_Peso | avg_Lunghezza | avg_Cranio |
|---|---|---|---|
| F | 3161.132 | 489.7643 | 337.6330 |
| M | 3408.215 | 499.6672 | 342.4486 |
t.test(Peso ~ Sesso, data = dati)
##
## Welch Two Sample t-test
##
## data: Peso by Sesso
## t = -12.106, df = 2490.7, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -287.1051 -207.0615
## sample estimates:
## mean in group F mean in group M
## 3161.132 3408.215
t.test(Lunghezza ~ Sesso, data = dati)
##
## Welch Two Sample t-test
##
## data: Lunghezza by Sesso
## t = -9.582, df = 2459.3, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -11.929470 -7.876273
## sample estimates:
## mean in group F mean in group M
## 489.7643 499.6672
t.test(Cranio ~ Sesso, data = dati)
##
## Welch Two Sample t-test
##
## data: Cranio by Sesso
## t = -7.4102, df = 2491.4, p-value = 1.718e-13
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -6.089912 -3.541270
## sample estimates:
## mean in group F mean in group M
## 337.6330 342.4486
Male newborn present significantly more weight, length and cranium dimensions.
moments::skewness(dati$Peso)
## [1] -0.6470308
moments::kurtosis(dati$Peso)-3
## [1] 2.031532
shapiro.test(dati$Peso)
##
## Shapiro-Wilk normality test
##
## data: dati$Peso
## W = 0.97066, p-value < 2.2e-16
moments::skewness(dati$Lunghezza)
## [1] -1.514699
moments::kurtosis(dati$Lunghezza)-3
## [1] 6.487174
shapiro.test(dati$Lunghezza)
##
## Shapiro-Wilk normality test
##
## data: dati$Lunghezza
## W = 0.90941, p-value < 2.2e-16
plot(density(dati$Peso))
plot(density(dati$Lunghezza))
Both Peso and Lunghezza present a density distribution which is similar to a bell shape. Both are also positively skewed.
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...)
{
usr <- par("usr"); on.exit(par(usr = usr))
par(usr = c(0, 1, 0, 1))
r <- (cor(x, y))
txt <- format(c(r, 1), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = 1)
}
#correlazioni
pairs(dati,lower.panel=panel.cor, upper.panel=panel.smooth)
High positive correlation: gestation period vs weight, gestation period vs lenght, gestation period vs skull circumference, weight vs length, skull circumference vs weight, skull circumference vs lenght.
mod<-lm(Peso~Anni.madre+N.gravidanze+Fumatrici+Gestazione+Lunghezza+Cranio+Tipo.parto+Ospedale+Sesso)
summary(mod)
##
## Call:
## lm(formula = Peso ~ Anni.madre + N.gravidanze + Fumatrici + Gestazione +
## Lunghezza + Cranio + Tipo.parto + Ospedale + Sesso)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1124.40 -181.66 -14.42 160.91 2611.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6738.4762 141.3087 -47.686 < 2e-16 ***
## Anni.madre 0.8921 1.1323 0.788 0.4308
## N.gravidanze 11.2665 4.6608 2.417 0.0157 *
## Fumatrici -30.1631 27.5386 -1.095 0.2735
## Gestazione 32.5696 3.8187 8.529 < 2e-16 ***
## Lunghezza 10.2945 0.3007 34.236 < 2e-16 ***
## Cranio 10.4707 0.4260 24.578 < 2e-16 ***
## Tipo.partoNat 29.5254 12.0844 2.443 0.0146 *
## Ospedaleosp2 -11.2095 13.4379 -0.834 0.4043
## Ospedaleosp3 28.0958 13.4957 2.082 0.0375 *
## SessoM 77.5409 11.1776 6.937 5.08e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 273.9 on 2489 degrees of freedom
## Multiple R-squared: 0.7289, Adjusted R-squared: 0.7278
## F-statistic: 669.2 on 10 and 2489 DF, p-value: < 2.2e-16
The mod1 is statistically significant because the p value is < 2.2e-16. About 73% of the variability in birth rate (R^2) is explained by the predictors. Among the coefficient the most effective are: gestazione, lunghezza, cranio and sesso. Less effective but still relevant: n. Gravidanze and tipo.parto.
mod2<-lm(Peso~N.gravidanze+Gestazione+Lunghezza+Cranio+Tipo.parto+Sesso)
summary(mod2)
##
## Call:
## lm(formula = Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio +
## Tipo.parto + Sesso)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1129.31 -181.70 -16.31 161.07 2638.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6707.2971 135.9911 -49.322 < 2e-16 ***
## N.gravidanze 12.7558 4.3366 2.941 0.0033 **
## Gestazione 32.2713 3.7941 8.506 < 2e-16 ***
## Lunghezza 10.2864 0.3007 34.207 < 2e-16 ***
## Cranio 10.5057 0.4260 24.659 < 2e-16 ***
## Tipo.partoNat 30.0342 12.0969 2.483 0.0131 *
## SessoM 77.9285 11.1905 6.964 4.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 274.3 on 2493 degrees of freedom
## Multiple R-squared: 0.7277, Adjusted R-squared: 0.727
## F-statistic: 1110 on 6 and 2493 DF, p-value: < 2.2e-16
By removing ospedale, fumatrice and anni.madre, we do not obtain an improvement in residual error and p-value at overall level but another two factors: n. gravidanze and tipo parto show an improvement in the significance. R^2 and residual standard error are more or less identical.
mod3<-lm(Peso~Gestazione+Lunghezza+Cranio+Sesso)
summary(mod3)
##
## Call:
## lm(formula = Peso ~ Gestazione + Lunghezza + Cranio + Sesso)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1138.2 -184.3 -17.6 163.3 2627.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6651.1188 135.5172 -49.080 < 2e-16 ***
## Gestazione 31.2737 3.7856 8.261 2.31e-16 ***
## Lunghezza 10.2054 0.3007 33.939 < 2e-16 ***
## Cranio 10.6704 0.4245 25.139 < 2e-16 ***
## SessoM 79.1049 11.2117 7.056 2.22e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 275 on 2495 degrees of freedom
## Multiple R-squared: 0.7261, Adjusted R-squared: 0.7257
## F-statistic: 1654 on 4 and 2495 DF, p-value: < 2.2e-16
By removing also n.gravidanze and tipo.parto overall results are almost identical.
anova(mod2,mod)
## Analysis of Variance Table
##
## Model 1: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto +
## Sesso
## Model 2: Peso ~ Anni.madre + N.gravidanze + Fumatrici + Gestazione + Lunghezza +
## Cranio + Tipo.parto + Ospedale + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2493 187601677
## 2 2489 186762521 4 839155 2.7959 0.02479 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod3,mod2)
## Analysis of Variance Table
##
## Model 1: Peso ~ Gestazione + Lunghezza + Cranio + Sesso
## Model 2: Peso ~ N.gravidanze + Gestazione + Lunghezza + Cranio + Tipo.parto +
## Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2495 188688687
## 2 2493 187601677 2 1087011 7.2225 0.0007453 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(mod3,mod)
## Analysis of Variance Table
##
## Model 1: Peso ~ Gestazione + Lunghezza + Cranio + Sesso
## Model 2: Peso ~ Anni.madre + N.gravidanze + Fumatrici + Gestazione + Lunghezza +
## Cranio + Tipo.parto + Ospedale + Sesso
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 2495 188688687
## 2 2489 186762521 6 1926166 4.2784 0.0002689 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
With the ANOVA test we test whether the more complex model is able to explain better the variability. In this case the best model by p-value is model 3.
BIC(mod,mod2)
## df BIC
## mod 12 35241.84
## mod2 8 35221.75
BIC(mod2,mod3)
## df BIC
## mod2 8 35221.75
## mod3 6 35220.54
BIC(mod,mod3)
## df BIC
## mod 12 35241.84
## mod3 6 35220.54
BIC presents a compromise between adaptability of the model and complexity. Also in this case the best model is the mod3.
car::vif(mod)
## GVIF Df GVIF^(1/(2*Df))
## Anni.madre 1.187454 1 1.089704
## N.gravidanze 1.186428 1 1.089233
## Fumatrici 1.007392 1 1.003689
## Gestazione 1.695810 1 1.302233
## Lunghezza 2.085755 1 1.444214
## Cranio 1.630796 1 1.277026
## Tipo.parto 1.004242 1 1.002119
## Ospedale 1.004071 2 1.001016
## Sesso 1.040643 1 1.020119
car::vif(mod2)
## N.gravidanze Gestazione Lunghezza Cranio Tipo.parto Sesso
## 1.024171 1.669258 2.080039 1.626199 1.003438 1.040060
car::vif(mod3)
## Gestazione Lunghezza Cranio Sesso
## 1.653502 2.069517 1.606131 1.038813
Vif explains the multicollinearity between variables. In this case the level is pretty modest in all 3 models.
AIC(mod,mod2,mod3)
## df AIC
## mod 12 35171.95
## mod2 8 35175.16
## mod3 6 35185.60
AIC compares models adaptability to data penalizing complexity. In this case mod1 is the best. However, we will consider the BIC test and the previous ones to select the definitive one because more rigid than AIC.
The model selected is mod3.
par(mfrow=c(2,2))
plot(mod3)
From the residuals vs fitted we see that the number of outliers is very small: 1 to 5 maximum values. The shape of the representation do not lead to any pattern. In Q-Q residuals representation residuals follow the diagonal, there is only 1 outlier that affect the whole scheme. Scale-location shows variance pretty much costant except one value. In the residuals vs leverage we can see there is a point with high leverage.
lmtest::bptest(mod3)
##
## studentized Breusch-Pagan test
##
## data: mod3
## BP = 89.148, df = 4, p-value < 2.2e-16
lmtest::dwtest(mod3)
##
## Durbin-Watson test
##
## data: mod3
## DW = 1.9557, p-value = 0.1337
## alternative hypothesis: true autocorrelation is greater than 0
shapiro.test(mod3$residuals)
##
## Shapiro-Wilk normality test
##
## data: mod3$residuals
## W = 0.9742, p-value < 2.2e-16
plot(density(residuals(mod3)))
Variance is not constant among the residuals and no autocorrelation
among residuals. Shapiro Wilk is confirming that the residuals have a
normal distribution.
plot(rstudent(mod3))
abline(h=c(-2,2))
car::outlierTest(mod3)
## rstudent unadjusted p-value Bonferroni p
## 1551 9.986149 4.7193e-23 1.1798e-19
## 155 4.951276 7.8654e-07 1.9663e-03
## 1306 4.781188 1.8440e-06 4.6100e-03
Most of the residuals are between -2 < rstudent < +2 Also with these test the value on row 1551 is an outlier.
cook<-cooks.distance(mod3)
plot(cook,ylim = c(0,1))
Analyzing the Cook distance the same value (1551) appear as outlier. All the residual analysis is highlighting that specific outlier so we decided to regenerate the model and perform the analysis without it.
lev <- hatvalues(mod3)
plot(lev)
soglia <- 9*mean(lev) # regola comune
abline(h=soglia, col="red")
lev[lev > soglia]
## 310 928 1429 1551 1780 2120 2175
## 0.02875776 0.02209535 0.02103704 0.04852182 0.02552886 0.01800254 0.02116734
## 2437 2452
## 0.02375701 0.02322740
Here we identify the value, number -1551, as a specific outlier.
dati_clean <- dati[-1551, ]
We exclude it from the dataset.
mod4<-lm(dati_clean$Peso~dati_clean$Gestazione+dati_clean$Lunghezza+dati_clean$Cranio+dati_clean$Sesso)
summary(mod4)
##
## Call:
## lm(formula = dati_clean$Peso ~ dati_clean$Gestazione + dati_clean$Lunghezza +
## dati_clean$Cranio + dati_clean$Sesso)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1153.79 -181.99 -14.94 163.85 1391.36
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6651.7241 132.9131 -50.046 < 2e-16 ***
## dati_clean$Gestazione 28.4861 3.7233 7.651 2.83e-14 ***
## dati_clean$Lunghezza 10.8440 0.3018 35.935 < 2e-16 ***
## dati_clean$Cranio 10.0591 0.4208 23.906 < 2e-16 ***
## dati_clean$SessoM 79.3075 10.9963 7.212 7.27e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 269.7 on 2494 degrees of freedom
## Multiple R-squared: 0.7362, Adjusted R-squared: 0.7358
## F-statistic: 1740 on 4 and 2494 DF, p-value: < 2.2e-16
This is the adjusted model, mod4. It does not present differences with mod3 in variables p-values and adjusted R^2.
par(mfrow=c(2,2))
plot(mod4)
Here the reperforming analysis of the residuals. We do not see other relevant outliers to be excluded and all the results are still consistent and acceptable.
lmtest::bptest(mod4)
##
## studentized Breusch-Pagan test
##
## data: mod4
## BP = 9.3632, df = 4, p-value = 0.05263
lmtest::dwtest(mod4)
##
## Durbin-Watson test
##
## data: mod4
## DW = 1.956, p-value = 0.1354
## alternative hypothesis: true autocorrelation is greater than 0
shapiro.test(mod4$residuals)
##
## Shapiro-Wilk normality test
##
## data: mod4$residuals
## W = 0.98864, p-value = 3.278e-13
plot(density(residuals(mod4)))
By running further tests on the residuals we can see the both mod3 and mod4(adjusted for the outlier) do not present any exception on density, Shapiro Wilk and other tests.
#leverage
lev<-hatvalues(mod4)
plot(lev)
p<-sum(lev)
n<-length(lev)
soglia=2*p/n
abline(h=soglia,col=2)
lev[lev>soglia]
## 15 34 42 61 67 96
## 0.006216127 0.006419346 0.004284156 0.004751731 0.005345377 0.004810057
## 101 106 117 131 151 155
## 0.007251251 0.012815138 0.004747775 0.007009499 0.010847654 0.006897851
## 190 205 206 220 249 295
## 0.005309175 0.005344517 0.009412881 0.007718992 0.004716504 0.004011599
## 304 305 310 312 315 328
## 0.004585991 0.005382998 0.029002218 0.013237384 0.005449381 0.004022997
## 348 378 383 445 471 486
## 0.004334722 0.015378831 0.004433232 0.007097064 0.004450677 0.004741196
## 492 540 565 587 592 601
## 0.008301360 0.004066992 0.004671172 0.008390672 0.006318340 0.004053859
## 615 629 638 656 666 684
## 0.004684496 0.004047512 0.006224667 0.004853782 0.004482028 0.009021694
## 697 701 702 726 729 748
## 0.005959882 0.004032325 0.004719964 0.004115623 0.004106355 0.008258852
## 750 765 805 821 895 928
## 0.006728478 0.006059381 0.014347455 0.004058874 0.005211017 0.022273516
## 947 956 964 968 991 1014
## 0.007871923 0.007792368 0.004781050 0.004224510 0.004261335 0.008390845
## 1067 1072 1091 1096 1130 1157
## 0.007869602 0.004048867 0.008928154 0.004295875 0.006750051 0.004146814
## 1166 1181 1188 1200 1238 1248
## 0.004110134 0.005598732 0.006668017 0.005601912 0.005376885 0.014587832
## 1273 1275 1283 1293 1294 1323
## 0.007175323 0.004022293 0.004067765 0.005670664 0.004939173 0.004106355
## 1356 1357 1361 1385 1395 1400
## 0.005317649 0.006536670 0.004206499 0.012017646 0.004750654 0.005629558
## 1402 1420 1428 1429 1552 1555
## 0.004987887 0.005130893 0.008347152 0.021373260 0.006837072 0.005898733
## 1559 1592 1605 1609 1618 1627
## 0.004671854 0.004888954 0.004928537 0.007873201 0.014722961 0.004764679
## 1633 1685 1691 1692 1700 1704
## 0.004670587 0.008715904 0.004308371 0.005085978 0.010329336 0.004002764
## 1711 1734 1779 1801 1805 1808
## 0.007159551 0.004524722 0.025570593 0.004165349 0.004214850 0.008400142
## 1826 1857 1867 1892 1910 1919
## 0.006034318 0.004329163 0.004777518 0.004272483 0.004625073 0.004239332
## 1976 2036 2039 2065 2088 2113
## 0.005385205 0.004240886 0.011996929 0.004150851 0.006329055 0.012939287
## 2114 2119 2139 2148 2174 2199
## 0.011990671 0.018003618 0.006099280 0.013047765 0.021170010 0.011030766
## 2214 2215 2219 2223 2224 2256
## 0.004834379 0.007558794 0.004024902 0.005798946 0.005472383 0.005842930
## 2306 2317 2336 2358 2390 2407
## 0.013927622 0.004810336 0.004833496 0.004762420 0.004454478 0.009636146
## 2432 2436 2451 2457 2475 2477
## 0.004087473 0.023909862 0.023228278 0.008004959 0.004173925 0.005767626
Now there are not values significantly above the thresold line.
plot(rstudent(mod4))
abline(h=c(-2,2))
car::outlierTest(mod4)
## rstudent unadjusted p-value Bonferroni p
## 155 5.203448 2.1156e-07 0.00052869
## 1306 4.892231 1.0605e-06 0.00265030
## 1399 -4.297520 1.7935e-05 0.04481900
cook<-cooks.distance(mod4)
plot(cook,ylim = c(0,1))
Results are more consistent with rstudent and Cook’s distance.
nuovo <- data.frame(
Anni.madre = mean(dati$Anni.madre, na.rm = TRUE),
N.gravidanze = 3,
Fumatrici = 0,
Gestazione = 39,
Lunghezza = mean(dati$Lunghezza, na.rm = TRUE),
Cranio = mean(dati$Cranio, na.rm = TRUE),
Tipo.parto = "Nat",
Ospedale = "osp1",
Sesso = "F"
)
predict(mod3, newdata = nuovo)
## 1
## 3245.331
Considering the similar results in terms of prediction we test mod3 in a specific case.
ggplot(data=dati)+
geom_point(aes(x=Gestazione,
y=Peso,
col=Sesso),position = "jitter")+
geom_smooth(aes(x=Gestazione,
y=Peso,
col=Sesso),se=F,method = "lm", formula = y ~ x)
The chart shows how both sesso e gestazione are affecting the weight of
the newborn. Males weight more and a major number of weeks in Gestazione
potentially determine a major weight.
ggplot(data=dati)+
geom_point(aes(x=Gestazione,
y=Peso,
col=Lunghezza),position = "jitter")+
geom_smooth(aes(x=Gestazione,
y=Peso,
col=Lunghezza),method = "lm", formula = y ~ x)
## Warning: The following aesthetics were dropped during statistical transformation:
## colour.
## ℹ This can happen when ggplot fails to infer the correct grouping structure in
## the data.
## ℹ Did you forget to specify a `group` aesthetic or to convert a numerical
## variable into a factor?
Moving now to the visual study of Peso, Gestazione and Lunghezza we can see that the Peso is proportional also with Gestazione and Lunghezza.
ggplot(data=dati)+
geom_point(aes(x=Gestazione,
y=Peso,
col=Fumatrici),position = "jitter")
The number of Fumatrici is low compared with the number of Non
Fumatrici, this does not provide a sample big enough to be studied.
However we can see that the Fumatrici variable does not seem to
influence Peso and Gestazione.
ggplot(dati, aes(x = Gestazione, y = Peso)) +
geom_point(aes(color = Sesso), alpha = 0.6) +
geom_smooth(aes(color = Sesso), method = "lm") +
facet_wrap(~ Fumatrici)
## `geom_smooth()` using formula = 'y ~ x'
This chart confirms the previous consideration.
Conclusions: - the dataset available presents the following variables: Anni.madre: int N.gravidanze: int Fumatrici: int Gestazione: int Peso: int Lunghezza : int Cranio: int Tipo.parto: Factor 2 options Ospedale: Factor 3 options Sesso: Factor 2 options