install.packages(“sandwich”) install.packages(“lmtest”) install.packages(“dplyr”) install.packages(“ggplot2”) library(sandwich) library(lmtest) library(ggplot2) library(dplyr)
#1.Data Collection and Dataset Structure
#upload database dati<-read.csv(“neonati.csv”)
attach(dati) summary(dati) n<-nrow(dati)
moments::skewness(Peso) #negatively skewed distribution: higher frequency of values below the average compared to a symmetrical #distribution
moments::kurtosis(Peso)-3 #leptokurtic distribution: dataset may contain more outliers or extreme values than a normal distribution.
shapiro.test(Peso )#the data do not follow a normal distribution.
#relationship between the weight of newborns and variables of the
dataset panel.cor <- function(x, y, digits = 2, prefix = ““, cex.cor,
…) { usr <- par(”usr”); on.exit(par(usr)) par(usr = c(0, 1, 0, 1)) r
<- cor(x, y, use = “complete.obs”) txt <- format(c(r, 1), digits =
digits)[1] txt <- paste0(prefix, txt) if(missing(cex.cor)) cex.cor
<- 0.8/strwidth(txt) text(0.5, 0.5, txt, cex = cex.cor) } x11()
pairs(dati[sapply(dati, is.numeric)], lower.panel=panel.cor,
upper.panel=panel.smooth)
#moderate positive correlation between “Gestazione” e “Peso”. A scatterplot that is similar to a logarithmic function.
par(mfrow=c(1,2)) boxplot(Peso) boxplot(Peso~Sesso) boxplot(Peso~Fumatrici)
wilcox.test(Peso ~ Fumatrici, data = dati) #verify whether the observed average difference is statistically significant. #There is a tendency towards a difference, but it is not strong enough to be confirmed with the sample available. #Use the Wilcoxon test instead of the t-test because the variable ‘peso’ does not follow a normal distribution.
mean(Peso[Fumatrici==0]) mean(Peso[Fumatrici==1]) #The average weight for children of non-smoking mothers is slightly higher (3286 g) than for children of smoking mothers (3236 g), #indicating a possible difference, albeit small.
#The presence of numerous outliers could indicate significant weight variability within the groups.
#2.Analysis and Modelling
#2.1Preliminary Analysis
#2.1.1 Analysis of the relationship between ‘peso’ and variables (“Lungghezza”, “Peso” and “Cranio”). par(mfrow=c(1,3)) boxplot(Peso~Sesso) boxplot(Lunghezza~Sesso) boxplot(Cranio~Sesso) wilcox.test(Peso ~ Sesso, data = dati) wilcox.test(Lunghezza ~ Sesso, data = dati) wilcox.test(Cranio ~ Sesso, data = dati)
#The difference in the weight, length and dimensions of the skulls of males and females is statistically significant.
#2.1.2 In some hospitals, more caesarean sections are performed. #H0: the proportion of caesarean sections is the same in all hospitals. tabella <- table(Ospedale, Tipo.parto) print(tabella) test_chi <- chisq.test(tabella) print(test_chi) #There is insufficient statistical evidence to claim that the proportion of caesarean sections differs between the three hospitals #in the sample.
#2.1.3 The average weight and length of the sample of newborns are significantly equal to those of the population. #From OMS(Organizzazione Mondiale della Sanità), mean weight population newborns mu_m_weight <- 3.3464 #kg mu_f_weight <- 3.2322 #Kg mu_m_length <- 498.842 #millimeters mu_f_length <- 491.477 #millimeters
dati_m <- subset(dati, Sesso == “M”) dati_f <- subset(dati, Sesso == “F”)
weight_male_newborns_sample <- mean(dati_m\(Peso) weight_female_newborns_sample <- mean(dati_f\)Peso)
#Normal test for variable ‘peso’ divided by variable ‘sesso’. shapiro.test(dati_m\(Peso) shapiro.test(dati_f\)Peso) #the data do not follow a normal distribution.
#H0: the median weight of males in the sample is equal to 3.3464 kg wilcox.test(dati_m$Peso, mu_m_weight) #p_value>0.05: I can’t refuse H0
#H0: the median weight of females in the sample is equal to 3.2322 kg wilcox.test(dati_f$Peso, mu_f_weight) #p_value>0.05: I can’t refuse H0
#H0: the median length of males in the sample is equal to 498.842 mm wilcox.test(dati_f$Lunghezza, mu_m_length) #p_value>0.05: I can’t refuse H0
#H0: the median weight of females in the sample is equal to 491.477 mm wilcox.test(dati_f$Lunghezza, mu_f_length) #p_value>0.05: I can’t refuse H0
#In light of the results obtained: the average weight and length of this sample of newborns are significantly equal to those of the population.
#2.2 Creation of the Regression Model
mod<-lm(Peso~.,data=dati) summary(mod)
mod1 <- lm(Peso ~ . - Anni.madre - N.gravidanze, data = dati) summary(mod1)
mod2 <- lm(Peso ~ Gestazione + Lunghezza + Cranio + Sesso, data = dati) summary(mod2)
full_model <- lm(Peso ~ (Gestazione + Lunghezza + Cranio + Sesso + Fumatrici + Tipo.parto)^2, data = dati) mod3 <- step(full_model, direction = “both”) summary(mod3)
#2.3 Selection of the Optimal Model BIC(mod,mod2,mod1,mod3) #step_model could be the best model
#2.4 Model Quality Analysis par(mfrow=c(2,2)) plot(mod3) #1) It seems that the homoscedasticity assumption is being respected. Observations are distributed in a random way around the #line located at value 0. #2) Most of the points follow the diagonal line in the Q-Q plot, it means that the model residuals are approximately normal. #3) Even with the Scale-Location, the model seems to respect the homoscedasticity assumption. #4) Most of the residuals are randomly distributed around zero, suggesting a good overall fit.
lmtest::bptest(mod3) #p-value<0.5: the model exhibits heteroscedasticity lmtest::dwtest(mod3) #there is no evidence of positive autocorrelation in the residuals. shapiro.test(mod3$residuals) #the residues do not follow a normal distribution. plot(density(residuals(mod3)))
#condition of residuals aren’t respected. #log trasformation dati\(logPeso <- log(dati\)Peso) full_model_log <- lm(logPeso ~ (Gestazione + Lunghezza + Cranio + Sesso + Fumatrici + Tipo.parto)^2, data = dati) mod4 <- step(full_model_log, direction = “both”) summary(mod4)
#residue analysis lmtest::bptest(mod4) lmtest::dwtest(mod4) shapiro.test(mod4$residuals) plot(density(residuals(mod4)))
#Calculation of robust standard errors of the “HC1” type robust_vcov <- vcovHC(mod4, type = “HC1”) coeftest(mod4, vcov = robust_vcov)
#new model without the variability “cranio” with a p-value= 0.94 mod5 <- update(mod4, . ~ . - Cranio) summary(mod5) lmtest::bptest(mod5) lmtest::dwtest(mod5) shapiro.test(mod5$residuals) robust_vcov <- vcovHC(mod5, type = “HC1”) coeftest(mod5, vcov = robust_vcov)
BIC(mod,mod1,mod2,mod3,mod4,mod5)# mod4 has a lower BIC
#I have chosen model 4 because it applies a log transformation, addressing the issue that the residual assumptions of the first #four models were not met. Although heteroscedasticity and non-normality of residuals remain issues in model 4, it yields better #overall performance, particularly evidenced by a higher adjusted R-squared and a lower BIC.
#3 Forecasts and Results #Since the number of pregnancies was not significant in the model, our prediction of neonatal weight is based on other relevant #variables included in the model, such as length, sex and gestation duration.
new_case <- data.frame( Gestazione = 39, Lunghezza = mean(dati_f\(Lunghezza), Cranio = mean(dati_f\)Cranio), Sesso = “F”, Fumatrici = 0, Tipo.parto = “Nat” )
predicted_logPeso <- predict(mod4, newdata = new_case) predicted_Peso <- exp(predicted_logPeso) predicted_Peso #3154.35 #the prediction is 3154.35 g
#4Views
#dataframe with the varabiles: “Gestazione” in range (37,42) and “Fumatrici” (0,1) data <- expand.grid( Gestazione = seq(37, 42, by = 1), Fumatrici = c(0, 1), Sesso = c(“F”,“M”) ) #costant variables data\(Lunghezza <- mean(dati_f\)Lunghezza) data\(Cranio <- mean(dati_f\)Cranio) data$Tipo.parto <- “Nat”
predicted_logPeso_view <- predict(mod4, newdata = data) predicted_Peso_view <- exp(predicted_logPeso_view) data$Peso_predetto <- predicted_Peso_view predicted_Peso_view
data\(Fumatrici <- factor(data\)Fumatrici, levels = c(0,1), labels = c(“Non Fumatore”, “Fumatore”))
ggplot(data, aes(x = Gestazione, y = Peso_predetto, color = Fumatrici, linetype = Sesso)) + geom_line(size = 1) + labs( title = “Impatto di Gestazione, Fumo e Sesso sul Peso Predetto”, x = “Settimane di Gestazione”, y = “Peso Predetto (grammi)”, color = “Fumatrici”, linetype = “Sesso” ) + theme_minimal()
#The results show how the Weight variable is influenced by Gender and Smoking. The graphic representation is a line showing the #predicted weight as a function of weeks of gestation.
#It can be seen that the predicted weight for male newborns is generally higher than that for females, whether the mother is a #smoker or not.
#An interesting aspect is the different trend in predicted weight in relation to maternal smoking. If the mother is a non-smoker, #weight increases as the weeks of gestation progress. Conversely, if the mother is a smoker, the model shows a tendency for #weight to decrease as gestation progresses.
#These results suggest an interaction between maternal smoking and gestation in determining birth weight, with a negative effect #of smoking that could attenuate or reverse the expected weight gain as gestation progresses.
#Dataset and Model Limitations #1) The dataset used is limited in size, which may reduce the statistical power of some tests and limit the generalisability of the #results.
#2) Preliminary analyses showed that the response variable Weight does not follow a normal distribution and exhibits outliers. #Although a logarithmic transformation was adopted to mitigate this effect, the normality and heteroscedasticity of the residuals #were not completely resolved, which may affect the validity of the confidence intervals and statistical tests associated with #the model.