Tractament inicial de les dades
#Dades
#library(Ecdat)
data("OFP")
dades <- OFP
#Mirem si hi ha NAs
colMeans(apply(dades, 2, is.na))
#VARIABLE RESPOSTA
dades$visit<-as.factor(ifelse(OFP$ofp < 9, "poc", "molt"))
dades$visit<-relevel(dades$visit, ref="poc")
#MODIFIQUEM VARIABLES
#Limitacions de la vida diària com a factor
dades$adldiff <- as.factor(ifelse(OFP$adldiff == 1, "yes", "no"))
#multiplicacio de l'edat per 10
dades$age <- OFP$age*10
# Creem nova variable a euros anuals (ingresos familiars)
dades$ingeur <- OFP$faminc * 0.92 * 10000
# Traiem els casos de renda negativa a la bbdd
dades <- dades[!dades$faminc<0,]
# nombre d'hospitalitzacions a categòrica
dades$hospCat <- as.factor(ifelse(dades$hosp == 0, "no", "yes"))
#Malalties cròniques
dades$cronic[dades$numchron == 0] <- "0" # 0 = no enfermetats croniques
dades$cronic[dades$numchron == 1] <- "1" # 1 = una enfermetat cronica
dades$cronic[dades$numchron >= 2] <- "2" # 2 = dos o m©s enfermetats croniques
dades$cronic <- as.factor(dades$cronic)
Taula 1: Descriptiva univariant i bivariant
#Variables categòriques
descriptiva_cat<-function(x, y){
# noms<-paste0(c(deparse(substitute(x)), levels(x)))
noms<-(c(deparse(substitute(x)), levels(x)))
#DESCRIPTIVA
p<-round(as.vector(prop.table(table(x))*100),2)
n<-as.vector(table(x))
t<-table(x, y)
tp<-round(prop.table(t, 1)*100, 2)
#PVALOR
chi <- chisq.test(y, x)
chisq_param <- round(chi$p.value,4)
CHI<-c(rep(NA, nlevels(x)))
#OR
mod<-glm(y~x, family="binomial")
coefs<-summary(mod)$coefficients
coefs<-matrix(coefs[-(rownames(coefs)=="(Intercept)"),], nrow=(nlevels(x)-1), byrow=FALSE)
OR<-c("Ref", round(exp(coefs[,1]),2))
#NAGELKERKE
nagel<-nagelkerke(mod) #r quadrat tot
rquaa<-round(nagel$Pseudo.R.squared.for.model.vs.null[3,],4)
RQ<-c(rep(NA, nlevels(x)))
taula1<-matrix( c(paste0(n, " ", "(",p,"%",")"),
paste0(t, " ", "(", tp, "%", ")"),
CHI,
RQ,
OR),
byrow=FALSE, nrow=nlevels(x))
# a<-as.vector(rep(NA, 3), rquaa, NA)
a<-matrix(c(NA, NA, NA, chisq_param, rquaa, NA), nrow=1, byrow=TRUE)
taula<-rbind(data.frame(a), data.frame(taula1))
row.names(taula)<-noms
return(taula)
}
#Variables contínues
descriptiva_cont<-function(x,y){
nom<-deparse(substitute(x))
#DESCRIPTIVA
x<-x[!is.na(x)]
n<-round(mean(x),2)
p<-round(sd(x),2)
#TAULA
m1<-round(mean(x[y==levels(y)[1]]),2)
m2<-round(mean(x[y==levels(y)[2]]),2)
sd1<-round(sd(x[y==levels(y)[1]]),2)
sd2<-round(sd(x[y==levels(y)[2]]),2)
#PVALOR
ttest <- t.test(x ~ y)
ttest_para<-round(ttest$p.value,4)
#NAGELKERKE
mod<-glm(y~x, family="binomial")
nagel<-nagelkerke(mod) #r quadrat tot
rquaa<-round(nagel$Pseudo.R.squared.for.model.vs.null[3,],4)
#OR
coefs<-summary(mod)$coefficients
coefs<-matrix(coefs[-(rownames(coefs)=="(Intercept)"),], nrow=1, byrow=FALSE)
OR<-round(exp(coefs[,1]),2)
#taula
taula<-data.frame(paste0(n, " ", "(", p, ")"),
paste0(m1, " ", "(", sd1, ")"),
paste0(m2, " ", "(", sd2, ")"),
ttest_para,
rquaa,
OR)
colnames(taula)<-c("X1", "X2", "X3", "X4", "X5", "X6")
row.names(taula)<-nom
return(taula)
}
#Destria si són categòriques o contínues
cat_or_cont<-function(x, y){
n<-ifelse(length(table(x))< 6, 1, 2)
switch(n, descriptiva_cat(x,y), descriptiva_cont(x,y))
}
dades_taula1<-subset(dades, select=c(adldiff, black, sex, maried, employed, privins,
medicaid, region, hlth, hospCat, cronic, age, school, emr, ingeur))
taula1<-do.call(rbind,lapply(dades_taula1, FUN=cat_or_cont, y=dades$visit) )
ttaula1<-xtable(taula1)
Taula 2: Estimació dels paràmetres del model final
#Ajuntem region midwest amb region other perquè no són significatives
dades$region[dades$region=="midwest"] <- "other"
mod_inter <- glm(visit ~ cronic + privins + hospCat + hlth + sex + region
+ emr + school + hospCat*hlth, data = dades, family = "binomial")
smi<-summary(mod_inter)
z<-qnorm(0.975,mean=0,sd=1)
INF<-exp(smi$coefficients[,"Estimate"]-z*smi$coefficients[,"Std. Error"])
SUP<-exp(smi$coefficients[,"Estimate"]+z*smi$coefficients[,"Std. Error"])
taula2<-cbind(smi$coefficients[,"Estimate"],
exp(smi$coefficients[,"Estimate"]),
INF, SUP,
smi$coefficients[,"Pr(>|z|)"])
ttaula2<-xtable(taula2,
caption="Estimació dels paràmetres del model final",
label="t2"
)
colnames(ttaula2)<-c("\beta", "OR", "Inferior", "Superior", "p-valor")
Gràfic
dades$ofp_f <- dades$ofp
ofp_f <- dades$ofp_f[dades$ofp_f <= 30]
ofp_f <- as.factor(ofp_f)
rep <- data.frame( table(ofp_f) )
colnames(rep) <- c("Var1", "Freq")
ggplot(data = rep, aes(x = Var1, y = Freq)) +
theme_light() +
geom_bar(stat = "identity", position = position_dodge() ) +
geom_text(aes(label = Freq), position = position_dodge(1), vjust=-0.3 ) +
theme( plot.caption = element_text(hjust = 0)) +
xlab("Nombre de visites al consultori mèdic") +
ylab("Freqüència absoluta") +
ylim(0, 700) +
annotate("text", x=25, y=550, label= "Mitjana: 5.8", size = 4.5) +
annotate("text", x=25, y=500, label= "Mediana: 4", size = 4.5) +
annotate("text", x=25, y=450, label= "Percentil 75: 8", size = 4.5)
