We import our dataset using the package foreign to read a native file of SPSS .sav:
setwd ("D:/Users/Carlos/Desktop/Ejercicios R")
dataset <- foreign::read.spss('percepcion_patrimonio_cultural_2022.sav', to.data.frame=T)
we call the packages that we’re going to use most often:
library (dplyr)
library(summarytools)
We extract the variables of interest in a separate dataset:
F1 (sex), F2R (ordinal age), P20 (net household income). P19_EDUCACTION (human capital), P1.4 (reading books in leisure time, dichotomous)
d <- dataset %>% select(F1,F2R,P20,P19_EDUCATION,P1.4)
We recoded the variable P19_EDUCATION to make it more manageable for our objectives. our objectives:
d <- d %>%
mutate(
P19_EDUCATION = case_when(
P19_EDUCATION == "Sin estudios (Estudios primarios sin terminar)" ~ "Estudios primarios o inferiores",
P19_EDUCATION == "Primer Grado (Certificado escolar, EGB 1ª etapa, más o menos 10 años)" ~ "Estudios primarios o inferiores",
P19_EDUCATION == "Segundo Grado. 1er Ciclo (Graduado escolar, o EGB 2ª etapa, 1º y 2º ESO-1er ciclo- hasta 14 años)" ~ "Estudios secundarios",
P19_EDUCATION == "Segundo Grado. 2º Ciclo (FP Iº y IIº, Bachiller superior, BUP, 3º y 4º de ESO (2º ciclo) COU, PREU, 1º y 2º Bachillerato" ~ "Estudios secundarios",
P19_EDUCATION == "Licenciatura, Grado. 2º Ciclo (Universitarios, Licenciados superior, Facultades, Escuelas técnicas superiores, etc" ~ "Estudios superiores",
P19_EDUCATION == "Tercer Grado. 1er Ciclo (Equivalente a Ingeniero técnico, 3 años, Escuelas universitarias, Ingenieros técnicos, Arquitec" ~ "Estudios superiores",
P19_EDUCATION == "Tercer Grado (Máster)" ~ "Estudios superiores",
P19_EDUCATION == "Tercer grado (Doctorado)" ~ "Estudios superiores"
)
)
Since the methodology stipulates that the survey is for age >=18 we eliminated the 0-17 category:
d$F2R<- droplevels(d$F2R[d$F2R!= "0_17"])
freq.edad <- freq(d$F2R, report.nas = FALSE)
We tabulated and requested the frequencies of our variables:
freq.sex <- freq(d$F1, report.nas = FALSE)
freq.edad <- freq(d$F2R, report.nas = FALSE)
freq.ingresos <- freq(d$P20, report.nas = FALSE)
freq.estudios <- freq(d$P19_EDUCATION, report.nas = FALSE)
freq.libros <- freq(d$P1.4, report.nas = FALSE)
freq.sex
## Frequencies
## d$F1
## Type: Factor
##
## Freq % % Cum.
## ------------ ------ -------- --------
## Hombre 846 46.97 46.97
## Mujer 955 53.03 100.00
## Total 1801 100.00 100.00
freq.edad
## Frequencies
## d$F2R
## Type: Factor
##
## Freq % % Cum.
## ----------- ------ -------- --------
## 18_24 168 9.33 9.33
## 25_34 263 14.60 23.93
## 35_44 349 19.38 43.31
## 45_54 338 18.77 62.08
## 55_64 272 15.10 77.18
## 65_+ 411 22.82 100.00
## Total 1801 100.00 100.00
freq.ingresos
## Frequencies
## d$P20
## Type: Factor
##
## Freq % % Cum.
## --------------------------------------------- ------ -------- --------
## Muy superiores (más del doble) 366 20.32 20.32
## Superiores 686 38.09 58.41
## Alrededor de esa cifra 530 29.43 87.84
## Inferiores 156 8.66 96.50
## Bastante inferiores (menos de la mitad) 63 3.50 100.00
## Total 1801 100.00 100.00
freq.estudios
## Frequencies
## d$P19_EDUCATION
## Type: Character
##
## Freq % % Cum.
## ------------------------------------- ------ -------- --------
## Estudios primarios o inferiores 104 5.77 5.77
## Estudios secundarios 836 46.42 52.19
## Estudios superiores 861 47.81 100.00
## Total 1801 100.00 100.00
freq.libros
## Frequencies
## d$P1.4
## Type: Factor
##
## Freq % % Cum.
## ----------- ------ -------- --------
## 0 727 40.37 40.37
## 1 1074 59.63 100.00
## Total 1801 100.00 100.00
We note that the modal categories of our sample are female, 65 and over, income over €1,100, higher education and leisure readers.
In order to perform the linear probability model we intend, we must first prepare the variables to be introduced the variables to be introduced must first be prepared
d$P1.4 <- as.numeric(as.character(d$P1.4))
datos_dummy <- fastDummies::dummy_cols(d, select_columns = c("F1", "F2R", "P20", "P19_EDUCATION"))
We check if there is multicollinearity in the model we are going to propose by performing a LPM (linear probability model).
d$P1.4 <- as.numeric(d$P1.4)
LPM <- datos_dummy %>% lm(P1.4 ~F1_Mujer+F2R_25_34+F2R_35_44+F2R_45_54+F2R_55_64+
`F2R_65_+`+P20_Superiores+`P20_Alrededor de esa cifra`+
P20_Inferiores+`P20_Bastante inferiores (menos de la mitad)`
+`P19_EDUCATION_Estudios secundarios`+`P19_EDUCATION_Estudios superiores`,data=.)
VIF <-car::vif (LPM)
VIF
## F1_Mujer
## 1.102258
## F2R_25_34
## 2.233041
## F2R_35_44
## 2.538948
## F2R_45_54
## 2.531650
## F2R_55_64
## 2.366305
## `F2R_65_+`
## 2.854206
## P20_Superiores
## 1.851573
## `P20_Alrededor de esa cifra`
## 1.918387
## P20_Inferiores
## 1.453162
## `P20_Bastante inferiores (menos de la mitad)`
## 1.178135
## `P19_EDUCATION_Estudios secundarios`
## 5.004381
## `P19_EDUCATION_Estudios superiores`
## 5.386090
We note that the variable studies (P19) introduces collinearity to our model so we run an alternative LPM without it:
LPM2 <- datos_dummy %>% lm(P1.4 ~F1_Mujer+F2R_25_34+F2R_35_44+F2R_45_54+F2R_55_64+
`F2R_65_+`+P20_Superiores+`P20_Alrededor de esa cifra`+
P20_Inferiores+`P20_Bastante inferiores (menos de la mitad)`
,data=.)
VIF2 <-car::vif (LPM2)
VIF2
## F1_Mujer
## 1.100520
## F2R_25_34
## 2.219042
## F2R_35_44
## 2.532336
## F2R_45_54
## 2.492293
## F2R_55_64
## 2.295845
## `F2R_65_+`
## 2.813669
## P20_Superiores
## 1.808465
## `P20_Alrededor de esa cifra`
## 1.796853
## P20_Inferiores
## 1.355013
## `P20_Bastante inferiores (menos de la mitad)`
## 1.143551
Binary logistic regression:
logit <- d %>% glm(P1.4 ~F1+F2R+P20,data=.,family=binomial())
logit
##
## Call: glm(formula = P1.4 ~ F1 + F2R + P20, family = binomial(), data = .)
##
## Coefficients:
## (Intercept)
## 0.42023
## F1Mujer
## 0.65898
## F2R25_34
## -0.03973
## F2R35_44
## -0.05772
## F2R45_54
## -0.16567
## F2R55_64
## 0.00124
## F2R65_+
## 0.41085
## P20Superiores
## -0.32793
## P20Alrededor de esa cifra
## -0.54027
## P20Inferiores
## -0.93684
## P20Bastante inferiores (menos de la mitad)
## -1.25515
##
## Degrees of Freedom: 1800 Total (i.e. Null); 1790 Residual
## Null Deviance: 2429
## Residual Deviance: 2348 AIC: 2370
summary(logit)
##
## Call:
## glm(formula = P1.4 ~ F1 + F2R + P20, family = binomial(), data = .)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8403 -1.2174 0.8087 0.9810 1.6209
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.42023 0.20541 2.046 0.040779
## F1Mujer 0.65898 0.10406 6.333 2.41e-10
## F2R25_34 -0.03973 0.20754 -0.191 0.848179
## F2R35_44 -0.05772 0.19804 -0.291 0.770706
## F2R45_54 -0.16567 0.19790 -0.837 0.402523
## F2R55_64 0.00124 0.20746 0.006 0.995233
## F2R65_+ 0.41085 0.19926 2.062 0.039221
## P20Superiores -0.32793 0.13914 -2.357 0.018427
## P20Alrededor de esa cifra -0.54027 0.14679 -3.681 0.000233
## P20Inferiores -0.93683 0.20271 -4.622 3.81e-06
## P20Bastante inferiores (menos de la mitad) -1.25515 0.28781 -4.361 1.29e-05
##
## (Intercept) *
## F1Mujer ***
## F2R25_34
## F2R35_44
## F2R45_54
## F2R55_64
## F2R65_+ *
## P20Superiores *
## P20Alrededor de esa cifra ***
## P20Inferiores ***
## P20Bastante inferiores (menos de la mitad) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2429.4 on 1800 degrees of freedom
## Residual deviance: 2347.6 on 1790 degrees of freedom
## AIC: 2369.6
##
## Number of Fisher Scoring iterations: 4
confint(logit,level = 0.95)
## 2.5 % 97.5 %
## (Intercept) 0.02020745 0.82625041
## F1Mujer 0.45576765 0.86377928
## F2R25_34 -0.44837866 0.36600126
## F2R35_44 -0.44830162 0.32883512
## F2R45_54 -0.55602086 0.22055547
## F2R55_64 -0.40721474 0.40683383
## F2R65_+ 0.01842373 0.80037620
## P20Superiores -0.60241951 -0.05670596
## P20Alrededor de esa cifra -0.82972571 -0.25402108
## P20Inferiores -1.33597095 -0.54059105
## P20Bastante inferiores (menos de la mitad) -1.83044438 -0.69799816
exp <- exp(coefficients(logit))
exp
## (Intercept)
## 1.5223122
## F1Mujer
## 1.9328144
## F2R25_34
## 0.9610468
## F2R35_44
## 0.9439154
## F2R45_54
## 0.8473298
## F2R55_64
## 1.0012403
## F2R65_+
## 1.5080996
## P20Superiores
## 0.7204123
## P20Alrededor de esa cifra
## 0.5825933
## P20Inferiores
## 0.3918660
## P20Bastante inferiores (menos de la mitad)
## 0.2850323
We check the residuals of our model and plot them:
resid_pearson <- residuals(logit, type = "pearson")
resid_desviacion <- residuals(logit, type = "deviance")
plot(resid_pearson, main="Residuos de Pearson", ylab="Residuos", xlab="Índice", pch=16)
abline(h=0, col="red", lwd=2)
plot(resid_desviacion, main="Residuos de desviación", ylab="Residuos", xlab="Índice", pch=16)
abline(h=0, col="red", lwd=2)
We check the goodness of our fit:
anova(logit,test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: P1.4
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 1800 2429.4
## F1 1 25.546 1799 2403.9 4.320e-07 ***
## F2R 5 20.057 1794 2383.8 0.001219 **
## P20 4 36.262 1790 2347.6 2.555e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
HLtest <- ResourceSelection::hoslem.test(logit$y,fitted(logit))
HLtest
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: logit$y, fitted(logit)
## X-squared = 5.0787, df = 8, p-value = 0.7491
We evaluate the Nagelkerke pseudo R-squared:
logit1 <- glm(d$P1.4 ~d$F1+d$F2R+d$P20,data=d,family=binomial())
nagelkerke <- rcompanion::nagelkerke(logit1)
rm(logit1)
nagelkerke
## $Models
##
## Model: "glm, d$P1.4 ~ d$F1 + d$F2R + d$P20, binomial(), d"
## Null: "glm, d$P1.4 ~ 1, binomial(), d"
##
## $Pseudo.R.squared.for.model.vs.null
## Pseudo.R.squared
## McFadden 0.0336972
## Cox and Snell (ML) 0.0444379
## Nagelkerke (Cragg and Uhler) 0.0600119
##
## $Likelihood.ratio.test
## Df.diff LogLik.diff Chisq p.value
## -10 -40.933 81.865 2.1612e-13
##
## $Number.of.observations
##
## Model: 1801
## Null: 1801
##
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
##
## $Warnings
## [1] "None"
We created a dataset with ficticious cases to compute the probability of reading books controlled by our variables by changing only one variable from the modal profile above to see the change that occurs when all other variables are held constant:
perfiles.ficticios <- data.frame(
F1 = c(rep("Mujer", 11)),
F2R = c("65_+", "55_64", "45_54", "35_44", "25_34", "18_24",
rep("65_+", 5)),
P20 = c("Superiores", "Superiores", "Superiores", "Superiores", "Superiores", "Superiores",
"Muy superiores (más del doble)", "Superiores", "Alrededor de esa cifra", "Inferiores",
"Bastante inferiores (menos de la mitad)"),
stringsAsFactors = FALSE
)
We observe the change in the probability of the fictitious cases:
perfiles.ficticios$prob <- predict.glm(logit,newdata = perfiles.ficticios,type="response")
tail(perfiles.ficticios,11)
## F1 F2R P20 prob
## 1 Mujer 65_+ Superiores 0.7617189
## 2 Mujer 55_64 Superiores 0.6797266
## 3 Mujer 45_54 Superiores 0.6423574
## 4 Mujer 35_44 Superiores 0.6667578
## 5 Mujer 25_34 Superiores 0.6707422
## 6 Mujer 18_24 Superiores 0.6794567
## 7 Mujer 65_+ Muy superiores (más del doble) 0.8160870
## 8 Mujer 65_+ Superiores 0.7617189
## 9 Mujer 65_+ Alrededor de esa cifra 0.7210733
## 10 Mujer 65_+ Inferiores 0.6348829
## 11 Mujer 65_+ Bastante inferiores (menos de la mitad) 0.5584577
Conclusions:
Our model explains just the 6% of the vairability (0-1) of the event read books in the free time of the interviewed.
Checking the Hosmer Lemeshow test we can say that the model is good fitted. The Pearson’s residuasls are between 0’75 and -0’75 and the deviance residuals are around 1 and 1’25 and the threshold of the 2 index are 3 and -3 to make innacurate the model.
Being woman enhance the relative risk of reading books (controlled by the grouped age and the family income) in a ~93’28% compared wit the man.
Having a family income above 1100 € net per month decrease the relative risk of reading books (controlled by the grouped age and sex) in a ~28% compared to income well above 1100 €.
Having a family income around 1100 € net per month decrease the relative risk of reading books (controlled by the grouped age and sex) in a ~41’75% compared to income well above 1100 €.
Having a family income lower than 1100 € net per month decrease the relative risk of reading books (controlled by the grouped age and sex) in a ~60% compared to income well above 1100 €.
Having a family income significantly lower than 1100 € net per month decrease the relative risk of reading books (controlled by the grouped age and sex) in a ~71,50% compared to income well above 1100 €.
Being 65 years old or more enhace the relative risk of reading books (controlled by sex and the family income) in a 50’8% compared to the individuals who be 18-24 years old.