## This is lavaan 0.6-19
## lavaan is FREE software! Please report any bugs.
data <- read.csv("data.csv",sep = ";")
head(data, 5)
## school sex age address famsize Pstatus Medu Fedu Mjob Fjob reason
## 1 GP F 18 U GT3 A 4 4 at_home teacher course
## 2 GP F 17 U GT3 T 1 1 at_home other course
## 3 GP F 15 U LE3 T 1 1 at_home other other
## 4 GP F 15 U GT3 T 4 2 health services home
## 5 GP F 16 U GT3 T 3 3 other other home
## guardian traveltime studytime failures schoolsup famsup paid activities
## 1 mother 2 2 0 yes no no no
## 2 father 1 2 0 no yes no no
## 3 mother 1 2 3 yes no yes no
## 4 mother 1 3 0 no yes yes yes
## 5 father 1 2 0 no yes yes no
## nursery higher internet romantic famrel freetime goout Dalc Walc health
## 1 yes yes no no 4 3 4 1 1 3
## 2 no yes yes no 5 3 3 1 1 3
## 3 yes yes yes no 4 3 2 2 3 3
## 4 yes yes yes yes 3 2 2 1 1 5
## 5 yes yes no no 4 3 2 1 2 5
## absences G1 G2 G3
## 1 6 5 6 6
## 2 4 5 5 6
## 3 10 7 8 10
## 4 2 15 14 15
## 5 4 6 10 10
school - student’s school (binary: “GP” - Gabriel
Pereira or “MS” - Mousinho da Silveira)
sex -
student’s sex (binary: “F” - female or “M” - male)
age - student’s age (numeric: from 15 to 22)
address - student’s home address type (binary: “U” -
urban or “R” - rural)
famsize - family size
(binary: “LE3” - less or equal to 3 or “GT3” - greater than 3)
Pstatus - parent’s cohabitation status (binary: “T” -
living together or “A” - apart)
Medu - mother’s
education (numeric: 0 - none, 1 - primary education (4th grade), 2 – 5th
to 9th grade, 3 – secondary education or 4 – higher education)
Fedu - father’s education (numeric: 0 - none, 1 -
primary education (4th grade), 2 – 5th to 9th grade, 3 – secondary
education or 4 – higher education)
Mjob - mother’s
job (nominal: “teacher”, “health” care related, civil “services”
(e.g. administrative or police), “at_home” or “other”)
Fjob - father’s job (nominal: “teacher”, “health” care
related, civil “services” (e.g. administrative or police), “at_home” or
“other”)
reason - reason to choose this school
(nominal: close to “home”, school “reputation”, “course” preference or
“other”)
guardian - student’s guardian (nominal:
“mother”, “father” or “other”)
traveltime - home
to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 -
30 min. to 1 hour, or 4 - >1 hour)
studytime -
weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to
10 hours, or 4 - >10 hours)
failures - number
of past class failures (numeric: n if 1<=n<3, else 4)
schoolsup - extra educational support (binary: yes or
no)
** famsup** - family educational support (binary: yes or no)
paid - extra paid classes within the course
subject (Math or Portuguese) (binary: yes or no)
activities - extra-curricular activities (binary: yes
or no)
nursery - attended nursery school (binary:
yes or no)
higher - wants to take higher education
(binary: yes or no)
internet - Internet access at
home (binary: yes or no)
romantic - with a
romantic relationship (binary: yes or no)
famrel -
quality of family relationships (numeric: from 1 - very bad to 5 -
excellent)
freetime - free time after school
(numeric: from 1 - very low to 5 - very high)
goout - going out with friends (numeric: from 1 - very
low to 5 - very high)
Dalc - workday alcohol
consumption (numeric: from 1 - very low to 5 - very high)
Walc - weekend alcohol consumption (numeric: from 1 -
very low to 5 - very high)
health - current health
status (numeric: from 1 - very bad to 5 - very good)
absences - number of school absences (numeric: from 0
to 93)
G1 - first period grade (numeric: from 0 to 20)
G2 - second period grade (numeric: from 0 to 20)
G3 - final grade (numeric: from 0 to 20, output target)
lets check for missing values
colSums(is.na(data))
## school sex age address famsize Pstatus Medu
## 0 0 0 0 0 0 0
## Fedu Mjob Fjob reason guardian traveltime studytime
## 0 0 0 0 0 0 0
## failures schoolsup famsup paid activities nursery higher
## 0 0 0 0 0 0 0
## internet romantic famrel freetime goout Dalc Walc
## 0 0 0 0 0 0 0
## health absences G1 G2 G3
## 0 0 0 0 0
No missing values in our dataset.
lets build the CFA model.
CFAmodel <- 'Parents_Demo=~ Medu+Fedu +Mjob+Fjob
Support_Act=~ schoolsup+famsup+paid+activities
Personal_life=~romantic+famrel+freetime+goout'
CFAmodel.fitted <- cfa(model = CFAmodel, data = data, std.lv = TRUE,
missing = "fiml", mimic = "Mplus")
summary(CFAmodel.fitted)
## lavaan 0.6-19 ended normally after 48 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 39
##
## Number of observations 395
## Number of missing patterns 1
##
## Model Test User Model:
##
## Test statistic 68.453
## Degrees of freedom 51
## P-value (Chi-square) 0.052
##
## Parameter Estimates:
##
## Standard errors Standard
## Information Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|)
## Parents_Demo =~
## Medu 1.063 0.067 15.873 0.000
## Fedu 0.696 0.060 11.656 0.000
## Mjob 0.571 0.063 9.010 0.000
## Fjob 0.147 0.048 3.041 0.002
## Support_Act =~
## schoolsup -0.028 0.026 -1.086 0.278
## famsup -0.305 0.070 -4.385 0.000
## paid -0.234 0.055 -4.243 0.000
## activities -0.011 0.036 -0.306 0.759
## Personal_life =~
## romantic 0.009 0.031 0.275 0.783
## famrel -0.179 0.069 -2.592 0.010
## freetime -0.738 0.232 -3.180 0.001
## goout -0.428 0.143 -2.993 0.003
##
## Covariances:
## Estimate Std.Err z-value P(>|z|)
## Parents_Demo ~~
## Support_Act -0.315 0.082 -3.839 0.000
## Personal_life -0.057 0.073 -0.777 0.437
## Support_Act ~~
## Personal_life -0.044 0.099 -0.439 0.661
##
## Intercepts:
## Estimate Std.Err z-value P(>|z|)
## .Medu 2.749 0.055 49.977 0.000
## .Fedu 2.522 0.055 46.111 0.000
## .Mjob 3.170 0.062 51.386 0.000
## .Fjob 3.281 0.043 75.609 0.000
## .schoolsup 1.129 0.017 66.922 0.000
## .famsup 1.613 0.025 65.794 0.000
## .paid 1.458 0.025 58.167 0.000
## .activities 1.509 0.025 59.985 0.000
## .romantic 1.334 0.024 56.214 0.000
## .famrel 3.944 0.045 87.537 0.000
## .freetime 3.235 0.050 64.458 0.000
## .goout 3.109 0.056 55.571 0.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .Medu 0.065 0.114 0.568 0.570
## .Fedu 0.696 0.069 10.086 0.000
## .Mjob 1.177 0.088 13.312 0.000
## .Fjob 0.722 0.052 13.941 0.000
## .schoolsup 0.112 0.008 13.912 0.000
## .famsup 0.144 0.042 3.456 0.001
## .paid 0.194 0.027 7.151 0.000
## .activities 0.250 0.018 14.046 0.000
## .romantic 0.222 0.016 14.050 0.000
## .famrel 0.770 0.058 13.295 0.000
## .freetime 0.450 0.339 1.328 0.184
## .goout 1.053 0.136 7.751 0.000
## Parents_Demo 1.000
## Support_Act 1.000
## Personal_life 1.000
lets calculate the standardized estimates.
summary(CFAmodel.fitted, standardized = TRUE)
## lavaan 0.6-19 ended normally after 48 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 39
##
## Number of observations 395
## Number of missing patterns 1
##
## Model Test User Model:
##
## Test statistic 68.453
## Degrees of freedom 51
## P-value (Chi-square) 0.052
##
## Parameter Estimates:
##
## Standard errors Standard
## Information Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Parents_Demo =~
## Medu 1.063 0.067 15.873 0.000 1.063 0.972
## Fedu 0.696 0.060 11.656 0.000 0.696 0.641
## Mjob 0.571 0.063 9.010 0.000 0.571 0.465
## Fjob 0.147 0.048 3.041 0.002 0.147 0.171
## Support_Act =~
## schoolsup -0.028 0.026 -1.086 0.278 -0.028 -0.085
## famsup -0.305 0.070 -4.385 0.000 -0.305 -0.627
## paid -0.234 0.055 -4.243 0.000 -0.234 -0.469
## activities -0.011 0.036 -0.306 0.759 -0.011 -0.022
## Personal_life =~
## romantic 0.009 0.031 0.275 0.783 0.009 0.018
## famrel -0.179 0.069 -2.592 0.010 -0.179 -0.200
## freetime -0.738 0.232 -3.180 0.001 -0.738 -0.740
## goout -0.428 0.143 -2.993 0.003 -0.428 -0.385
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Parents_Demo ~~
## Support_Act -0.315 0.082 -3.839 0.000 -0.315 -0.315
## Personal_life -0.057 0.073 -0.777 0.437 -0.057 -0.057
## Support_Act ~~
## Personal_life -0.044 0.099 -0.439 0.661 -0.044 -0.044
##
## Intercepts:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .Medu 2.749 0.055 49.977 0.000 2.749 2.515
## .Fedu 2.522 0.055 46.111 0.000 2.522 2.320
## .Mjob 3.170 0.062 51.386 0.000 3.170 2.586
## .Fjob 3.281 0.043 75.609 0.000 3.281 3.804
## .schoolsup 1.129 0.017 66.922 0.000 1.129 3.367
## .famsup 1.613 0.025 65.794 0.000 1.613 3.310
## .paid 1.458 0.025 58.167 0.000 1.458 2.927
## .activities 1.509 0.025 59.985 0.000 1.509 3.018
## .romantic 1.334 0.024 56.214 0.000 1.334 2.828
## .famrel 3.944 0.045 87.537 0.000 3.944 4.404
## .freetime 3.235 0.050 64.458 0.000 3.235 3.243
## .goout 3.109 0.056 55.571 0.000 3.109 2.796
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .Medu 0.065 0.114 0.568 0.570 0.065 0.054
## .Fedu 0.696 0.069 10.086 0.000 0.696 0.590
## .Mjob 1.177 0.088 13.312 0.000 1.177 0.783
## .Fjob 0.722 0.052 13.941 0.000 0.722 0.971
## .schoolsup 0.112 0.008 13.912 0.000 0.112 0.993
## .famsup 0.144 0.042 3.456 0.001 0.144 0.607
## .paid 0.194 0.027 7.151 0.000 0.194 0.780
## .activities 0.250 0.018 14.046 0.000 0.250 1.000
## .romantic 0.222 0.016 14.050 0.000 0.222 1.000
## .famrel 0.770 0.058 13.295 0.000 0.770 0.960
## .freetime 0.450 0.339 1.328 0.184 0.450 0.452
## .goout 1.053 0.136 7.751 0.000 1.053 0.852
## Parents_Demo 1.000 1.000 1.000
## Support_Act 1.000 1.000 1.000
## Personal_life 1.000 1.000 1.000
The above results suggest that majority of our observed factors are statistically significant having p value less than 0.05. However, there are a few factors which are not significant such as schoolsup, activities belonging to supoort_activities latent factor. Similarly, romantic is also not significant belonging to personal life latent factor.We can remove these and re run CFA just to see if our results improve.
r_squared_values <- inspect(CFAmodel.fitted, "rsquare")
print(r_squared_values)
## Medu Fedu Mjob Fjob schoolsup famsup paid
## 0.946 0.410 0.217 0.029 0.007 0.393 0.220
## activities romantic famrel freetime goout
## 0.000 0.000 0.040 0.548 0.148
The output above reports the explained variance for the respective observed variable. The higher the value the better it is and vice versa. Lets now visualize our 3 factor fitted model.
semPaths(CFAmodel.fitted, "par", weighted = FALSE, nCharNodes = 7, shapeMan = "rectangle",
sizeMan = 8, sizeMan2 = 5)
lets examine fit measures to see if our results are reliable.
# Extract a selection of fit measures
selected_fit_measures <- fitMeasures(CFAmodel.fitted, c("chisq","cfi", "rmsea", "srmr"))
# Print selected fit measures
selected_fit_measures
## chisq cfi rmsea srmr
## 68.453 0.956 0.029 0.040
Model diagnostic results are convincing. Chi-square test statistic value is lower and its pvalue: p-value for > 0.05 which indicates a good fit, Comparative Fit Index, value close to 1 indicates a good fit.Root Mean Square Error of Approximation is less than 0.08 which also suggests a good model fit and similarly Standardized Root Mean Square Residual value as well.
Lets now re run the estimation by removing observed variables which have higher p values greater than 0.05. In this case i am removing romatic and activities as both had p value equal to 0.7
CFAmodel_1 <- 'Parents_Demo=~ Medu+Fedu +Mjob+Fjob
Support_Act=~ schoolsup+famsup+paid
Personal_life=~famrel+freetime+goout'
CFAmodel.fitted_1 <- cfa(model = CFAmodel_1, data = data, std.lv = TRUE,
missing = "fiml", mimic = "Mplus")
summary(CFAmodel.fitted_1, standardized = TRUE)
## lavaan 0.6-19 ended normally after 39 iterations
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 33
##
## Number of observations 395
## Number of missing patterns 1
##
## Model Test User Model:
##
## Test statistic 47.878
## Degrees of freedom 32
## P-value (Chi-square) 0.035
##
## Parameter Estimates:
##
## Standard errors Standard
## Information Observed
## Observed information based on Hessian
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Parents_Demo =~
## Medu 1.064 0.067 15.852 0.000 1.064 0.973
## Fedu 0.696 0.060 11.640 0.000 0.696 0.640
## Mjob 0.570 0.063 9.003 0.000 0.570 0.465
## Fjob 0.147 0.048 3.035 0.002 0.147 0.171
## Support_Act =~
## schoolsup 0.028 0.026 1.075 0.282 0.028 0.084
## famsup 0.307 0.072 4.256 0.000 0.307 0.629
## paid 0.234 0.057 4.109 0.000 0.234 0.470
## Personal_life =~
## famrel 0.178 0.069 2.589 0.010 0.178 0.198
## freetime 0.740 0.236 3.131 0.002 0.740 0.742
## goout 0.427 0.145 2.945 0.003 0.427 0.384
##
## Covariances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## Parents_Demo ~~
## Support_Act 0.311 0.081 3.836 0.000 0.311 0.311
## Personal_life 0.057 0.074 0.777 0.437 0.057 0.057
## Support_Act ~~
## Personal_life -0.046 0.099 -0.462 0.644 -0.046 -0.046
##
## Intercepts:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .Medu 2.749 0.055 49.977 0.000 2.749 2.515
## .Fedu 2.522 0.055 46.111 0.000 2.522 2.320
## .Mjob 3.170 0.062 51.386 0.000 3.170 2.586
## .Fjob 3.281 0.043 75.609 0.000 3.281 3.804
## .schoolsup 1.129 0.017 66.922 0.000 1.129 3.367
## .famsup 1.613 0.025 65.794 0.000 1.613 3.310
## .paid 1.458 0.025 58.167 0.000 1.458 2.927
## .famrel 3.944 0.045 87.537 0.000 3.944 4.404
## .freetime 3.235 0.050 64.458 0.000 3.235 3.243
## .goout 3.109 0.056 55.571 0.000 3.109 2.796
##
## Variances:
## Estimate Std.Err z-value P(>|z|) Std.lv Std.all
## .Medu 0.064 0.115 0.553 0.580 0.064 0.053
## .Fedu 0.697 0.069 10.083 0.000 0.697 0.590
## .Mjob 1.178 0.088 13.310 0.000 1.178 0.784
## .Fjob 0.722 0.052 13.941 0.000 0.722 0.971
## .schoolsup 0.112 0.008 13.910 0.000 0.112 0.993
## .famsup 0.143 0.043 3.314 0.001 0.143 0.604
## .paid 0.193 0.028 6.938 0.000 0.193 0.779
## .famrel 0.770 0.058 13.312 0.000 0.770 0.961
## .freetime 0.448 0.345 1.297 0.195 0.448 0.450
## .goout 1.054 0.138 7.661 0.000 1.054 0.852
## Parents_Demo 1.000 1.000 1.000
## Support_Act 1.000 1.000 1.000
## Personal_life 1.000 1.000 1.000
lets compare models and see which one is better.
anova(CFAmodel.fitted, CFAmodel.fitted_1)
## Warning: lavaan->lavTestLRT():
## some models are based on a different set of observed variables
##
## Chi-Squared Difference Test
##
## Df AIC BIC Chisq Chisq diff RMSEA Df diff
## CFAmodel.fitted_1 32 9073.6 9204.9 47.878
## CFAmodel.fitted 51 10186.0 10341.2 68.453 20.574 0.014483 19
## Pr(>Chisq)
## CFAmodel.fitted_1
## CFAmodel.fitted 0.3608
Model CFAmodel.fitted_1 seems to be better fitted than the first model as AIC value is low and also the Chisq value.