the summed variable of the sauere term of X1^2 + X2^2 will have the
same 95 quantiles of a random variable variable that follows a
Chi-squared distribution with degree freedom at 2
draws <- 10000
storageN <- rep(NA, draws)
storageC <- rep(NA, draws)
for (i in 1:draws) {
x1 <- rnorm(n=1, mean=0, sd=1)
x2 <- rnorm(n=1, mean=0, sd=1)
storageN[i] <- x1^2 + x2^2
storageC[i] <- rchisq(n = 1, df = 2)
}
quantile(storageN, prob = 0.95)
## 95%
## 5.946008
quantile(storageC, prob = 0.95)
## 95%
## 6.03983
sample size = 100000 is too large for the computer to handle QQ, so I
set 10000.
draws <- 10000
storage <- rep(NA, draws)
for (i in 1:draws) {
y <- rbinom(n = 10000, size = 10000, prob = 0.0001)
storage[i] <- mean(y) # repeated draw sample & calculate average 10000 times
}
mean(storage)
## [1] 1.000079
hist(storage)
QB3Dat <- read.csv("C:/Users/Liz/Desktop/NTNU/test/midterm/QB3Dat.csv")
str(QB3Dat) # all num variable
fa.parallel(QB3Dat, fa="fa") # suggests that the number of factors=2
## Parallel analysis suggests that the number of factors = 2 and the number of components = NA
EFAres2 <- fa(QB3Dat, nfactors = 2, fm ="ml", rotate ="oblimin" )
EFAres3 <- fa(QB3Dat, nfactors = 3, fm ="ml", rotate ="oblimin" )
print.psych(EFAres2, cut = .32) # TLI=0.963 RMSEA=0.033
## Factor Analysis using method = ml
## Call: fa(r = QB3Dat, nfactors = 2, rotate = "oblimin", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 h2 u2 com
## joy 0.66 0.46 0.54 1.0
## interest 0.56 0.30 0.70 1.0
## Pride 0.37 0.14 0.86 1.0
## depression 0.44 0.31 0.69 1.5
## Anxiety 0.46 0.24 0.76 1.1
## anger 0.60 0.32 0.68 1.1
## fear 0.43 0.24 0.76 1.2
##
## ML1 ML2
## SS loadings 1.02 0.99
## Proportion Var 0.15 0.14
## Cumulative Var 0.15 0.29
## Proportion Explained 0.51 0.49
## Cumulative Proportion 0.51 1.00
##
## With factor correlations of
## ML1 ML2
## ML1 1.00 -0.33
## ML2 -0.33 1.00
##
## Mean item complexity = 1.1
## Test of the hypothesis that 2 factors are sufficient.
##
## df null model = 21 with the objective function = 0.7 with Chi Square = 208.2
## df of the model are 8 and the objective function was 0.04
##
## The root mean square of the residuals (RMSR) is 0.03
## The df corrected root mean square of the residuals is 0.05
##
## The harmonic n.obs is 300 with the empirical chi square 12.01 with prob < 0.15
## The total n.obs was 300 with Likelihood Chi Square = 10.63 with prob < 0.22
##
## Tucker Lewis Index of factoring reliability = 0.963
## RMSEA index = 0.033 and the 90 % confidence intervals are 0 0.08
## BIC = -35
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy
## ML1 ML2
## Correlation of (regression) scores with factors 0.79 0.77
## Multiple R square of scores with factors 0.62 0.59
## Minimum correlation of possible factor scores 0.25 0.18
print.psych(EFAres3, cut = .32) # TLI=1.043 RMSEA=0
## Factor Analysis using method = ml
## Call: fa(r = QB3Dat, nfactors = 3, rotate = "oblimin", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
## ML1 ML2 ML3 h2 u2 com
## joy 0.52 0.35 0.645 1.2
## interest 0.63 0.39 0.609 1.0
## Pride 0.99 1.00 0.005 1.0
## depression 0.43 0.31 0.690 1.7
## Anxiety 0.45 0.24 0.761 1.2
## anger 0.62 0.37 0.628 1.2
## fear 0.43 0.26 0.741 1.6
##
## ML1 ML2 ML3
## SS loadings 1.03 1.00 0.88
## Proportion Var 0.15 0.14 0.13
## Cumulative Var 0.15 0.29 0.42
## Proportion Explained 0.35 0.34 0.30
## Cumulative Proportion 0.35 0.70 1.00
##
## With factor correlations of
## ML1 ML2 ML3
## ML1 1.00 -0.15 0.28
## ML2 -0.15 1.00 -0.24
## ML3 0.28 -0.24 1.00
##
## Mean item complexity = 1.3
## Test of the hypothesis that 3 factors are sufficient.
##
## df null model = 21 with the objective function = 0.7 with Chi Square = 208.2
## df of the model are 3 and the objective function was 0.01
##
## The root mean square of the residuals (RMSR) is 0.01
## The df corrected root mean square of the residuals is 0.03
##
## The harmonic n.obs is 300 with the empirical chi square 1.84 with prob < 0.61
## The total n.obs was 300 with Likelihood Chi Square = 1.85 with prob < 0.6
##
## Tucker Lewis Index of factoring reliability = 1.043
## RMSEA index = 0 and the 90 % confidence intervals are 0 0.081
## BIC = -15.27
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy
## ML1 ML2 ML3
## Correlation of (regression) scores with factors 1.00 0.77 0.77
## Multiple R square of scores with factors 0.99 0.60 0.59
## Minimum correlation of possible factor scores 0.99 0.19 0.18
anova(EFAres2, EFAres3)
## Model 1 = fa(r = QB3Dat, nfactors = 2, rotate = "oblimin", fm = "ml")
## Model 2 = fa(r = QB3Dat, nfactors = 3, rotate = "oblimin", fm = "ml")
maybe factors=3 better ? but we don’t want factors that has less than
three items
factors=2 TLI=0.963 RMSEA=0.033 is acceptable
Two factors:positive = (joy + interest + Pride) negative = (depression
+ Anxiety + anger + fear)
QB4Dat <- read.csv("C:/Users/Liz/Desktop/NTNU/test/midterm/QB4Dat.csv")
str(QB4Dat) # all int variable
summary(QB4Dat) # all 1 or 2
mod1 <- '
QOL =~ physical + enviroment + psychology + socialLife
'
res1 <- cfa(mod1, data= QB4Dat, estimator= "WLSMv",
ordered = c("psychology", "socialLife", "physical", "enviroment" ) )
summary(res1, fit.measures = T)
## lavaan 0.6.16 ended normally after 18 iterations
##
## Estimator DWLS
## Optimization method NLMINB
## Number of model parameters 8
##
## Number of observations 5000
##
## Model Test User Model:
## Standard Scaled
## Test Statistic 824.269 908.194
## Degrees of freedom 2 2
## P-value (Chi-square) 0.000 0.000
## Scaling correction factor 0.908
## Shift parameter 0.299
## simple second-order correction
##
## Model Test Baseline Model:
##
## Test statistic 5730.672 4861.203
## Degrees of freedom 6 6
## P-value 0.000 0.000
## Scaling correction factor 1.179
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 0.856 0.813
## Tucker-Lewis Index (TLI) 0.569 0.440
##
## Robust Comparative Fit Index (CFI) 0.628
## Robust Tucker-Lewis Index (TLI) -0.117
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.287 0.301
## 90 Percent confidence interval - lower 0.270 0.285
## 90 Percent confidence interval - upper 0.303 0.318
## P-value H_0: RMSEA <= 0.050 0.000 0.000
## P-value H_0: RMSEA >= 0.080 1.000 1.000
##
## Robust RMSEA 0.533
## 90 Percent confidence interval - lower 0.498
## 90 Percent confidence interval - upper 0.568
## P-value H_0: Robust RMSEA <= 0.050 0.000
## P-value H_0: Robust RMSEA >= 0.080 1.000
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.178 0.178
##
## Parameter Estimates:
##
## Standard errors Robust.sem
## Information Expected
## Information saturated (h1) model Unstructured
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|)
## QOL =~
## physical 1.000
## enviroment 0.610 0.029 20.906 0.000
## psychology 1.215 0.054 22.559 0.000
## socialLife 0.646 0.029 22.232 0.000
##
## Intercepts:
## Estimate Std.Err z-value P(>|z|)
## .physical 0.000
## .enviroment 0.000
## .psychology 0.000
## .socialLife 0.000
## QOL 0.000
##
## Thresholds:
## Estimate Std.Err z-value P(>|z|)
## physical|t1 0.477 0.018 25.797 0.000
## enviroment|t1 0.500 0.019 26.965 0.000
## psychology|t1 0.473 0.018 25.629 0.000
## socialLife|t1 0.512 0.019 27.549 0.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .physical 0.411
## .enviroment 0.781
## .psychology 0.130
## .socialLife 0.754
## QOL 0.589 0.030 19.410 0.000
##
## Scales y*:
## Estimate Std.Err z-value P(>|z|)
## physical 1.000
## enviroment 1.000
## psychology 1.000
## socialLife 1.000
mod1 CFI=0.813 TLI=0.44 RMSEA=0.301 unacceptable.
modindices(res1, sort = T) #repair
mod2 <- '
QOL =~ physical + enviroment + psychology + socialLife
physical ~~ psychology
'
res2 <- cfa(mod2, data= QB4Dat, estimator= "WLSMv",
ordered = c("psychology", "socialLife", "physical", "enviroment" ) )
summary(res2, fit.measures = T) # CFI=1 TLI=1.0001 RMSEA=0
## lavaan 0.6.16 ended normally after 50 iterations
##
## Estimator DWLS
## Optimization method NLMINB
## Number of model parameters 9
##
## Number of observations 5000
##
## Model Test User Model:
## Standard Scaled
## Test Statistic 0.137 0.391
## Degrees of freedom 1 1
## P-value (Chi-square) 0.711 0.532
## Scaling correction factor 0.352
## Shift parameter -0.000
## simple second-order correction
##
## Model Test Baseline Model:
##
## Test statistic 5730.672 4861.203
## Degrees of freedom 6 6
## P-value 0.000 0.000
## Scaling correction factor 1.179
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 1.000 1.000
## Tucker-Lewis Index (TLI) 1.001 1.001
##
## Robust Comparative Fit Index (CFI) 1.000
## Robust Tucker-Lewis Index (TLI) 1.005
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.000 0.000
## 90 Percent confidence interval - lower 0.000 0.000
## 90 Percent confidence interval - upper 0.027 0.032
## P-value H_0: RMSEA <= 0.050 0.999 0.998
## P-value H_0: RMSEA >= 0.080 0.000 0.000
##
## Robust RMSEA 0.000
## 90 Percent confidence interval - lower 0.000
## 90 Percent confidence interval - upper 0.104
## P-value H_0: Robust RMSEA <= 0.050 0.721
## P-value H_0: Robust RMSEA >= 0.080 0.125
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.003 0.003
##
## Parameter Estimates:
##
## Standard errors Robust.sem
## Information Expected
## Information saturated (h1) model Unstructured
##
## Latent Variables:
## Estimate Std.Err z-value P(>|z|)
## QOL =~
## physical 1.000
## enviroment 4.027 0.587 6.855 0.000
## psychology 1.979 0.231 8.555 0.000
## socialLife 4.797 0.752 6.378 0.000
##
## Covariances:
## Estimate Std.Err z-value P(>|z|)
## .physical ~~
## .psychology 0.716 0.016 44.775 0.000
##
## Intercepts:
## Estimate Std.Err z-value P(>|z|)
## .physical 0.000
## .enviroment 0.000
## .psychology 0.000
## .socialLife 0.000
## QOL 0.000
##
## Thresholds:
## Estimate Std.Err z-value P(>|z|)
## physical|t1 0.477 0.018 25.797 0.000
## enviroment|t1 0.500 0.019 26.965 0.000
## psychology|t1 0.473 0.018 25.629 0.000
## socialLife|t1 0.512 0.019 27.549 0.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .physical 0.968
## .enviroment 0.485
## .psychology 0.876
## .socialLife 0.270
## QOL 0.032 0.009 3.510 0.000
##
## Scales y*:
## Estimate Std.Err z-value P(>|z|)
## physical 1.000
## enviroment 1.000
## psychology 1.000
## socialLife 1.000
mod2 CFI=1 TLI=1.0001 RMSEA=0 is better.
QB5Dat <- read.csv("C:/Users/Liz/Desktop/NTNU/test/midterm/QB5Dat.csv")
str(QB5Dat)
OLSreQ5 <- lm(grade ~ studyHr + support, data = QB5Dat)
summary(OLSreQ5)
##
## Call:
## lm(formula = grade ~ studyHr + support, data = QB5Dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.0637 -0.5690 -0.1134 0.5770 1.3187
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.1288 0.9375 59.870 < 2e-16 ***
## studyHr 0.7015 0.1760 3.985 0.000959 ***
## support -0.1642 0.2162 -0.760 0.457934
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7898 on 17 degrees of freedom
## Multiple R-squared: 0.4857, Adjusted R-squared: 0.4252
## F-statistic: 8.027 on 2 and 17 DF, p-value: 0.003511
# Extract OLS estimates
OLSslope <- coef(OLSreQ5)["studyHr"]
OLSresidualvar <- summary(OLSreQ5)$sigma^2
model <- 'grade ~ studyHr + support'
MLreQ5 <- sem(model, data = QB5Dat, estimator = "ML")
summary(MLreQ5)
## lavaan 0.6.16 ended normally after 1 iteration
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 3
##
## Number of observations 20
##
## Model Test User Model:
##
## Test statistic 0.000
## Degrees of freedom 0
##
## Parameter Estimates:
##
## Standard errors Standard
## Information Expected
## Information saturated (h1) model Structured
##
## Regressions:
## Estimate Std.Err z-value P(>|z|)
## grade ~
## studyHr 0.701 0.162 4.322 0.000
## support -0.164 0.199 -0.824 0.410
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .grade 0.530 0.168 3.162 0.002
# Extract ML estimates
MLslope <- parameterEstimates(MLreQ5)$est[1]
MLresidualvar <- parameterEstimates(MLreQ5)$est[3]
OLSslope
## studyHr
## 0.7014689
MLslope
## [1] 0.7014689
OLSresidualvar
## [1] 0.6238261
MLresidualvar
## [1] 0.5302522
MLresidualvar is underestimated
ML and OLS both estimators will generate unbiased slope
the residual variance of ML will be biased, unless the sample size is
big
draws <- 10000
slope1 <- rep(NA, draws)
slope2 <- rep(NA, draws)
for (i in 1:draws) {
X <- rnorm(1000)
NorE <-rnorm(1000)
LapE <- rlaplace(1000) # Laplace distribution
Y1 <- 0.2 + 0.5 * X + NorE
Y2 <- 0.2 + 0.5 * X + LapE
# Fit regression model
model1 <- lm(Y1 ~ X)
model2 <- lm(Y2 ~ X)
# Extract the coefficients
slope1[i] <- coef(model1)["X"]
slope2[i] <- coef(model2)["X"]
}
mean(slope1)
## [1] 0.499404
mean(slope2)
## [1] 0.5003527
sd(slope1)
## [1] 0.03179522
sd(slope2)
## [1] 0.04530654
both unbiased but sd of laplas is bigger
par(mfrow = c(1, 2))
hist(slope1, xlim = c(0.35, 0.65), ylim = c(0, 3000))
hist(slope2, xlim = c(0.35, 0.65), ylim = c(0, 3000))
draws <- 10000
slope1 <- rep(NA, draws)
slope2 <- rep(NA, draws)
for (i in 1:draws) {
X <- rnorm(1000)
X_prime <- X + rnorm(1000) # add normal noise
NorE <- rnorm(1000)
Y1 <- 0.2 + 0.5*X + NorE
Y2 <- 0.2 + 0.5*X_prime + NorE
# Fit a regression model
model1 <- lm(Y1 ~ X)
model2 <- lm(Y2 ~ X_prime)
# Extract the slope coefficient and store it
slope1[i] <- coef(model1)["X"]
slope2[i] <- coef(model2)["X_prime"]
}
mean(slope1)
## [1] 0.4999464
mean(slope2)
## [1] 0.5000155
sd(slope1)
## [1] 0.03124199
sd(slope2)
## [1] 0.02203813
add noise make sd converge
hist couldn’t see the different QQ
# long data frame
slopes_df <- data.frame(Slope = c(slope1, slope2), Model = rep(c("Model1", "Model2"), each = draws))
# boxplot
library(ggplot2)
ggplot(slopes_df, aes(x = Model, y = Slope, fill = Model)) +
geom_boxplot() +
geom_point(stat = "summary", fun = "mean", shape = 3, size = 3, aes(group = Model)) +
labs(x = "Model", y = "Slope", title = "Slopes (Model 1 vs. Model 2)") +
theme_minimal()