Q1

the summed variable of the sauere term of X1^2 + X2^2 will have the same 95 quantiles of a random variable variable that follows a Chi-squared distribution with degree freedom at 2

draws <- 10000
storageN <- rep(NA, draws)
storageC <- rep(NA, draws)

for (i in 1:draws) {
  
  x1 <- rnorm(n=1, mean=0, sd=1)
  x2 <- rnorm(n=1, mean=0, sd=1)
  storageN[i] <- x1^2 + x2^2
  storageC[i] <- rchisq(n = 1, df = 2)
  }

quantile(storageN, prob = 0.95)
##      95% 
## 5.946008
quantile(storageC, prob = 0.95)
##     95% 
## 6.03983

Q2

sample size = 100000 is too large for the computer to handle QQ, so I set 10000.

draws <- 10000
storage <- rep(NA, draws)
for (i in 1:draws) {
  y <- rbinom(n = 10000, size = 10000, prob = 0.0001)
  storage[i] <- mean(y) # repeated draw sample & calculate average 10000 times
}

mean(storage)
## [1] 1.000079
hist(storage)

Q3

data

QB3Dat <- read.csv("C:/Users/Liz/Desktop/NTNU/test/midterm/QB3Dat.csv")
str(QB3Dat) # all num variable

EFA

fa.parallel(QB3Dat, fa="fa")   # suggests that the number of factors=2

## Parallel analysis suggests that the number of factors =  2  and the number of components =  NA
EFAres2 <- fa(QB3Dat, nfactors = 2, fm ="ml", rotate ="oblimin" )
EFAres3 <- fa(QB3Dat, nfactors = 3, fm ="ml", rotate ="oblimin" )

print.psych(EFAres2, cut = .32) # TLI=0.963 RMSEA=0.033 
## Factor Analysis using method =  ml
## Call: fa(r = QB3Dat, nfactors = 2, rotate = "oblimin", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##              ML1   ML2   h2   u2 com
## joy         0.66       0.46 0.54 1.0
## interest    0.56       0.30 0.70 1.0
## Pride       0.37       0.14 0.86 1.0
## depression        0.44 0.31 0.69 1.5
## Anxiety           0.46 0.24 0.76 1.1
## anger             0.60 0.32 0.68 1.1
## fear              0.43 0.24 0.76 1.2
## 
##                        ML1  ML2
## SS loadings           1.02 0.99
## Proportion Var        0.15 0.14
## Cumulative Var        0.15 0.29
## Proportion Explained  0.51 0.49
## Cumulative Proportion 0.51 1.00
## 
##  With factor correlations of 
##       ML1   ML2
## ML1  1.00 -0.33
## ML2 -0.33  1.00
## 
## Mean item complexity =  1.1
## Test of the hypothesis that 2 factors are sufficient.
## 
## df null model =  21  with the objective function =  0.7 with Chi Square =  208.2
## df of  the model are 8  and the objective function was  0.04 
## 
## The root mean square of the residuals (RMSR) is  0.03 
## The df corrected root mean square of the residuals is  0.05 
## 
## The harmonic n.obs is  300 with the empirical chi square  12.01  with prob <  0.15 
## The total n.obs was  300  with Likelihood Chi Square =  10.63  with prob <  0.22 
## 
## Tucker Lewis Index of factoring reliability =  0.963
## RMSEA index =  0.033  and the 90 % confidence intervals are  0 0.08
## BIC =  -35
## Fit based upon off diagonal values = 0.98
## Measures of factor score adequacy             
##                                                    ML1  ML2
## Correlation of (regression) scores with factors   0.79 0.77
## Multiple R square of scores with factors          0.62 0.59
## Minimum correlation of possible factor scores     0.25 0.18
print.psych(EFAres3, cut = .32) # TLI=1.043 RMSEA=0
## Factor Analysis using method =  ml
## Call: fa(r = QB3Dat, nfactors = 3, rotate = "oblimin", fm = "ml")
## Standardized loadings (pattern matrix) based upon correlation matrix
##              ML1   ML2   ML3   h2    u2 com
## joy                     0.52 0.35 0.645 1.2
## interest                0.63 0.39 0.609 1.0
## Pride       0.99             1.00 0.005 1.0
## depression        0.43       0.31 0.690 1.7
## Anxiety           0.45       0.24 0.761 1.2
## anger             0.62       0.37 0.628 1.2
## fear              0.43       0.26 0.741 1.6
## 
##                        ML1  ML2  ML3
## SS loadings           1.03 1.00 0.88
## Proportion Var        0.15 0.14 0.13
## Cumulative Var        0.15 0.29 0.42
## Proportion Explained  0.35 0.34 0.30
## Cumulative Proportion 0.35 0.70 1.00
## 
##  With factor correlations of 
##       ML1   ML2   ML3
## ML1  1.00 -0.15  0.28
## ML2 -0.15  1.00 -0.24
## ML3  0.28 -0.24  1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 3 factors are sufficient.
## 
## df null model =  21  with the objective function =  0.7 with Chi Square =  208.2
## df of  the model are 3  and the objective function was  0.01 
## 
## The root mean square of the residuals (RMSR) is  0.01 
## The df corrected root mean square of the residuals is  0.03 
## 
## The harmonic n.obs is  300 with the empirical chi square  1.84  with prob <  0.61 
## The total n.obs was  300  with Likelihood Chi Square =  1.85  with prob <  0.6 
## 
## Tucker Lewis Index of factoring reliability =  1.043
## RMSEA index =  0  and the 90 % confidence intervals are  0 0.081
## BIC =  -15.27
## Fit based upon off diagonal values = 1
## Measures of factor score adequacy             
##                                                    ML1  ML2  ML3
## Correlation of (regression) scores with factors   1.00 0.77 0.77
## Multiple R square of scores with factors          0.99 0.60 0.59
## Minimum correlation of possible factor scores     0.99 0.19 0.18
anova(EFAres2, EFAres3)
## Model 1 = fa(r = QB3Dat, nfactors = 2, rotate = "oblimin", fm = "ml")
## Model 2 = fa(r = QB3Dat, nfactors = 3, rotate = "oblimin", fm = "ml")

maybe factors=3 better ? but we don’t want factors that has less than three items
factors=2 TLI=0.963 RMSEA=0.033 is acceptable
Two factors:positive = (joy + interest + Pride) negative = (depression + Anxiety + anger + fear)

Q4

data

QB4Dat <- read.csv("C:/Users/Liz/Desktop/NTNU/test/midterm/QB4Dat.csv")
str(QB4Dat) # all int variable
summary(QB4Dat) # all 1 or 2

CFA

mod1

mod1 <- '
QOL =~ physical + enviroment + psychology + socialLife
'

res1 <- cfa(mod1, data= QB4Dat, estimator= "WLSMv",
            ordered = c("psychology", "socialLife", "physical", "enviroment" ) )

summary(res1, fit.measures = T)
## lavaan 0.6.16 ended normally after 18 iterations
## 
##   Estimator                                       DWLS
##   Optimization method                           NLMINB
##   Number of model parameters                         8
## 
##   Number of observations                          5000
## 
## Model Test User Model:
##                                               Standard      Scaled
##   Test Statistic                               824.269     908.194
##   Degrees of freedom                                 2           2
##   P-value (Chi-square)                           0.000       0.000
##   Scaling correction factor                                  0.908
##   Shift parameter                                            0.299
##     simple second-order correction                                
## 
## Model Test Baseline Model:
## 
##   Test statistic                              5730.672    4861.203
##   Degrees of freedom                                 6           6
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.179
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    0.856       0.813
##   Tucker-Lewis Index (TLI)                       0.569       0.440
##                                                                   
##   Robust Comparative Fit Index (CFI)                         0.628
##   Robust Tucker-Lewis Index (TLI)                           -0.117
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.287       0.301
##   90 Percent confidence interval - lower         0.270       0.285
##   90 Percent confidence interval - upper         0.303       0.318
##   P-value H_0: RMSEA <= 0.050                    0.000       0.000
##   P-value H_0: RMSEA >= 0.080                    1.000       1.000
##                                                                   
##   Robust RMSEA                                               0.533
##   90 Percent confidence interval - lower                     0.498
##   90 Percent confidence interval - upper                     0.568
##   P-value H_0: Robust RMSEA <= 0.050                         0.000
##   P-value H_0: Robust RMSEA >= 0.080                         1.000
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.178       0.178
## 
## Parameter Estimates:
## 
##   Standard errors                           Robust.sem
##   Information                                 Expected
##   Information saturated (h1) model        Unstructured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   QOL =~                                              
##     physical          1.000                           
##     enviroment        0.610    0.029   20.906    0.000
##     psychology        1.215    0.054   22.559    0.000
##     socialLife        0.646    0.029   22.232    0.000
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .physical          0.000                           
##    .enviroment        0.000                           
##    .psychology        0.000                           
##    .socialLife        0.000                           
##     QOL               0.000                           
## 
## Thresholds:
##                    Estimate  Std.Err  z-value  P(>|z|)
##     physical|t1       0.477    0.018   25.797    0.000
##     enviroment|t1     0.500    0.019   26.965    0.000
##     psychology|t1     0.473    0.018   25.629    0.000
##     socialLife|t1     0.512    0.019   27.549    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .physical          0.411                           
##    .enviroment        0.781                           
##    .psychology        0.130                           
##    .socialLife        0.754                           
##     QOL               0.589    0.030   19.410    0.000
## 
## Scales y*:
##                    Estimate  Std.Err  z-value  P(>|z|)
##     physical          1.000                           
##     enviroment        1.000                           
##     psychology        1.000                           
##     socialLife        1.000

mod1 CFI=0.813 TLI=0.44 RMSEA=0.301 unacceptable.

modindices(res1, sort = T)  #repair

mod2

mod2 <- '
QOL =~ physical + enviroment + psychology + socialLife
physical ~~ psychology
'

res2 <- cfa(mod2, data= QB4Dat, estimator= "WLSMv",
            ordered = c("psychology", "socialLife", "physical", "enviroment" ) )

summary(res2, fit.measures = T) # CFI=1 TLI=1.0001 RMSEA=0
## lavaan 0.6.16 ended normally after 50 iterations
## 
##   Estimator                                       DWLS
##   Optimization method                           NLMINB
##   Number of model parameters                         9
## 
##   Number of observations                          5000
## 
## Model Test User Model:
##                                               Standard      Scaled
##   Test Statistic                                 0.137       0.391
##   Degrees of freedom                                 1           1
##   P-value (Chi-square)                           0.711       0.532
##   Scaling correction factor                                  0.352
##   Shift parameter                                           -0.000
##     simple second-order correction                                
## 
## Model Test Baseline Model:
## 
##   Test statistic                              5730.672    4861.203
##   Degrees of freedom                                 6           6
##   P-value                                        0.000       0.000
##   Scaling correction factor                                  1.179
## 
## User Model versus Baseline Model:
## 
##   Comparative Fit Index (CFI)                    1.000       1.000
##   Tucker-Lewis Index (TLI)                       1.001       1.001
##                                                                   
##   Robust Comparative Fit Index (CFI)                         1.000
##   Robust Tucker-Lewis Index (TLI)                            1.005
## 
## Root Mean Square Error of Approximation:
## 
##   RMSEA                                          0.000       0.000
##   90 Percent confidence interval - lower         0.000       0.000
##   90 Percent confidence interval - upper         0.027       0.032
##   P-value H_0: RMSEA <= 0.050                    0.999       0.998
##   P-value H_0: RMSEA >= 0.080                    0.000       0.000
##                                                                   
##   Robust RMSEA                                               0.000
##   90 Percent confidence interval - lower                     0.000
##   90 Percent confidence interval - upper                     0.104
##   P-value H_0: Robust RMSEA <= 0.050                         0.721
##   P-value H_0: Robust RMSEA >= 0.080                         0.125
## 
## Standardized Root Mean Square Residual:
## 
##   SRMR                                           0.003       0.003
## 
## Parameter Estimates:
## 
##   Standard errors                           Robust.sem
##   Information                                 Expected
##   Information saturated (h1) model        Unstructured
## 
## Latent Variables:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   QOL =~                                              
##     physical          1.000                           
##     enviroment        4.027    0.587    6.855    0.000
##     psychology        1.979    0.231    8.555    0.000
##     socialLife        4.797    0.752    6.378    0.000
## 
## Covariances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##  .physical ~~                                         
##    .psychology        0.716    0.016   44.775    0.000
## 
## Intercepts:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .physical          0.000                           
##    .enviroment        0.000                           
##    .psychology        0.000                           
##    .socialLife        0.000                           
##     QOL               0.000                           
## 
## Thresholds:
##                    Estimate  Std.Err  z-value  P(>|z|)
##     physical|t1       0.477    0.018   25.797    0.000
##     enviroment|t1     0.500    0.019   26.965    0.000
##     psychology|t1     0.473    0.018   25.629    0.000
##     socialLife|t1     0.512    0.019   27.549    0.000
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .physical          0.968                           
##    .enviroment        0.485                           
##    .psychology        0.876                           
##    .socialLife        0.270                           
##     QOL               0.032    0.009    3.510    0.000
## 
## Scales y*:
##                    Estimate  Std.Err  z-value  P(>|z|)
##     physical          1.000                           
##     enviroment        1.000                           
##     psychology        1.000                           
##     socialLife        1.000

mod2 CFI=1 TLI=1.0001 RMSEA=0 is better.

Q5

data

QB5Dat <- read.csv("C:/Users/Liz/Desktop/NTNU/test/midterm/QB5Dat.csv")
str(QB5Dat)

Regression

OLS

OLSreQ5 <- lm(grade ~ studyHr + support, data = QB5Dat)
summary(OLSreQ5)
## 
## Call:
## lm(formula = grade ~ studyHr + support, data = QB5Dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0637 -0.5690 -0.1134  0.5770  1.3187 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  56.1288     0.9375  59.870  < 2e-16 ***
## studyHr       0.7015     0.1760   3.985 0.000959 ***
## support      -0.1642     0.2162  -0.760 0.457934    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7898 on 17 degrees of freedom
## Multiple R-squared:  0.4857, Adjusted R-squared:  0.4252 
## F-statistic: 8.027 on 2 and 17 DF,  p-value: 0.003511
# Extract OLS estimates
OLSslope <- coef(OLSreQ5)["studyHr"]
OLSresidualvar <- summary(OLSreQ5)$sigma^2

ML

model <- 'grade ~ studyHr + support'
MLreQ5 <- sem(model, data = QB5Dat, estimator = "ML")
summary(MLreQ5)
## lavaan 0.6.16 ended normally after 1 iteration
## 
##   Estimator                                         ML
##   Optimization method                           NLMINB
##   Number of model parameters                         3
## 
##   Number of observations                            20
## 
## Model Test User Model:
##                                                       
##   Test statistic                                 0.000
##   Degrees of freedom                                 0
## 
## Parameter Estimates:
## 
##   Standard errors                             Standard
##   Information                                 Expected
##   Information saturated (h1) model          Structured
## 
## Regressions:
##                    Estimate  Std.Err  z-value  P(>|z|)
##   grade ~                                             
##     studyHr           0.701    0.162    4.322    0.000
##     support          -0.164    0.199   -0.824    0.410
## 
## Variances:
##                    Estimate  Std.Err  z-value  P(>|z|)
##    .grade             0.530    0.168    3.162    0.002
# Extract ML estimates
MLslope <- parameterEstimates(MLreQ5)$est[1]
MLresidualvar <- parameterEstimates(MLreQ5)$est[3]

compare

OLSslope
##   studyHr 
## 0.7014689
MLslope
## [1] 0.7014689
OLSresidualvar
## [1] 0.6238261
MLresidualvar
## [1] 0.5302522

MLresidualvar is underestimated
ML and OLS both estimators will generate unbiased slope
the residual variance of ML will be biased, unless the sample size is big

Q6

draws <- 10000
slope1 <- rep(NA, draws)
slope2 <- rep(NA, draws)

for (i in 1:draws) {
  X <- rnorm(1000)
  NorE <-rnorm(1000)
  LapE <- rlaplace(1000) # Laplace distribution
  Y1 <- 0.2 + 0.5 * X + NorE
  Y2 <- 0.2 + 0.5 * X + LapE
  # Fit regression model
  model1 <- lm(Y1 ~ X)
  model2 <- lm(Y2 ~ X)
  # Extract the coefficients
  slope1[i] <- coef(model1)["X"]
  slope2[i] <- coef(model2)["X"]
 }

mean(slope1)
## [1] 0.499404
mean(slope2)
## [1] 0.5003527
sd(slope1)
## [1] 0.03179522
sd(slope2)
## [1] 0.04530654

both unbiased but sd of laplas is bigger

par(mfrow = c(1, 2))
hist(slope1, xlim = c(0.35, 0.65), ylim = c(0, 3000))
hist(slope2, xlim = c(0.35, 0.65), ylim = c(0, 3000))

Q7

draws <- 10000
slope1 <- rep(NA, draws)
slope2 <- rep(NA, draws)

for (i in 1:draws) {
  X <- rnorm(1000)
  X_prime <- X + rnorm(1000) # add normal noise
  NorE <- rnorm(1000)
  
  Y1 <- 0.2 + 0.5*X + NorE
  Y2 <- 0.2 + 0.5*X_prime + NorE
  
  # Fit a regression model
  model1 <- lm(Y1 ~ X)
  model2 <- lm(Y2 ~ X_prime)
  
  # Extract the slope coefficient and store it
  slope1[i] <- coef(model1)["X"]
  slope2[i] <- coef(model2)["X_prime"]

}

mean(slope1)
## [1] 0.4999464
mean(slope2)
## [1] 0.5000155
sd(slope1)
## [1] 0.03124199
sd(slope2)
## [1] 0.02203813

add noise make sd converge
hist couldn’t see the different QQ

# long data frame
slopes_df <- data.frame(Slope = c(slope1, slope2), Model = rep(c("Model1", "Model2"), each = draws))

# boxplot
library(ggplot2)
ggplot(slopes_df, aes(x = Model, y = Slope, fill = Model)) +
  geom_boxplot() +
  geom_point(stat = "summary", fun = "mean", shape = 3, size = 3, aes(group = Model)) +
  labs(x = "Model", y = "Slope", title = "Slopes (Model 1 vs. Model 2)") +
  theme_minimal()