Chương trình tập huấn phân tích dữ liệu bằng ngôn ngữ R - BV SIS Cần Thơ (2-6/1/2025)

Ngày 4: Phân tích theo thời gian

Phân tích tương quan

Việc 1. Phân tích trước-sau cho 1 nhóm

1.1 Đọc dữ liệu vào R

df = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\SIS Can Tho\\Datasets\\Pre-Post study one group.csv")
head(df)
##   EmpID Before After
## 1    26     43    66
## 2    27     58    74
## 3    28     52    62
## 4    29     47    84
## 5    30     43    78
## 6    31     50    73

1.2 Đánh giá hiệu quả can thiệp bằng paired t-test

library(lessR)
## Warning: package 'lessR' was built under R version 4.3.3
## 
## lessR 4.3.9                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()
ttest(Before, After, paired = TRUE, data = df)
## 
## 
## ------ Describe ------
## 
## Difference:  n.miss = 0,  n = 25,   mean = 19.640,  sd = 11.597
## 
## 
## ------ Normality Assumption ------
## 
## Null hypothesis is a normal distribution of Difference.
## Shapiro-Wilk normality test:  W = 0.9687,  p-value = 0.613
## 
## 
## ------ Infer ------
## 
## t-cutoff for 95% range of variation: tcut =  2.064 
## Standard Error of Mean: SE =  2.319 
## 
## Hypothesized Value H0: mu = 0 
## Hypothesis Test of Mean:  t-value = 8.468,  df = 24,  p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  4.787
## 95% Confidence Interval for Mean:  14.853 to 24.427
## 
## 
## ------ Effect Size ------
## 
## Distance of sample mean from hypothesized:  19.640
## Standardized Distance, Cohen's d:  1.694
## 
## 
## ------ Graphics Smoothing Parameter ------
## 
## Density bandwidth for 6.941

Việc 2. Phân tích trước-sau cho 2 nhóm

2.1 Đọc dữ liệu

df2 = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\SIS Can Tho\\Datasets\\Pre-Post test dataset.csv")
head(df2)
##   EmpID Treatment Before After
## 1    26       New     41    66
## 2    27       New     56    74
## 3    28       New     50    62
## 4    29       New     45    84
## 5    30       New     41    78
## 6    31       New     48    73

2.2 So sánh hiệu quả của 2 can thiệp bằng t-test

df2$Diff = df2$After - df2$Before
ttest(Diff ~ Treatment, data = df2)
## 
## Compare Diff across Treatment with levels New and Old 
## Grouping Variable:  Treatment
## Response Variable:  Diff
## 
## 
## ------ Describe ------
## 
## Diff for Treatment New:  n.miss = 0,  n = 25,  mean = 22.040,  sd = 11.238
## Diff for Treatment Old:  n.miss = 0,  n = 25,  mean = 13.280,  sd = 12.785
## 
## Mean Difference of Diff:  8.760
## 
## Weighted Average Standard Deviation:   12.036 
## 
## 
## ------ Assumptions ------
## 
## Note: These hypothesis tests can perform poorly, and the 
##       t-test is typically robust to violations of assumptions. 
##       Use as heuristic guides instead of interpreting literally. 
## 
## Null hypothesis, for each group, is a normal distribution of Diff.
## Group New  Shapiro-Wilk normality test:  W = 0.964,  p-value = 0.502
## Group Old  Shapiro-Wilk normality test:  W = 0.954,  p-value = 0.309
## 
## Null hypothesis is equal variances of Diff, homogeneous.
## Variance Ratio test:  F = 163.460/126.290 = 1.294,  df = 24;24,  p-value = 0.532
## Levene's test, Brown-Forsythe:  t = -0.983,  df = 48,  p-value = 0.331
## 
## 
## ------ Infer ------
## 
## --- Assume equal population variances of Diff for each Treatment 
## 
## t-cutoff for 95% range of variation: tcut =  2.011 
## Standard Error of Mean Difference: SE =  3.404 
## 
## Hypothesis Test of 0 Mean Diff:  t-value = 2.573,  df = 48,  p-value = 0.013
## 
## Margin of Error for 95% Confidence Level:  6.845
## 95% Confidence Interval for Mean Difference:  1.915 to 15.605
## 
## 
## --- Do not assume equal population variances of Diff for each Treatment 
## 
## t-cutoff: tcut =  2.011 
## Standard Error of Mean Difference: SE =  3.404 
## 
## Hypothesis Test of 0 Mean Diff:  t = 2.573,  df = 47.223, p-value = 0.013
## 
## Margin of Error for 95% Confidence Level:  6.848
## 95% Confidence Interval for Mean Difference:  1.912 to 15.608
## 
## 
## ------ Effect Size ------
## 
## --- Assume equal population variances of Diff for each Treatment 
## 
## Standardized Mean Difference of Diff, Cohen's d:  0.728
## 
## 
## ------ Practical Importance ------
## 
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
## 
## 
## ------ Graphics Smoothing Parameter ------
## 
## Density bandwidth for Treatment New: 6.726
## Density bandwidth for Treatment Old: 7.655

2.3 So sánh hiệu quả của 2 can thiệp bằng oha6n tích hiệp biến

reg(After ~ Before + Treatment + Before*Treatment, data = df2)
## 
## >>>  Treatment is not numeric. Converted to indicator variables.

## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## reg(After ~ Before + Treatment + Before * Treatment, data=df2, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  df2 
##  
## Response Variable: After 
## Predictor Variable 1: Before 
## Predictor Variable 2: TreatmentOld 
## Predictor Variable 3: Before.TreatmentOld 
##  
## Number of cases (rows) of data:  50 
## Number of cases retained for analysis:  50 
## 
## 
##   BASIC ANALYSIS 
## 
##                      Estimate    Std Err  t-value  p-value   Lower 95%   Upper 95% 
##         (Intercept)    90.468     12.149    7.446    0.000      66.012     114.923 
##              Before    -0.360      0.239   -1.503    0.140      -0.842       0.122 
##        TreatmentOld    -9.181     16.948   -0.542    0.591     -43.295      24.933 
## Before.TreatmentOld    -0.056      0.342   -0.163    0.871      -0.744       0.632 
## 
## Standard deviation of After: 9.795 
##  
## Standard deviation of residuals:  7.880 for df=46 
## 95% range of residuals:  31.723 = 2 * (2.013 * 7.880) 
##  
## R-squared: 0.392    Adjusted R-squared: 0.353    PRESS R-squared: NA 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 9.901     df: 3 and 46     p-value:  0.000 
## 
## -- Analysis of Variance from Type II Sums of Squares 
##  
##                       df    Sum Sq   Mean Sq   F-value   p-value 
##              Before    1   319.163   319.163     5.249     0.026 
##        TreatmentOld    1  1724.303  1724.303    27.769     0.000 
## Before.TreatmentOld    1     1.655     1.655     0.027     0.871 
## Residuals             46  2856.382    62.095 
## 
## -- Test of Interaction 
##   
## Before:Treatment  df: 1  df resid: 46  SS: 1.655  F: 0.027  p-value: 0.871 
##   
## -- Assume parallel lines, no interaction of Treatment with Before 
##  
## Level New: y^_After = 90.468 + -0.360(x_Before) 
## Level Old: y^_After = 81.287 + -0.360(x_Before) 
##   
## -- Visualize Separately Computed Regression Lines 
## 
## Plot(Before, After, by=Treatment, fit="lm") 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##                       After Before TreatmentOld Before.TreatmentOld 
##                 After  1.00  -0.16        -0.57               -0.60 
##                Before -0.16   1.00        -0.17               -0.04 
##          TreatmentOld -0.57  -0.17         1.00                0.98 
##   Before.TreatmentOld -0.60  -0.04         0.98                1.00 
## 
##                       Tolerance       VIF 
##                Before     0.494     2.024 
##          TreatmentOld     0.017    57.821 
##   Before.TreatmentOld     0.018    56.198 
## 
##  Before TreatmentOld Before.TreatmentOld    R2adj    X's 
##       1            1                   0    0.366      2 
##       1            0                   1    0.362      2 
##       1            1                   1    0.353      3 
##       0            0                   1    0.342      1 
##       0            1                   1    0.335      2 
##       0            1                   0    0.310      1 
##       1            0                   0    0.005      1 
##  
## [based on Thomas Lumley's leaps function from the leaps package] 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 50 rows of data, or do n_res_rows="all"] 
## -------------------------------------------------------------------------------------- 
##      Before TreatmentOld Before.TreatmentOld  After fitted   resid rstdnt dffits cooks 
##   40     43            1                  43     42 63.415 -21.415 -3.053 -0.800 0.136 
##   19     64            0                   0     77 67.437   9.563  1.381  0.718 0.126 
##   50     60            1                  60     46 56.349 -10.349 -1.466 -0.679 0.113 
##   34     51            1                  51     79 60.090  18.910  2.611  0.589 0.077 
##    7     64            0                   0     60 67.437  -7.437 -1.065 -0.554 0.076 
##   32     44            1                  44     79 62.999  16.001  2.172  0.527 0.064 
##    1     41            0                   0     66 75.714  -9.714 -1.325 -0.490 0.059 
##    8     46            0                   0     61 73.915 -12.915 -1.724 -0.425 0.043 
##   26     51            1                  51     74 60.090  13.910  1.857  0.419 0.042 
##   46     47            1                  47     49 61.752 -12.752 -1.685 -0.349 0.029 
##    4     45            0                   0     84 74.274   9.726  1.286  0.342 0.029 
##   18     43            0                   0     83 74.994   8.006  1.066  0.334 0.028 
##   17     53            0                   0     61 71.396 -10.396 -1.364 -0.302 0.022 
##   16     43            0                   0     82 74.994   7.006  0.930  0.292 0.021 
##   10     47            0                   0     83 73.555   9.445  1.237  0.284 0.020 
##    3     50            0                   0     62 72.475 -10.475 -1.370 -0.280 0.019 
##   45     51            1                  51     51 60.090  -9.090 -1.188 -0.268 0.018 
##   30     42            1                  42     70 63.830   6.170  0.811  0.231 0.013 
##   36     50            1                  50     53 60.505  -7.505 -0.973 -0.208 0.011 
##   39     50            1                  50     53 60.505  -7.505 -0.973 -0.208 0.011 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##      Before TreatmentOld Before.TreatmentOld  After   pred s_pred pi.lwr pi.upr  width 
##   50     60            1                  60     46 56.349  8.549 39.142 73.557 34.415 
##   35     59            1                  59     59 56.765  8.469 39.718 73.811 34.093 
##   47     58            1                  58     60 57.180  8.395 40.282 74.078 33.796 
## ... 
##   48     49            1                  49     62 60.921  8.040 44.738 77.104 32.365 
##   41     48            1                  48     60 61.337  8.036 45.161 77.512 32.352 
##   27     47            1                  47     65 61.752  8.040 45.568 77.936 32.368 
## ... 
##   44     36            1                  36     67 66.324  8.555 49.103 83.545 34.442 
##    7     64            0                   0     60 67.437  8.678 49.970 84.905 34.935 
##   19     64            0                   0     77 67.437  8.678 49.970 84.905 34.935 
## 
## ------------------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## Plot 3: Scatterplot and Least-Squares Lines 
## -------------------------------------------

Việc 3. Phân tích đánh giá thay đổi theo thời gian

3.1 Đọc dữ liệu

df3 = read.csv("C:\\Thach\\VN trips\\2024_4Dec\\SIS Can Tho\\Datasets\\Orthodont.csv")
head(df3)
##    id  sex age distance
## 1 M01 Male   8     26.0
## 2 M01 Male  10     25.0
## 3 M01 Male  12     29.0
## 4 M01 Male  14     31.0
## 5 M02 Male   8     21.5
## 6 M02 Male  10     22.5

3.2 Vẽ biểu đồ bánh tằm

library(ggplot2)
ggplot(data = df3, aes(x = age, y = distance, group = id, col = id)) + geom_point() + geom_line()

3.3 Thực hiện mô hình hồi qui tuyến tính

m.1 = lm(distance ~ age, data = df3)
summary(m.1)
## 
## Call:
## lm(formula = distance ~ age, data = df3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.5037 -1.5778 -0.1833  1.3519  6.3167 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  16.7611     1.2256  13.676 < 0.0000000000000002 ***
## age           0.6602     0.1092   6.047         0.0000000225 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.537 on 106 degrees of freedom
## Multiple R-squared:  0.2565, Adjusted R-squared:  0.2495 
## F-statistic: 36.56 on 1 and 106 DF,  p-value: 0.00000002248

3.4 Thực hiện mô hình phân tích hỗn hợp

library(lme4)
## Loading required package: Matrix
m.2 = lmer(distance ~ 1 + age + (1 + age | id), data = df3)
summary(m.2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: distance ~ 1 + age + (1 + age | id)
##    Data: df3
## 
## REML criterion at convergence: 442.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.2229 -0.4938  0.0073  0.4721  3.9161 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  id       (Intercept) 5.41657  2.3274        
##           age         0.05128  0.2264   -0.61
##  Residual             1.71616  1.3100        
## Number of obs: 108, groups:  id, 27
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept) 16.76111    0.77527  21.620
## age          0.66019    0.07126   9.265
## 
## Correlation of Fixed Effects:
##     (Intr)
## age -0.848

Việc 4. Đánh giá ảnh hưởng của giới tính lên sự thay đổi của răng theo thời gian

4.1 Thực hiện mô hình phân tích hỗn hợp

m.3 = lmer(distance ~ 1 + age + sex + age*sex + (1 + age | id), data = df3)
summary(m.3)
## Linear mixed model fit by REML ['lmerMod']
## Formula: distance ~ 1 + age + sex + age * sex + (1 + age | id)
##    Data: df3
## 
## REML criterion at convergence: 432.6
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.1694 -0.3862  0.0070  0.4454  3.8490 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev. Corr 
##  id       (Intercept) 5.77449  2.4030        
##           age         0.03245  0.1801   -0.67
##  Residual             1.71663  1.3102        
## Number of obs: 108, groups:  id, 27
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  17.3727     1.2281  14.147
## age           0.4795     0.1037   4.625
## sexMale      -1.0321     1.5953  -0.647
## age:sexMale   0.3048     0.1347   2.263
## 
## Correlation of Fixed Effects:
##             (Intr) age    sexMal
## age         -0.880              
## sexMale     -0.770  0.678       
## age:sexMale  0.678 -0.770 -0.880
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following objects are masked from 'package:lessR':
## 
##     bc, recode, sp
Anova(m.3, type = 3)
## Analysis of Deviance Table (Type III Wald chisquare tests)
## 
## Response: distance
##                Chisq Df            Pr(>Chisq)    
## (Intercept) 200.1256  1 < 0.00000000000000022 ***
## age          21.3860  1           0.000003755 ***
## sex           0.4186  1               0.51765    
## age:sex       5.1208  1               0.02364 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Việc 5. Ghi lại tất cả các hàm/lệnh trên và chia sẻ lên tài khoản rpubs