DAY 4- BAI TAP-13052025

Việc 1. Đọc dữ liệu vào R

df = read.csv("D:\\R\\DU LIEU THUC HANH TS ThACH GUI\\Bone data.csv")

##Việc 2. Đánh giá mối liên quan giữa cân nặng và mật độ xương cổ xương đùi ###2.1 Vẽ biểu đồ đánh giá mối liên quan giữa cân nặng và mật độ xương cổ xương đùi

library(lessR)

## 
## lessR 4.4.3                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()

## 
## Attaching package: 'lessR'

## The following object is masked from 'package:base':
## 
##     sort_by

Plot(weight, fnbmd, fit = "loess", data = df)

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(weight, fnbmd, enhance=TRUE)  # many options
## Plot(weight, fnbmd, fill="skyblue")  # interior fill color of points
## Plot(weight, fnbmd, out_cut=.10)  # label top 10% from center as outliers 
## 
##    Loess Model MSE = 0.0156
##

###GPT1

plot(weight ~ fnbmd, data = df)

##2.2 Phân tích đánh giá tương quan giữa cân nặng và mật độ cổ xương đùi

Correlation(weight, fnbmd, data = df)

## Correlation Analysis for Variables weight and fnbmd 
##   
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 2121 
## Number of cases (rows of data) deleted: 41 
## 
## Sample Covariance: s = 1.269 
##  
## Sample Correlation: r = 0.581 
## 
## Hypothesis Test of 0 Correlation:  t = 32.882,  df = 2119,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.552 to 0.609

###GP2

cor_result <- cor.test(df$weight, df$fnbmd, method = "pearson")
print(cor_result)

## 
##  Pearson's product-moment correlation
## 
## data:  df$weight and df$fnbmd
## t = 32.882, df = 2119, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5523536 0.6087528
## sample estimates:
##       cor 
## 0.5812509

###GP2-Prom2: install. packages/ Correlation

library(correlation)
cor_result <- correlation(df[, c("weight", "fnbmd")])
print(cor_result)

## # Correlation Matrix (pearson-method)
## 
## Parameter1 | Parameter2 |    r |       95% CI | t(2119) |         p
## -------------------------------------------------------------------
## weight     |      fnbmd | 0.58 | [0.55, 0.61] |   32.88 | < .001***
## 
## p-value adjustment method: Holm (1979)
## Observations: 2121

###GP2-Prompt3

library(correlation)
cor_result <- correlation(df[, c("weight", "fnbmd")])
print(cor_result)

## # Correlation Matrix (pearson-method)
## 
## Parameter1 | Parameter2 |    r |       95% CI | t(2119) |         p
## -------------------------------------------------------------------
## weight     |      fnbmd | 0.58 | [0.55, 0.61] |   32.88 | < .001***
## 
## p-value adjustment method: Holm (1979)
## Observations: 2121

###Tô màu Xanh -line đỏ chart #### cài install.packages(“corrplot”)- ko chạy chart màu ##Hồi qui tuyến tính ##Nhập File.choose (Obbesity data)lên R ##Việc 3. Đọc dữ liệu vào R

ob=read.csv("D:\\R\\DU LIEU THUC HANH TS ThACH GUI\\Obesity data.csv")

##Việc 4. So sánh tỉ trọng mỡ giữa nam và nữ ###4.1 Đánh giá phân bố của tỉ trọng mỡ

library(lessR)
Histogram(pcfat, fill = "blue", xlab = "Percentage of fat (%)", ylab = "Frequency",  data = ob)

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## Histogram(pcfat, density=TRUE)  # smoothed curve + histogram 
## Plot(pcfat)  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- pcfat --- 
##  
##        n   miss            mean              sd             min             mdn             max 
##      1217      0       31.604786        7.182862        9.200000       32.400000       48.400000 
##  
## 
##   
## --- Outliers ---     from the box plot: 10 
##  
## Small       Large 
## -----       ----- 
##   9.2            
##   9.7            
##   9.8            
##  10.3            
##  10.3            
##  10.7            
##  11.0            
##  11.4            
##  11.7            
##  11.9            
## 
## 
## Bin Width: 5 
## Number of Bins: 9 
##  
##      Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ------------------------------------------------- 
##   5 > 10     7.5      3    0.00        3     0.00 
##  10 > 15    12.5     26    0.02       29     0.02 
##  15 > 20    17.5     61    0.05       90     0.07 
##  20 > 25    22.5    128    0.11      218     0.18 
##  25 > 30    27.5    244    0.20      462     0.38 
##  30 > 35    32.5    338    0.28      800     0.66 
##  35 > 40    37.5    294    0.24     1094     0.90 
##  40 > 45    42.5    107    0.09     1201     0.99 
##  45 > 50    47.5     16    0.01     1217     1.00 
##

###4.2 Sử dụng kiểm định t

ttest(pcfat ~ gender, data = ob)

## 
## Compare pcfat across gender with levels F and M 
## Grouping Variable:  gender
## Response Variable:  pcfat
## 
## 
## ------ Describe ------
## 
## pcfat for gender F:  n.miss = 0,  n = 862,  mean = 34.672,  sd = 5.187
## pcfat for gender M:  n.miss = 0,  n = 355,  mean = 24.156,  sd = 5.764
## 
## Mean Difference of pcfat:  10.516
## 
## Weighted Average Standard Deviation:   5.362 
## 
## 
## ------ Assumptions ------
## 
## Note: These hypothesis tests can perform poorly, and the 
##       t-test is typically robust to violations of assumptions. 
##       Use as heuristic guides instead of interpreting literally. 
## 
## Null hypothesis, for each group, is a normal distribution of pcfat.
## Group F: Sample mean assumed normal because n > 30, so no test needed.
## Group M: Sample mean assumed normal because n > 30, so no test needed.
## 
## Null hypothesis is equal variances of pcfat, homogeneous.
## Variance Ratio test:  F = 33.223/26.909 = 1.235,  df = 354;861,  p-value = 0.016
## Levene's test, Brown-Forsythe:  t = -2.232,  df = 1215,  p-value = 0.026
## 
## 
## ------ Infer ------
## 
## --- Assume equal population variances of pcfat for each gender 
## 
## t-cutoff for 95% range of variation: tcut =  1.962 
## Standard Error of Mean Difference: SE =  0.338 
## 
## Hypothesis Test of 0 Mean Diff:  t-value = 31.101,  df = 1215,  p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.663
## 95% Confidence Interval for Mean Difference:  9.853 to 11.180
## 
## 
## --- Do not assume equal population variances of pcfat for each gender 
## 
## t-cutoff: tcut =  1.964 
## Standard Error of Mean Difference: SE =  0.353 
## 
## Hypothesis Test of 0 Mean Diff:  t = 29.768,  df = 602.015, p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.694
## 95% Confidence Interval for Mean Difference:  9.823 to 11.210
## 
## 
## ------ Effect Size ------
## 
## --- Assume equal population variances of pcfat for each gender 
## 
## Standardized Mean Difference of pcfat, Cohen's d:  1.961
## 
## 
## ------ Practical Importance ------
## 
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
## 
## 
## ------ Graphics Smoothing Parameter ------
## 
## Density bandwidth for gender F: 1.475
## Density bandwidth for gender M: 1.867

###4.3 Sử dụng mô hình hồi qui tuyến tính

m.1 = lm(pcfat ~ gender, data = ob)
summary(m.1)

## 
## Call:
## lm(formula = pcfat ~ gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.0724  -3.2724   0.1484   3.6276  14.8439 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept)  34.6724     0.1826   189.9 <0.0000000000000002 ***
## genderM     -10.5163     0.3381   -31.1 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared:  0.4432, Adjusted R-squared:  0.4428 
## F-statistic: 967.3 on 1 and 1215 DF,  p-value: < 0.00000000000000022

###4.3 Dùng gói lessR

library(lessR)
m.2 = reg(pcfat ~ gender, data = ob)

## 
## >>>  gender is not numeric. Converted to indicator variables.

m.2

## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## reg(pcfat ~ gender, data=ob, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  ob 
##  
## Response Variable: pcfat 
## Predictor Variable: genderM 
##  
## Number of cases (rows) of data:  1217 
## Number of cases retained for analysis:  1217 
## 
## 
##   BASIC ANALYSIS 
## 
##               Estimate    Std Err  t-value  p-value    Lower 95%    Upper 95% 
## (Intercept)  34.672413   0.182622  189.859    0.000    34.314123    35.030703 
##     genderM -10.516344   0.338131  -31.101    0.000   -11.179729    -9.852959 
## 
## Standard deviation of pcfat: 7.182861 
##  
## Standard deviation of residuals:  5.361759 for df=1215 
## 95% range of residuals:  21.038669 = 2 * (1.962 * 5.361759) 
##  
## R-squared: 0.443    Adjusted R-squared: 0.443    PRESS R-squared: 0.441 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 967.297     df: 1 and 1215     p-value:  0.000 
## 
## -- Analysis of Variance 
##  
##                df        Sum Sq       Mean Sq     F-value   p-value 
## Model           1  27808.311497  27808.311497  967.297285     0.000 
## Residuals    1215  34929.384159     28.748464 
## pcfat        1216  62737.695656     51.593500 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##           pcfat genderM 
##     pcfat  1.00   -0.67 
##   genderM -0.67    1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"] 
## --------------------------------------------------------------------------- 
##         genderM     pcfat    fitted      resid    rstdnt    dffits    cooks 
##    210        1  9.200000 24.156069 -14.956069 -2.801192 -0.148882 0.011020 
##    509        1 39.000000 24.156069  14.843931  2.780055  0.147758 0.010860 
##    179        1 38.700000 24.156069  14.543931  2.723523  0.144754 0.010420 
##    518        1  9.700000 24.156069 -14.456069 -2.706970 -0.143874 0.010300 
##    200        1  9.800000 24.156069 -14.356069 -2.688132 -0.142873 0.010150 
##    563        1 38.300000 24.156069  14.143931  2.648179  0.140749 0.009860 
##    318        1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460 
##    972        1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460 
##    388        1 10.700000 24.156069 -13.456069 -2.518700 -0.133867 0.008920 
##    203        1 11.000000 24.156069 -13.156069 -2.462262 -0.130868 0.008530 
##   1137        0 14.600000 34.672413 -20.072413 -3.766065 -0.128347 0.008150 
##    893        0 14.700000 34.672413 -19.972413 -3.747085 -0.127700 0.008070 
##    688        1 11.400000 24.156069 -12.756069 -2.387042 -0.126870 0.008020 
##    403        1 11.700000 24.156069 -12.456069 -2.330649 -0.123873 0.007640 
##    858        1 11.900000 24.156069 -12.256069 -2.293064 -0.121875 0.007400 
##    158        1 36.300000 24.156069  12.143931  2.271993  0.120755 0.007270 
##   1106        1 36.300000 24.156069  12.143931  2.271993  0.120755 0.007270 
##    827        1 36.000000 24.156069  11.843931  2.215637  0.117760 0.006910 
##    756        1 12.400000 24.156069 -11.756069 -2.199135 -0.116883 0.006810 
##    196        1 12.500000 24.156069 -11.656069 -2.180355 -0.115885 0.006690 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##         genderM     pcfat      pred   s_pred    pi.lwr    pi.upr     width 
##      2        1 16.800000 24.156069 5.369306 13.621929 34.690209 21.068280 
##      5        1 14.800000 24.156069 5.369306 13.621929 34.690209 21.068280 
## ... 
##   1209        1 26.400000 24.156069 5.369306 13.621929 34.690209 21.068280 
##      1        0 37.300000 34.672413 5.364869 24.146979 45.197847 21.050869 
##      3        0 34.000000 34.672413 5.364869 24.146979 45.197847 21.050869 
## ... 
##   1215        0 34.400000 34.672413 5.364869 24.146979 45.197847 21.050869 
##   1216        0 41.300000 34.672413 5.364869 24.146979 45.197847 21.050869 
##   1217        0 33.200000 34.672413 5.364869 24.146979 45.197847 21.050869 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------

###4.3Kiểm tra giả định

par(mfrow = c(2, 2))
plot(m.1)

###Dùng gói ggfortify

library(ggfortify)

## Loading required package: ggplot2

autoplot(m.1)

###Mô hình

Plot(weight, pcfat, fit = "lm", data = ob)

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(weight, pcfat, enhance=TRUE)  # many options
## Plot(weight, pcfat, color="red")  # exterior edge color of points
## Plot(weight, pcfat, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 1217 
## Sample Correlation of weight and pcfat: r = 0.057 
##   
## Hypothesis Test of 0 Correlation:  t = 1.975,  df = 1215,  p-value = 0.049 
## 95% Confidence Interval for Correlation:  0.000 to 0.112 
##   
## 
##   Line: b0 = 29.222947    b1 = 0.043193    Linear Model MSE = 51.470803   Rsq = 0.003
##

###4.4GPT ###Prompt 1

t_test_result <- t.test(pcfat ~ gender, data = ob)
t.test(pcfat ~ gender, data = ob)

## 
##  Welch Two Sample t-test
## 
## data:  pcfat by gender
## t = 29.768, df = 602.01, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
##   9.822548 11.210140
## sample estimates:
## mean in group F mean in group M 
##        34.67241        24.15607

###Prompt 2

model <- lm(pcfat ~ gender, data = ob)

###Prompt 3 chưa làm ##Việc 5. Đánh giá mối liên quan giữa cân nặng và tỉ trọng mỡ

###5.1 Vẽ biểu đồ tán xạ- Lổi- chạy lcode duốik

Plot(weight, pcfat, fit = "lm", data = ob)

## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(weight, pcfat, enhance=TRUE)  # many options
## Plot(weight, pcfat, color="red")  # exterior edge color of points
## Plot(weight, pcfat, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 1217 
## Sample Correlation of weight and pcfat: r = 0.057 
##   
## Hypothesis Test of 0 Correlation:  t = 1.975,  df = 1215,  p-value = 0.049 
## 95% Confidence Interval for Correlation:  0.000 to 0.112 
##   
## 
##   Line: b0 = 29.222947    b1 = 0.043193    Linear Model MSE = 51.470803   Rsq = 0.003
##

###5.2 Sử dụng mô hình hồi qui tuyến tính

m.3 = lm(pcfat ~ weight, data = ob)
summary(m.3)

## 
## Call:
## lm(formula = pcfat ~ weight, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.3122  -4.5234   0.8902   5.2695  16.9742 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) 29.22295    1.22370  23.881 <0.0000000000000002 ***
## weight       0.04319    0.02188   1.975              0.0485 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared:  0.003199,   Adjusted R-squared:  0.002378 
## F-statistic: 3.899 on 1 and 1215 DF,  p-value: 0.04855

###5.2 Chart Mô hình hồi quy tuyến tính

library(ggfortify)
autoplot(m.3)

###5.3 GPT

DAY 4- BAI TAP-13052025

NGUYEN VAN PHUC

2025-05-13

Việc 1. Đọc dữ liệu vào R