Day3

ob = read.csv(file.choose())
head(ob,10)

##    id gender height weight  bmi age  bmc  bmd   fat  lean pcfat
## 1   1      F    150     49 21.8  53 1312 0.88 17802 28600  37.3
## 2   2      M    165     52 19.1  65 1309 0.84  8381 40229  16.8
## 3   3      F    157     57 23.1  64 1230 0.84 19221 36057  34.0
## 4   4      F    156     53 21.8  56 1171 0.80 17472 33094  33.8
## 5   5      M    160     51 19.9  54 1681 0.98  7336 40621  14.8
## 6   6      F    153     47 20.1  52 1358 0.91 14904 30068  32.2
## 7   7      F    155     58 24.1  66 1546 0.96 20233 35599  35.3
## 8   8      M    167     65 23.3  50 2276 1.11 17749 43301  28.0
## 9   9      M    165     54 19.8  61 1778 0.96 10795 38613  21.1
## 10 10      F    158     60 24.0  58 1404 0.86 21365 35534  36.6

##Việc 2. So sánh tỉ trọng mỡ giữa nam và nữ 2.1 Vẽ biểu đồ histogram đánh giá phân bố mật độ xương

library(lessR)

## 
## lessR 4.5                            feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is the default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation to pivot tables.
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including modern time series forecasting
##   and many, new Plotly interactive visualizations output. Most
##   visualization functions are now reorganized to three functions:
##      Chart(): type="bar", "pie", "radar", "bubble", "treemap", "icicle"
##      X(): type="histogram", "density", "vbs" and more
##      XY(): type="scatter" for a scatterplot, or "contour", "smooth"
##    Most previous function calls still work, such as:
##      BarChart(), Histogram, and Plot().
##   Enter: news(package="lessR"), or ?Chart, ?X, or ?XY
## There is also Flows() for Sankey flow diagrams, see ?Flows
## 
## Interactive data analysis for constructing visualizations.
##   Enter: interact()

Histogram(pcfat,
          data = ob,
          fill = "blue",
          xlab = "Tỉ trọng mỡ (%)",
          ylab = "Số người",
          main = "Phân bố tỉ trọng mỡ")

## lessR visualizations are now unified over just three core functions:
##   - Chart() for pivot tables, such as bar charts. More info: ?Chart
##   - X() for a single variable x, such as histograms. More info: ?X
##   - XY() for scatterplots of two variables, x and y. More info: ?XY
## 
## Histogram() is deprecated, though still working for now.
## Please use X(..., type = "histogram") going forward.

## [Interactive plot from the Plotly R package (Sievert, 2020)]

## >>> Suggestions 
## bin_width: set the width of each bin 
## bin_start: set the start of the first bin 
## bin_end: set the end of the last bin 
## X(pcfat, type="density")  # smoothed curve + histogram 
## X(pcfat, type="vbs")  # Violin/Box/Scatterplot (VBS) plot 
## 
## --- pcfat --- 
##  
##        n   miss            mean              sd             min             mdn             max 
##      1217      0       31.604786        7.182862        9.200000       32.400000       48.400000 
##  
## 
##   
## --- Outliers ---     from the box plot: 10 
##  
## Small       Large 
## -----       ----- 
##   9.2            
##   9.7            
##   9.8            
##  10.3            
##  10.3            
##  10.7            
##  11.0            
##  11.4            
##  11.7            
##  11.9            
## 
## 
## Bin Width: 5 
## Number of Bins: 9 
##  
##      Bin  Midpnt  Count    Prop  Cumul.c  Cumul.p 
## ------------------------------------------------- 
##   5 > 10     7.5      3    0.00        3     0.00 
##  10 > 15    12.5     26    0.02       29     0.02 
##  15 > 20    17.5     61    0.05       90     0.07 
##  20 > 25    22.5    128    0.11      218     0.18 
##  25 > 30    27.5    244    0.20      462     0.38 
##  30 > 35    32.5    338    0.28      800     0.66 
##  35 > 40    37.5    294    0.24     1094     0.90 
##  40 > 45    42.5    107    0.09     1201     0.99 
##  45 > 50    47.5     16    0.01     1217     1.00 
##

2.2 So sánh tỉ trọng mỡ giữa nam và nữ bằng t-test

library(table1)

## 
## Attaching package: 'table1'

## The following object is masked from 'package:lessR':
## 
##     label

## The following objects are masked from 'package:base':
## 
##     units, units<-

table1(~ pcfat | gender, data = ob)

	F (N=862)	M (N=355)	Overall (N=1217)
pcfat
Mean (SD)	34.7 (5.19)	24.2 (5.76)	31.6 (7.18)
Median [Min, Max]	34.7 [14.6, 48.4]	24.6 [9.20, 39.0]	32.4 [9.20, 48.4]

ttest(pcfat ~ gender, data = ob)

## 
## Compare pcfat across gender with levels F and M 
## Grouping Variable:  gender
## Response Variable:  pcfat
## 
## 
## ------ Describe ------
## 
## pcfat for gender F:  n.miss = 0,  n = 862,  mean = 34.672,  sd = 5.187
## pcfat for gender M:  n.miss = 0,  n = 355,  mean = 24.156,  sd = 5.764
## 
## Mean Difference of pcfat:  10.516
## 
## Weighted Average Standard Deviation:   5.362 
## 
## 
## ------ Assumptions ------
## 
## Note: These hypothesis tests can perform poorly, and the 
##       t-test is typically robust to violations of assumptions. 
##       Use as heuristic guides instead of interpreting literally. 
## 
## Null hypothesis, for each group, is a normal distribution of pcfat.
## Group F: Sample mean assumed normal because n > 30, so no test needed.
## Group M: Sample mean assumed normal because n > 30, so no test needed.
## 
## Null hypothesis is equal variances of pcfat, homogeneous.
## Variance Ratio test:  F = 33.223/26.909 = 1.235,  df = 354;861,  p-value = 0.016
## Levene's test, Brown-Forsythe:  t = -2.232,  df = 1215,  p-value = 0.026
## 
## 
## ------ Infer ------
## 
## --- Assume equal population variances of pcfat for each gender 
## 
## t-cutoff for 95% range of variation: tcut =  1.962 
## Standard Error of Mean Difference: SE =  0.338 
## 
## Hypothesis Test of 0 Mean Diff:  t-value = 31.101,  df = 1215,  p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.663
## 95% Confidence Interval for Mean Difference:  9.853 to 11.180
## 
## 
## --- Do not assume equal population variances of pcfat for each gender 
## 
## t-cutoff: tcut =  1.964 
## Standard Error of Mean Difference: SE =  0.353 
## 
## Hypothesis Test of 0 Mean Diff:  t = 29.768,  df = 602.015, p-value = 0.000
## 
## Margin of Error for 95% Confidence Level:  0.694
## 95% Confidence Interval for Mean Difference:  9.823 to 11.210
## 
## 
## ------ Effect Size ------
## 
## --- Assume equal population variances of pcfat for each gender 
## 
## Standardized Mean Difference of pcfat, Cohen's d:  1.961
## 
## 
## ------ Practical Importance ------
## 
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
## 
## 
## ------ Graphics Smoothing Parameter ------
## 
## Density bandwidth for gender F: 1.475
## Density bandwidth for gender M: 1.867

2.3 So sánh tỉ trọng mỡ giữa nam và nữ bằng hồi qui tuyến tính

model <- lm(pcfat ~ gender, data = ob)
summary(model)

## 
## Call:
## lm(formula = pcfat ~ gender, data = ob)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.0724  -3.2724   0.1484   3.6276  14.8439 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  34.6724     0.1826   189.9   <2e-16 ***
## genderM     -10.5163     0.3381   -31.1   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared:  0.4432, Adjusted R-squared:  0.4428 
## F-statistic: 967.3 on 1 and 1215 DF,  p-value: < 2.2e-16

#cách 2

Regression(pcfat ~ gender, data = ob, graphics = TRUE)

## 
## >>>  gender is not numeric. Converted to indicator variables.

## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## Regression(my_formula=pcfat ~ gender, data=ob, graphics=TRUE, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  ob 
##  
## Response Variable: pcfat 
## Predictor Variable: genderM 
##  
## Number of cases (rows) of data:  1217 
## Number of cases retained for analysis:  1217 
## 
## 
##   BASIC ANALYSIS 
## 
##               Estimate    Std Err  t-value  p-value    Lower 95%    Upper 95% 
## (Intercept)  34.672413   0.182622  189.859    0.000    34.314123    35.030703 
##     genderM -10.516344   0.338131  -31.101    0.000   -11.179729    -9.852959 
## 
## Standard deviation of pcfat: 7.182861 
##  
## Standard deviation of residuals:  5.361759 for df=1215 
## 95% range of residuals:  21.038669 = 2 * (1.962 * 5.361759) 
##  
## R-squared: 0.443    Adjusted R-squared: 0.443    PRESS R-squared: 0.441 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 967.297     df: 1 and 1215     p-value:  0.000 
## 
## -- Analysis of Variance 
##  
##                df        Sum Sq       Mean Sq     F-value   p-value 
## Model           1  27808.311497  27808.311497  967.297285     0.000 
## Residuals    1215  34929.384159     28.748464 
## pcfat        1216  62737.695656     51.593500 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##           pcfat genderM 
##     pcfat  1.00   -0.67 
##   genderM -0.67    1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"] 
## --------------------------------------------------------------------------- 
##         genderM     pcfat    fitted      resid    rstdnt    dffits    cooks 
##    210        1  9.200000 24.156069 -14.956069 -2.801192 -0.148882 0.011020 
##    509        1 39.000000 24.156069  14.843931  2.780055  0.147758 0.010860 
##    179        1 38.700000 24.156069  14.543931  2.723523  0.144754 0.010420 
##    518        1  9.700000 24.156069 -14.456069 -2.706970 -0.143874 0.010300 
##    200        1  9.800000 24.156069 -14.356069 -2.688132 -0.142873 0.010150 
##    563        1 38.300000 24.156069  14.143931  2.648179  0.140749 0.009860 
##    318        1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460 
##    972        1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460 
##    388        1 10.700000 24.156069 -13.456069 -2.518700 -0.133867 0.008920 
##    203        1 11.000000 24.156069 -13.156069 -2.462262 -0.130868 0.008530 
##   1137        0 14.600000 34.672413 -20.072413 -3.766065 -0.128347 0.008150 
##    893        0 14.700000 34.672413 -19.972413 -3.747085 -0.127700 0.008070 
##    688        1 11.400000 24.156069 -12.756069 -2.387042 -0.126870 0.008020 
##    403        1 11.700000 24.156069 -12.456069 -2.330649 -0.123873 0.007640 
##    858        1 11.900000 24.156069 -12.256069 -2.293064 -0.121875 0.007400 
##    158        1 36.300000 24.156069  12.143931  2.271993  0.120755 0.007270 
##   1106        1 36.300000 24.156069  12.143931  2.271993  0.120755 0.007270 
##    827        1 36.000000 24.156069  11.843931  2.215637  0.117760 0.006910 
##    756        1 12.400000 24.156069 -11.756069 -2.199135 -0.116883 0.006810 
##    196        1 12.500000 24.156069 -11.656069 -2.180355 -0.115885 0.006690 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##    [to see all intervals add n_pred_rows="all"] 
##  ---------------------------------------------- 
## 
##         genderM     pcfat      pred   s_pred    pi.lwr    pi.upr     width 
##      2        1 16.800000 24.156069 5.369306 13.621929 34.690209 21.068280 
##      5        1 14.800000 24.156069 5.369306 13.621929 34.690209 21.068280 
## ... 
##   1209        1 26.400000 24.156069 5.369306 13.621929 34.690209 21.068280 
##      1        0 37.300000 34.672413 5.364869 24.146979 45.197847 21.050869 
##      3        0 34.000000 34.672413 5.364869 24.146979 45.197847 21.050869 
## ... 
##   1215        0 34.400000 34.672413 5.364869 24.146979 45.197847 21.050869 
##   1216        0 41.300000 34.672413 5.364869 24.146979 45.197847 21.050869 
##   1217        0 33.200000 34.672413 5.364869 24.146979 45.197847 21.050869 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------

Day3

tt

2026-01-08