df = read.csv("D:\\R\\DU LIEU THUC HANH TS ThACH GUI\\Bone data.csv")
##Việc 2. Đánh giá mối liên quan giữa cân nặng và mật độ xương cổ xương đùi ###2.1 Vẽ biểu đồ đánh giá mối liên quan giữa cân nặng và mật độ xương cổ xương đùi
library(lessR)
##
## lessR 4.4.3 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read data file, many formats available, e.g., Excel
## d is default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data,
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including time series forecasting
## Enter: news(package="lessR")
##
## Interactive data analysis
## Enter: interact()
##
## Attaching package: 'lessR'
## The following object is masked from 'package:base':
##
## sort_by
Plot(weight, fnbmd, fit = "loess", data = df)
##
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(weight, fnbmd, enhance=TRUE) # many options
## Plot(weight, fnbmd, fill="skyblue") # interior fill color of points
## Plot(weight, fnbmd, out_cut=.10) # label top 10% from center as outliers
##
## Loess Model MSE = 0.0156
##
###GPT1
plot(weight ~ fnbmd, data = df)
##2.2 Phân tích đánh giá tương quan giữa cân nặng và mật độ cổ xương đùi
Correlation(weight, fnbmd, data = df)
## Correlation Analysis for Variables weight and fnbmd
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 2121
## Number of cases (rows of data) deleted: 41
##
## Sample Covariance: s = 1.269
##
## Sample Correlation: r = 0.581
##
## Hypothesis Test of 0 Correlation: t = 32.882, df = 2119, p-value = 0.000
## 95% Confidence Interval for Correlation: 0.552 to 0.609
###GP2
cor_result <- cor.test(df$weight, df$fnbmd, method = "pearson")
print(cor_result)
##
## Pearson's product-moment correlation
##
## data: df$weight and df$fnbmd
## t = 32.882, df = 2119, p-value < 0.00000000000000022
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.5523536 0.6087528
## sample estimates:
## cor
## 0.5812509
###GP2-Prom2: install. packages/ Correlation
library(correlation)
cor_result <- correlation(df[, c("weight", "fnbmd")])
print(cor_result)
## # Correlation Matrix (pearson-method)
##
## Parameter1 | Parameter2 | r | 95% CI | t(2119) | p
## -------------------------------------------------------------------
## weight | fnbmd | 0.58 | [0.55, 0.61] | 32.88 | < .001***
##
## p-value adjustment method: Holm (1979)
## Observations: 2121
###GP2-Prompt3
library(correlation)
cor_result <- correlation(df[, c("weight", "fnbmd")])
print(cor_result)
## # Correlation Matrix (pearson-method)
##
## Parameter1 | Parameter2 | r | 95% CI | t(2119) | p
## -------------------------------------------------------------------
## weight | fnbmd | 0.58 | [0.55, 0.61] | 32.88 | < .001***
##
## p-value adjustment method: Holm (1979)
## Observations: 2121
###Tô màu Xanh -line đỏ chart #### cài install.packages(“corrplot”)- ko chạy chart màu ##Hồi qui tuyến tính ##Nhập File.choose (Obbesity data)lên R ##Việc 3. Đọc dữ liệu vào R
ob=read.csv("D:\\R\\DU LIEU THUC HANH TS ThACH GUI\\Obesity data.csv")
##Việc 4. So sánh tỉ trọng mỡ giữa nam và nữ ###4.1 Đánh giá phân bố của tỉ trọng mỡ
library(lessR)
Histogram(pcfat, fill = "blue", xlab = "Percentage of fat (%)", ylab = "Frequency", data = ob)
## >>> Suggestions
## bin_width: set the width of each bin
## bin_start: set the start of the first bin
## bin_end: set the end of the last bin
## Histogram(pcfat, density=TRUE) # smoothed curve + histogram
## Plot(pcfat) # Violin/Box/Scatterplot (VBS) plot
##
## --- pcfat ---
##
## n miss mean sd min mdn max
## 1217 0 31.604786 7.182862 9.200000 32.400000 48.400000
##
##
##
## --- Outliers --- from the box plot: 10
##
## Small Large
## ----- -----
## 9.2
## 9.7
## 9.8
## 10.3
## 10.3
## 10.7
## 11.0
## 11.4
## 11.7
## 11.9
##
##
## Bin Width: 5
## Number of Bins: 9
##
## Bin Midpnt Count Prop Cumul.c Cumul.p
## -------------------------------------------------
## 5 > 10 7.5 3 0.00 3 0.00
## 10 > 15 12.5 26 0.02 29 0.02
## 15 > 20 17.5 61 0.05 90 0.07
## 20 > 25 22.5 128 0.11 218 0.18
## 25 > 30 27.5 244 0.20 462 0.38
## 30 > 35 32.5 338 0.28 800 0.66
## 35 > 40 37.5 294 0.24 1094 0.90
## 40 > 45 42.5 107 0.09 1201 0.99
## 45 > 50 47.5 16 0.01 1217 1.00
##
###4.2 Sử dụng kiểm định t
ttest(pcfat ~ gender, data = ob)
##
## Compare pcfat across gender with levels F and M
## Grouping Variable: gender
## Response Variable: pcfat
##
##
## ------ Describe ------
##
## pcfat for gender F: n.miss = 0, n = 862, mean = 34.672, sd = 5.187
## pcfat for gender M: n.miss = 0, n = 355, mean = 24.156, sd = 5.764
##
## Mean Difference of pcfat: 10.516
##
## Weighted Average Standard Deviation: 5.362
##
##
## ------ Assumptions ------
##
## Note: These hypothesis tests can perform poorly, and the
## t-test is typically robust to violations of assumptions.
## Use as heuristic guides instead of interpreting literally.
##
## Null hypothesis, for each group, is a normal distribution of pcfat.
## Group F: Sample mean assumed normal because n > 30, so no test needed.
## Group M: Sample mean assumed normal because n > 30, so no test needed.
##
## Null hypothesis is equal variances of pcfat, homogeneous.
## Variance Ratio test: F = 33.223/26.909 = 1.235, df = 354;861, p-value = 0.016
## Levene's test, Brown-Forsythe: t = -2.232, df = 1215, p-value = 0.026
##
##
## ------ Infer ------
##
## --- Assume equal population variances of pcfat for each gender
##
## t-cutoff for 95% range of variation: tcut = 1.962
## Standard Error of Mean Difference: SE = 0.338
##
## Hypothesis Test of 0 Mean Diff: t-value = 31.101, df = 1215, p-value = 0.000
##
## Margin of Error for 95% Confidence Level: 0.663
## 95% Confidence Interval for Mean Difference: 9.853 to 11.180
##
##
## --- Do not assume equal population variances of pcfat for each gender
##
## t-cutoff: tcut = 1.964
## Standard Error of Mean Difference: SE = 0.353
##
## Hypothesis Test of 0 Mean Diff: t = 29.768, df = 602.015, p-value = 0.000
##
## Margin of Error for 95% Confidence Level: 0.694
## 95% Confidence Interval for Mean Difference: 9.823 to 11.210
##
##
## ------ Effect Size ------
##
## --- Assume equal population variances of pcfat for each gender
##
## Standardized Mean Difference of pcfat, Cohen's d: 1.961
##
##
## ------ Practical Importance ------
##
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
##
##
## ------ Graphics Smoothing Parameter ------
##
## Density bandwidth for gender F: 1.475
## Density bandwidth for gender M: 1.867
###4.3 Sử dụng mô hình hồi qui tuyến tính
m.1 = lm(pcfat ~ gender, data = ob)
summary(m.1)
##
## Call:
## lm(formula = pcfat ~ gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0724 -3.2724 0.1484 3.6276 14.8439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.6724 0.1826 189.9 <0.0000000000000002 ***
## genderM -10.5163 0.3381 -31.1 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4428
## F-statistic: 967.3 on 1 and 1215 DF, p-value: < 0.00000000000000022
###4.3 Dùng gói lessR
library(lessR)
m.2 = reg(pcfat ~ gender, data = ob)
##
## >>> gender is not numeric. Converted to indicator variables.
m.2
## >>> Suggestion
## # Create an R markdown file for interpretative output with Rmd = "file_name"
## reg(pcfat ~ gender, data=ob, Rmd="eg")
##
##
## BACKGROUND
##
## Data Frame: ob
##
## Response Variable: pcfat
## Predictor Variable: genderM
##
## Number of cases (rows) of data: 1217
## Number of cases retained for analysis: 1217
##
##
## BASIC ANALYSIS
##
## Estimate Std Err t-value p-value Lower 95% Upper 95%
## (Intercept) 34.672413 0.182622 189.859 0.000 34.314123 35.030703
## genderM -10.516344 0.338131 -31.101 0.000 -11.179729 -9.852959
##
## Standard deviation of pcfat: 7.182861
##
## Standard deviation of residuals: 5.361759 for df=1215
## 95% range of residuals: 21.038669 = 2 * (1.962 * 5.361759)
##
## R-squared: 0.443 Adjusted R-squared: 0.443 PRESS R-squared: 0.441
##
## Null hypothesis of all 0 population slope coefficients:
## F-statistic: 967.297 df: 1 and 1215 p-value: 0.000
##
## -- Analysis of Variance
##
## df Sum Sq Mean Sq F-value p-value
## Model 1 27808.311497 27808.311497 967.297285 0.000
## Residuals 1215 34929.384159 28.748464
## pcfat 1216 62737.695656 51.593500
##
##
## K-FOLD CROSS-VALIDATION
##
##
## RELATIONS AMONG THE VARIABLES
##
## pcfat genderM
## pcfat 1.00 -0.67
## genderM -0.67 1.00
##
##
## RESIDUALS AND INFLUENCE
##
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance
## [sorted by Cook's Distance]
## [n_res_rows = 20, out of 1217 rows of data, or do n_res_rows="all"]
## ---------------------------------------------------------------------------
## genderM pcfat fitted resid rstdnt dffits cooks
## 210 1 9.200000 24.156069 -14.956069 -2.801192 -0.148882 0.011020
## 509 1 39.000000 24.156069 14.843931 2.780055 0.147758 0.010860
## 179 1 38.700000 24.156069 14.543931 2.723523 0.144754 0.010420
## 518 1 9.700000 24.156069 -14.456069 -2.706970 -0.143874 0.010300
## 200 1 9.800000 24.156069 -14.356069 -2.688132 -0.142873 0.010150
## 563 1 38.300000 24.156069 14.143931 2.648179 0.140749 0.009860
## 318 1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460
## 972 1 10.300000 24.156069 -13.856069 -2.593980 -0.137869 0.009460
## 388 1 10.700000 24.156069 -13.456069 -2.518700 -0.133867 0.008920
## 203 1 11.000000 24.156069 -13.156069 -2.462262 -0.130868 0.008530
## 1137 0 14.600000 34.672413 -20.072413 -3.766065 -0.128347 0.008150
## 893 0 14.700000 34.672413 -19.972413 -3.747085 -0.127700 0.008070
## 688 1 11.400000 24.156069 -12.756069 -2.387042 -0.126870 0.008020
## 403 1 11.700000 24.156069 -12.456069 -2.330649 -0.123873 0.007640
## 858 1 11.900000 24.156069 -12.256069 -2.293064 -0.121875 0.007400
## 158 1 36.300000 24.156069 12.143931 2.271993 0.120755 0.007270
## 1106 1 36.300000 24.156069 12.143931 2.271993 0.120755 0.007270
## 827 1 36.000000 24.156069 11.843931 2.215637 0.117760 0.006910
## 756 1 12.400000 24.156069 -11.756069 -2.199135 -0.116883 0.006810
## 196 1 12.500000 24.156069 -11.656069 -2.180355 -0.115885 0.006690
##
##
## PREDICTION ERROR
##
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals
## [sorted by lower bound of prediction interval]
## [to see all intervals add n_pred_rows="all"]
## ----------------------------------------------
##
## genderM pcfat pred s_pred pi.lwr pi.upr width
## 2 1 16.800000 24.156069 5.369306 13.621929 34.690209 21.068280
## 5 1 14.800000 24.156069 5.369306 13.621929 34.690209 21.068280
## ...
## 1209 1 26.400000 24.156069 5.369306 13.621929 34.690209 21.068280
## 1 0 37.300000 34.672413 5.364869 24.146979 45.197847 21.050869
## 3 0 34.000000 34.672413 5.364869 24.146979 45.197847 21.050869
## ...
## 1215 0 34.400000 34.672413 5.364869 24.146979 45.197847 21.050869
## 1216 0 41.300000 34.672413 5.364869 24.146979 45.197847 21.050869
## 1217 0 33.200000 34.672413 5.364869 24.146979 45.197847 21.050869
##
## ----------------------------------
## Plot 1: Distribution of Residuals
## Plot 2: Residuals vs Fitted Values
## ----------------------------------
###4.3Kiểm tra giả định
par(mfrow = c(2, 2))
plot(m.1)
###Dùng gói ggfortify
library(ggfortify)
## Loading required package: ggplot2
autoplot(m.1)
###Mô hình
Plot(weight, pcfat, fit = "lm", data = ob)
##
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(weight, pcfat, enhance=TRUE) # many options
## Plot(weight, pcfat, color="red") # exterior edge color of points
## Plot(weight, pcfat, MD_cut=6) # Mahalanobis distance from center > 6 is an outlier
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 1217
## Sample Correlation of weight and pcfat: r = 0.057
##
## Hypothesis Test of 0 Correlation: t = 1.975, df = 1215, p-value = 0.049
## 95% Confidence Interval for Correlation: 0.000 to 0.112
##
##
## Line: b0 = 29.222947 b1 = 0.043193 Linear Model MSE = 51.470803 Rsq = 0.003
##
###4.4GPT ###Prompt 1
t_test_result <- t.test(pcfat ~ gender, data = ob)
t.test(pcfat ~ gender, data = ob)
##
## Welch Two Sample t-test
##
## data: pcfat by gender
## t = 29.768, df = 602.01, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## 9.822548 11.210140
## sample estimates:
## mean in group F mean in group M
## 34.67241 24.15607
###Prompt 2
model <- lm(pcfat ~ gender, data = ob)
###Prompt 3 chưa làm ##Việc 5. Đánh giá mối liên quan giữa cân nặng và tỉ trọng mỡ
###5.1 Vẽ biểu đồ tán xạ- Lổi- chạy lcode duốik
Plot(weight, pcfat, fit = "lm", data = ob)
##
##
## >>> Suggestions or enter: style(suggest=FALSE)
## Plot(weight, pcfat, enhance=TRUE) # many options
## Plot(weight, pcfat, color="red") # exterior edge color of points
## Plot(weight, pcfat, MD_cut=6) # Mahalanobis distance from center > 6 is an outlier
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 1217
## Sample Correlation of weight and pcfat: r = 0.057
##
## Hypothesis Test of 0 Correlation: t = 1.975, df = 1215, p-value = 0.049
## 95% Confidence Interval for Correlation: 0.000 to 0.112
##
##
## Line: b0 = 29.222947 b1 = 0.043193 Linear Model MSE = 51.470803 Rsq = 0.003
##
###5.2 Sử dụng mô hình hồi qui tuyến tính
m.3 = lm(pcfat ~ weight, data = ob)
summary(m.3)
##
## Call:
## lm(formula = pcfat ~ weight, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.3122 -4.5234 0.8902 5.2695 16.9742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.22295 1.22370 23.881 <0.0000000000000002 ***
## weight 0.04319 0.02188 1.975 0.0485 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared: 0.003199, Adjusted R-squared: 0.002378
## F-statistic: 3.899 on 1 and 1215 DF, p-value: 0.04855
###5.2 Chart Mô hình hồi quy tuyến tính
library(ggfortify)
autoplot(m.3)
###5.3 GPT