df = read.csv("D:\\Deskop\\Học R tại CT\\DỮ LIỆU ĐÍNH KÈM BÀI TẬP\\Bone data.csv")
head(df)
## id sex age weight height prior.fx fnbmd smoking fx
## 1 1 Male 73 98 175 0 1.08 1 0
## 2 2 Female 68 72 166 0 0.97 0 0
## 3 3 Male 68 87 184 0 1.01 0 0
## 4 4 Female 62 72 173 0 0.84 1 0
## 5 5 Male 61 72 173 0 0.81 1 0
## 6 6 Female 76 57 156 0 0.74 0 0
library(ggplot2)
library(ggthemes)
p= ggplot (data=df, aes(x= weight, y= fnbmd)) + geom_point()+ geom_smooth(col="red") + theme_tufte()
p
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
## Warning: Removed 41 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 41 rows containing missing values or values outside the scale range
## (`geom_point()`).
##2.1 Phân tích đánh giá tương quan giữa cân nặng và mật độ cổ xương đùi
library(lessR)
##
## lessR 4.3.9 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read text, Excel, SPSS, SAS, or R data file
## d is default data frame, data= in analysis routines optional
##
## Many examples of reading, writing, and manipulating data,
## graphics, testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables
## Enter: browseVignettes("lessR")
##
## View lessR updates, now including time series forecasting
## Enter: news(package="lessR")
##
## Interactive data analysis
## Enter: interact()
##
## Attaching package: 'lessR'
## The following object is masked from 'package:base':
##
## sort_by
Correlation(weight, fnbmd, data=df)
## Correlation Analysis for Variables weight and fnbmd
##
##
## >>> Pearson's product-moment correlation
##
## Number of paired values with neither missing, n = 2121
## Number of cases (rows of data) deleted: 41
##
## Sample Covariance: s = 1.269
##
## Sample Correlation: r = 0.581
##
## Hypothesis Test of 0 Correlation: t = 32.882, df = 2119, p-value = 0.000
## 95% Confidence Interval for Correlation: 0.552 to 0.609
###95% CI nói lên 2 điều: 1. có /hay không liên quan (không chứa số 0), 2. mức độ dao động ntn *một số tập san đã bỏ giá trị p, 95%CI có ý nghĩa hơn vì nói được nhiều điều.
vars = df [, c("sex","age","weight","height","fnbmd")]
library (GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
head(vars)
## sex age weight height fnbmd
## 1 Male 73 98 175 1.08
## 2 Female 68 72 166 0.97
## 3 Male 68 87 184 1.01
## 4 Female 62 72 173 0.84
## 5 Male 61 72 173 0.81
## 6 Female 76 57 156 0.74
ggpairs(data=vars, mapping = aes(color=sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#HỒI QUI TUYẾN TÍNH # Việc 4. Đọc dữ liệu “Obesity.data”
ob = read.csv ("D:\\Deskop\\Học R tại CT\\DỮ LIỆU ĐÍNH KÈM BÀI TẬP\\Obesity data.csv")
head(ob)
## id gender height weight bmi age bmc bmd fat lean pcfat
## 1 1 F 150 49 21.8 53 1312 0.88 17802 28600 37.3
## 2 2 M 165 52 19.1 65 1309 0.84 8381 40229 16.8
## 3 3 F 157 57 23.1 64 1230 0.84 19221 36057 34.0
## 4 4 F 156 53 21.8 56 1171 0.80 17472 33094 33.8
## 5 5 M 160 51 19.9 54 1681 0.98 7336 40621 14.8
## 6 6 F 153 47 20.1 52 1358 0.91 14904 30068 32.2
p = ggplot(data = ob, aes(x = gender, y = pcfat, col = gender))
p1 = p + geom_boxplot()
p1
##5.2 Kiểm định t
library(lessR)
ttest(pcfat ~ gender, data = ob)
##
## Compare pcfat across gender with levels F and M
## Grouping Variable: gender
## Response Variable: pcfat
##
##
## ------ Describe ------
##
## pcfat for gender F: n.miss = 0, n = 862, mean = 34.672, sd = 5.187
## pcfat for gender M: n.miss = 0, n = 355, mean = 24.156, sd = 5.764
##
## Mean Difference of pcfat: 10.516
##
## Weighted Average Standard Deviation: 5.362
##
##
## ------ Assumptions ------
##
## Note: These hypothesis tests can perform poorly, and the
## t-test is typically robust to violations of assumptions.
## Use as heuristic guides instead of interpreting literally.
##
## Null hypothesis, for each group, is a normal distribution of pcfat.
## Group F: Sample mean assumed normal because n > 30, so no test needed.
## Group M: Sample mean assumed normal because n > 30, so no test needed.
##
## Null hypothesis is equal variances of pcfat, homogeneous.
## Variance Ratio test: F = 33.223/26.909 = 1.235, df = 354;861, p-value = 0.016
## Levene's test, Brown-Forsythe: t = -2.232, df = 1215, p-value = 0.026
##
##
## ------ Infer ------
##
## --- Assume equal population variances of pcfat for each gender
##
## t-cutoff for 95% range of variation: tcut = 1.962
## Standard Error of Mean Difference: SE = 0.338
##
## Hypothesis Test of 0 Mean Diff: t-value = 31.101, df = 1215, p-value = 0.000
##
## Margin of Error for 95% Confidence Level: 0.663
## 95% Confidence Interval for Mean Difference: 9.853 to 11.180
##
##
## --- Do not assume equal population variances of pcfat for each gender
##
## t-cutoff: tcut = 1.964
## Standard Error of Mean Difference: SE = 0.353
##
## Hypothesis Test of 0 Mean Diff: t = 29.768, df = 602.015, p-value = 0.000
##
## Margin of Error for 95% Confidence Level: 0.694
## 95% Confidence Interval for Mean Difference: 9.823 to 11.210
##
##
## ------ Effect Size ------
##
## --- Assume equal population variances of pcfat for each gender
##
## Standardized Mean Difference of pcfat, Cohen's d: 1.961
##
##
## ------ Practical Importance ------
##
## Minimum Mean Difference of practical importance: mmd
## Minimum Standardized Mean Difference of practical importance: msmd
## Neither value specified, so no analysis
##
##
## ------ Graphics Smoothing Parameter ------
##
## Density bandwidth for gender F: 1.475
## Density bandwidth for gender M: 1.867
m.1 = lm(pcfat ~ gender, data = ob)
summary(m.1)
##
## Call:
## lm(formula = pcfat ~ gender, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -20.0724 -3.2724 0.1484 3.6276 14.8439
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.6724 0.1826 189.9 <0.0000000000000002 ***
## genderM -10.5163 0.3381 -31.1 <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.362 on 1215 degrees of freedom
## Multiple R-squared: 0.4432, Adjusted R-squared: 0.4428
## F-statistic: 967.3 on 1 and 1215 DF, p-value: < 0.00000000000000022
library(ggfortify)
autoplot(m.1)
m.3 = lm(pcfat ~ weight, data = ob)
summary(m.3)
##
## Call:
## lm(formula = pcfat ~ weight, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.3122 -4.5234 0.8902 5.2695 16.9742
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.22295 1.22370 23.881 <0.0000000000000002 ***
## weight 0.04319 0.02188 1.975 0.0485 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.174 on 1215 degrees of freedom
## Multiple R-squared: 0.003199, Adjusted R-squared: 0.002378
## F-statistic: 3.899 on 1 and 1215 DF, p-value: 0.04855
autoplot(m.3)
m.4 = lm(pcfat ~ weight + age + gender + height, data = ob)
summary(m.4)
##
## Call:
## lm(formula = pcfat ~ weight + age + gender + height, data = ob)
##
## Residuals:
## Min 1Q Median 3Q Max
## -18.208 -2.543 0.019 2.582 15.706
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.368722 3.505431 13.798 < 0.0000000000000002 ***
## weight 0.439169 0.015594 28.163 < 0.0000000000000002 ***
## age 0.056166 0.007404 7.585 0.0000000000000658 ***
## genderM -11.483254 0.344343 -33.348 < 0.0000000000000002 ***
## height -0.257013 0.023768 -10.813 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.974 on 1212 degrees of freedom
## Multiple R-squared: 0.695, Adjusted R-squared: 0.694
## F-statistic: 690.4 on 4 and 1212 DF, p-value: < 0.00000000000000022
autoplot (m.4)
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-6)
yvar = ob[, c("pcfat")]
xvar = ob[, c("gender", "age", "height", "weight", "bmi")]
m.bma = bicreg(xvar, yvar, strict = FALSE, OR = 20)
summary(m.bma)
##
## Call:
## bicreg(x = xvar, y = yvar, strict = FALSE, OR = 20)
##
##
## 3 models were selected
## Best 3 models (cumulative posterior probability = 1 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 5.26146 4.582901 7.95773 -0.79279 8.13735
## genderM 100.0 -11.25139 0.429659 -11.44430 -11.42764 -10.80625
## age 100.0 0.05259 0.008048 0.05497 0.05473 0.04715
## height 31.4 0.01759 0.028494 . 0.05598 .
## weight 39.2 0.03102 0.042611 0.07921 . .
## bmi 100.0 1.01265 0.111625 0.89419 1.08852 1.08936
##
## nVar 4 4 3
## r2 0.697 0.696 0.695
## BIC -1423.06312 -1422.62198 -1422.49027
## post prob 0.392 0.314 0.294
imageplot.bma(m.bma)
library(relaimpo)
## Loading required package: MASS
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:robustbase':
##
## salinity
## The following object is masked from 'package:survival':
##
## aml
## Loading required package: survey
## Loading required package: grid
## Loading required package: Matrix
##
## Attaching package: 'survey'
## The following object is masked from 'package:graphics':
##
## dotchart
## Loading required package: mitools
## This is the global version of package relaimpo.
## If you are a non-US user, a version with the interesting additional metric pmvd is available
## from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
ob$sex = ifelse (ob$gender=="M",1,0)
m.4 = lm (pcfat ~ sex + age + bmi+ weight, data = ob)
calc.relimp(m.4, type="lmg", rela =T, rank= T)
## Response variable: pcfat
## Total response variance: 51.5935
## Analysis based on 1217 observations
##
## 4 Regressors:
## sex age bmi weight
## Proportion of variance explained by model: 69.66%
## Metrics are normalized to sum to 100% (rela=TRUE).
##
## Relative importance metrics:
##
## lmg
## sex 0.59317775
## age 0.06893066
## bmi 0.24613695
## weight 0.09175463
##
## Average coefficients for different model sizes:
##
## 1X 2Xs 3Xs 4Xs
## sex -10.51634414 -11.71834412 -11.80453842 -11.44430262
## age 0.12768705 0.10445197 0.05168496 0.05496623
## bmi 1.03619023 1.50631405 1.54278433 0.89419395
## weight 0.04319324 -0.05539405 -0.06907993 0.07920690