A = c(8, 9, 11, 4, 7, 8, 5)
B = c(7, 17, 10, 14, 12, 24, 11, 22)
C = c(28, 21, 26, 11, 24, 19)
D = c(26, 16, 13, 12, 9, 10, 11, 17, 15)
weight = c(A, B, C, D)
group = c(rep("A", 7), rep("B", 8), rep("C", 6), rep("D", 9))
data = data.frame(weight, group)
data##    weight group
## 1       8     A
## 2       9     A
## 3      11     A
## 4       4     A
## 5       7     A
## 6       8     A
## 7       5     A
## 8       7     B
## 9      17     B
## 10     10     B
## 11     14     B
## 12     12     B
## 13     24     B
## 14     11     B
## 15     22     B
## 16     28     C
## 17     21     C
## 18     26     C
## 19     11     C
## 20     24     C
## 21     19     C
## 22     26     D
## 23     16     D
## 24     13     D
## 25     12     D
## 26      9     D
## 27     10     D
## 28     11     D
## 29     17     D
## 30     15     Dlibrary(table1)## 
## Attaching package: 'table1'## The following objects are masked from 'package:base':
## 
##     units, units<-table1(~ weight | group, data = data, render.continuous = c(. = "Mean (SD)", . = "Median [Q1, Q3]"))| A (N=7) | B (N=8) | C (N=6) | D (N=9) | Overall (N=30) | |
|---|---|---|---|---|---|
| weight | |||||
| Mean (SD) | 7.43 (2.37) | 14.6 (5.95) | 21.5 (6.09) | 14.3 (5.15) | 14.2 (6.75) | 
| Median [Q1, Q3] | 8.00 [6.00, 8.50] | 13.0 [10.8, 18.3] | 22.5 [19.5, 25.5] | 13.0 [11.0, 16.0] | 12.0 [9.25, 18.5] | 
1.3 Phân tích sự khác biệt về cân nặng giữa 4 nhóm. Diễn giải kết quả.
av = aov(weight ~ group, data = data)
summary(av)##             Df Sum Sq Mean Sq F value   Pr(>F)    
## group        3  642.3  214.09   8.197 0.000528 ***
## Residuals   26  679.1   26.12                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 11.4 Thực hiện phân tích hậu định (post-hoc analysis) để xác định cụ thể nhóm có khác biệt về cân nặng. Diễn giải kết quả.
TukeyHSD(av)##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = weight ~ group, data = data)
## 
## $group
##           diff          lwr        upr     p adj
## B-A  7.1964286  -0.05969765 14.4525548 0.0525014
## C-A 14.0714286   6.27132726 21.8715299 0.0002134
## D-A  6.9047619  -0.16073856 13.9702624 0.0571911
## C-B  6.8750000  -0.69675602 14.4467560 0.0850381
## D-B -0.2916667  -7.10424368  6.5209103 0.9994049
## D-C -7.1666667 -14.55594392  0.2226106 0.0597131plot(TukeyHSD(av), las = 2, col = "blue")df = read.csv("C:\\Users\\ADMIN\\Downloads\\Demo.csv")2.2 Mô tả đặc điểm cân nặng (weight) và chiều cao (height).
library(table1)
table1(~ weight + height, data = df)| Overall (N=1217) | |
|---|---|
| weight | |
| Mean (SD) | 55.1 (9.40) | 
| Median [Min, Max] | 54.0 [34.0, 95.0] | 
| height | |
| Mean (SD) | 157 (7.98) | 
| Median [Min, Max] | 155 [136, 185] | 
2.3 Vẽ biểu đồ tán xạ đánh giá mối liên quan giữa cân nặng (weight) và chiều cao (height). Nhận xét kết quả.
plot(height ~ weight, data = df)library(ggplot2)
ggplot(data = df, aes(x = weight, y = height)) + geom_point() + geom_smooth()## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'2.4 Tiến hành phân tích tương quan định lượng mối liên quan giữa cân nặng (weight) và chiều cao (height). Nhận xét kết quả.
cor.test(df$weight, df$height)## 
##  Pearson's product-moment correlation
## 
## data:  df$weight and df$height
## t = 25.984, df = 1215, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5602911 0.6326135
## sample estimates:
##       cor 
## 0.59766672.5 Tiến hành phân tích tương quan định lượng mối liên quan giữa chiều cao (height) và tỉ trọng mỡ (pcfat). Nhận xét kết quả.
library(lessR)## 
## lessR 4.4.5                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")  Read data file, many formats available, e.g., Excel
##   d is default data frame, data= in analysis routines optional
## 
## Many examples of reading, writing, and manipulating data, 
## graphics, testing means and proportions, regression, factor analysis,
## customization, forecasting, and aggregation from pivot tables
##   Enter: browseVignettes("lessR")
## 
## View lessR updates, now including time series forecasting
##   Enter: news(package="lessR")
## 
## Interactive data analysis
##   Enter: interact()## 
## Attaching package: 'lessR'## The following object is masked from 'package:table1':
## 
##     labelPlot(height, pcfat, data = df, fit = "lm")## 
## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(height, pcfat, enhance=TRUE)  # many options
## Plot(height, pcfat, fill="skyblue")  # interior fill color of points
## Plot(height, pcfat, MD_cut=6)  # Mahalanobis distance from center > 6 is an outlier 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 1217 
## Sample Correlation of height and pcfat: r = -0.480 
##   
## Hypothesis Test of 0 Correlation:  t = -19.063,  df = 1215,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  -0.522 to -0.435 
##   
## 
##   Line: b0 = 99.311633    b1 = -0.432014    Linear Model MSE = 39.747926   Rsq = 0.230
## cor.test(df$weight, df$pcfat)## 
##  Pearson's product-moment correlation
## 
## data:  df$weight and df$pcfat
## t = 1.9745, df = 1215, p-value = 0.04855
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0003640405 0.1123913870
## sample estimates:
##        cor 
## 0.05655573dữ liệu của Việt Nam.
library(gapminder)
data(gapminder)
vn = subset(gapminder, country == "Vietnam")
library(lessR)
Plot(lifeExp, year, data = vn, xlab = "Life expectancy (years)", ylab = "Year")## 
## >>> Suggestions  or  enter: style(suggest=FALSE)
## Plot(lifeExp, year, enhance=TRUE)  # many options
## Plot(lifeExp, year, color="red")  # exterior edge color of points
## Plot(lifeExp, year, fit="lm", fit_se=c(.90,.99))  # fit line, stnd errors
## Plot(lifeExp, year, out_cut=.10)  # label top 10% from center as outliers 
## 
## 
## >>> Pearson's product-moment correlation 
##  
## Number of paired values with neither missing, n = 12 
## Sample Correlation of lifeExp and year: r = 0.995 
##   
## Hypothesis Test of 0 Correlation:  t = 30.569,  df = 10,  p-value = 0.000 
## 95% Confidence Interval for Correlation:  0.981 to 0.999 
## 2007, tuổi thọ của người Việt Nam gia tăng như thế nào?
m.1 = lm(lifeExp ~ year, data = vn)
summary(m.1)## 
## Call:
## lm(formula = lifeExp ~ year, data = vn)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1884 -0.5840  0.1335  0.7396  1.7873 
## 
## Coefficients:
##                Estimate  Std. Error t value        Pr(>|t|)    
## (Intercept) -1271.98315    43.49240  -29.25 0.0000000000510 ***
## year            0.67162     0.02197   30.57 0.0000000000329 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.314 on 10 degrees of freedom
## Multiple R-squared:  0.9894, Adjusted R-squared:  0.9884 
## F-statistic: 934.5 on 1 and 10 DF,  p-value: 0.00000000003289m.2 = Regression(lifeExp ~ year, data = vn)m.2## >>> Suggestion
## # Create an R markdown file for interpretative output with  Rmd = "file_name"
## Regression(my_formula=lifeExp ~ year, data=vn, Rmd="eg")  
## 
## 
##   BACKGROUND 
## 
## Data Frame:  vn 
##  
## Response Variable: lifeExp 
## Predictor Variable: year 
##  
## Number of cases (rows) of data:  12 
## Number of cases retained for analysis:  12 
## 
## 
##   BASIC ANALYSIS 
## 
##               Estimate    Std Err  t-value  p-value    Lower 95%    Upper 95% 
## (Intercept) -1271.9832    43.4924  -29.246    0.000   -1368.8903   -1175.0760 
##        year     0.6716     0.0220   30.569    0.000       0.6227       0.7206 
## 
## Standard deviation of lifeExp: 12.17233 
##  
## Standard deviation of residuals:  1.31365 for df=10 
## 95% range of residuals:  5.85399 = 2 * (2.228 * 1.31365) 
##  
## R-squared: 0.989    Adjusted R-squared: 0.988    PRESS R-squared: 0.984 
## 
## Null hypothesis of all 0 population slope coefficients:
##   F-statistic: 934.455     df: 1 and 10     p-value:  0.000 
## 
## -- Analysis of Variance 
##  
##             df     Sum Sq    Mean Sq   F-value   p-value 
## Model        1  1612.5653  1612.5653  934.4554     0.000 
## Residuals   10    17.2567     1.7257 
## lifeExp     11  1629.8221   148.1656 
## 
## 
##   K-FOLD CROSS-VALIDATION 
## 
## 
##   RELATIONS AMONG THE VARIABLES 
## 
##           lifeExp year 
##   lifeExp    1.00 0.99 
##      year    0.99 1.00 
## 
## 
##   RESIDUALS AND INFLUENCE 
## 
## -- Data, Fitted, Residual, Studentized Residual, Dffits, Cook's Distance 
##    [sorted by Cook's Distance] 
##    [n_res_rows = 12, out of 12 ] 
## ------------------------------------------------------------- 
##           year lifeExp  fitted   resid  rstdnt  dffits  cooks 
##   12      2007 74.2490 75.9489 -1.6999 -1.6742 -1.0827 0.4965 
##    1      1952 40.4120 39.0101  1.4019  1.3167  0.8515 0.3377 
##    5      1972 50.2540 52.4424 -2.1884 -2.0016 -0.6637 0.1694 
##    9      1992 67.6620 65.8747  1.7873  1.5563  0.5937 0.1543 
##   10      1997 70.6720 69.2328  1.4392  1.2327  0.5559 0.1469 
##    4      1967 47.8380 49.0843 -1.2463 -1.0172 -0.3880 0.0750 
##    2      1957 42.8870 42.3682  0.5188  0.4300  0.2316 0.0292 
##   11      2002 73.0170 72.5908  0.4262  0.3520  0.1896 0.0197 
##    3      1962 45.3630 45.7262 -0.3632 -0.2891 -0.1304 0.0094 
##    7      1982 58.8160 59.1585 -0.3425 -0.2596 -0.0792 0.0035 
##    8      1987 62.8200 62.5166  0.3034  0.2315  0.0768 0.0032 
##    6      1977 55.7640 55.8005 -0.0365 -0.0275 -0.0084 0.0000 
## 
## 
##   PREDICTION ERROR 
## 
## -- Data, Predicted, Standard Error of Prediction, 95% Prediction Intervals 
##    [sorted by lower bound of prediction interval] 
##  ---------------------------------------------- 
## 
##           year lifeExp    pred s_pred  pi.lwr  pi.upr  width 
##    1      1952 40.4120 39.0101 1.4948 35.6794 42.3408 6.6614 
##    2      1957 42.8870 42.3682 1.4539 39.1286 45.6077 6.4790 
##    3      1962 45.3630 45.7262 1.4203 42.5616 48.8909 6.3293 
##    4      1967 47.8380 49.0843 1.3946 45.9770 52.1917 6.2147 
##    5      1972 50.2540 52.4424 1.3772 49.3738 55.5109 6.1371 
##    6      1977 55.7640 55.8005 1.3684 52.7515 58.8494 6.0979 
##    7      1982 58.8160 59.1585 1.3684 56.1096 62.2075 6.0979 
##    8      1987 62.8200 62.5166 1.3772 59.4481 65.5852 6.1371 
##    9      1992 67.6620 65.8747 1.3946 62.7673 68.9820 6.2147 
##   10      1997 70.6720 69.2328 1.4203 66.0681 72.3974 6.3293 
##   11      2002 73.0170 72.5908 1.4539 69.3513 75.8304 6.4790 
##   12      2007 74.2490 75.9489 1.4948 72.6182 79.2796 6.6614 
## 
## ---------------------------------- 
## Plot 1: Distribution of Residuals 
## Plot 2: Residuals vs Fitted Values 
## ----------------------------------