Thực hành ngày 4

Phân tích tương quan

Học viên Đặng Bảo Đăng - Mã số R1F019

library(ggplot2); library(tidyverse); library(gridExtra); library(readxl); library(DescTools)

Việc 1: Đọc dữ liệu nghiên cứu tim mạch “Framingham dataset.csv”

fmh = read.csv("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/Framingham dataset.csv")

Desc(fmh)
## ------------------------------------------------------------------------------ 
## Describe fmh (data.frame):
## 
## data frame:  11627 obs. of  39 variables
##      2236 complete cases (19.2%)
## 
##   Nr  ColName       Class    NAs           Levels
##   1   id            integer     .                
##   2   sex           integer     .                
##   3   tot.chol      integer   409 (3.5%)         
##   4   age           integer     .                
##   5   sysbp         numeric     .                
##   6   diasbp        numeric     .                
##   7   smoker        integer     .                
##   8   cigs.day      integer    79 (0.7%)         
##   9   bmi           numeric    52 (0.4%)         
##   10  diabetes      integer     .                
##   11  bpmed         integer   593 (5.1%)         
##   12  heart.rate    integer     6 (0.1%)         
##   13  glucose       integer  1440 (12.4%)        
##   14  educ          integer   295 (2.5%)         
##   15  prev.chd      integer     .                
##   16  prev.ap       integer     .                
##   17  prev.mi       integer     .                
##   18  prev.stroke   integer     .                
##   19  prev.hyp      integer     .                
##   20  time          integer     .                
##   21  period        integer     .                
##   22  hdlc          integer  8600 (74.0%)        
##   23  ldlc          integer  8601 (74.0%)        
##   24  death         integer     .                
##   25  angina        integer     .                
##   26  hosp.mi       integer     .                
##   27  mi.fchd       integer     .                
##   28  any.chd       integer     .                
##   29  stroke        integer     .                
##   30  cvd           integer     .                
##   31  hypertension  integer     .                
##   32  time.ap       integer     .                
##   33  time.mi       integer     .                
##   34  time.mi.1     integer     .                
##   35  time.chd      integer     .                
##   36  time.stroke   integer     .                
##   37  time.cvd      integer     .                
##   38  time.dth      integer     .                
##   39  time.hyp      integer     .                
## 
## 
## ------------------------------------------------------------------------------ 
## 1 - id (integer)
## 
##         length             n           NAs        unique            0s'
##         11'627        11'627             0         4'434             0
##                       100.0%          0.0%                        0.0%
##                                                                       
##            .05           .10           .25        median           .75
##     538'481.90    977'484.60  2'474'378.00  5'006'008.00  7'472'730.00
##                                                                       
##          range            sd         vcoef           mad           IQR
##   9'996'864.00  2'900'877.44          0.58  3'710'052.31  4'998'352.00
##                                                                       
##           mean        meanCI
##   5'004'740.92  4'952'007.14
##                 5'057'474.69
##                             
##            .90           .95
##   9'040'424.20  9'460'638.00
##                             
##           skew          kurt
##          -0.00         -1.22
##                             
## lowest : 2'448 (2), 6'238 (3), 9'428 (2), 10'552 (2), 11'252 (3)
## highest: 9'990'894 (3), 9'993'179 (3), 9'995'546 (2), 9'998'212 (3), 9'999'312 (3)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 2 - sex (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## 1  5'022  43.2%   42.3%   44.1%
## 2  6'605  56.8%   55.9%   57.7%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 3 - tot.chol (integer)
## 
##   length       n     NAs  unique      0s    mean  meanCI'
##   11'627  11'218     409     299       0  241.16  240.32
##            96.5%    3.5%            0.0%          242.00
##                                                         
##      .05     .10     .25  median     .75     .90     .95
##   174.00  187.00  210.00  238.00  268.00  298.00  319.00
##                                                         
##    range      sd   vcoef     mad     IQR    skew    kurt
##   589.00   45.37    0.19   43.00   58.00    0.82    3.38
##                                                         
## lowest : 107, 112, 113, 115, 117
## highest: 600, 614, 625, 638, 696
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 4 - age (integer)
## 
##   length       n    NAs  unique     0s   mean  meanCI'
##   11'627  11'627      0      50      0  54.79   54.62
##           100.0%   0.0%           0.0%          54.97
##                                                      
##      .05     .10    .25  median    .75    .90     .95
##    40.00   42.00  48.00   54.00  62.00  68.00   71.00
##                                                      
##    range      sd  vcoef     mad    IQR   skew    kurt
##    49.00    9.56   0.17   10.38  14.00   0.14   -0.66
##                                                      
## lowest : 32, 33 (5), 34 (18), 35 (42), 36 (84)
## highest: 77 (33), 78 (18), 79 (21), 80 (6), 81 (3)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 5 - sysbp (numeric)
## 
##   length       n     NAs  unique      0s    mean  meanCI'
##   11'627  11'627       0     260       0  136.32  135.91
##           100.0%    0.0%            0.0%          136.74
##                                                         
##      .05     .10     .25  median     .75     .90     .95
##   106.00  110.00  120.00  132.00  149.00  167.00  180.00
##                                                         
##    range      sd   vcoef     mad     IQR    skew    kurt
##   211.50   22.80    0.17   20.76   29.00    0.94    1.37
##                                                         
## lowest : 83.5 (2), 85.0, 85.5, 86.0 (2), 88.0 (2)
## highest: 254.0, 265.0, 267.0, 282.0, 295.0
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 6 - diasbp (numeric)
## 
##   length       n    NAs  unique     0s   mean  meanCI'
##   11'627  11'627      0     160      0  83.04   82.83
##           100.0%   0.0%           0.0%          83.25
##                                                      
##      .05     .10    .25  median    .75    .90     .95
##    66.00   70.00  75.00   82.00  90.00  98.00  104.00
##                                                      
##    range      sd  vcoef     mad    IQR   skew    kurt
##   120.00   11.66   0.14   11.12  15.00   0.55    0.91
##                                                      
## lowest : 30.0, 37.0, 46.0, 47.0, 48.0
## highest: 136.0 (2), 140.0, 141.0, 142.5, 150.0
## 
## heap(?): remarkable frequency (7.4%) for the mode(s) (= 80)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 7 - smoker (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## 0  6'598  56.7%   55.8%   57.6%
## 1  5'029  43.3%   42.4%   44.2%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 8 - cigs.day (integer)
## 
##   length       n    NAs  unique     0s   mean  meanCI'
##   11'627  11'548     79      45  6'598   8.25    8.03
##            99.3%   0.7%          56.7%           8.47
##                                                      
##      .05     .10    .25  median    .75    .90     .95
##     0.00    0.00   0.00    0.00  20.00  25.00   30.00
##                                                      
##    range      sd  vcoef     mad    IQR   skew    kurt
##    90.00   12.19   1.48    0.00  20.00   1.51    2.13
##                                                      
## lowest : 0 (6'598), 1 (162), 2 (98), 3 (183), 4 (65)
## highest: 55 (2), 60 (27), 70 (3), 80 (5), 90 (3)
## 
## heap(?): remarkable frequency (57.1%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 9 - bmi (numeric)
## 
##   length       n     NAs  unique      0s    mean  meanCI'
##   11'627  11'575      52   1'818       0  25.877  25.803
##            99.6%    0.4%            0.0%          25.952
##                                                         
##      .05     .10     .25  median     .75     .90     .95
##   20.120  21.170  23.095  25.480  28.070  30.930  33.013
##                                                         
##    range      sd   vcoef     mad     IQR    skew    kurt
##   42.370   4.103   0.159   3.647   4.975   0.983   2.814
##                                                         
## lowest : 14.43, 14.53, 15.16, 15.32, 15.33
## highest: 48.64, 51.28, 52.94, 55.31, 56.8 (3)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 10 - diabetes (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##      freq   perc  lci.95  uci.95'
## 0  11'097  95.4%   95.0%   95.8%
## 1     530   4.6%    4.2%    5.0%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 11 - bpmed (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'034    593      2
##           94.9%   5.1%       
## 
##      freq   perc  lci.95  uci.95'
## 0  10'090  91.4%   90.9%   92.0%
## 1     944   8.6%    8.0%    9.1%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 12 - heart.rate (integer)
## 
##   length       n    NAs  unique     0s   mean  meanCI'
##   11'627  11'621      6      83      0  76.78   76.55
##            99.9%   0.1%           0.0%          77.01
##                                                      
##      .05     .10    .25  median    .75    .90     .95
##    60.00   60.00  69.00   75.00  85.00  94.00  100.00
##                                                      
##    range      sd  vcoef     mad    IQR   skew    kurt
##   183.00   12.46   0.16   10.38  16.00   0.68    2.08
##                                                      
## lowest : 37, 42, 43 (2), 44 (4), 45 (9)
## highest: 135, 140 (2), 143, 150, 220
## 
## heap(?): remarkable frequency (11.9%) for the mode(s) (= 75)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 13 - glucose (integer)
## 
##   length       n    NAs  unique     0s    mean  meanCI'
##   11'627  10'187  1'440     211      0   84.12   83.64
##            87.6%  12.4%           0.0%           84.61
##                                                       
##      .05     .10    .25  median    .75     .90     .95
##    62.00   66.00  72.00   80.00  89.00  103.00  116.00
##                                                       
##    range      sd  vcoef     mad    IQR    skew    kurt
##   439.00   24.99   0.30   11.86  17.00    5.48   50.17
##                                                       
## lowest : 39, 40 (3), 43 (2), 44 (4), 45 (4)
## highest: 394 (2), 410, 420, 423, 478
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 14 - educ (integer)
## 
##   length       n    NAs  unique    0s  mean  meanCI'
##   11'627  11'332    295       4     0  1.99    1.97
##            97.5%   2.5%          0.0%          2.01
##                                                    
##      .05     .10    .25  median   .75   .90     .95
##     1.00    1.00   1.00    2.00  3.00  4.00    4.00
##                                                    
##    range      sd  vcoef     mad   IQR  skew    kurt
##     3.00    1.03   0.52    1.48  2.00  0.68   -0.75
##                                                    
## 
##    level   freq   perc  cumfreq  cumperc
## 1      1  4'690  41.4%    4'690    41.4%
## 2      2  3'410  30.1%    8'100    71.5%
## 3      3  1'885  16.6%    9'985    88.1%
## 4      4  1'347  11.9%   11'332   100.0%
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 15 - prev.chd (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##      freq   perc  lci.95  uci.95'
## 0  10'785  92.8%   92.3%   93.2%
## 1     842   7.2%    6.8%    7.7%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 16 - prev.ap (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##      freq   perc  lci.95  uci.95'
## 0  11'000  94.6%   94.2%   95.0%
## 1     627   5.4%    5.0%    5.8%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 17 - prev.mi (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##      freq   perc  lci.95  uci.95'
## 0  11'253  96.8%   96.4%   97.1%
## 1     374   3.2%    2.9%    3.6%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 18 - prev.stroke (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##      freq   perc  lci.95  uci.95'
## 0  11'475  98.7%   98.5%   98.9%
## 1     152   1.3%    1.1%    1.5%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 19 - prev.hyp (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## 0  6'283  54.0%   53.1%   54.9%
## 1  5'344  46.0%   45.1%   46.9%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 20 - time (integer)
## 
##     length         n    NAs    unique        0s      mean    meanCI'
##     11'627    11'627      0       932     4'434  1'957.02  1'925.05
##               100.0%   0.0%               38.1%            1'988.99
##                                                                    
##        .05       .10    .25    median       .75       .90       .95
##       0.00      0.00   0.00  2'156.00  4'252.50  4'385.00  4'418.00
##                                                                    
##      range        sd  vcoef       mad       IQR      skew      kurt
##   4'854.00  1'758.78   0.90  3'196.49  4'252.50      0.19     -1.44
##                                                                    
## lowest : 0 (4'434), 1'577, 1'633, 1'734, 1'735
## highest: 4'761, 4'770, 4'807, 4'816, 4'854
## 
## heap(?): remarkable frequency (38.1%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 21 - period (integer)
## 
##   length       n    NAs  unique    0s  mean  meanCI'
##   11'627  11'627      0       3     0  1.90    1.88
##           100.0%   0.0%          0.0%          1.91
##                                                    
##      .05     .10    .25  median   .75   .90     .95
##     1.00    1.00   1.00    2.00  3.00  3.00    3.00
##                                                    
##    range      sd  vcoef     mad   IQR  skew    kurt
##     2.00    0.81   0.43    1.48  2.00  0.18   -1.44
##                                                    
## 
##    level   freq   perc  cumfreq  cumperc
## 1      1  4'434  38.1%    4'434    38.1%
## 2      2  3'930  33.8%    8'364    71.9%
## 3      3  3'263  28.1%   11'627   100.0%
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 22 - hdlc (integer)
## 
##   length      n    NAs  unique     0s   mean  meanCI'
##   11'627  3'027  8'600     105      0  49.36   48.81
##           26.0%  74.0%           0.0%          49.92
##                                                     
##      .05    .10    .25  median    .75    .90     .95
##    28.00  32.00  39.00   48.00  58.00  69.00   78.00
##                                                     
##    range     sd  vcoef     mad    IQR   skew    kurt
##   179.00  15.63   0.32   14.83  19.00   1.06    3.65
##                                                     
## lowest : 10, 11 (2), 12, 14, 15 (5)
## highest: 121, 122, 138, 141, 189
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 23 - ldlc (integer)
## 
##   length       n     NAs  unique      0s    mean  meanCI'
##   11'627   3'026   8'601     262       0  176.47  174.80
##            26.0%   74.0%            0.0%          178.14
##                                                         
##      .05     .10     .25  median     .75     .90     .95
##   107.00  121.00  145.00  173.00  205.00  236.00  257.00
##                                                         
##    range      sd   vcoef     mad     IQR    skew    kurt
##   545.00   46.86    0.27   43.00   60.00    0.69    2.53
##                                                         
## lowest : 20, 34, 44, 45, 51
## highest: 376, 381, 428, 452, 565
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 24 - death (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## 0  8'100  69.7%   68.8%   70.5%
## 1  3'527  30.3%   29.5%   31.2%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 25 - angina (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## 0  9'725  83.6%   83.0%   84.3%
## 1  1'902  16.4%   15.7%   17.0%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 26 - hosp.mi (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##      freq   perc  lci.95  uci.95'
## 0  10'473  90.1%   89.5%   90.6%
## 1   1'154   9.9%    9.4%   10.5%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 27 - mi.fchd (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## 0  9'839  84.6%   84.0%   85.3%
## 1  1'788  15.4%   14.7%   16.0%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 28 - any.chd (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## 0  8'469  72.8%   72.0%   73.6%
## 1  3'158  27.2%   26.4%   28.0%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 29 - stroke (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##      freq   perc  lci.95  uci.95'
## 0  10'566  90.9%   90.3%   91.4%
## 1   1'061   9.1%    8.6%    9.7%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 30 - cvd (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## 0  8'728  75.1%   74.3%   75.8%
## 1  2'899  24.9%   24.2%   25.7%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 31 - hypertension (integer - dichotomous)
## 
##   length      n    NAs unique
##   11'627 11'627      0      2
##          100.0%   0.0%       
## 
##     freq   perc  lci.95  uci.95'
## 0  2'985  25.7%   24.9%   26.5%
## 1  8'642  74.3%   73.5%   75.1%
## 
## ' 95%-CI (Wilson)

## ------------------------------------------------------------------------------ 
## 32 - time.ap (integer)
## 
##     length         n       NAs    unique        0s      mean    meanCI'
##     11'627    11'627         0     1'606       307  7'241.56  7'196.51
##               100.0%      0.0%                2.6%            7'286.60
##                                                                       
##        .05       .10       .25    median       .75       .90       .95
##   1'156.70  3'201.00  6'224.00  8'766.00  8'766.00  8'766.00  8'766.00
##                                                                       
##      range        sd     vcoef       mad       IQR      skew      kurt
##   8'766.00  2'477.78      0.34      0.00  2'542.00     -1.57      1.32
##                                                                       
## lowest : 0 (307), 26, 46, 53, 55
## highest: 8'750 (3), 8'753 (3), 8'759 (3), 8'764 (3), 8'766 (7'013)
## 
## heap(?): remarkable frequency (60.3%) for the mode(s) (= 8766)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 33 - time.mi (integer)
## 
##     length         n       NAs    unique        0s      mean    meanCI'
##     11'627    11'627         0     1'529       141  7'593.85  7'555.00
##               100.0%      0.0%                1.2%            7'632.69
##                                                                       
##        .05       .10       .25    median       .75       .90       .95
##   2'484.00  4'201.00  7'212.00  8'766.00  8'766.00  8'766.00  8'766.00
##                                                                       
##      range        sd     vcoef       mad       IQR      skew      kurt
##   8'766.00  2'136.73      0.28      0.00  1'554.00     -1.91      2.77
##                                                                       
## lowest : 0 (141), 26, 27 (3), 34, 40
## highest: 8'747 (3), 8'753 (3), 8'754 (3), 8'758 (3), 8'766 (7'631)
## 
## heap(?): remarkable frequency (65.6%) for the mode(s) (= 8766)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 34 - time.mi.1 (integer)
## 
##     length         n       NAs    unique        0s      mean    meanCI'
##     11'627    11'627         0     1'543       161  7'543.04  7'503.19
##               100.0%      0.0%                1.4%            7'582.89
##                                                                       
##        .05       .10       .25    median       .75       .90       .95
##   2'334.10  4'050.60  7'049.50  8'766.00  8'766.00  8'766.00  8'766.00
##                                                                       
##      range        sd     vcoef       mad       IQR      skew      kurt
##   8'766.00  2'192.12      0.29      0.00  1'716.50     -1.86      2.53
##                                                                       
## lowest : 0 (161), 26, 27 (3), 34, 40
## highest: 8'747 (3), 8'753 (3), 8'754 (3), 8'758 (3), 8'766 (7'552)
## 
## heap(?): remarkable frequency (65.0%) for the mode(s) (= 8766)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 35 - time.chd (integer)
## 
##     length         n       NAs    unique        0s      mean    meanCI'
##     11'627    11'627         0     1'716       407  7'008.15  6'960.14
##               100.0%      0.0%                3.5%            7'056.17
##                                                                       
##        .05       .10       .25    median       .75       .90       .95
##     747.00  2'538.00  5'598.50  8'766.00  8'766.00  8'766.00  8'766.00
##                                                                       
##      range        sd     vcoef       mad       IQR      skew      kurt
##   8'766.00  2'641.34      0.38      0.00  3'167.50     -1.36      0.61
##                                                                       
## lowest : 0 (407), 26, 27 (3), 46, 53
## highest: 8'750 (3), 8'753 (3), 8'754 (3), 8'758 (3), 8'766 (6'604)
## 
## heap(?): remarkable frequency (56.8%) for the mode(s) (= 8766)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 36 - time.stroke (integer)
## 
##     length         n       NAs    unique        0s      mean    meanCI'
##     11'627    11'627         0     1'525        60  7'660.88  7'624.32
##               100.0%      0.0%                0.5%            7'697.44
##                                                                       
##        .05       .10       .25    median       .75       .90       .95
##   2'941.00  4'484.00  7'295.00  8'766.00  8'766.00  8'766.00  8'766.00
##                                                                       
##      range        sd     vcoef       mad       IQR      skew      kurt
##   8'766.00  2'011.08      0.26      0.00  1'471.00     -1.91      2.82
##                                                                       
## lowest : 0 (60), 22, 26, 45, 47
## highest: 8'744 (6), 8'747 (3), 8'753 (3), 8'759 (3), 8'766 (7'666)
## 
## heap(?): remarkable frequency (65.9%) for the mode(s) (= 8766)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 37 - time.cvd (integer)
## 
##     length         n       NAs    unique        0s      mean    meanCI'
##     11'627    11'627         0     1'634       331  7'166.08  7'119.88
##               100.0%      0.0%                2.8%            7'212.29
##                                                                       
##        .05       .10       .25    median       .75       .90       .95
##   1'110.00  2'862.80  6'004.00  8'766.00  8'766.00  8'766.00  8'766.00
##                                                                       
##      range        sd     vcoef       mad       IQR      skew      kurt
##   8'766.00  2'541.67      0.35      0.00  2'762.00     -1.49      1.03
##                                                                       
## lowest : 0 (331), 26, 27 (3), 47, 58
## highest: 8'747 (3), 8'753 (3), 8'754 (3), 8'758 (3), 8'766 (6'950)
## 
## heap(?): remarkable frequency (59.8%) for the mode(s) (= 8766)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 38 - time.dth (integer)
## 
##     length         n       NAs    unique        0s      mean    meanCI'
##     11'627    11'627         0     1'419         0  7'854.10  7'821.59
##               100.0%      0.0%                0.0%            7'886.61
##                                                                       
##        .05       .10       .25    median       .75       .90       .95
##   3'607.00  5'024.00  7'797.50  8'766.00  8'766.00  8'766.00  8'766.00
##                                                                       
##      range        sd     vcoef       mad       IQR      skew      kurt
##   8'740.00  1'788.37      0.23      0.00    968.50     -2.10      3.73
##                                                                       
## lowest : 26, 34, 40, 45, 46
## highest: 8'744 (6), 8'747 (3), 8'753 (3), 8'759 (3), 8'766 (8'100)
## 
## heap(?): remarkable frequency (69.7%) for the mode(s) (= 8766)
## 
## ' 95%-CI (classic)

## ------------------------------------------------------------------------------ 
## 39 - time.hyp (integer)
## 
##     length         n    NAs    unique        0s      mean    meanCI'
##     11'627    11'627      0     1'519     3'518  3'598.96  3'535.98
##               100.0%   0.0%               30.3%            3'661.93
##                                                                    
##        .05       .10    .25    median       .75       .90       .95
##       0.00      0.00   0.00  2'429.00  7'329.00  8'766.00  8'766.00
##                                                                    
##      range        sd  vcoef       mad       IQR      skew      kurt
##   8'766.00  3'464.16   0.96  3'601.24  7'329.00      0.41     -1.44
##                                                                    
## lowest : 0 (3'518), 45, 58, 87, 133
## highest: 8'754 (3), 8'756 (3), 8'761 (3), 8'764 (3), 8'766 (2'247)
## 
## heap(?): remarkable frequency (30.3%) for the mode(s) (= 0)
## 
## ' 95%-CI (classic)

Việc 2: Kiểm tra tương quan giữa bmi & sysbp, bmi & tot.chol

fmh %>% ggplot(aes(x=bmi, y=sysbp)) + geom_point(col="blue", alpha=0.15) + labs(x="Body Mass Index", y="Systolic Blood Pressure")
## Warning: Removed 52 rows containing missing values (geom_point).

cor(fmh$bmi, fmh$sysbp, use="complete.obs")
## [1] 0.2749543
fmh %>% ggplot(aes(x=bmi, y=tot.chol)) + geom_point(col="blue", alpha=0.15) + labs(x="Body Mass Index", y="Total Cholesterol")
## Warning: Removed 454 rows containing missing values (geom_point).

cor(fmh$bmi, fmh$tot.chol, use="complete.obs")
## [1] 0.08017361

Việc 3: Kiểm tra tương quan giữa các biến age, bmi, sysbp, diasbp, tot.chol, heart.rate

library(GGally); library(gridExtra)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
fmh1 = fmh[, c("age", "bmi", "sysbp", "diasbp", "tot.chol", "heart.rate")]
ggpairs(fmh, columns = c(3, 4, 5, 6, 9, 12), mapping=aes(alpha=0.5))
## Warning: Removed 409 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 409 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 409 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 409 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 454 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 413 rows containing missing values
## Warning: Removed 409 rows containing missing values (geom_point).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 52 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 409 rows containing missing values (geom_point).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 52 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 409 rows containing missing values (geom_point).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 52 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 454 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing missing values (geom_point).

## Warning: Removed 52 rows containing missing values (geom_point).

## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 54 rows containing missing values
## Warning: Removed 413 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).

## Warning: Removed 6 rows containing missing values (geom_point).

## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 54 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing non-finite values (stat_density).

Việc 4: Đọc dữ liệu “Insurance dataset.xlsx”

Dữ liệu này bao gồm các biến số:
‘age’ Tuổi
‘sex’ Giới tính (male / female)
‘bmi’ Tỷ trọng cơ thể
‘children’ Số con
‘smoker’ Hút thuốc lá (yes / no)
‘region’ Vùng miền (northeast / northwest / southeast / southwest)
‘charge’ Tiền bảo hiểm ($)
Dùng mô hình hồi qui tuyến tính để đánh giá mối liên quan giữa tuổi ‘age’ và giá bảo hiểm ‘charge’:
\(charge = alpha + beta*age\)

ins = read_excel("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/Insurance dataset.xlsx")
dim(ins); summary(ins)
## [1] 1338    7
##       age            sex                 bmi           children    
##  Min.   :18.00   Length:1338        Min.   :15.96   Min.   :0.000  
##  1st Qu.:27.00   Class :character   1st Qu.:26.30   1st Qu.:0.000  
##  Median :39.00   Mode  :character   Median :30.40   Median :1.000  
##  Mean   :39.21                      Mean   :30.66   Mean   :1.095  
##  3rd Qu.:51.00                      3rd Qu.:34.69   3rd Qu.:2.000  
##  Max.   :64.00                      Max.   :53.13   Max.   :5.000  
##     smoker             region              charge     
##  Length:1338        Length:1338        Min.   : 1122  
##  Class :character   Class :character   1st Qu.: 4740  
##  Mode  :character   Mode  :character   Median : 9382  
##                                        Mean   :13270  
##                                        3rd Qu.:16640  
##                                        Max.   :63770
ins %>% ggplot(aes(x=charge)) + geom_histogram(aes(y=..density..), col = "white", fill = "blue") + geom_density(col="red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ins %>% ggplot(aes(x=age, y=charge)) + geom_point(col="blue", alpha=0.15)

model = lm(charge ~ age, data=ins)
summary(model)
## 
## Call:
## lm(formula = charge ~ age, data = ins)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -8059  -6671  -5939   5440  47829 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3165.9      937.1   3.378 0.000751 ***
## age            257.7       22.5  11.453  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared:  0.08941,    Adjusted R-squared:  0.08872 
## F-statistic: 131.2 on 1 and 1336 DF,  p-value: < 2.2e-16
anova = anova(model)
anova
## Analysis of Variance Table
## 
## Response: charge
##             Df     Sum Sq    Mean Sq F value    Pr(>F)    
## age          1 1.7530e+10 1.7530e+10  131.17 < 2.2e-16 ***
## Residuals 1336 1.7854e+11 1.3364e+08                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova["Residuals", "Mean Sq"]
## [1] 133640741
Tham số Ước số
Intercept (SE) 3165.9 (937.1)
Slope (SE) 257.7 (22.5)
R^{2} 0.08941
MSE 133640741
par(mfrow = c(2, 2))
plot(model)

library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.0.5
autoplot(model)