library(ggplot2); library(tidyverse); library(gridExtra); library(readxl); library(DescTools)
fmh = read.csv("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/Framingham dataset.csv")
Desc(fmh)
## ------------------------------------------------------------------------------
## Describe fmh (data.frame):
##
## data frame: 11627 obs. of 39 variables
## 2236 complete cases (19.2%)
##
## Nr ColName Class NAs Levels
## 1 id integer .
## 2 sex integer .
## 3 tot.chol integer 409 (3.5%)
## 4 age integer .
## 5 sysbp numeric .
## 6 diasbp numeric .
## 7 smoker integer .
## 8 cigs.day integer 79 (0.7%)
## 9 bmi numeric 52 (0.4%)
## 10 diabetes integer .
## 11 bpmed integer 593 (5.1%)
## 12 heart.rate integer 6 (0.1%)
## 13 glucose integer 1440 (12.4%)
## 14 educ integer 295 (2.5%)
## 15 prev.chd integer .
## 16 prev.ap integer .
## 17 prev.mi integer .
## 18 prev.stroke integer .
## 19 prev.hyp integer .
## 20 time integer .
## 21 period integer .
## 22 hdlc integer 8600 (74.0%)
## 23 ldlc integer 8601 (74.0%)
## 24 death integer .
## 25 angina integer .
## 26 hosp.mi integer .
## 27 mi.fchd integer .
## 28 any.chd integer .
## 29 stroke integer .
## 30 cvd integer .
## 31 hypertension integer .
## 32 time.ap integer .
## 33 time.mi integer .
## 34 time.mi.1 integer .
## 35 time.chd integer .
## 36 time.stroke integer .
## 37 time.cvd integer .
## 38 time.dth integer .
## 39 time.hyp integer .
##
##
## ------------------------------------------------------------------------------
## 1 - id (integer)
##
## length n NAs unique 0s'
## 11'627 11'627 0 4'434 0
## 100.0% 0.0% 0.0%
##
## .05 .10 .25 median .75
## 538'481.90 977'484.60 2'474'378.00 5'006'008.00 7'472'730.00
##
## range sd vcoef mad IQR
## 9'996'864.00 2'900'877.44 0.58 3'710'052.31 4'998'352.00
##
## mean meanCI
## 5'004'740.92 4'952'007.14
## 5'057'474.69
##
## .90 .95
## 9'040'424.20 9'460'638.00
##
## skew kurt
## -0.00 -1.22
##
## lowest : 2'448 (2), 6'238 (3), 9'428 (2), 10'552 (2), 11'252 (3)
## highest: 9'990'894 (3), 9'993'179 (3), 9'995'546 (2), 9'998'212 (3), 9'999'312 (3)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 2 - sex (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 1 5'022 43.2% 42.3% 44.1%
## 2 6'605 56.8% 55.9% 57.7%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 3 - tot.chol (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'218 409 299 0 241.16 240.32
## 96.5% 3.5% 0.0% 242.00
##
## .05 .10 .25 median .75 .90 .95
## 174.00 187.00 210.00 238.00 268.00 298.00 319.00
##
## range sd vcoef mad IQR skew kurt
## 589.00 45.37 0.19 43.00 58.00 0.82 3.38
##
## lowest : 107, 112, 113, 115, 117
## highest: 600, 614, 625, 638, 696
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 4 - age (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 50 0 54.79 54.62
## 100.0% 0.0% 0.0% 54.97
##
## .05 .10 .25 median .75 .90 .95
## 40.00 42.00 48.00 54.00 62.00 68.00 71.00
##
## range sd vcoef mad IQR skew kurt
## 49.00 9.56 0.17 10.38 14.00 0.14 -0.66
##
## lowest : 32, 33 (5), 34 (18), 35 (42), 36 (84)
## highest: 77 (33), 78 (18), 79 (21), 80 (6), 81 (3)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 5 - sysbp (numeric)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 260 0 136.32 135.91
## 100.0% 0.0% 0.0% 136.74
##
## .05 .10 .25 median .75 .90 .95
## 106.00 110.00 120.00 132.00 149.00 167.00 180.00
##
## range sd vcoef mad IQR skew kurt
## 211.50 22.80 0.17 20.76 29.00 0.94 1.37
##
## lowest : 83.5 (2), 85.0, 85.5, 86.0 (2), 88.0 (2)
## highest: 254.0, 265.0, 267.0, 282.0, 295.0
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 6 - diasbp (numeric)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 160 0 83.04 82.83
## 100.0% 0.0% 0.0% 83.25
##
## .05 .10 .25 median .75 .90 .95
## 66.00 70.00 75.00 82.00 90.00 98.00 104.00
##
## range sd vcoef mad IQR skew kurt
## 120.00 11.66 0.14 11.12 15.00 0.55 0.91
##
## lowest : 30.0, 37.0, 46.0, 47.0, 48.0
## highest: 136.0 (2), 140.0, 141.0, 142.5, 150.0
##
## heap(?): remarkable frequency (7.4%) for the mode(s) (= 80)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 7 - smoker (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 6'598 56.7% 55.8% 57.6%
## 1 5'029 43.3% 42.4% 44.2%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 8 - cigs.day (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'548 79 45 6'598 8.25 8.03
## 99.3% 0.7% 56.7% 8.47
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 0.00 0.00 20.00 25.00 30.00
##
## range sd vcoef mad IQR skew kurt
## 90.00 12.19 1.48 0.00 20.00 1.51 2.13
##
## lowest : 0 (6'598), 1 (162), 2 (98), 3 (183), 4 (65)
## highest: 55 (2), 60 (27), 70 (3), 80 (5), 90 (3)
##
## heap(?): remarkable frequency (57.1%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 9 - bmi (numeric)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'575 52 1'818 0 25.877 25.803
## 99.6% 0.4% 0.0% 25.952
##
## .05 .10 .25 median .75 .90 .95
## 20.120 21.170 23.095 25.480 28.070 30.930 33.013
##
## range sd vcoef mad IQR skew kurt
## 42.370 4.103 0.159 3.647 4.975 0.983 2.814
##
## lowest : 14.43, 14.53, 15.16, 15.32, 15.33
## highest: 48.64, 51.28, 52.94, 55.31, 56.8 (3)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 10 - diabetes (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 11'097 95.4% 95.0% 95.8%
## 1 530 4.6% 4.2% 5.0%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 11 - bpmed (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'034 593 2
## 94.9% 5.1%
##
## freq perc lci.95 uci.95'
## 0 10'090 91.4% 90.9% 92.0%
## 1 944 8.6% 8.0% 9.1%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 12 - heart.rate (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'621 6 83 0 76.78 76.55
## 99.9% 0.1% 0.0% 77.01
##
## .05 .10 .25 median .75 .90 .95
## 60.00 60.00 69.00 75.00 85.00 94.00 100.00
##
## range sd vcoef mad IQR skew kurt
## 183.00 12.46 0.16 10.38 16.00 0.68 2.08
##
## lowest : 37, 42, 43 (2), 44 (4), 45 (9)
## highest: 135, 140 (2), 143, 150, 220
##
## heap(?): remarkable frequency (11.9%) for the mode(s) (= 75)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 13 - glucose (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 10'187 1'440 211 0 84.12 83.64
## 87.6% 12.4% 0.0% 84.61
##
## .05 .10 .25 median .75 .90 .95
## 62.00 66.00 72.00 80.00 89.00 103.00 116.00
##
## range sd vcoef mad IQR skew kurt
## 439.00 24.99 0.30 11.86 17.00 5.48 50.17
##
## lowest : 39, 40 (3), 43 (2), 44 (4), 45 (4)
## highest: 394 (2), 410, 420, 423, 478
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 14 - educ (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'332 295 4 0 1.99 1.97
## 97.5% 2.5% 0.0% 2.01
##
## .05 .10 .25 median .75 .90 .95
## 1.00 1.00 1.00 2.00 3.00 4.00 4.00
##
## range sd vcoef mad IQR skew kurt
## 3.00 1.03 0.52 1.48 2.00 0.68 -0.75
##
##
## level freq perc cumfreq cumperc
## 1 1 4'690 41.4% 4'690 41.4%
## 2 2 3'410 30.1% 8'100 71.5%
## 3 3 1'885 16.6% 9'985 88.1%
## 4 4 1'347 11.9% 11'332 100.0%
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 15 - prev.chd (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 10'785 92.8% 92.3% 93.2%
## 1 842 7.2% 6.8% 7.7%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 16 - prev.ap (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 11'000 94.6% 94.2% 95.0%
## 1 627 5.4% 5.0% 5.8%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 17 - prev.mi (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 11'253 96.8% 96.4% 97.1%
## 1 374 3.2% 2.9% 3.6%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 18 - prev.stroke (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 11'475 98.7% 98.5% 98.9%
## 1 152 1.3% 1.1% 1.5%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 19 - prev.hyp (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 6'283 54.0% 53.1% 54.9%
## 1 5'344 46.0% 45.1% 46.9%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 20 - time (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 932 4'434 1'957.02 1'925.05
## 100.0% 0.0% 38.1% 1'988.99
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 0.00 2'156.00 4'252.50 4'385.00 4'418.00
##
## range sd vcoef mad IQR skew kurt
## 4'854.00 1'758.78 0.90 3'196.49 4'252.50 0.19 -1.44
##
## lowest : 0 (4'434), 1'577, 1'633, 1'734, 1'735
## highest: 4'761, 4'770, 4'807, 4'816, 4'854
##
## heap(?): remarkable frequency (38.1%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 21 - period (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 3 0 1.90 1.88
## 100.0% 0.0% 0.0% 1.91
##
## .05 .10 .25 median .75 .90 .95
## 1.00 1.00 1.00 2.00 3.00 3.00 3.00
##
## range sd vcoef mad IQR skew kurt
## 2.00 0.81 0.43 1.48 2.00 0.18 -1.44
##
##
## level freq perc cumfreq cumperc
## 1 1 4'434 38.1% 4'434 38.1%
## 2 2 3'930 33.8% 8'364 71.9%
## 3 3 3'263 28.1% 11'627 100.0%
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 22 - hdlc (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 3'027 8'600 105 0 49.36 48.81
## 26.0% 74.0% 0.0% 49.92
##
## .05 .10 .25 median .75 .90 .95
## 28.00 32.00 39.00 48.00 58.00 69.00 78.00
##
## range sd vcoef mad IQR skew kurt
## 179.00 15.63 0.32 14.83 19.00 1.06 3.65
##
## lowest : 10, 11 (2), 12, 14, 15 (5)
## highest: 121, 122, 138, 141, 189
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 23 - ldlc (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 3'026 8'601 262 0 176.47 174.80
## 26.0% 74.0% 0.0% 178.14
##
## .05 .10 .25 median .75 .90 .95
## 107.00 121.00 145.00 173.00 205.00 236.00 257.00
##
## range sd vcoef mad IQR skew kurt
## 545.00 46.86 0.27 43.00 60.00 0.69 2.53
##
## lowest : 20, 34, 44, 45, 51
## highest: 376, 381, 428, 452, 565
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 24 - death (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 8'100 69.7% 68.8% 70.5%
## 1 3'527 30.3% 29.5% 31.2%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 25 - angina (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 9'725 83.6% 83.0% 84.3%
## 1 1'902 16.4% 15.7% 17.0%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 26 - hosp.mi (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 10'473 90.1% 89.5% 90.6%
## 1 1'154 9.9% 9.4% 10.5%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 27 - mi.fchd (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 9'839 84.6% 84.0% 85.3%
## 1 1'788 15.4% 14.7% 16.0%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 28 - any.chd (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 8'469 72.8% 72.0% 73.6%
## 1 3'158 27.2% 26.4% 28.0%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 29 - stroke (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 10'566 90.9% 90.3% 91.4%
## 1 1'061 9.1% 8.6% 9.7%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 30 - cvd (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 8'728 75.1% 74.3% 75.8%
## 1 2'899 24.9% 24.2% 25.7%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 31 - hypertension (integer - dichotomous)
##
## length n NAs unique
## 11'627 11'627 0 2
## 100.0% 0.0%
##
## freq perc lci.95 uci.95'
## 0 2'985 25.7% 24.9% 26.5%
## 1 8'642 74.3% 73.5% 75.1%
##
## ' 95%-CI (Wilson)
## ------------------------------------------------------------------------------
## 32 - time.ap (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 1'606 307 7'241.56 7'196.51
## 100.0% 0.0% 2.6% 7'286.60
##
## .05 .10 .25 median .75 .90 .95
## 1'156.70 3'201.00 6'224.00 8'766.00 8'766.00 8'766.00 8'766.00
##
## range sd vcoef mad IQR skew kurt
## 8'766.00 2'477.78 0.34 0.00 2'542.00 -1.57 1.32
##
## lowest : 0 (307), 26, 46, 53, 55
## highest: 8'750 (3), 8'753 (3), 8'759 (3), 8'764 (3), 8'766 (7'013)
##
## heap(?): remarkable frequency (60.3%) for the mode(s) (= 8766)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 33 - time.mi (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 1'529 141 7'593.85 7'555.00
## 100.0% 0.0% 1.2% 7'632.69
##
## .05 .10 .25 median .75 .90 .95
## 2'484.00 4'201.00 7'212.00 8'766.00 8'766.00 8'766.00 8'766.00
##
## range sd vcoef mad IQR skew kurt
## 8'766.00 2'136.73 0.28 0.00 1'554.00 -1.91 2.77
##
## lowest : 0 (141), 26, 27 (3), 34, 40
## highest: 8'747 (3), 8'753 (3), 8'754 (3), 8'758 (3), 8'766 (7'631)
##
## heap(?): remarkable frequency (65.6%) for the mode(s) (= 8766)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 34 - time.mi.1 (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 1'543 161 7'543.04 7'503.19
## 100.0% 0.0% 1.4% 7'582.89
##
## .05 .10 .25 median .75 .90 .95
## 2'334.10 4'050.60 7'049.50 8'766.00 8'766.00 8'766.00 8'766.00
##
## range sd vcoef mad IQR skew kurt
## 8'766.00 2'192.12 0.29 0.00 1'716.50 -1.86 2.53
##
## lowest : 0 (161), 26, 27 (3), 34, 40
## highest: 8'747 (3), 8'753 (3), 8'754 (3), 8'758 (3), 8'766 (7'552)
##
## heap(?): remarkable frequency (65.0%) for the mode(s) (= 8766)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 35 - time.chd (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 1'716 407 7'008.15 6'960.14
## 100.0% 0.0% 3.5% 7'056.17
##
## .05 .10 .25 median .75 .90 .95
## 747.00 2'538.00 5'598.50 8'766.00 8'766.00 8'766.00 8'766.00
##
## range sd vcoef mad IQR skew kurt
## 8'766.00 2'641.34 0.38 0.00 3'167.50 -1.36 0.61
##
## lowest : 0 (407), 26, 27 (3), 46, 53
## highest: 8'750 (3), 8'753 (3), 8'754 (3), 8'758 (3), 8'766 (6'604)
##
## heap(?): remarkable frequency (56.8%) for the mode(s) (= 8766)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 36 - time.stroke (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 1'525 60 7'660.88 7'624.32
## 100.0% 0.0% 0.5% 7'697.44
##
## .05 .10 .25 median .75 .90 .95
## 2'941.00 4'484.00 7'295.00 8'766.00 8'766.00 8'766.00 8'766.00
##
## range sd vcoef mad IQR skew kurt
## 8'766.00 2'011.08 0.26 0.00 1'471.00 -1.91 2.82
##
## lowest : 0 (60), 22, 26, 45, 47
## highest: 8'744 (6), 8'747 (3), 8'753 (3), 8'759 (3), 8'766 (7'666)
##
## heap(?): remarkable frequency (65.9%) for the mode(s) (= 8766)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 37 - time.cvd (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 1'634 331 7'166.08 7'119.88
## 100.0% 0.0% 2.8% 7'212.29
##
## .05 .10 .25 median .75 .90 .95
## 1'110.00 2'862.80 6'004.00 8'766.00 8'766.00 8'766.00 8'766.00
##
## range sd vcoef mad IQR skew kurt
## 8'766.00 2'541.67 0.35 0.00 2'762.00 -1.49 1.03
##
## lowest : 0 (331), 26, 27 (3), 47, 58
## highest: 8'747 (3), 8'753 (3), 8'754 (3), 8'758 (3), 8'766 (6'950)
##
## heap(?): remarkable frequency (59.8%) for the mode(s) (= 8766)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 38 - time.dth (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 1'419 0 7'854.10 7'821.59
## 100.0% 0.0% 0.0% 7'886.61
##
## .05 .10 .25 median .75 .90 .95
## 3'607.00 5'024.00 7'797.50 8'766.00 8'766.00 8'766.00 8'766.00
##
## range sd vcoef mad IQR skew kurt
## 8'740.00 1'788.37 0.23 0.00 968.50 -2.10 3.73
##
## lowest : 26, 34, 40, 45, 46
## highest: 8'744 (6), 8'747 (3), 8'753 (3), 8'759 (3), 8'766 (8'100)
##
## heap(?): remarkable frequency (69.7%) for the mode(s) (= 8766)
##
## ' 95%-CI (classic)
## ------------------------------------------------------------------------------
## 39 - time.hyp (integer)
##
## length n NAs unique 0s mean meanCI'
## 11'627 11'627 0 1'519 3'518 3'598.96 3'535.98
## 100.0% 0.0% 30.3% 3'661.93
##
## .05 .10 .25 median .75 .90 .95
## 0.00 0.00 0.00 2'429.00 7'329.00 8'766.00 8'766.00
##
## range sd vcoef mad IQR skew kurt
## 8'766.00 3'464.16 0.96 3'601.24 7'329.00 0.41 -1.44
##
## lowest : 0 (3'518), 45, 58, 87, 133
## highest: 8'754 (3), 8'756 (3), 8'761 (3), 8'764 (3), 8'766 (2'247)
##
## heap(?): remarkable frequency (30.3%) for the mode(s) (= 0)
##
## ' 95%-CI (classic)
fmh %>% ggplot(aes(x=bmi, y=sysbp)) + geom_point(col="blue", alpha=0.15) + labs(x="Body Mass Index", y="Systolic Blood Pressure")
## Warning: Removed 52 rows containing missing values (geom_point).
cor(fmh$bmi, fmh$sysbp, use="complete.obs")
## [1] 0.2749543
fmh %>% ggplot(aes(x=bmi, y=tot.chol)) + geom_point(col="blue", alpha=0.15) + labs(x="Body Mass Index", y="Total Cholesterol")
## Warning: Removed 454 rows containing missing values (geom_point).
cor(fmh$bmi, fmh$tot.chol, use="complete.obs")
## [1] 0.08017361
library(GGally); library(gridExtra)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
fmh1 = fmh[, c("age", "bmi", "sysbp", "diasbp", "tot.chol", "heart.rate")]
ggpairs(fmh, columns = c(3, 4, 5, 6, 9, 12), mapping=aes(alpha=0.5))
## Warning: Removed 409 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 409 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 409 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 409 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 454 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 413 rows containing missing values
## Warning: Removed 409 rows containing missing values (geom_point).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 52 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 409 rows containing missing values (geom_point).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 52 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 409 rows containing missing values (geom_point).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 52 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 6 rows containing missing values
## Warning: Removed 454 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing missing values (geom_point).
## Warning: Removed 52 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 54 rows containing missing values
## Warning: Removed 413 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 54 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing non-finite values (stat_density).
Dữ liệu này bao gồm các biến số:
‘age’ Tuổi
‘sex’ Giới tính (male / female)
‘bmi’ Tỷ trọng cơ thể
‘children’ Số con
‘smoker’ Hút thuốc lá (yes / no)
‘region’ Vùng miền (northeast / northwest / southeast / southwest)
‘charge’ Tiền bảo hiểm ($)
Dùng mô hình hồi qui tuyến tính để đánh giá mối liên quan giữa tuổi
‘age’ và giá bảo hiểm ‘charge’:
\(charge = alpha + beta*age\)
ins = read_excel("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/Insurance dataset.xlsx")
dim(ins); summary(ins)
## [1] 1338 7
## age sex bmi children
## Min. :18.00 Length:1338 Min. :15.96 Min. :0.000
## 1st Qu.:27.00 Class :character 1st Qu.:26.30 1st Qu.:0.000
## Median :39.00 Mode :character Median :30.40 Median :1.000
## Mean :39.21 Mean :30.66 Mean :1.095
## 3rd Qu.:51.00 3rd Qu.:34.69 3rd Qu.:2.000
## Max. :64.00 Max. :53.13 Max. :5.000
## smoker region charge
## Length:1338 Length:1338 Min. : 1122
## Class :character Class :character 1st Qu.: 4740
## Mode :character Mode :character Median : 9382
## Mean :13270
## 3rd Qu.:16640
## Max. :63770
ins %>% ggplot(aes(x=charge)) + geom_histogram(aes(y=..density..), col = "white", fill = "blue") + geom_density(col="red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ins %>% ggplot(aes(x=age, y=charge)) + geom_point(col="blue", alpha=0.15)
model = lm(charge ~ age, data=ins)
summary(model)
##
## Call:
## lm(formula = charge ~ age, data = ins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8059 -6671 -5939 5440 47829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3165.9 937.1 3.378 0.000751 ***
## age 257.7 22.5 11.453 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11560 on 1336 degrees of freedom
## Multiple R-squared: 0.08941, Adjusted R-squared: 0.08872
## F-statistic: 131.2 on 1 and 1336 DF, p-value: < 2.2e-16
anova = anova(model)
anova
## Analysis of Variance Table
##
## Response: charge
## Df Sum Sq Mean Sq F value Pr(>F)
## age 1 1.7530e+10 1.7530e+10 131.17 < 2.2e-16 ***
## Residuals 1336 1.7854e+11 1.3364e+08
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova["Residuals", "Mean Sq"]
## [1] 133640741
| Tham số | Ước số |
|---|---|
| Intercept (SE) | 3165.9 (937.1) |
| Slope (SE) | 257.7 (22.5) |
| R^{2} | 0.08941 |
| MSE | 133640741 |
par(mfrow = c(2, 2))
plot(model)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.0.5
autoplot(model)