t=file.choose()
heart <- read.csv(t)
names(heart)
##  [1] "age"      "sex"      "cp"       "trestbps" "chol"     "fbs"     
##  [7] "restecg"  "thalach"  "exang"    "oldpeak"  "slope"    "ca"      
## [13] "thal"     "target"
str(heart)
## 'data.frame':    1025 obs. of  14 variables:
##  $ age     : int  52 53 70 61 62 58 58 55 46 54 ...
##  $ sex     : int  1 1 1 1 0 0 1 1 1 1 ...
##  $ cp      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trestbps: int  125 140 145 148 138 100 114 160 120 122 ...
##  $ chol    : int  212 203 174 203 294 248 318 289 249 286 ...
##  $ fbs     : int  0 1 0 0 1 0 0 0 0 0 ...
##  $ restecg : int  1 0 1 1 1 0 2 0 0 0 ...
##  $ thalach : int  168 155 125 161 106 122 140 145 144 116 ...
##  $ exang   : int  0 1 1 0 0 0 0 1 0 1 ...
##  $ oldpeak : num  1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
##  $ slope   : int  2 0 0 2 1 1 0 1 2 1 ...
##  $ ca      : int  2 0 0 1 3 0 3 1 0 2 ...
##  $ thal    : int  3 3 3 3 2 2 1 3 3 2 ...
##  $ target  : int  0 0 0 0 0 1 0 0 0 0 ...
summary(heart)
##       age             sex               cp            trestbps    
##  Min.   :29.00   Min.   :0.0000   Min.   :0.0000   Min.   : 94.0  
##  1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:120.0  
##  Median :56.00   Median :1.0000   Median :1.0000   Median :130.0  
##  Mean   :54.43   Mean   :0.6956   Mean   :0.9424   Mean   :131.6  
##  3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:2.0000   3rd Qu.:140.0  
##  Max.   :77.00   Max.   :1.0000   Max.   :3.0000   Max.   :200.0  
##       chol          fbs            restecg          thalach     
##  Min.   :126   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
##  1st Qu.:211   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:132.0  
##  Median :240   Median :0.0000   Median :1.0000   Median :152.0  
##  Mean   :246   Mean   :0.1493   Mean   :0.5298   Mean   :149.1  
##  3rd Qu.:275   3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:166.0  
##  Max.   :564   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
##      exang           oldpeak          slope             ca        
##  Min.   :0.0000   Min.   :0.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.800   Median :1.000   Median :0.0000  
##  Mean   :0.3366   Mean   :1.072   Mean   :1.385   Mean   :0.7541  
##  3rd Qu.:1.0000   3rd Qu.:1.800   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :6.200   Max.   :2.000   Max.   :4.0000  
##       thal           target      
##  Min.   :0.000   Min.   :0.0000  
##  1st Qu.:2.000   1st Qu.:0.0000  
##  Median :2.000   Median :1.0000  
##  Mean   :2.324   Mean   :0.5132  
##  3rd Qu.:3.000   3rd Qu.:1.0000  
##  Max.   :3.000   Max.   :1.0000
summarise(heart)
## data frame with 0 columns and 1 row
head(heart)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  52   1  0      125  212   0       1     168     0     1.0     2  2    3
## 2  53   1  0      140  203   1       0     155     1     3.1     0  0    3
## 3  70   1  0      145  174   0       1     125     1     2.6     0  0    3
## 4  61   1  0      148  203   0       1     161     0     0.0     2  1    3
## 5  62   0  0      138  294   1       1     106     0     1.9     1  3    2
## 6  58   0  0      100  248   0       0     122     0     1.0     1  0    2
##   target
## 1      0
## 2      0
## 3      0
## 4      0
## 5      0
## 6      1
head(heart,3)
##   age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1  52   1  0      125  212   0       1     168     0     1.0     2  2    3
## 2  53   1  0      140  203   1       0     155     1     3.1     0  0    3
## 3  70   1  0      145  174   0       1     125     1     2.6     0  0    3
##   target
## 1      0
## 2      0
## 3      0
heart %>% count(age)
##    age  n
## 1   29  4
## 2   34  6
## 3   35 15
## 4   37  6
## 5   38 12
## 6   39 14
## 7   40 11
## 8   41 32
## 9   42 26
## 10  43 26
## 11  44 36
## 12  45 25
## 13  46 23
## 14  47 18
## 15  48 23
## 16  49 17
## 17  50 21
## 18  51 39
## 19  52 43
## 20  53 26
## 21  54 53
## 22  55 30
## 23  56 39
## 24  57 57
## 25  58 68
## 26  59 46
## 27  60 37
## 28  61 31
## 29  62 37
## 30  63 32
## 31  64 34
## 32  65 27
## 33  66 25
## 34  67 31
## 35  68 12
## 36  69  9
## 37  70 14
## 38  71 11
## 39  74  3
## 40  76  3
## 41  77  3
table(heart$age)
## 
## 29 34 35 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 
##  4  6 15  6 12 14 11 32 26 26 36 25 23 18 23 17 21 39 43 26 53 30 39 57 68 46 
## 60 61 62 63 64 65 66 67 68 69 70 71 74 76 77 
## 37 31 37 32 34 27 25 31 12  9 14 11  3  3  3
cor(heart)
##                  age         sex          cp    trestbps        chol
## age       1.00000000 -0.10324030 -0.07196627  0.27112141  0.21982253
## sex      -0.10324030  1.00000000 -0.04111909 -0.07897377 -0.19825787
## cp       -0.07196627 -0.04111909  1.00000000  0.03817742 -0.08164102
## trestbps  0.27112141 -0.07897377  0.03817742  1.00000000  0.12797743
## chol      0.21982253 -0.19825787 -0.08164102  0.12797743  1.00000000
## fbs       0.12124348  0.02720046  0.07929359  0.18176662  0.02691716
## restecg  -0.13269617 -0.05511721  0.04358061 -0.12379409 -0.14741024
## thalach  -0.39022708 -0.04936524  0.30683928 -0.03926407 -0.02177209
## exang     0.08816338  0.13915681 -0.40151271  0.06119697  0.06738223
## oldpeak   0.20813668  0.08468656 -0.17473348  0.18743411  0.06488031
## slope    -0.16910511 -0.02666629  0.13163278 -0.12044531 -0.01424787
## ca        0.27155053  0.11172891 -0.17620647  0.10455372  0.07425934
## thal      0.07229745  0.19842425 -0.16334148  0.05927618  0.10024418
## target   -0.22932355 -0.27950076  0.43485425 -0.13877173 -0.09996559
##                   fbs     restecg      thalach       exang     oldpeak
## age       0.121243479 -0.13269617 -0.390227075  0.08816338  0.20813668
## sex       0.027200461 -0.05511721 -0.049365243  0.13915681  0.08468656
## cp        0.079293586  0.04358061  0.306839282 -0.40151271 -0.17473348
## trestbps  0.181766624 -0.12379409 -0.039264069  0.06119697  0.18743411
## chol      0.026917164 -0.14741024 -0.021772091  0.06738223  0.06488031
## fbs       1.000000000 -0.10405124 -0.008865857  0.04926057  0.01085948
## restecg  -0.104051244  1.00000000  0.048410637 -0.06560553 -0.05011425
## thalach  -0.008865857  0.04841064  1.000000000 -0.38028087 -0.34979616
## exang     0.049260570 -0.06560553 -0.380280872  1.00000000  0.31084376
## oldpeak   0.010859481 -0.05011425 -0.349796163  0.31084376  1.00000000
## slope    -0.061902374  0.08608609  0.395307843 -0.26733547 -0.57518854
## ca        0.137156259 -0.07807235 -0.207888416  0.10784854  0.22181603
## thal     -0.042177320 -0.02050406 -0.098068165  0.19720104  0.20267203
## target   -0.041163547  0.13446821  0.422895496 -0.43802855 -0.43844127
##                slope          ca        thal      target
## age      -0.16910511  0.27155053  0.07229745 -0.22932355
## sex      -0.02666629  0.11172891  0.19842425 -0.27950076
## cp        0.13163278 -0.17620647 -0.16334148  0.43485425
## trestbps -0.12044531  0.10455372  0.05927618 -0.13877173
## chol     -0.01424787  0.07425934  0.10024418 -0.09996559
## fbs      -0.06190237  0.13715626 -0.04217732 -0.04116355
## restecg   0.08608609 -0.07807235 -0.02050406  0.13446821
## thalach   0.39530784 -0.20788842 -0.09806817  0.42289550
## exang    -0.26733547  0.10784854  0.19720104 -0.43802855
## oldpeak  -0.57518854  0.22181603  0.20267203 -0.43844127
## slope     1.00000000 -0.07344041 -0.09409006  0.34551175
## ca       -0.07344041  1.00000000  0.14901387 -0.38208529
## thal     -0.09409006  0.14901387  1.00000000 -0.33783815
## target    0.34551175 -0.38208529 -0.33783815  1.00000000
corrplot(cor(heart), type = "upper", method = "number")

corrplot(cor(heart), type = "upper", method = "circle")

shapiro.test(heart$age)
## 
##  Shapiro-Wilk normality test
## 
## data:  heart$age
## W = 0.98436, p-value = 5.039e-09
# p nhỏ hơn 0,05 rất nhiều nên ko có phân phối chuẩn
heart <- subset(heart, select = c(-chol,-restecg,-fbs))

library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(dplyr)
heart$sex1[heart$sex == 0] = "female"
heart$sex1[heart$sex == 1] = "male"
heart$cp1[heart$cp == 0] = "typical angina"
heart$cp1[heart$cp == 1] = "atypical angina"
heart$cp1[heart$cp == 2] = "non angina pain"
heart$cp1[heart$cp == 3] = "asymptomatic"
heart$exang1[heart$exang == 0] = "no"
heart$exang1[heart$exang == 1] = "yes"
heart$slope1[heart$slope == 0] = "upsloping"
heart$slope1[heart$slope == 1] = "flat"
heart$slope1[heart$slope == 2] = "downsloping"
heart$thal1[heart$thal == 1] = "normal"
heart$thal1[heart$thal == 2] = "fixed defect"
heart$thal1[heart$thal == 3] = "reversible defect"
heart$target1 = heart$target
heart$target1[heart$target1 == 0] = "no heart disease"
heart$target1[heart$target1 == 1] = "heart disease"
heart %>% 
  head() %>% 
  datatable()
heart$sex=ifelse(heart$sex == "nam", 1, 0)
heart %>% count(target1)
##            target1   n
## 1    heart disease 526
## 2 no heart disease 499
table(heart$target1)
## 
##    heart disease no heart disease 
##              526              499
prop.table(table(heart$target1))
## 
##    heart disease no heart disease 
##        0.5131707        0.4868293
round(prop.table(table(heart$target1)),2)
## 
##    heart disease no heart disease 
##             0.51             0.49
ggplot(data = heart, mapping = aes(x=age))+
  geom_histogram(color = "red", fill = "#7A9B57")+
  ggtitle("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

heart %>% 
  ggplot(aes(age)) +
  geom_histogram(color = "red", fill = "#7A9B57")+
  ggtitle("ABC")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

table(heart$sex1)
## 
## female   male 
##    312    713
heart %>% count(sex1)
##     sex1   n
## 1 female 312
## 2   male 713
heart %>% count(age)
##    age  n
## 1   29  4
## 2   34  6
## 3   35 15
## 4   37  6
## 5   38 12
## 6   39 14
## 7   40 11
## 8   41 32
## 9   42 26
## 10  43 26
## 11  44 36
## 12  45 25
## 13  46 23
## 14  47 18
## 15  48 23
## 16  49 17
## 17  50 21
## 18  51 39
## 19  52 43
## 20  53 26
## 21  54 53
## 22  55 30
## 23  56 39
## 24  57 57
## 25  58 68
## 26  59 46
## 27  60 37
## 28  61 31
## 29  62 37
## 30  63 32
## 31  64 34
## 32  65 27
## 33  66 25
## 34  67 31
## 35  68 12
## 36  69  9
## 37  70 14
## 38  71 11
## 39  74  3
## 40  76  3
## 41  77  3
table(heart$age)
## 
## 29 34 35 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 
##  4  6 15  6 12 14 11 32 26 26 36 25 23 18 23 17 21 39 43 26 53 30 39 57 68 46 
## 60 61 62 63 64 65 66 67 68 69 70 71 74 76 77 
## 37 31 37 32 34 27 25 31 12  9 14 11  3  3  3
heart$age_grp1 = cut(heart$age, breaks = seq(25,77,4))
heart %>% count(age_grp1)
##    age_grp1   n
## 1   (25,29]   4
## 2   (33,37]  27
## 3   (37,41]  69
## 4   (41,45] 113
## 5   (45,49]  81
## 6   (49,53] 129
## 7   (53,57] 179
## 8   (57,61] 182
## 9   (61,65] 130
## 10  (65,69]  77
## 11  (69,73]  25
## 12  (73,77]   9
table(heart$age_grp1)
## 
## (25,29] (29,33] (33,37] (37,41] (41,45] (45,49] (49,53] (53,57] (57,61] (61,65] 
##       4       0      27      69     113      81     129     179     182     130 
## (65,69] (69,73] (73,77] 
##      77      25       9

q

heart$age_grp = cut(heart$age, breaks = seq(25,77,5))
target_by_age = heart %>% 
  group_by(age_grp) %>% 
  summarise(heart_disease = sum(target))
target_by_age
## # A tibble: 11 x 2
##    age_grp heart_disease
##    <fct>           <int>
##  1 (25,30]             4
##  2 (30,35]            14
##  3 (35,40]            27
##  4 (40,45]           111
##  5 (45,50]            55
##  6 (50,55]           117
##  7 (55,60]            87
##  8 (60,65]            56
##  9 (65,70]            38
## 10 (70,75]            14
## 11 <NA>                3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.5
target_by_age %>%
  ggplot(aes(x=age_grp, y=heart_disease)) +
    geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
    xlab("Age group") + ylab("No. of people with heart disease") + ggtitle("No. of heart disease in age group") + 
    theme_economist()

heart$age_grp = cut(heart$age, breaks = seq(25,77,5))
target_by_age1 = heart %>% 
  group_by(age_grp) %>% 
  summarise(heart_disease = sum(target)/length(age_grp))
target_by_age1
## # A tibble: 11 x 2
##    age_grp heart_disease
##    <fct>           <dbl>
##  1 (25,30]         1    
##  2 (30,35]         0.667
##  3 (35,40]         0.628
##  4 (40,45]         0.766
##  5 (45,50]         0.539
##  6 (50,55]         0.613
##  7 (55,60]         0.352
##  8 (60,65]         0.348
##  9 (65,70]         0.418
## 10 (70,75]         1    
## 11 <NA>            0.5
library(ggthemes)

target_by_age1 %>%
  ggplot(aes(x=age_grp, y=heart_disease)) +
    geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
    xlab("Age group") + ylab("No. of people with heart disease") + ggtitle("No. of heart disease in age group") + 
    theme_economist()

data("women")
names(women)
## [1] "height" "weight"
str(women)
## 'data.frame':    15 obs. of  2 variables:
##  $ height: num  58 59 60 61 62 63 64 65 66 67 ...
##  $ weight: num  115 117 120 123 126 129 132 135 139 142 ...
women %>% head() %>% datatable()
cor(women)
##           height    weight
## height 1.0000000 0.9954948
## weight 0.9954948 1.0000000
corrplot(cor(women), type = "upper", method = "number")

corrplot(cor(women), type = "upper", method = "circle")

fit <- lm(weight ~ height, data = women)
summary(fit)
## 
## Call:
## lm(formula = weight ~ height, data = women)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7333 -1.1333 -0.3833  0.7417  3.1167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -87.51667    5.93694  -14.74 1.71e-09 ***
## height        3.45000    0.09114   37.85 1.09e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared:  0.991,  Adjusted R-squared:  0.9903 
## F-statistic:  1433 on 1 and 13 DF,  p-value: 1.091e-14
fitted(fit) %>% head()
##        1        2        3        4        5        6 
## 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333
women$prediction_value <- fitted(fit)
women$prediction_value1 <- residuals(fit)
women$prediction_value1
##  [1]  2.41666667  0.96666667  0.51666667  0.06666667 -0.38333333 -0.83333333
##  [7] -1.28333333 -1.73333333 -1.18333333 -1.63333333 -1.08333333 -0.53333333
## [13]  0.01666667  1.56666667  3.11666667
women$prediction_value
##  [1] 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333
##  [9] 140.1833 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833