t=file.choose()
heart <- read.csv(t)
names(heart)
## [1] "age" "sex" "cp" "trestbps" "chol" "fbs"
## [7] "restecg" "thalach" "exang" "oldpeak" "slope" "ca"
## [13] "thal" "target"
str(heart)
## 'data.frame': 1025 obs. of 14 variables:
## $ age : int 52 53 70 61 62 58 58 55 46 54 ...
## $ sex : int 1 1 1 1 0 0 1 1 1 1 ...
## $ cp : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trestbps: int 125 140 145 148 138 100 114 160 120 122 ...
## $ chol : int 212 203 174 203 294 248 318 289 249 286 ...
## $ fbs : int 0 1 0 0 1 0 0 0 0 0 ...
## $ restecg : int 1 0 1 1 1 0 2 0 0 0 ...
## $ thalach : int 168 155 125 161 106 122 140 145 144 116 ...
## $ exang : int 0 1 1 0 0 0 0 1 0 1 ...
## $ oldpeak : num 1 3.1 2.6 0 1.9 1 4.4 0.8 0.8 3.2 ...
## $ slope : int 2 0 0 2 1 1 0 1 2 1 ...
## $ ca : int 2 0 0 1 3 0 3 1 0 2 ...
## $ thal : int 3 3 3 3 2 2 1 3 3 2 ...
## $ target : int 0 0 0 0 0 1 0 0 0 0 ...
summary(heart)
## age sex cp trestbps
## Min. :29.00 Min. :0.0000 Min. :0.0000 Min. : 94.0
## 1st Qu.:48.00 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:120.0
## Median :56.00 Median :1.0000 Median :1.0000 Median :130.0
## Mean :54.43 Mean :0.6956 Mean :0.9424 Mean :131.6
## 3rd Qu.:61.00 3rd Qu.:1.0000 3rd Qu.:2.0000 3rd Qu.:140.0
## Max. :77.00 Max. :1.0000 Max. :3.0000 Max. :200.0
## chol fbs restecg thalach
## Min. :126 Min. :0.0000 Min. :0.0000 Min. : 71.0
## 1st Qu.:211 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:132.0
## Median :240 Median :0.0000 Median :1.0000 Median :152.0
## Mean :246 Mean :0.1493 Mean :0.5298 Mean :149.1
## 3rd Qu.:275 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:166.0
## Max. :564 Max. :1.0000 Max. :2.0000 Max. :202.0
## exang oldpeak slope ca
## Min. :0.0000 Min. :0.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.000 1st Qu.:0.0000
## Median :0.0000 Median :0.800 Median :1.000 Median :0.0000
## Mean :0.3366 Mean :1.072 Mean :1.385 Mean :0.7541
## 3rd Qu.:1.0000 3rd Qu.:1.800 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :6.200 Max. :2.000 Max. :4.0000
## thal target
## Min. :0.000 Min. :0.0000
## 1st Qu.:2.000 1st Qu.:0.0000
## Median :2.000 Median :1.0000
## Mean :2.324 Mean :0.5132
## 3rd Qu.:3.000 3rd Qu.:1.0000
## Max. :3.000 Max. :1.0000
summarise(heart)
## data frame with 0 columns and 1 row
head(heart)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 52 1 0 125 212 0 1 168 0 1.0 2 2 3
## 2 53 1 0 140 203 1 0 155 1 3.1 0 0 3
## 3 70 1 0 145 174 0 1 125 1 2.6 0 0 3
## 4 61 1 0 148 203 0 1 161 0 0.0 2 1 3
## 5 62 0 0 138 294 1 1 106 0 1.9 1 3 2
## 6 58 0 0 100 248 0 0 122 0 1.0 1 0 2
## target
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 1
head(heart,3)
## age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal
## 1 52 1 0 125 212 0 1 168 0 1.0 2 2 3
## 2 53 1 0 140 203 1 0 155 1 3.1 0 0 3
## 3 70 1 0 145 174 0 1 125 1 2.6 0 0 3
## target
## 1 0
## 2 0
## 3 0
heart %>% count(age)
## age n
## 1 29 4
## 2 34 6
## 3 35 15
## 4 37 6
## 5 38 12
## 6 39 14
## 7 40 11
## 8 41 32
## 9 42 26
## 10 43 26
## 11 44 36
## 12 45 25
## 13 46 23
## 14 47 18
## 15 48 23
## 16 49 17
## 17 50 21
## 18 51 39
## 19 52 43
## 20 53 26
## 21 54 53
## 22 55 30
## 23 56 39
## 24 57 57
## 25 58 68
## 26 59 46
## 27 60 37
## 28 61 31
## 29 62 37
## 30 63 32
## 31 64 34
## 32 65 27
## 33 66 25
## 34 67 31
## 35 68 12
## 36 69 9
## 37 70 14
## 38 71 11
## 39 74 3
## 40 76 3
## 41 77 3
table(heart$age)
##
## 29 34 35 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 4 6 15 6 12 14 11 32 26 26 36 25 23 18 23 17 21 39 43 26 53 30 39 57 68 46
## 60 61 62 63 64 65 66 67 68 69 70 71 74 76 77
## 37 31 37 32 34 27 25 31 12 9 14 11 3 3 3
cor(heart)
## age sex cp trestbps chol
## age 1.00000000 -0.10324030 -0.07196627 0.27112141 0.21982253
## sex -0.10324030 1.00000000 -0.04111909 -0.07897377 -0.19825787
## cp -0.07196627 -0.04111909 1.00000000 0.03817742 -0.08164102
## trestbps 0.27112141 -0.07897377 0.03817742 1.00000000 0.12797743
## chol 0.21982253 -0.19825787 -0.08164102 0.12797743 1.00000000
## fbs 0.12124348 0.02720046 0.07929359 0.18176662 0.02691716
## restecg -0.13269617 -0.05511721 0.04358061 -0.12379409 -0.14741024
## thalach -0.39022708 -0.04936524 0.30683928 -0.03926407 -0.02177209
## exang 0.08816338 0.13915681 -0.40151271 0.06119697 0.06738223
## oldpeak 0.20813668 0.08468656 -0.17473348 0.18743411 0.06488031
## slope -0.16910511 -0.02666629 0.13163278 -0.12044531 -0.01424787
## ca 0.27155053 0.11172891 -0.17620647 0.10455372 0.07425934
## thal 0.07229745 0.19842425 -0.16334148 0.05927618 0.10024418
## target -0.22932355 -0.27950076 0.43485425 -0.13877173 -0.09996559
## fbs restecg thalach exang oldpeak
## age 0.121243479 -0.13269617 -0.390227075 0.08816338 0.20813668
## sex 0.027200461 -0.05511721 -0.049365243 0.13915681 0.08468656
## cp 0.079293586 0.04358061 0.306839282 -0.40151271 -0.17473348
## trestbps 0.181766624 -0.12379409 -0.039264069 0.06119697 0.18743411
## chol 0.026917164 -0.14741024 -0.021772091 0.06738223 0.06488031
## fbs 1.000000000 -0.10405124 -0.008865857 0.04926057 0.01085948
## restecg -0.104051244 1.00000000 0.048410637 -0.06560553 -0.05011425
## thalach -0.008865857 0.04841064 1.000000000 -0.38028087 -0.34979616
## exang 0.049260570 -0.06560553 -0.380280872 1.00000000 0.31084376
## oldpeak 0.010859481 -0.05011425 -0.349796163 0.31084376 1.00000000
## slope -0.061902374 0.08608609 0.395307843 -0.26733547 -0.57518854
## ca 0.137156259 -0.07807235 -0.207888416 0.10784854 0.22181603
## thal -0.042177320 -0.02050406 -0.098068165 0.19720104 0.20267203
## target -0.041163547 0.13446821 0.422895496 -0.43802855 -0.43844127
## slope ca thal target
## age -0.16910511 0.27155053 0.07229745 -0.22932355
## sex -0.02666629 0.11172891 0.19842425 -0.27950076
## cp 0.13163278 -0.17620647 -0.16334148 0.43485425
## trestbps -0.12044531 0.10455372 0.05927618 -0.13877173
## chol -0.01424787 0.07425934 0.10024418 -0.09996559
## fbs -0.06190237 0.13715626 -0.04217732 -0.04116355
## restecg 0.08608609 -0.07807235 -0.02050406 0.13446821
## thalach 0.39530784 -0.20788842 -0.09806817 0.42289550
## exang -0.26733547 0.10784854 0.19720104 -0.43802855
## oldpeak -0.57518854 0.22181603 0.20267203 -0.43844127
## slope 1.00000000 -0.07344041 -0.09409006 0.34551175
## ca -0.07344041 1.00000000 0.14901387 -0.38208529
## thal -0.09409006 0.14901387 1.00000000 -0.33783815
## target 0.34551175 -0.38208529 -0.33783815 1.00000000
corrplot(cor(heart), type = "upper", method = "number")
corrplot(cor(heart), type = "upper", method = "circle")
shapiro.test(heart$age)
##
## Shapiro-Wilk normality test
##
## data: heart$age
## W = 0.98436, p-value = 5.039e-09
# p nhỏ hơn 0,05 rất nhiều nên ko có phân phối chuẩn
heart <- subset(heart, select = c(-chol,-restecg,-fbs))
library(DT)
## Warning: package 'DT' was built under R version 4.0.5
library(dplyr)
heart$sex1[heart$sex == 0] = "female"
heart$sex1[heart$sex == 1] = "male"
heart$cp1[heart$cp == 0] = "typical angina"
heart$cp1[heart$cp == 1] = "atypical angina"
heart$cp1[heart$cp == 2] = "non angina pain"
heart$cp1[heart$cp == 3] = "asymptomatic"
heart$exang1[heart$exang == 0] = "no"
heart$exang1[heart$exang == 1] = "yes"
heart$slope1[heart$slope == 0] = "upsloping"
heart$slope1[heart$slope == 1] = "flat"
heart$slope1[heart$slope == 2] = "downsloping"
heart$thal1[heart$thal == 1] = "normal"
heart$thal1[heart$thal == 2] = "fixed defect"
heart$thal1[heart$thal == 3] = "reversible defect"
heart$target1 = heart$target
heart$target1[heart$target1 == 0] = "no heart disease"
heart$target1[heart$target1 == 1] = "heart disease"
heart %>%
head() %>%
datatable()
heart$sex=ifelse(heart$sex == "nam", 1, 0)
heart %>% count(target1)
## target1 n
## 1 heart disease 526
## 2 no heart disease 499
table(heart$target1)
##
## heart disease no heart disease
## 526 499
prop.table(table(heart$target1))
##
## heart disease no heart disease
## 0.5131707 0.4868293
round(prop.table(table(heart$target1)),2)
##
## heart disease no heart disease
## 0.51 0.49
ggplot(data = heart, mapping = aes(x=age))+
geom_histogram(color = "red", fill = "#7A9B57")+
ggtitle("")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
heart %>%
ggplot(aes(age)) +
geom_histogram(color = "red", fill = "#7A9B57")+
ggtitle("ABC")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
table(heart$sex1)
##
## female male
## 312 713
heart %>% count(sex1)
## sex1 n
## 1 female 312
## 2 male 713
heart %>% count(age)
## age n
## 1 29 4
## 2 34 6
## 3 35 15
## 4 37 6
## 5 38 12
## 6 39 14
## 7 40 11
## 8 41 32
## 9 42 26
## 10 43 26
## 11 44 36
## 12 45 25
## 13 46 23
## 14 47 18
## 15 48 23
## 16 49 17
## 17 50 21
## 18 51 39
## 19 52 43
## 20 53 26
## 21 54 53
## 22 55 30
## 23 56 39
## 24 57 57
## 25 58 68
## 26 59 46
## 27 60 37
## 28 61 31
## 29 62 37
## 30 63 32
## 31 64 34
## 32 65 27
## 33 66 25
## 34 67 31
## 35 68 12
## 36 69 9
## 37 70 14
## 38 71 11
## 39 74 3
## 40 76 3
## 41 77 3
table(heart$age)
##
## 29 34 35 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
## 4 6 15 6 12 14 11 32 26 26 36 25 23 18 23 17 21 39 43 26 53 30 39 57 68 46
## 60 61 62 63 64 65 66 67 68 69 70 71 74 76 77
## 37 31 37 32 34 27 25 31 12 9 14 11 3 3 3
heart$age_grp1 = cut(heart$age, breaks = seq(25,77,4))
heart %>% count(age_grp1)
## age_grp1 n
## 1 (25,29] 4
## 2 (33,37] 27
## 3 (37,41] 69
## 4 (41,45] 113
## 5 (45,49] 81
## 6 (49,53] 129
## 7 (53,57] 179
## 8 (57,61] 182
## 9 (61,65] 130
## 10 (65,69] 77
## 11 (69,73] 25
## 12 (73,77] 9
table(heart$age_grp1)
##
## (25,29] (29,33] (33,37] (37,41] (41,45] (45,49] (49,53] (53,57] (57,61] (61,65]
## 4 0 27 69 113 81 129 179 182 130
## (65,69] (69,73] (73,77]
## 77 25 9
q
heart$age_grp = cut(heart$age, breaks = seq(25,77,5))
target_by_age = heart %>%
group_by(age_grp) %>%
summarise(heart_disease = sum(target))
target_by_age
## # A tibble: 11 x 2
## age_grp heart_disease
## <fct> <int>
## 1 (25,30] 4
## 2 (30,35] 14
## 3 (35,40] 27
## 4 (40,45] 111
## 5 (45,50] 55
## 6 (50,55] 117
## 7 (55,60] 87
## 8 (60,65] 56
## 9 (65,70] 38
## 10 (70,75] 14
## 11 <NA> 3
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.0.5
target_by_age %>%
ggplot(aes(x=age_grp, y=heart_disease)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
xlab("Age group") + ylab("No. of people with heart disease") + ggtitle("No. of heart disease in age group") +
theme_economist()
heart$age_grp = cut(heart$age, breaks = seq(25,77,5))
target_by_age1 = heart %>%
group_by(age_grp) %>%
summarise(heart_disease = sum(target)/length(age_grp))
target_by_age1
## # A tibble: 11 x 2
## age_grp heart_disease
## <fct> <dbl>
## 1 (25,30] 1
## 2 (30,35] 0.667
## 3 (35,40] 0.628
## 4 (40,45] 0.766
## 5 (45,50] 0.539
## 6 (50,55] 0.613
## 7 (55,60] 0.352
## 8 (60,65] 0.348
## 9 (65,70] 0.418
## 10 (70,75] 1
## 11 <NA> 0.5
library(ggthemes)
target_by_age1 %>%
ggplot(aes(x=age_grp, y=heart_disease)) +
geom_bar(stat="identity", fill="#f68060", alpha=.6, width=.4) +
xlab("Age group") + ylab("No. of people with heart disease") + ggtitle("No. of heart disease in age group") +
theme_economist()
data("women")
names(women)
## [1] "height" "weight"
str(women)
## 'data.frame': 15 obs. of 2 variables:
## $ height: num 58 59 60 61 62 63 64 65 66 67 ...
## $ weight: num 115 117 120 123 126 129 132 135 139 142 ...
women %>% head() %>% datatable()
cor(women)
## height weight
## height 1.0000000 0.9954948
## weight 0.9954948 1.0000000
corrplot(cor(women), type = "upper", method = "number")
corrplot(cor(women), type = "upper", method = "circle")
fit <- lm(weight ~ height, data = women)
summary(fit)
##
## Call:
## lm(formula = weight ~ height, data = women)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.7333 -1.1333 -0.3833 0.7417 3.1167
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -87.51667 5.93694 -14.74 1.71e-09 ***
## height 3.45000 0.09114 37.85 1.09e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.525 on 13 degrees of freedom
## Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
## F-statistic: 1433 on 1 and 13 DF, p-value: 1.091e-14
fitted(fit) %>% head()
## 1 2 3 4 5 6
## 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333
women$prediction_value <- fitted(fit)
women$prediction_value1 <- residuals(fit)
women$prediction_value1
## [1] 2.41666667 0.96666667 0.51666667 0.06666667 -0.38333333 -0.83333333
## [7] -1.28333333 -1.73333333 -1.18333333 -1.63333333 -1.08333333 -0.53333333
## [13] 0.01666667 1.56666667 3.11666667
women$prediction_value
## [1] 112.5833 116.0333 119.4833 122.9333 126.3833 129.8333 133.2833 136.7333
## [9] 140.1833 143.6333 147.0833 150.5333 153.9833 157.4333 160.8833