2.1 Import necessary libraries and dataset.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tidyr)
library(ggplot2)
setwd("~/作业/wqd7004/groupwork")
alldata = read.csv("data3.csv")
2.2 Data Overview
data1=alldata
attach(data1)
opar<-par(no.readonly=TRUE)
par(mfrow=c(3,3))
hist(HeartDiseaseorAttack,main="HeartDiseaseorAttack")
hist(HighBP,main="HighBP")
hist(HighChol,main="HighChol")
hist(CholCheck,main="CholCheck")
hist(BMI,main="BMI")
hist(Smoker,main="Smoker")
hist(Stroke,main="Stroke")
hist(Diabetes,main="Diabetes")
hist(PhysActivity,main="PhysActivity")

opar<-par(no.readonly=TRUE)
par(mfrow=c(3,3))
hist(Fruits,main="Fruits")
hist(Veggies,main="Veggies")
hist(HvyAlcoholConsump,main="HvyAlcoholConsump")
hist(AnyHealthcare,main="AnyHealthcare")
hist(NoDocbcCost,main="NoDocbcCost")
hist(GenHlth,main="GenHlth")
hist(MentHlth,main="MentHlth")
hist(PhysHlth,main="PhysHlth")
hist(DiffWalk,main="DiffWalk")

opar<-par(no.readonly=TRUE)
par(mfrow=c(2,2))
hist(Sex,main="Sex")
hist(Age,main="Age")
hist(Education,main="Education")
hist(Income,main="Income")

2.3 Data Description
Contains 22 variables (before cleaning process) and 253680 rows
Consists of 1 data types:Numerical
str(alldata)
## 'data.frame': 100638 obs. of 22 variables:
## $ HeartDiseaseorAttack: int 0 0 0 0 0 0 0 0 1 0 ...
## $ HighBP : int 1 0 1 1 1 1 1 1 1 0 ...
## $ HighChol : int 1 0 1 0 1 1 0 1 1 0 ...
## $ CholCheck : int 1 0 1 1 1 1 1 1 1 1 ...
## $ BMI : int 40 25 28 27 24 25 30 25 30 24 ...
## $ Smoker : int 1 1 0 0 0 1 1 1 1 0 ...
## $ Stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Diabetes : int 0 0 0 0 0 0 0 0 2 0 ...
## $ PhysActivity : int 0 1 0 1 1 1 0 1 0 0 ...
## $ Fruits : int 0 0 1 1 1 1 0 0 1 0 ...
## $ Veggies : int 1 0 0 1 1 1 0 1 1 1 ...
## $ HvyAlcoholConsump : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AnyHealthcare : int 1 0 1 1 1 1 1 1 1 1 ...
## $ NoDocbcCost : int 0 1 1 0 0 0 0 0 0 0 ...
## $ GenHlth : int 5 3 5 2 2 2 3 3 5 2 ...
## $ MentHlth : int 18 0 30 0 3 0 0 0 30 0 ...
## $ PhysHlth : int 15 0 30 0 0 2 14 0 30 0 ...
## $ DiffWalk : int 1 0 1 0 0 0 0 1 1 0 ...
## $ Sex : int 0 0 0 0 0 1 0 0 0 1 ...
## $ Age : int 9 7 9 11 11 10 9 11 9 8 ...
## $ Education : int 4 6 4 3 5 6 6 4 5 4 ...
## $ Income : int 3 1 8 6 4 8 7 4 1 3 ...
summary(alldata)
## HeartDiseaseorAttack HighBP HighChol CholCheck
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.2374 Mean :0.4812 Mean :0.4724 Mean :0.9673
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## BMI Smoker Stroke Diabetes
## Min. :12.00 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:24.00 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :27.00 Median :0.0000 Median :0.00000 Median :0.0000
## Mean :28.62 Mean :0.4702 Mean :0.06043 Mean :0.3589
## 3rd Qu.:31.00 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :98.00 Max. :1.0000 Max. :1.00000 Max. :2.0000
## PhysActivity Fruits Veggies HvyAlcoholConsump
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.00000
## Median :1.0000 Median :1.0000 Median :1.0000 Median :0.00000
## Mean :0.7433 Mean :0.6276 Mean :0.8048 Mean :0.05312
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## AnyHealthcare NoDocbcCost GenHlth MentHlth
## Min. :0.0000 Min. :0.00000 Min. :1.000 Min. : 0.000
## 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:2.000 1st Qu.: 0.000
## Median :1.0000 Median :0.00000 Median :3.000 Median : 0.000
## Mean :0.9514 Mean :0.08857 Mean :2.648 Mean : 3.392
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:3.000 3rd Qu.: 2.000
## Max. :1.0000 Max. :1.00000 Max. :5.000 Max. :30.000
## PhysHlth DiffWalk Sex Age
## Min. : 0.000 Min. :0.0000 Min. :0.0000 Min. : 1.000
## 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 6.000
## Median : 0.000 Median :0.0000 Median :0.0000 Median : 9.000
## Mean : 5.026 Mean :0.2077 Mean :0.4627 Mean : 8.393
## 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:11.000
## Max. :30.000 Max. :1.0000 Max. :1.0000 Max. :13.000
## Education Income
## Min. :1.000 Min. :1.000
## 1st Qu.:4.000 1st Qu.:5.000
## Median :5.000 Median :7.000
## Mean :5.017 Mean :5.937
## 3rd Qu.:6.000 3rd Qu.:8.000
## Max. :6.000 Max. :8.000
head(alldata)
## HeartDiseaseorAttack HighBP HighChol CholCheck BMI Smoker Stroke Diabetes
## 1 0 1 1 1 40 1 0 0
## 2 0 0 0 0 25 1 0 0
## 3 0 1 1 1 28 0 0 0
## 4 0 1 0 1 27 0 0 0
## 5 0 1 1 1 24 0 0 0
## 6 0 1 1 1 25 1 0 0
## PhysActivity Fruits Veggies HvyAlcoholConsump AnyHealthcare NoDocbcCost
## 1 0 0 1 0 1 0
## 2 1 0 0 0 0 1
## 3 0 1 0 0 1 1
## 4 1 1 1 0 1 0
## 5 1 1 1 0 1 0
## 6 1 1 1 0 1 0
## GenHlth MentHlth PhysHlth DiffWalk Sex Age Education Income
## 1 5 18 15 1 0 9 4 3
## 2 3 0 0 0 0 7 6 1
## 3 5 30 30 1 0 9 4 8
## 4 2 0 0 0 0 11 3 6
## 5 2 3 0 0 0 11 5 4
## 6 2 0 2 0 1 10 6 8
ls(alldata)
## [1] "Age" "AnyHealthcare" "BMI"
## [4] "CholCheck" "Diabetes" "DiffWalk"
## [7] "Education" "Fruits" "GenHlth"
## [10] "HeartDiseaseorAttack" "HighBP" "HighChol"
## [13] "HvyAlcoholConsump" "Income" "MentHlth"
## [16] "NoDocbcCost" "PhysActivity" "PhysHlth"
## [19] "Sex" "Smoker" "Stroke"
## [22] "Veggies"
dim(alldata)
## [1] 100638 22
####3.1. do some descriptive analysis First, in this section, we
attempted the most preliminary descriptive statistics, and the results
obtained are shown in the following figure.
str(alldata)
## 'data.frame': 100638 obs. of 22 variables:
## $ HeartDiseaseorAttack: int 0 0 0 0 0 0 0 0 1 0 ...
## $ HighBP : int 1 0 1 1 1 1 1 1 1 0 ...
## $ HighChol : int 1 0 1 0 1 1 0 1 1 0 ...
## $ CholCheck : int 1 0 1 1 1 1 1 1 1 1 ...
## $ BMI : int 40 25 28 27 24 25 30 25 30 24 ...
## $ Smoker : int 1 1 0 0 0 1 1 1 1 0 ...
## $ Stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Diabetes : int 0 0 0 0 0 0 0 0 2 0 ...
## $ PhysActivity : int 0 1 0 1 1 1 0 1 0 0 ...
## $ Fruits : int 0 0 1 1 1 1 0 0 1 0 ...
## $ Veggies : int 1 0 0 1 1 1 0 1 1 1 ...
## $ HvyAlcoholConsump : int 0 0 0 0 0 0 0 0 0 0 ...
## $ AnyHealthcare : int 1 0 1 1 1 1 1 1 1 1 ...
## $ NoDocbcCost : int 0 1 1 0 0 0 0 0 0 0 ...
## $ GenHlth : int 5 3 5 2 2 2 3 3 5 2 ...
## $ MentHlth : int 18 0 30 0 3 0 0 0 30 0 ...
## $ PhysHlth : int 15 0 30 0 0 2 14 0 30 0 ...
## $ DiffWalk : int 1 0 1 0 0 0 0 1 1 0 ...
## $ Sex : int 0 0 0 0 0 1 0 0 0 1 ...
## $ Age : int 9 7 9 11 11 10 9 11 9 8 ...
## $ Education : int 4 6 4 3 5 6 6 4 5 4 ...
## $ Income : int 3 1 8 6 4 8 7 4 1 3 ...
####1. do some descriptive analysis First, in this section, we
attempted the most preliminary descriptive statistics, and the results
obtained are shown in the following figure.
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
describe(alldata)
## vars n mean sd median trimmed mad min max range
## HeartDiseaseorAttack 1 100638 0.24 0.43 0 0.17 0.00 0 1 1
## HighBP 2 100638 0.48 0.50 0 0.48 0.00 0 1 1
## HighChol 3 100638 0.47 0.50 0 0.47 0.00 0 1 1
## CholCheck 4 100638 0.97 0.18 1 1.00 0.00 0 1 1
## BMI 5 100638 28.62 7.23 27 27.79 5.93 12 98 86
## Smoker 6 100638 0.47 0.50 0 0.46 0.00 0 1 1
## Stroke 7 100638 0.06 0.24 0 0.00 0.00 0 1 1
## Diabetes 8 100638 0.36 0.75 0 0.20 0.00 0 2 2
## PhysActivity 9 100638 0.74 0.44 1 0.80 0.00 0 1 1
## Fruits 10 100638 0.63 0.48 1 0.66 0.00 0 1 1
## Veggies 11 100638 0.80 0.40 1 0.88 0.00 0 1 1
## HvyAlcoholConsump 12 100638 0.05 0.22 0 0.00 0.00 0 1 1
## AnyHealthcare 13 100638 0.95 0.21 1 1.00 0.00 0 1 1
## NoDocbcCost 14 100638 0.09 0.28 0 0.00 0.00 0 1 1
## GenHlth 15 100638 2.65 1.12 3 2.60 1.48 1 5 4
## MentHlth 16 100638 3.39 7.68 0 1.18 0.00 0 30 30
## PhysHlth 17 100638 5.03 9.45 0 2.55 0.00 0 30 30
## DiffWalk 18 100638 0.21 0.41 0 0.13 0.00 0 1 1
## Sex 19 100638 0.46 0.50 0 0.45 0.00 0 1 1
## Age 20 100638 8.39 3.05 9 8.57 2.97 1 13 12
## Education 21 100638 5.02 1.01 5 5.13 1.48 1 6 5
## Income 22 100638 5.94 2.11 7 6.22 1.48 1 8 7
## skew kurtosis se
## HeartDiseaseorAttack 1.23 -0.48 0.00
## HighBP 0.08 -1.99 0.00
## HighChol 0.11 -1.99 0.00
## CholCheck -5.26 25.66 0.00
## BMI 2.70 15.20 0.02
## Smoker 0.12 -1.99 0.00
## Stroke 3.69 11.61 0.00
## Diabetes 1.67 0.85 0.00
## PhysActivity -1.11 -0.76 0.00
## Fruits -0.53 -1.72 0.00
## Veggies -1.54 0.37 0.00
## HvyAlcoholConsump 3.99 13.88 0.00
## AnyHealthcare -4.20 15.64 0.00
## NoDocbcCost 2.90 6.39 0.00
## GenHlth 0.33 -0.57 0.00
## MentHlth 2.59 5.66 0.02
## PhysHlth 1.91 2.14 0.03
## DiffWalk 1.44 0.08 0.00
## Sex 0.15 -1.98 0.00
## Age -0.47 -0.47 0.01
## Education -0.79 0.06 0.00
## Income -0.80 -0.48 0.01
So we can see lots of basic information about different attributes
and then we try to look into this dataset .
####3.2 Some simple analysis and visualisation
- firstly , we try to look into some continuous data like BMI
#BMI
#have the distribution of BMI
library(ggplot2)
alldata.BMI <-alldata[["BMI"]]
#so we can see the distribution of BMI .Then we try to divide it into 6 groups
# underweight(bmi<18.5),normal(18.5<-bmi<25),overweight(25<-bmi <30),mildly obese(30<- bmi <35),moderately obese(35<- bmi <40),severely obese(bmi >- 40)
#underweight-0,normal-1.overweight-2,mildly obese-3 , moderately-4 ,severely obese -5
data <- subset(alldata, !is.na(BMI))
#data$BMI
#data$Stroke
data$category <- apply(data, 1, function(row) {
if (row["BMI"] < 18.5) {
return("Underweight")
} else if (row["BMI"] >= 18.5 && row["BMI"] < 25) {
return("Normal")
} else if (row["BMI"] >= 25 && row["BMI"] < 30) {
return("Overweight")
} else if (row["BMI"] >= 30 && row["BMI"] < 35) {
return("Mildly obese")
} else if (row["BMI"] >= 35 && row["BMI"] < 40) {
return("Moderately obese")
}else {
return("Severely obese")
}
})
#data$category
# Calculate the count of strokes for each category
stroke_counts <- table(data$category, data$Stroke)
# Convert counts to proportions
stroke_proportions <- prop.table(stroke_counts, margin = 1)
# Print the resulting table
stroke_proportions
##
## 0 1
## Mildly obese 0.93547934 0.06452066
## Moderately obese 0.92803527 0.07196473
## Normal 0.94903936 0.05096064
## Overweight 0.94109332 0.05890668
## Severely obese 0.92705526 0.07294474
## Underweight 0.90104167 0.09895833
ggplot(data = NULL ,aes(x = alldata.BMI))+geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#After that , we try to find the relationship between the BMI and the possibility of getting heart disease
heart_disease_BMI <- data %>% group_by(category)%>%summarise(HeartDiseaseorAttack = sum(HeartDiseaseorAttack)/n())
heart_disease_BMI
## # A tibble: 6 × 2
## category HeartDiseaseorAttack
## <chr> <dbl>
## 1 Mildly obese 0.278
## 2 Moderately obese 0.299
## 3 Normal 0.176
## 4 Overweight 0.237
## 5 Severely obese 0.284
## 6 Underweight 0.246
ggplot(heart_disease_BMI,aes(x = category, y = HeartDiseaseorAttack,fill = category))+
geom_col()

#In this bar chart below , we can clearly find that people who have the normal BMI are less likely to get heart disease
##b. secondly , we try to look into some categorical data
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
heart_disease_HighBP <- data %>% group_by(HighBP) %>% summarize(HeartDiseaseorAttack = sum(HeartDiseaseorAttack)/n())
#heart_disease_HighBP
ggplot(heart_disease_HighBP,aes(x = HighBP, y = HeartDiseaseorAttack,fill = HighBP))+
geom_col()

#So in this bar chart , we can see that people with HighBP are more likely to have heart disease .
heart_disease_Sex <- data %>% group_by(Sex) %>% summarize(HeartDiseaseorAttack = sum(HeartDiseaseorAttack)/n())
heart_disease_Sex
## # A tibble: 2 × 2
## Sex HeartDiseaseorAttack
## <int> <dbl>
## 1 0 0.189
## 2 1 0.294
ggplot(heart_disease_Sex,aes(x = Sex, y = HeartDiseaseorAttack,fill = Sex))+
geom_col()

#So , by this bar chart below , we can see that in different gender , there will be different possibility of getting
#heart disease . For males , they are more likely to get heart disease ,up to 30%
3. Fing the relations between different attributes
data.cor <- cor(alldata)
library(corrplot)
## corrplot 0.92 loaded
corrplot(data.cor,title = "Corrplot" , method = "color")

#{r setup, include=FALSE} #knitr::opts_chunk$set(echo = TRUE) #