十月 12, 2023

Dataset: heart

head(heart)
  Patient.ID Age    Sex Cholesterol Blood.Pressure Heart.Rate Diabetes
1    BMW7812  67   Male         208         158/88         72        0
2    CZE1114  21   Male         389         165/93         98        1
3    BNI9906  21 Female         324         174/99         72        1
4    JLN3497  84   Male         383        163/100         73        1
5    GFO8847  66   Male         318          91/88         93        1
6    ZOO7941  54 Female         297         172/86         48        1
  Family.History Smoking Obesity Alcohol.Consumption Exercise.Hours.Per.Week
1              0       1       0                   0                4.168189
2              1       1       1                   1                1.813242
3              0       0       0                   0                2.078353
4              1       1       0                   1                9.828130
5              1       1       1                   0                5.804299
6              1       1       0                   1                0.625008
       Diet Previous.Heart.Problems Medication.Use Stress.Level
1   Average                       0              0            9
2 Unhealthy                       1              0            1
3   Healthy                       1              1            9
4   Average                       1              0            9
5 Unhealthy                       1              0            6
6 Unhealthy                       1              1            2
  Sedentary.Hours.Per.Day Income      BMI Triglycerides
1                6.615001 261404 31.25123           286
2                4.963459 285768 27.19497           235
3                9.463426 235282 28.17657           587
4                7.648981 125640 36.46470           378
5                1.514821 160555 21.80914           231
6                7.798752 241339 20.14684           795
  Physical.Activity.Days.Per.Week Sleep.Hours.Per.Day   Country     Continent
1                               0                   6 Argentina South America
2                               1                   7    Canada North America
3                               4                   4    France        Europe
4                               3                   4    Canada North America
5                               1                   5  Thailand          Asia
6                               5                  10   Germany        Europe
           Hemisphere Heart.Attack.Risk
1 Southern Hemisphere                 0
2 Northern Hemisphere                 0
3 Northern Hemisphere                 0
4 Northern Hemisphere                 0
5 Northern Hemisphere                 0
6 Northern Hemisphere                 1

which continent has the most people who get heart attack

Asia <- nrow(filter(heart, heart$Continent == "Asia"))
NorthAmer <- nrow(filter(heart, heart$Continent == "North America"))
SouthAmer <- nrow(filter(heart, heart$Continent == "South America"))
Europe <- nrow(filter(heart, heart$Continent == "Europe"))
Africa <- nrow(filter(heart, heart$Continent == "Africa"))
Australia <- nrow(filter(heart, heart$Continent == "Australia"))
numheart <- c(Asia, NorthAmer, SouthAmer, Europe, Africa, Australia)
continent <- c("Asia", "North America","South America","Europe","Africa","Australia")
new_df <- data.frame(continent, numheart)
ggplot(new_df, aes(x = continent, y = numheart, fill = continent))+geom_bar(stat = "identity")

plot Asians whose age above 60 if they have the risk of heart attack

yes <-nrow(filter(heart, Continent == "Asia" & Age > 60 & Heart.Attack.Risk ==1)) 
no <- nrow(filter(heart, Continent == "Asia" & Age > 60 & Heart.Attack.Risk ==0))
attack_number <- c(yes, no)
attack_label <- c("Have heart attack risk", "Don't have heart attack risk")
df2 <- data.frame(attack_label, attack_number)
ggplot(df2, aes(x = attack_number, y = attack_label, fill = attack_label))+geom_bar(stat = "identity")

the gender pie chart for people in the dataset who is in Asia above 60

man_ratio <- nrow(filter(heart, Age>60 & Continent == "Asia" & Sex == "Male"))/nrow(filter(heart, Age > 60 & Continent == "Asia"))*100
woman_ratio <- nrow(filter(heart, Age>60 & Continent == "Asia" & Sex == "Female"))/nrow(filter(heart, Age > 60 & Continent == "Asia"))*100

df <- data.frame(group = c("Man above 60 in Asia", "Woman above 60 in Asia"),value = c(man_ratio, woman_ratio))
w <- ggplot(df, aes(x = "", y =value, fill  = group))+geom_bar(stat ="identity",width = 1, color = "white") +coord_polar("y",start = 0)+theme_void()
w

Create a 3D plot, the x axis is about BMI, y axis is age, z axis is heart attack risk

asian_man <- filter(heart, Age>60 & Continent == "Asia" & Sex == "Male")
plot_ly(x = asian_man$BMI, y = asian_man$Age, z= asian_man$Heart.Attack.Risk,type="scatter3d", mode="markers")

Find the multiple linear regression line for BMI, age, and heart attack risk (heart attack risk = 0.003BMI+0.002Age +0.134)

model: \(\text{Heart.Attack.Risk} = \beta_0+\beta_1\cdot\text{BMI} +\beta_2\cdot\text{Age}+ \varepsilon; \hspace{1 cm} \varepsilon \sim N(0; \sigma^2)\)

\(\beta_0 = 0.134; \hspace{1 cm}\beta_1 = 0.003; \hspace{1 cm}\beta_2 = 0.002\;\)

The linear regression function: \(\text{Heart.Attack.Risk} = \beta_0=\beta_1\cdot\text{BMI}+\beta_2\cdot\text{Age}\)

Cont.

model <- lm(Heart.Attack.Risk~BMI + Age, data =asian_man)
summary(model)
Call:
lm(formula = Heart.Attack.Risk ~ BMI + Age, data = asian_man)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.4303 -0.3806 -0.3514  0.6109  0.6804 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.134379   0.178037   0.755    0.451
BMI         0.002828   0.002909   0.972    0.331
Age         0.002079   0.002012   1.033    0.302

Residual standard error: 0.4843 on 740 degrees of freedom
Multiple R-squared:  0.002612,  Adjusted R-squared:  -8.337e-05 
F-statistic: 0.9691 on 2 and 740 DF,  p-value: 0.3799

linear regression for exercise, sedentary hour and the heart attack risk for asian man above 60 (heart attack risk = Exercise.Hours.Per.Week0.0016 + Sedentary.Hours.Per.Day(-0.0024)+0.3724)

model: \(\text{Heart.Attack.Risk} = \beta_0+\beta_1\cdot\text{Exercise.Hours.Per.Week} +\beta_2\cdot\text{Sedentary.Hours.Per.Day}+ \varepsilon; \hspace{1 cm} \varepsilon \sim N(0; \sigma^2)\)

\(\beta_0 = 0.3724; \hspace{1 cm}\beta_1 = 0.0016; \hspace{1 cm}\beta_2 = -0.0024\;\)

The linear regression function: \(\text{Heart.Attack.Risk} = \beta_0=\beta_1\cdot\text{Heart.Attack.Risk~Exercise.Hours.Per.Week}+\beta_2\cdot\text{Sedentary.Hours.Per.Day}\)

Cont.

model2 <- lm(Heart.Attack.Risk~Exercise.Hours.Per.Week+Sedentary.Hours.Per.Day, data =asian_man)
summary(model2)
Call:
lm(formula = Heart.Attack.Risk ~ Exercise.Hours.Per.Week + Sedentary.Hours.Per.Day, 
    data = asian_man)

Residuals:
    Min      1Q  Median      3Q     Max 
-0.4037 -0.3770 -0.3631  0.6207  0.6515 

Coefficients:
                         Estimate Std. Error t value Pr(>|t|)    
(Intercept)              0.372434   0.045830   8.126 1.85e-15 ***
Exercise.Hours.Per.Week  0.001604   0.003163   0.507    0.612    
Sedentary.Hours.Per.Day -0.002364   0.005126  -0.461    0.645    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.4847 on 740 degrees of freedom
Multiple R-squared:  0.0005812, Adjusted R-squared:  -0.00212 
F-statistic: 0.2152 on 2 and 740 DF,  p-value: 0.8065