Tidyverse Create Assignment Vignette

Vignette Info

This vignette demonstrates how to use some functions from the dplyr and ggplot2 packages in the tidyverse suit of packages

Data

The data used to demonstrate the usage of the dplyr and ggplot2 package was obtained from kaggle. See the link below:
Heart Failure Dataset

Load Libraries

library(tidyverse)

Read the data

url <- "https://raw.githubusercontent.com/chinedu2301/DATA607-Data-Acquisition-and-Management/main/heart.csv"
heart_failure <- read_csv(url)

# Look into the data to know what columns are present
head(heart_failure)

## # A tibble: 6 x 12
##     Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <chr> <chr>             <dbl>       <dbl>     <dbl> <chr>      <dbl>
## 1    40 M     ATA                 140         289         0 Normal       172
## 2    49 F     NAP                 160         180         0 Normal       156
## 3    37 M     ATA                 130         283         0 ST            98
## 4    48 F     ASY                 138         214         0 Normal       108
## 5    54 M     NAP                 150         195         0 Normal       122
## 6    39 M     NAP                 120         339         0 Normal       170
## # ... with 4 more variables: ExerciseAngina <chr>, Oldpeak <dbl>,
## #   ST_Slope <chr>, HeartDisease <dbl>

dplyr package

glimpse

Q: What are the variables and data types of the variables in my dataset?
A: Use dplyr’s glimpse function.

glimpse(heart_failure)

## Rows: 918
## Columns: 12
## $ Age            <dbl> 40, 49, 37, 48, 54, 39, 45, 54, 37, 48, 37, 58, 39, 49,~
## $ Sex            <chr> "M", "F", "M", "F", "M", "M", "F", "M", "M", "F", "F", ~
## $ ChestPainType  <chr> "ATA", "NAP", "ATA", "ASY", "NAP", "NAP", "ATA", "ATA",~
## $ RestingBP      <dbl> 140, 160, 130, 138, 150, 120, 130, 110, 140, 120, 130, ~
## $ Cholesterol    <dbl> 289, 180, 283, 214, 195, 339, 237, 208, 207, 284, 211, ~
## $ FastingBS      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ RestingECG     <chr> "Normal", "Normal", "ST", "Normal", "Normal", "Normal",~
## $ MaxHR          <dbl> 172, 156, 98, 108, 122, 170, 170, 142, 130, 120, 142, 9~
## $ ExerciseAngina <chr> "N", "N", "N", "Y", "N", "N", "N", "N", "Y", "N", "N", ~
## $ Oldpeak        <dbl> 0.0, 1.0, 0.0, 1.5, 0.0, 0.0, 0.0, 0.0, 1.5, 0.0, 0.0, ~
## $ ST_Slope       <chr> "Up", "Flat", "Up", "Flat", "Up", "Up", "Up", "Up", "Fl~
## $ HeartDisease   <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1~

select

Q: I do not need all the columns, how do I select the columns that I only need for my analysis?
A: Use dplyr’s select function.

# select all columns except Oldpeak and ST_Slope
heart_failure_sub <- heart_failure %>% select(-c(Oldpeak, ST_Slope))
# select columns Age, Sex, RestingBP, and HeartDisease
heart_failure_sub1 <- heart_failure %>% select(Age, Sex, RestingBP, HeartDisease)
# select from columns Age to ExerciseAngina
heart_failure_sub2 <- heart_failure %>% select(Age:ExerciseAngina)

View the head of heart_failure_sub

head(heart_failure_sub)

## # A tibble: 6 x 10
##     Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <chr> <chr>             <dbl>       <dbl>     <dbl> <chr>      <dbl>
## 1    40 M     ATA                 140         289         0 Normal       172
## 2    49 F     NAP                 160         180         0 Normal       156
## 3    37 M     ATA                 130         283         0 ST            98
## 4    48 F     ASY                 138         214         0 Normal       108
## 5    54 M     NAP                 150         195         0 Normal       122
## 6    39 M     NAP                 120         339         0 Normal       170
## # ... with 2 more variables: ExerciseAngina <chr>, HeartDisease <dbl>

View the head of heart_failure_sub1

head(heart_failure_sub1)

## # A tibble: 6 x 4
##     Age Sex   RestingBP HeartDisease
##   <dbl> <chr>     <dbl>        <dbl>
## 1    40 M           140            0
## 2    49 F           160            1
## 3    37 M           130            0
## 4    48 F           138            1
## 5    54 M           150            0
## 6    39 M           120            0

View the head of heart_failure_sub2

head(heart_failure_sub2)

## # A tibble: 6 x 9
##     Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <chr> <chr>             <dbl>       <dbl>     <dbl> <chr>      <dbl>
## 1    40 M     ATA                 140         289         0 Normal       172
## 2    49 F     NAP                 160         180         0 Normal       156
## 3    37 M     ATA                 130         283         0 ST            98
## 4    48 F     ASY                 138         214         0 Normal       108
## 5    54 M     NAP                 150         195         0 Normal       122
## 6    39 M     NAP                 120         339         0 Normal       170
## # ... with 1 more variable: ExerciseAngina <chr>

filter

Q: I want only observations where Sex is M
A: Use dplyr’s filter function.

heart_failure_male <- heart_failure %>% filter(Sex == "M")
head(heart_failure_male)

## # A tibble: 6 x 12
##     Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <chr> <chr>             <dbl>       <dbl>     <dbl> <chr>      <dbl>
## 1    40 M     ATA                 140         289         0 Normal       172
## 2    37 M     ATA                 130         283         0 ST            98
## 3    54 M     NAP                 150         195         0 Normal       122
## 4    39 M     NAP                 120         339         0 Normal       170
## 5    54 M     ATA                 110         208         0 Normal       142
## 6    37 M     ASY                 140         207         0 Normal       130
## # ... with 4 more variables: ExerciseAngina <chr>, Oldpeak <dbl>,
## #   ST_Slope <chr>, HeartDisease <dbl>

Q: I want only observations where RestingECG is Normal
A: Use dplyr’s filter function.

heart_failure_disease <- heart_failure %>% filter(RestingECG == "Normal")
head(heart_failure_disease)

## # A tibble: 6 x 12
##     Age Sex   ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <chr> <chr>             <dbl>       <dbl>     <dbl> <chr>      <dbl>
## 1    40 M     ATA                 140         289         0 Normal       172
## 2    49 F     NAP                 160         180         0 Normal       156
## 3    48 F     ASY                 138         214         0 Normal       108
## 4    54 M     NAP                 150         195         0 Normal       122
## 5    39 M     NAP                 120         339         0 Normal       170
## 6    45 F     ATA                 130         237         0 Normal       170
## # ... with 4 more variables: ExerciseAngina <chr>, Oldpeak <dbl>,
## #   ST_Slope <chr>, HeartDisease <dbl>

rename

Q: How do I rename a column?
A: Use dplyr’s rename function

heart_failure <- heart_failure %>% rename("Gender" = Sex)
head(heart_failure)

## # A tibble: 6 x 12
##     Age Gender ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
##   <dbl> <chr>  <chr>             <dbl>       <dbl>     <dbl> <chr>      <dbl>
## 1    40 M      ATA                 140         289         0 Normal       172
## 2    49 F      NAP                 160         180         0 Normal       156
## 3    37 M      ATA                 130         283         0 ST            98
## 4    48 F      ASY                 138         214         0 Normal       108
## 5    54 M      NAP                 150         195         0 Normal       122
## 6    39 M      NAP                 120         339         0 Normal       170
## # ... with 4 more variables: ExerciseAngina <chr>, Oldpeak <dbl>,
## #   ST_Slope <chr>, HeartDisease <dbl>

ggplot2 package

Scatterplot

Q: I want to plot a scatter plot of RestingBP vs. Cholesterol
A: Use the ggplot function and the geom_point layer

ggplot(data = heart_failure, aes(x = Cholesterol, y = RestingBP)) + geom_point()

Add title, transparency, and theme

ggplot(data = heart_failure, aes(x = Cholesterol, y = RestingBP)) + geom_point(alpha = 0.5) + 
  labs(title = "RestingBP vs. Cholesterol") + theme_bw()

Barplot

Q: I want to plot a barchart of the count of Male and Female in the dataset
A: Use the ggplot function and the geom_bar layer

ggplot(data = heart_failure, aes(x = Gender)) + geom_bar(fill = "brown") + 
  labs(title = "Bar chart for count of Sex") + theme_bw()

Q: How do I flip co-ordinates?
A: Add the coord_flip layer

ggplot(data = heart_failure, aes(x = Gender)) + geom_bar(fill = "brown") + 
  labs(title = "Bar chart for count of Sex") + theme_bw() + coord_flip()