Title: CUNY SPS MDS DATA607_Tidyverse Extend”

Author: Charles Ugiagbe

Date: 11/13/2021

Part 1: Tidyverse create

The Part 1 Tidyverse Create Assignment was done by “Jiho Kim”.

The Part 2: Tidyverse Extend was done by me. I added some more code to the Tidyverse create using the same dataset heart.csv

Source: Kagle data

library(tidyverse)

Retrieve Data using readr::read.csv

I chose heart failure dataset from Kaggle and read the csv file using read.csv function.

heartdata <- read.csv('https://raw.githubusercontent.com/jihokim97/FALL2021TIDYVERSE/Tidyverse/heart.csv')
view(heartdata)
head(heartdata)
##   Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## 1  40   M           ATA       140         289         0     Normal   172
## 2  49   F           NAP       160         180         0     Normal   156
## 3  37   M           ATA       130         283         0         ST    98
## 4  48   F           ASY       138         214         0     Normal   108
## 5  54   M           NAP       150         195         0     Normal   122
## 6  39   M           NAP       120         339         0     Normal   170
##   ExerciseAngina Oldpeak ST_Slope HeartDisease
## 1              N     0.0       Up            0
## 2              N     1.0     Flat            1
## 3              N     0.0       Up            0
## 4              Y     1.5     Flat            1
## 5              N     0.0       Up            0
## 6              N     0.0       Up            0

Dplyr

I used dplyr packages to demonstrate different capabilities.

Filter

We can use filter functions to filter the desired rows from the dataframe.

filteredheart <- heartdata%>%
  filter(Cholesterol > 200)
head(filteredheart)
##   Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## 1  40   M           ATA       140         289         0     Normal   172
## 2  37   M           ATA       130         283         0         ST    98
## 3  48   F           ASY       138         214         0     Normal   108
## 4  39   M           NAP       120         339         0     Normal   170
## 5  45   F           ATA       130         237         0     Normal   170
## 6  54   M           ATA       110         208         0     Normal   142
##   ExerciseAngina Oldpeak ST_Slope HeartDisease
## 1              N     0.0       Up            0
## 2              N     0.0       Up            0
## 3              Y     1.5     Flat            1
## 4              N     0.0       Up            0
## 5              N     0.0       Up            0
## 6              N     0.0       Up            0

Select

We can select certain desired columns using select function.

selectheart <- heartdata%>%
  select(c('Age','MaxHR','HeartDisease'))
head(selectheart)
##   Age MaxHR HeartDisease
## 1  40   172            0
## 2  49   156            1
## 3  37    98            0
## 4  48   108            1
## 5  54   122            0
## 6  39   170            0

group_by and Summarise

We can group dataframe by columns and create new data frame into a single row using summarise.

bysex <- heartdata%>%
  group_by(Sex)%>%
  summarise(count = n())

bysex
## # A tibble: 2 x 2
##   Sex   count
##   <chr> <int>
## 1 F       193
## 2 M       725

Part 2: Tidyverse Extend

dplyr:: select

Q: How do i select from one column to another in a table

A: By using the dplyr select function

heartdata2 <- heartdata%>%
  select(Age:Cholesterol,HeartDisease)
head(heartdata2)
##   Age Sex ChestPainType RestingBP Cholesterol HeartDisease
## 1  40   M           ATA       140         289            0
## 2  49   F           NAP       160         180            1
## 3  37   M           ATA       130         283            0
## 4  48   F           ASY       138         214            1
## 5  54   M           NAP       150         195            0
## 6  39   M           NAP       120         339            0

dplyr::rename

Q: How do i rename a variable in a table?

A: I use dplyr rename function

heartdata2 <- heartdata2 %>% rename("Gender" = Sex)
head(heartdata2)
##   Age Gender ChestPainType RestingBP Cholesterol HeartDisease
## 1  40      M           ATA       140         289            0
## 2  49      F           NAP       160         180            1
## 3  37      M           ATA       130         283            0
## 4  48      F           ASY       138         214            1
## 5  54      M           NAP       150         195            0
## 6  39      M           NAP       120         339            0

dplyr::case_when()

Q: How do i vectorise multiple if else statement

A: By Using the case_when() function

heartdata3 <- heartdata2 %>% 
  mutate(Heart_condition = case_when(Cholesterol >= 300 ~ 'Severe'
                                     ,TRUE ~ 'Not_severe' )
         )
head(heartdata3)
##   Age Gender ChestPainType RestingBP Cholesterol HeartDisease Heart_condition
## 1  40      M           ATA       140         289            0      Not_severe
## 2  49      F           NAP       160         180            1      Not_severe
## 3  37      M           ATA       130         283            0      Not_severe
## 4  48      F           ASY       138         214            1      Not_severe
## 5  54      M           NAP       150         195            0      Not_severe
## 6  39      M           NAP       120         339            0          Severe

`ggplot2::Barplot

Q: I want to plot a Barchart of the count of Male and Female in the dataset

A: Use the ggplot function and the geom_bar layer

ggplot(data = heartdata2, aes(x = Gender)) + geom_bar(fill = "blue") + 
  labs(title = "Bar chart for count of Sex") + theme_bw()

coord_flip()

Q: How do I flip co-ordinates?

A: Add the coord_flip layer

ggplot(data = heartdata2, aes(x = Gender)) + geom_bar(fill = "blue") + 
  labs(title = "Bar chart for count of Sex") + theme_bw() + coord_flip()

ggplot: Histogram

# Histogram of Cholesterol level
ggplot(heartdata2, aes(x = Cholesterol)) + geom_histogram(binwidth = 30, fill = "purple") + 
  labs(title = "Distribution of Cholesterol level") + ylab(NULL)