library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'tibble' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
disease <- read_csv("heart.csv")
## Parsed with column specification:
## cols(
## age = col_double(),
## sex = col_double(),
## cp = col_double(),
## trestbps = col_double(),
## chol = col_double(),
## fbs = col_double(),
## restecg = col_double(),
## thalach = col_double(),
## exang = col_double(),
## oldpeak = col_double(),
## slope = col_double(),
## ca = col_double(),
## thal = col_double(),
## target = col_double()
## )
head(disease)
## # A tibble: 6 x 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 63 1 3 145 233 1 0 150 0 2.3
## 2 37 1 2 130 250 0 1 187 0 3.5
## 3 41 0 1 130 204 0 0 172 0 1.4
## 4 56 1 1 120 236 0 1 178 0 0.8
## 5 57 0 0 120 354 0 1 163 1 0.6
## 6 57 1 0 140 192 0 1 148 0 0.4
## # ... with 4 more variables: slope <dbl>, ca <dbl>, thal <dbl>,
## # target <dbl>
The selected package I want to use id dplyr.
Using filter we can select rows of the data frame matching conditions.
filter(data) ### Example
To select the people of over 20 and less than 65 we can pass the data disease and condtion age>20 and age < 65 to the function . It’ll return matching rows of heart disease.
filter(disease, age>20 & age < 65)
## # A tibble: 262 x 14
## age sex cp trestbps chol fbs restecg thalach exang oldpeak
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 63 1 3 145 233 1 0 150 0 2.3
## 2 37 1 2 130 250 0 1 187 0 3.5
## 3 41 0 1 130 204 0 0 172 0 1.4
## 4 56 1 1 120 236 0 1 178 0 0.8
## 5 57 0 0 120 354 0 1 163 1 0.6
## 6 57 1 0 140 192 0 1 148 0 0.4
## 7 56 0 1 140 294 0 0 153 0 1.3
## 8 44 1 1 120 263 0 1 173 0 0
## 9 52 1 2 172 199 1 1 162 0 0.5
## 10 57 1 2 150 168 0 1 174 0 1.6
## # ... with 252 more rows, and 4 more variables: slope <dbl>, ca <dbl>,
## # thal <dbl>, target <dbl>
Using select we can keep the selected variables
select(data, …)
To keep only age, sex,cp variable we can pass the data disease and age, sex,cp to the function .
df<- select(disease, c("age","sex","cp"))
head(df)
## # A tibble: 6 x 3
## age sex cp
## <dbl> <dbl> <dbl>
## 1 63 1 3
## 2 37 1 2
## 3 41 0 1
## 4 56 1 1
## 5 57 0 0
## 6 57 1 0
The part 2 of the Tidyverse receipe is further cleaning the data to analyse the data set. The following steps are taken to meet the goals 1. Change the column names 2. Change the data types for some of the columns 3. Analyze the data set
#Change the column names
disease->chest_pain
names(chest_pain)<- c("Age","Sex","Chest Pain Type","Resting Blood Pressure","Serum Cholestoral","Fasting Blood Sugar","Resting CardioGraphic results",
"Maximum Heart Rate","Excercise Induced angina","oldpeak","the slope of the peak exercise ST segment",
"number of major vessels (0-3) colored by flourosopy"," thal","Target")
# Change the values and data type of sex column
chest_pain$Sex[chest_pain$Sex==0]<-"Female"
chest_pain$Sex[chest_pain$Sex==1]<-"Male"
as.factor(chest_pain$Sex)->chest_pain$Sex
as.factor(chest_pain$Target)->chest_pain$Target
as.factor(chest_pain$`Chest Pain Type`)->chest_pain$`Chest Pain Type`
# Data Analysis
# Distribution of Maximum heart rate
ggplot(chest_pain,aes(chest_pain$`Maximum Heart Rate` )) +
geom_histogram(fill = "dodgerblue1",alpha =0.5) +
theme_bw()+theme_classic()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(chest_pain,aes(`Maximum Heart Rate`)) + geom_density(fill = "red",alpha =0.5) + theme_bw()+theme_classic()
boxplot(chest_pain$`Maximum Heart Rate`,col ="lightblue",notch = T,main ="boxplot of the maximum heart rate")
ggplot(data=chest_pain)+
aes(x=Sex,fill=Sex)+
geom_bar(stat = "count")+
labs(x="Gender",y="Count",title = "# Heart aliments by Gender")+
geom_label(stat='count',aes(label=..count..), size=7) +
theme_grey(base_size = 20)+
theme(panel.background = element_rect(fill="white"))
ggplot( data = chest_pain)+
aes(x=Target,fill=Target)+
geom_bar(stat = "count")+
labs(x="Target",y="Count",title = "Count of Target")+
geom_label(stat='count',aes(label=..count..), size=7) +
theme_grey(base_size = 20)+
theme(panel.background = element_rect(fill="white"))