Importing Libraries
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following object is masked from 'package:purrr':
##
## compact
## Warning: package 'plotly' was built under R version 4.1.2
##
## Attaching package: 'plotly'
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Warning: package 'corrplot' was built under R version 4.1.2
## corrplot 0.92 loaded
## Warning: package 'ggcorrplot' was built under R version 4.1.2
Importing Data as csv File
Proj_Data <- read.csv("C:\\Users\\zulfi\\Desktop\\Dataanalysis\\heart.csv", header=TRUE, stringsAsFactors=FALSE)
Observing overall structure, like int, characters, number and factor etc
str(Proj_Data)
## 'data.frame': 918 obs. of 12 variables:
## $ Age : int 40 49 37 48 54 39 45 54 37 48 ...
## $ Sex : chr "M" "F" "M" "F" ...
## $ ChestPainType : chr "ATA" "NAP" "ATA" "ASY" ...
## $ RestingBP : int 140 160 130 138 150 120 130 110 140 120 ...
## $ Cholesterol : int 289 180 283 214 195 339 237 208 207 284 ...
## $ FastingBS : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RestingECG : chr "Normal" "Normal" "ST" "Normal" ...
## $ MaxHR : int 172 156 98 108 122 170 170 142 130 120 ...
## $ ExerciseAngina: chr "N" "N" "N" "Y" ...
## $ Oldpeak : num 0 1 0 1.5 0 0 0 0 1.5 0 ...
## $ ST_Slope : chr "Up" "Flat" "Up" "Flat" ...
## $ HeartDisease : int 0 1 0 1 0 0 0 0 1 0 ...
Observing overall structure, like feilds, features, variables
head(Proj_Data)
## Age Sex ChestPainType RestingBP Cholesterol FastingBS RestingECG MaxHR
## 1 40 M ATA 140 289 0 Normal 172
## 2 49 F NAP 160 180 0 Normal 156
## 3 37 M ATA 130 283 0 ST 98
## 4 48 F ASY 138 214 0 Normal 108
## 5 54 M NAP 150 195 0 Normal 122
## 6 39 M NAP 120 339 0 Normal 170
## ExerciseAngina Oldpeak ST_Slope HeartDisease
## 1 N 0.0 Up 0
## 2 N 1.0 Flat 1
## 3 N 0.0 Up 0
## 4 Y 1.5 Flat 1
## 5 N 0.0 Up 0
## 6 N 0.0 Up 0
Deleting unnecessary column and converting character into factors
# Deleting not related variables
Proj_Data = subset(Proj_Data, select = c(-ExerciseAngina))
# Coverting the categorical data to factor
Proj_Data$Sex <- as.factor(Proj_Data$Sex)
Proj_Data$RestingECG <- as.factor(Proj_Data$RestingECG)
# Summary after pre-processing the data
summary(Proj_Data)
## Age Sex ChestPainType RestingBP Cholesterol
## Min. :28.00 F:193 Length:918 Min. : 0.0 Min. : 0.0
## 1st Qu.:47.00 M:725 Class :character 1st Qu.:120.0 1st Qu.:173.2
## Median :54.00 Mode :character Median :130.0 Median :223.0
## Mean :53.51 Mean :132.4 Mean :198.8
## 3rd Qu.:60.00 3rd Qu.:140.0 3rd Qu.:267.0
## Max. :77.00 Max. :200.0 Max. :603.0
## FastingBS RestingECG MaxHR Oldpeak
## Min. :0.0000 LVH :188 Min. : 60.0 Min. :-2.6000
## 1st Qu.:0.0000 Normal:552 1st Qu.:120.0 1st Qu.: 0.0000
## Median :0.0000 ST :178 Median :138.0 Median : 0.6000
## Mean :0.2331 Mean :136.8 Mean : 0.8874
## 3rd Qu.:0.0000 3rd Qu.:156.0 3rd Qu.: 1.5000
## Max. :1.0000 Max. :202.0 Max. : 6.2000
## ST_Slope HeartDisease
## Length:918 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :1.0000
## Mean :0.5534
## 3rd Qu.:1.0000
## Max. :1.0000
Proj_Data$ChestPainType <- as.factor(Proj_Data$ChestPainType)
Proj_Data$RestingECG <- as.factor(Proj_Data$RestingECG)
Proj_Data$ST_Slope <- as.factor(Proj_Data$ST_Slope)
Proj_Data$HeartDisease <- as.factor(Proj_Data$HeartDisease)
Observing Null value, if its present which will create problem in advanced stages.
is.null(Proj_Data)
## [1] FALSE
structure of new manipulated dataset
summary(Proj_Data)
## Age Sex ChestPainType RestingBP Cholesterol
## Min. :28.00 F:193 ASY:496 Min. : 0.0 Min. : 0.0
## 1st Qu.:47.00 M:725 ATA:173 1st Qu.:120.0 1st Qu.:173.2
## Median :54.00 NAP:203 Median :130.0 Median :223.0
## Mean :53.51 TA : 46 Mean :132.4 Mean :198.8
## 3rd Qu.:60.00 3rd Qu.:140.0 3rd Qu.:267.0
## Max. :77.00 Max. :200.0 Max. :603.0
## FastingBS RestingECG MaxHR Oldpeak ST_Slope
## Min. :0.0000 LVH :188 Min. : 60.0 Min. :-2.6000 Down: 63
## 1st Qu.:0.0000 Normal:552 1st Qu.:120.0 1st Qu.: 0.0000 Flat:460
## Median :0.0000 ST :178 Median :138.0 Median : 0.6000 Up :395
## Mean :0.2331 Mean :136.8 Mean : 0.8874
## 3rd Qu.:0.0000 3rd Qu.:156.0 3rd Qu.: 1.5000
## Max. :1.0000 Max. :202.0 Max. : 6.2000
## HeartDisease
## 0:410
## 1:508
##
##
##
##
checking class and level of varable Sex and ChestPainType
class(Proj_Data$Sex)
## [1] "factor"
class(Proj_Data$ChestPainType)
## [1] "factor"
levels(Proj_Data$Sex)
## [1] "F" "M"
levels(Proj_Data$ChestPainType)
## [1] "ASY" "ATA" "NAP" "TA"
Converting Age varibale into three groups for further dig down into data and to figure out the answer of the question which age is more are more prone to heart diseases.
young <- Proj_Data[which((Proj_Data$Age<45)), ]
middle <- Proj_Data[which((Proj_Data$Age>=45)&(Proj_Data$Age<55)), ]
elderly <- Proj_Data[which(Proj_Data$Age>55), ]
groups <- data.frame(Age_group = c("young","middle","elderly"), group_count = c(NROW(young$Age), NROW(middle$Age), NROW(elderly$Age)))
pl<- ggplot(groups, aes(x=groups$Age_group, y=groups$group_count, fill=groups$Age_group)) +
ggtitle("Age Analysis") +
xlab("Age Group") +
ylab("Group Count") +
geom_col(stat="identity") +
scale_fill_discrete(name = "Age Group", labels = c("Elderly", "Middle", "Young"))
## Warning: Ignoring unknown parameters: stat
ggplotly(pl)
## Warning: Use of `groups$Age_group` is discouraged. Use `Age_group` instead.
## Warning: Use of `groups$group_count` is discouraged. Use `group_count` instead.
## Warning: Use of `groups$Age_group` is discouraged. Use `Age_group` instead.
The Age Analysis plot shows the grouping of the age between three groups.
Observing the answer of the question heart deseases are most prevalent in male with compare to females
ggplot(Proj_Data, aes(x= Proj_Data$Sex, fill=Proj_Data$HeartDisease))+
geom_bar()+
xlab("Gender")+
ylab("Gender Count")+
ggtitle("Analysis of Gender")+
scale_fill_discrete(name = "Heart disease",labels = c("No", "Yes"))
## Warning: Use of `Proj_Data$Sex` is discouraged. Use `Sex` instead.
## Warning: Use of `Proj_Data$HeartDisease` is discouraged. Use `HeartDisease`
## instead.
0 indicates no and 1 indicates yes in Gender Graphs.Data of Graphs shows that 458 male out of 726 are having heart diseases while 0nly 50 female out of 193 is affected by heart diseases. Males are more prone to heart diseases.
Observing chestpain variable
plt<-ggplot(Proj_Data, aes(x= ChestPainType, fill=ChestPainType)) +
geom_bar() +
xlab("Chest Pain Type") +
ylab("Count") +
ggtitle("Analysis of Chest Pain Experienced") +
scale_fill_discrete(name = "Chest Pain Type")
ggplotly(plt)
By looking bar graphs we can easily say that most of the people in our sample has ASY type of chest pain.
plo<- ggplot(Proj_Data, aes(x= ChestPainType, fill=HeartDisease)) +
geom_bar() +
xlab("Chest Pain Type") +
ylab("Count") +
ggtitle("Analysis of Chest Pain Experienced") +
scale_fill_discrete(name = "Heart disease", labels = c("No", "Yes"))
ggplotly(plo)
From the Analysis of chest pain plot we can conclude people that experienced ASY type of pain are higly affected by heart diseases other all types are also positively correlated with diseases, so chest pain that can be of any type can cause the heart diseases. Our conclusion here is that chest pain always prone to heart diseases.
ggplot(Proj_Data, aes(x= RestingECG, fill=RestingECG)) +
geom_bar() +
xlab("Resting ECG Types") +
ylab("Count") +
ggtitle("Resting ECG Types") +
theme(legend.position="none")
## Now we analyse the reading of different ECGs and will see the relationship of that ECGs with the disease.
ggplot(Proj_Data, aes(x= RestingECG, fill=HeartDisease)) +
geom_bar(position = 'dodge') +
xlab("ECG Types") +
ylab("Count") +
ggtitle("Analysis of Resting ECG Types") +
scale_fill_discrete(name = "Heart disease", labels = c("No", "Yes"))
Analysis of Resting ECG Types graphs shows that the ST reading when comparing with other two LVH and Normal reading is more positively correlated with the disease,However LVH and Normal also have possitive correlation with disease
ggplot(Proj_Data, aes(x= RestingECG, fill=HeartDisease)) +
geom_bar(position = 'dodge') +
xlab("ECG Types") +
ylab("Count") +
ggtitle("Analysis of Resting ECG Types") +
scale_fill_discrete(name = "Heart disease", labels = c("No", "Yes"))
Now we will analyse the relationship between ST_Slope and heart diseases
plott<-ggplot(Proj_Data, aes(x = ST_Slope, fill=ST_Slope))+
geom_bar()+
xlab("ECG Types") +
ylab("Count") +
ggtitle("Analysis of ST Slop Types")
ggplotly(plott)
Analysis Of ST Slop
plotst <- ggplot(Proj_Data, aes(x= ST_Slope, fill=HeartDisease)) +
geom_bar(position = 'dodge') +
xlab("ECG Types") +
ylab("Count") +
ggtitle("Analysis of ST Slop Types") +
scale_fill_discrete(name = "Heart disease", labels = c("No", "Yes"))
ggplotly(plotst)
Analysis of ST Slop Types show that st slop Flat type is more positive with heart diseases, out of 460 people 381 have heart disease with st slop of type Flat. St slope type UP reading shows a very little but positive correlation with heart disease. Similarly the st slop down is also positively correlated with desease. 49 0ut 0f 63 have heart disease with st slop down.
ggplot(Proj_Data, aes(x= FastingBS, fill = FastingBS ))+
geom_bar()+
xlab("Fasting BS") +
ylab("Count") +
ggtitle("Analysis of Fasting BS ")
Analysis of relationship of Fasting BS and heart disease.
rrp<- ggplot(Proj_Data, aes(x= FastingBS, fill = HeartDisease ))+
geom_bar()+
xlab("Fasting BS") +
ylab("Count") +
ggtitle("Analysis of Fasting BS ")
ggplotly(rrp)
Analysis of Fasting BS shows that 366 out of 704 people have heart disease with fasting bs valu 0 and 170 out of 214 people have heart disease with fasting bs value 1, so we concluded that as fasting bs value increases the the heart disease increased. If we want to control heart disease we must control fasting bs value.
boxplot(Age~HeartDisease, data = Proj_Data)
Performing T Test
t.test(Age~HeartDisease, data = Proj_Data, mu=0, conf= 0.95)
##
## Welch Two Sample t-test
##
## data: Age by HeartDisease
## t = -8.8225, df = 843.69, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -6.538260 -4.158513
## sample estimates:
## mean in group 0 mean in group 1
## 50.55122 55.89961
Our null hypotheses was that heart disease does not depend on age but we reject our null hypotheses and accept alternate hypotheses sine t value is greater than P value we can conclude that heart desease depends on age, elder people are more prone to heart disease Plating the box plot
boxplot(FastingBS~HeartDisease, data = Proj_Data)
Performing the Welch two sample test
t.test(FastingBS~HeartDisease, data = Proj_Data, mu=0, conf= 0.95)
##
## Welch Two Sample t-test
##
## data: FastingBS by HeartDisease
## t = -8.7603, df = 881.28, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.2782594 -0.1763978
## sample estimates:
## mean in group 0 mean in group 1
## 0.1073171 0.3346457
Testing the chest pain and heart diseases
boxplot(ChestPainType~HeartDisease, data = Proj_Data)
Performing T test on chest pain and heart diseases
t.test(Age~HeartDisease, data = Proj_Data, mu=0, conf= 0.95)
##
## Welch Two Sample t-test
##
## data: Age by HeartDisease
## t = -8.8225, df = 843.69, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -6.538260 -4.158513
## sample estimates:
## mean in group 0 mean in group 1
## 50.55122 55.89961
Here we also reject our null hypotheses and accept alternate hypotheses that p value is less than T value and we can say with 95% confidence that chest pain is positive effect on heart disease.
Note that the echo = FALSE
parameter was added to the code chunk to prevent printing of the R code that generated the plot.