setwd("~/Data 101")
stroke_df <- read.csv("stroke.csv")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.1.3
library(ggplot2)
library(psych)
## Warning: package 'psych' was built under R version 4.1.3
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(corrplot)
## corrplot 0.92 loaded
library(RColorBrewer)
library(dslabs)
## Warning: package 'dslabs' was built under R version 4.1.3
library(highcharter)
## Warning: package 'highcharter' was built under R version 4.1.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'highcharter'
## The following object is masked from 'package:dslabs':
##
## stars
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.1.3
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.1.3
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
###This dataset explores data of patients internal and external characteristics and their stroke status. It contains 5110 observations with 12 attributes. It is used to predict whether a patient is likely to get stroke based on the input parameters like gender, age, various diseases, and smoking status. Each row in the data provides relavant information about the patient.
sum(is.na(stroke_df))
## [1] 201
mean(is.na(stroke_df))
## [1] 0.003277886
stroke_df[stroke_df == ""]<-NA
view(stroke_df)
stroke1_df<- stroke_df%>%
mutate(hypertension=ifelse(hypertension==0,"No","Yes")) %>%
mutate(heart_disease=ifelse(heart_disease==0,"No","Yes")) %>%
mutate(stroke=ifelse(stroke==0,"No","Yes"))
Gender<-stroke1_df%>%
ggplot(aes(stroke))+
geom_bar(aes(fill=gender),position = "dodge")+
theme_minimal()+
ylab("Count")+
xlab("Stroke")+
scale_fill_brewer(palette = "Set1")+
ggtitle("Stroke by Gender")
Marriage<-stroke1_df %>%
ggplot(aes(stroke))+
geom_bar(aes(fill=ever_married),position = "dodge")+
theme_minimal()+
ylab("Count")+
xlab("Stroke")+scale_fill_brewer(palette = "Set2")+
ggtitle("Stroke by Marriage Status")
ggarrange(Gender,Marriage)
## Based on the graph above, we can observe that:
Smoke<-stroke1_df %>%
ggplot(aes(stroke))+
geom_bar(aes(fill=smoking_status),position = "dodge")+
theme_minimal()+
ylab("Count")+
xlab("Stroke")+scale_fill_brewer(palette = "Set2")+
ggtitle("Stroke by Smoking Status")
ggarrange(Smoke)
f<-stroke1_df %>%
ggplot(aes(stroke))+geom_bar(aes(fill=work_type),position = "dodge")+theme_minimal()+ylab("Count")+xlab("Stroke")+scale_fill_brewer(palette = "Set2")+ggtitle("Stroke by Work Type")
g<-stroke1_df %>%
ggplot(aes(stroke))+geom_bar(aes(fill=Residence_type),position = "dodge")+theme_minimal()+ylab("Count")+xlab("Stroke")+scale_fill_brewer(palette = "Set2")+ggtitle("Stroke by Residence Type")
ggarrange(f,g,ncol = 1)
withstroke <- stroke_df %>% filter(stroke==1)
withstroke %>% ggplot(aes(age, fill=gender)) + geom_density(alpha=0.2) + ggtitle("Stroke by Age in Male and Female")
withstroke %>% ggplot(aes(avg_glucose_level, fill=gender)) + geom_density(alpha=0.2) + ggtitle("Stroke and Glucose Level by Gender")
stroke1_df$bmi[is.na(stroke1_df$bmi)] <- 0
Male <- subset(stroke1_df, gender == "Male")
Female <- subset(stroke1_df, gender== "Female")
t.test(Male$bmi, Female$bmi, alternative = "less", conf.level = 0.95)
##
## Welch Two Sample t-test
##
## data: Male$bmi and Female$bmi
## t = -3.268, df = 4532.5, p-value = 0.0005455
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -0.4393942
## sample estimates:
## mean of x mean of y
## 27.23924 28.12408
boxplot(data = stroke1_df, bmi ~ gender, outline = F, col = "blue")
x<-stroke1_df %>% ggplot(aes(age))+geom_boxplot(aes(fill=stroke))+theme_minimal()+coord_flip()+ggtitle("Age")
y<-stroke1_df %>% ggplot(aes(avg_glucose_level))+geom_boxplot(aes(fill=stroke))+theme_minimal()+coord_flip()+ggtitle("Average Glucose Level")
z<-stroke1_df %>% ggplot(aes(bmi))+geom_boxplot(aes(fill=stroke))+theme_minimal()+coord_flip()+ggtitle("Body Mass Index")
ggarrange(x,y,z,common.legend = T)
df <- data.frame(data.matrix(stroke1_df))
sapply(stroke1_df, class)
## id gender age hypertension
## "integer" "character" "numeric" "character"
## heart_disease ever_married work_type Residence_type
## "character" "character" "character" "character"
## avg_glucose_level bmi smoking_status stroke
## "numeric" "numeric" "character" "character"
res <- cor(df)
round(res, 2)
## id gender age hypertension heart_disease ever_married
## id 1.00 0.00 0.00 0.00 0.00 0.01
## gender 0.00 1.00 -0.03 0.02 0.09 -0.03
## age 0.00 -0.03 1.00 0.28 0.26 0.68
## hypertension 0.00 0.02 0.28 1.00 0.11 0.16
## heart_disease 0.00 0.09 0.26 0.11 1.00 0.11
## ever_married 0.01 -0.03 0.68 0.16 0.11 1.00
## work_type 0.01 -0.07 0.54 0.13 0.10 0.43
## Residence_type 0.00 -0.01 0.01 -0.01 0.00 0.01
## avg_glucose_level 0.00 0.06 0.24 0.17 0.16 0.16
## bmi 0.08 -0.05 0.22 0.07 -0.03 0.25
## smoking_status -0.02 0.04 -0.38 -0.13 -0.06 -0.30
## stroke 0.01 0.01 0.25 0.13 0.13 0.11
## work_type Residence_type avg_glucose_level bmi
## id 0.01 0.00 0.00 0.08
## gender -0.07 -0.01 0.06 -0.05
## age 0.54 0.01 0.24 0.22
## hypertension 0.13 -0.01 0.17 0.07
## heart_disease 0.10 0.00 0.16 -0.03
## ever_married 0.43 0.01 0.16 0.25
## work_type 1.00 0.00 0.09 0.25
## Residence_type 0.00 1.00 0.00 0.00
## avg_glucose_level 0.09 0.00 1.00 0.08
## bmi 0.25 0.00 0.08 1.00
## smoking_status -0.34 0.00 -0.10 -0.19
## stroke 0.08 0.02 0.13 -0.05
## smoking_status stroke
## id -0.02 0.01
## gender 0.04 0.01
## age -0.38 0.25
## hypertension -0.13 0.13
## heart_disease -0.06 0.13
## ever_married -0.30 0.11
## work_type -0.34 0.08
## Residence_type 0.00 0.02
## avg_glucose_level -0.10 0.13
## bmi -0.19 -0.05
## smoking_status 1.00 -0.07
## stroke -0.07 1.00
corrplot(res, type = "upper", order = "hclust", tl.col = "black", tl.srt = 45)
heart_table <- table(stroke1_df$hypertension, stroke1_df$stroke)
heart_table
##
## No Yes
## No 4429 183
## Yes 432 66
result <- chisq.test(heart_table); result
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: heart_table
## X-squared = 81.605, df = 1, p-value < 2.2e-16