library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:pastecs':
##
## first, last
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:pastecs':
##
## first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RColorBrewer)
library(lessR)
##
## lessR 4.2.4 feedback: gerbing@pdx.edu
## --------------------------------------------------------------
## > d <- Read("") Read text, Excel, SPSS, SAS, or R data file
## d is default data frame, data= in analysis routines optional
##
## Learn about reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables.
## Enter: browseVignettes("lessR")
##
## View changes in this and recent versions of lessR.
## Enter: news(package="lessR")
##
## **New Feature**: Interactive analysis of your data
## Enter: interact()
##
## Attaching package: 'lessR'
## The following objects are masked from 'package:dplyr':
##
## recode, rename
## The following object is masked from 'package:data.table':
##
## set
myPalette <- brewer.pal(5, "Set2")
#Read data and convert to dataframe
df<-read.csv("brain_stroke.csv")
#Explore the data set
head(df)
## gender age hypertension heart_disease ever_married work_type
## 1 Male 67 0 1 Yes Private
## 2 Male 80 0 1 Yes Private
## 3 Female 49 0 0 Yes Private
## 4 Female 79 1 0 Yes Self-employed
## 5 Male 81 0 0 Yes Private
## 6 Male 74 1 1 Yes Private
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1 Urban 228.69 36.6 formerly smoked 1
## 2 Rural 105.92 32.5 never smoked 1
## 3 Urban 171.23 34.4 smokes 1
## 4 Rural 174.12 24.0 never smoked 1
## 5 Urban 186.21 29.0 formerly smoked 1
## 6 Rural 70.09 27.4 never smoked 1
#Check Age values
summary(df$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.08 25.00 45.00 43.42 61.00 82.00
Having a min of 0.08 doesn’t seem right, let’s investigate more and see why.
subset(df,df$age==0.08)
## gender age hypertension heart_disease ever_married work_type
## 1488 Female 0.08 0 0 No children
## 3058 Male 0.08 0 0 No children
## Residence_type avg_glucose_level bmi smoking_status stroke
## 1488 Urban 139.67 14.1 Unknown 0
## 3058 Rural 70.33 16.9 Unknown 0
After investigation we see that we have children with months numbers, so It’s good and no need no remove them.
# Check if there are any missing values with our dataset
sum(is.null(df))
## [1] 0
#Check data types of the columns
str(df)
## 'data.frame': 4981 obs. of 11 variables:
## $ gender : chr "Male" "Male" "Female" "Female" ...
## $ age : num 67 80 49 79 81 74 69 78 81 61 ...
## $ hypertension : int 0 0 0 1 0 1 0 0 1 0 ...
## $ heart_disease : int 1 1 0 0 0 1 0 0 0 1 ...
## $ ever_married : chr "Yes" "Yes" "Yes" "Yes" ...
## $ work_type : chr "Private" "Private" "Private" "Self-employed" ...
## $ Residence_type : chr "Urban" "Rural" "Urban" "Rural" ...
## $ avg_glucose_level: num 229 106 171 174 186 ...
## $ bmi : num 36.6 32.5 34.4 24 29 27.4 22.8 24.2 29.7 36.8 ...
## $ smoking_status : chr "formerly smoked" "never smoked" "smokes" "never smoked" ...
## $ stroke : int 1 1 1 1 1 1 1 1 1 1 ...
All data types looks great, no need for cleaning or changing.
# Define a function to help with plots
pie_plot<- function(var1,colname,main1,names1,border1, color1){
pct <- round(var1/length(colname)*100)
lbls <- paste(names1,pct)
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(var1,labels = lbls ,main = main1 ,border = border1,radius = 1,col = color1)
}
barplot
## function (height, ...)
## UseMethod("barplot")
## <bytecode: 0x000001ffe6940e90>
## <environment: namespace:graphics>
# Define a function for bar chart
bar_chart<-function(height, width,space, names,color,main,xlab,ylab){
barplot(height = height, width = width, space=space,names.arg=names,col=color,main=main,xlab=xlab,ylab=ylab)
}
sex<-table(df$gender)
sex
##
## Female Male
## 2907 2074
# visualization of percentage of gender
pie_plot(sex,df$gender,"Percentage of Gender",names(table(df$gender)),"white",myPalette)
pie_plot(table(df$stroke),df$stroke, "Percentage of patients with stroke", c("Negative", "Positive"), "white",myPalette)
# Filtering data based on having a stroke by gender
stroke_patients<-filter(df,df$stroke==1)
gender_stroke<-table(stroke_patients$gender)
gender_stroke
##
## Female Male
## 140 108
Looks like females have strokes more than men, but since the data set as we seen have a much higher portion of women than men. so let’s investigate more by taking mean of people with strokes based on their gender.
# Calculating mean for both genders
gender_totals<-table(df$gender)
gender_filter<-table(stroke_patients$gender)
gender_portion_male<-gender_filter['Male']/gender_totals['Male']
gender_portion_female<-gender_filter['Female']/gender_totals['Female']
bar_chart(c(gender_portion_male,gender_portion_female),1,0.1,c("Male","Female"),myPalette, "Brain Strokes and Gender","Gender","Mean of Brain Stroke")
So now after using the filtered data with the right numbers, we have seen the true result, since we have more women in our data set than men so it’s obvious that they’ll have much higher people with stroke, after taking the mean it turns out that men are higher by a little percent.
Job Type
job_stroke<-table(stroke_patients$work_type)
job_stroke
##
## children Govt_job Private Self-employed
## 2 33 148 65
bar_chart(job_stroke,1.5,0.3,c("Childern","Government","Private","Self Employed"),myPalette,"People with Stroke","Type of Job","Number of People")
Hypertension
hyper_stroke<-table(stroke_patients$hypertension)
bar_chart(hyper_stroke,1.5,0.3,c("Negative","Positive"),myPalette,"Brain Strokes and Hypertension","Hypertension State","Number of People")
Heart Disease
heart_stroke<-table(stroke_patients$heart_disease)
heart_stroke
##
## 0 1
## 201 47
bar_chart(heart_stroke,1.5,0.3,c("No heart disease","Have had heart disease"),myPalette,"Brain Strokes and Heart Diseases","Heart Disease","Number of People")
Smoking
smoke_stroke<-table(stroke_patients$smoking_status)
bar_chart(smoke_stroke,1.5,0.3,c("Formerly Smoked","Never Smoked","Smokes","Unknown"),myPalette,"Brain Stroke and Smoking","Smoking Status","Number of People")
First, we see the distribution of people with hypertension and its relation with age.
# Plot Histogram of people with hypertension
hyper_patient <- filter(df,df$hypertension == 1)
work_hyper<-table(hyper_patient$work_type)
plot(table(hyper_patient$age),main = "Number of People with HyperTension",ylab = "Count",xlab = "Ages",col = "darkolivegreen",lwd = 5)
Older people have higher rates of hypertension than people of young age.
plot(work_hyper,xlab="Job Type",ylab="Number of People",type="o",frame.plot = TRUE,col = myPalette,
)
People with private jobs have much higher percentage of getting Hypertension than their peers who work at governmental job or are self employed.
#plot a line chart to see
smoke_hyper<-table(hyper_patient$smoking_status)
plot(smoke_hyper,type="o",ylab="Number of People",ylim=c(0,250))
It’s a bit surprising that people who have never smoked has more chance of getting Hypertension.
heart_patient<-filter(df, df$heart_disease==1)
heart_smoke<-table(heart_patient$smoking_status)
plot(heart_smoke, type="o",ylab = "Number of People",ylim=c(0,120))
another surprising find that people who never smoked also have higher chance of getting a heart disease than their peers who are already smoking.
#We're going to detremine who's diabetec and who's not
diabetec<-filter(df,df$avg_glucose_level>=126)
diabetec_stroke<-table(diabetec$stroke)
diabetec_stroke
##
## 0 1
## 848 99
bar_chart(diabetec_stroke,1,0.1,c("No-Stroke", "Have had Stroke"),"darkolivegreen", "People who are diabetic and have had strokes","Diabetes","Number of People")
Firstly, we calculated people who are above the normal level of glucose so those who are diabetic.
People who are not diabetic and have had strokes are much more than those who are diabetic.
self <- filter(df,df$work_type== 'Self-employed')
place_self<-table(self$Residence_type)
PieChart(place_self,main= 'Percentage of where People with Self-Employment Live')
## >>> Note: place_self is not in a data frame (table)
## >>> Note: place_self is not in a data frame (table)
## >>> suggestions
## piechart(place_self, hole=0) # traditional pie chart
## piechart(place_self, values="%") # display %'s on the chart
## piechart(place_self) # bar chart
## plot(place_self) # bubble plot
## plot(place_self, values="count") # lollipop plot
##
## --- place_self ---
##
## Rural Urban Total
## Frequencies: 383 421 804
## Proportions: 0.476 0.524 1.000
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 1.796, df = 1, p-value = 0.180
private <- filter(df,df$work_type=='Private')
place_private<-table(private$Residence_type)
PieChart(place_private,main='Percentage of where People with Private work Live')
## >>> Note: place_private is not in a data frame (table)
## >>> Note: place_private is not in a data frame (table)
## >>> suggestions
## piechart(place_private, hole=0) # traditional pie chart
## piechart(place_private, values="%") # display %'s on the chart
## piechart(place_private) # bar chart
## plot(place_private) # bubble plot
## plot(place_private, values="count") # lollipop plot
##
## --- place_private ---
##
## Rural Urban Total
## Frequencies: 1426 1434 2860
## Proportions: 0.499 0.501 1.000
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 0.022, df = 1, p-value = 0.881
gov<- filter(df,df$work_type=='Govt_job')
place_gov<-table(gov$Residence_type)
PieChart(place_gov,main='Percentage of where People with Government job Live')
## >>> Note: place_gov is not in a data frame (table)
## >>> Note: place_gov is not in a data frame (table)
## >>> suggestions
## piechart(place_gov, hole=0) # traditional pie chart
## piechart(place_gov, values="%") # display %'s on the chart
## piechart(place_gov) # bar chart
## plot(place_gov) # bubble plot
## plot(place_gov, values="count") # lollipop plot
##
## --- place_gov ---
##
## Rural Urban Total
## Frequencies: 305 339 644
## Proportions: 0.474 0.526 1.000
##
## Chi-squared test of null hypothesis of equal probabilities
## Chisq = 1.795, df = 1, p-value = 0.180
Q1) Which portion have had strokes more than the other?
Males have had brain strokes more than females by higher percent of 0.01.
People with private work have more strokes then other types of work.
No observation for heart disease or hypertension, yet in the smoking section we noticed some strange result which is people who have never smoked have the highest rate of brain stroke.
Q2) Does Work Type increases the chances of getting Hypertension?
Older people have much higher percent of hypertension as expected.
Same as brain strokes people with private jobs have much higher rate than their peers to get Hypertension.
Q3) Does smoking affects Hypertension or heart disease?
This where surprises start, people who have never smoked actually have Hypertension more than those who formerly smoked or never smoked, and also same for heart diseases.
Q4) Does Average Glucose Level affects brain strokes?
about 10% of people who are already diabetic have had strokes before, hence diabetic people have more chances of getting brain strokes.
Q5) Does the type of job affects where you live ?
People with private work is actually the same.
people who are self employed live more in the urban area.
people who work for the government live more in the urban area.