Data Wrangling

library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:pastecs':
## 
##     first, last
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, first, last
## The following objects are masked from 'package:pastecs':
## 
##     first, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(RColorBrewer)
library(lessR)
## 
## lessR 4.2.4                         feedback: gerbing@pdx.edu 
## --------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Learn about reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables.
##   Enter:  browseVignettes("lessR")
## 
## View changes in this and recent versions of lessR.
##   Enter: news(package="lessR")
## 
## **New Feature**: Interactive analysis of your data
##   Enter: interact()
## 
## Attaching package: 'lessR'
## The following objects are masked from 'package:dplyr':
## 
##     recode, rename
## The following object is masked from 'package:data.table':
## 
##     set
myPalette <- brewer.pal(5, "Set2") 
#Read data and convert to dataframe
df<-read.csv("brain_stroke.csv")
#Explore the data set
head(df)
##   gender age hypertension heart_disease ever_married     work_type
## 1   Male  67            0             1          Yes       Private
## 2   Male  80            0             1          Yes       Private
## 3 Female  49            0             0          Yes       Private
## 4 Female  79            1             0          Yes Self-employed
## 5   Male  81            0             0          Yes       Private
## 6   Male  74            1             1          Yes       Private
##   Residence_type avg_glucose_level  bmi  smoking_status stroke
## 1          Urban            228.69 36.6 formerly smoked      1
## 2          Rural            105.92 32.5    never smoked      1
## 3          Urban            171.23 34.4          smokes      1
## 4          Rural            174.12 24.0    never smoked      1
## 5          Urban            186.21 29.0 formerly smoked      1
## 6          Rural             70.09 27.4    never smoked      1
#Check Age values
summary(df$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.08   25.00   45.00   43.42   61.00   82.00

Having a min of 0.08 doesn’t seem right, let’s investigate more and see why.

subset(df,df$age==0.08)
##      gender  age hypertension heart_disease ever_married work_type
## 1488 Female 0.08            0             0           No  children
## 3058   Male 0.08            0             0           No  children
##      Residence_type avg_glucose_level  bmi smoking_status stroke
## 1488          Urban            139.67 14.1        Unknown      0
## 3058          Rural             70.33 16.9        Unknown      0

After investigation we see that we have children with months numbers, so It’s good and no need no remove them.

# Check if there are any missing values with our dataset
sum(is.null(df))
## [1] 0
#Check data types of the columns
str(df)
## 'data.frame':    4981 obs. of  11 variables:
##  $ gender           : chr  "Male" "Male" "Female" "Female" ...
##  $ age              : num  67 80 49 79 81 74 69 78 81 61 ...
##  $ hypertension     : int  0 0 0 1 0 1 0 0 1 0 ...
##  $ heart_disease    : int  1 1 0 0 0 1 0 0 0 1 ...
##  $ ever_married     : chr  "Yes" "Yes" "Yes" "Yes" ...
##  $ work_type        : chr  "Private" "Private" "Private" "Self-employed" ...
##  $ Residence_type   : chr  "Urban" "Rural" "Urban" "Rural" ...
##  $ avg_glucose_level: num  229 106 171 174 186 ...
##  $ bmi              : num  36.6 32.5 34.4 24 29 27.4 22.8 24.2 29.7 36.8 ...
##  $ smoking_status   : chr  "formerly smoked" "never smoked" "smokes" "never smoked" ...
##  $ stroke           : int  1 1 1 1 1 1 1 1 1 1 ...

All data types looks great, no need for cleaning or changing.

Exploratory Data Analysis

# Define a function to help with plots
pie_plot<- function(var1,colname,main1,names1,border1, color1){
  pct <- round(var1/length(colname)*100)
  lbls <- paste(names1,pct)
  lbls <- paste(lbls,"%",sep="") # ad % to labels
  pie(var1,labels = lbls ,main = main1 ,border = border1,radius = 1,col = color1)
}
barplot
## function (height, ...) 
## UseMethod("barplot")
## <bytecode: 0x000001ffe6940e90>
## <environment: namespace:graphics>
# Define a function for bar chart
bar_chart<-function(height, width,space, names,color,main,xlab,ylab){
  
  barplot(height = height, width = width, space=space,names.arg=names,col=color,main=main,xlab=xlab,ylab=ylab)
}
sex<-table(df$gender)
sex
## 
## Female   Male 
##   2907   2074
# visualization of percentage of gender
pie_plot(sex,df$gender,"Percentage of Gender",names(table(df$gender)),"white",myPalette)

pie_plot(table(df$stroke),df$stroke, "Percentage of patients with stroke", c("Negative", "Positive"), "white",myPalette)

Q1) Which portion have had strokes more than the other?

# Filtering data based on having a stroke by gender
stroke_patients<-filter(df,df$stroke==1)
gender_stroke<-table(stroke_patients$gender)
gender_stroke
## 
## Female   Male 
##    140    108

Looks like females have strokes more than men, but since the data set as we seen have a much higher portion of women than men. so let’s investigate more by taking mean of people with strokes based on their gender.

# Calculating mean for both genders 
gender_totals<-table(df$gender)

gender_filter<-table(stroke_patients$gender)

gender_portion_male<-gender_filter['Male']/gender_totals['Male']

gender_portion_female<-gender_filter['Female']/gender_totals['Female']

bar_chart(c(gender_portion_male,gender_portion_female),1,0.1,c("Male","Female"),myPalette, "Brain Strokes and Gender","Gender","Mean of Brain Stroke")

So now after using the filtered data with the right numbers, we have seen the true result, since we have more women in our data set than men so it’s obvious that they’ll have much higher people with stroke, after taking the mean it turns out that men are higher by a little percent.

Job Type

job_stroke<-table(stroke_patients$work_type)
job_stroke
## 
##      children      Govt_job       Private Self-employed 
##             2            33           148            65
bar_chart(job_stroke,1.5,0.3,c("Childern","Government","Private","Self Employed"),myPalette,"People with Stroke","Type of Job","Number of People")

Hypertension

hyper_stroke<-table(stroke_patients$hypertension)
bar_chart(hyper_stroke,1.5,0.3,c("Negative","Positive"),myPalette,"Brain Strokes and Hypertension","Hypertension State","Number of People")

Heart Disease

heart_stroke<-table(stroke_patients$heart_disease)
heart_stroke
## 
##   0   1 
## 201  47
bar_chart(heart_stroke,1.5,0.3,c("No heart disease","Have had heart disease"),myPalette,"Brain Strokes and Heart Diseases","Heart Disease","Number of People")

Smoking

smoke_stroke<-table(stroke_patients$smoking_status)
bar_chart(smoke_stroke,1.5,0.3,c("Formerly Smoked","Never Smoked","Smokes","Unknown"),myPalette,"Brain Stroke and Smoking","Smoking Status","Number of People")

Q2) Does work type increase the chances of getting Hypertension?

First, we see the distribution of people with hypertension and its relation with age.

# Plot Histogram of people with hypertension
hyper_patient <- filter(df,df$hypertension == 1)
work_hyper<-table(hyper_patient$work_type)
plot(table(hyper_patient$age),main = "Number of People with HyperTension",ylab = "Count",xlab = "Ages",col = "darkolivegreen",lwd = 5)

Older people have higher rates of hypertension than people of young age.

plot(work_hyper,xlab="Job Type",ylab="Number of People",type="o",frame.plot = TRUE,col = myPalette,
    )

People with private jobs have much higher percentage of getting Hypertension than their peers who work at governmental job or are self employed.

Q3) Does smoking affects hypertension or heart diseases?

#plot a line chart to see 
smoke_hyper<-table(hyper_patient$smoking_status)
plot(smoke_hyper,type="o",ylab="Number of People",ylim=c(0,250))

It’s a bit surprising that people who have never smoked has more chance of getting Hypertension.

heart_patient<-filter(df, df$heart_disease==1)
heart_smoke<-table(heart_patient$smoking_status)
plot(heart_smoke, type="o",ylab = "Number of People",ylim=c(0,120))

another surprising find that people who never smoked also have higher chance of getting a heart disease than their peers who are already smoking.

Q4) Does Average Glucose Level affects brain strokes?

#We're going to detremine who's diabetec and who's not
diabetec<-filter(df,df$avg_glucose_level>=126)
diabetec_stroke<-table(diabetec$stroke)
diabetec_stroke
## 
##   0   1 
## 848  99
bar_chart(diabetec_stroke,1,0.1,c("No-Stroke", "Have had Stroke"),"darkolivegreen", "People who are diabetic and have had strokes","Diabetes","Number of People")

Firstly, we calculated people who are above the normal level of glucose so those who are diabetic.

People who are not diabetic and have had strokes are much more than those who are diabetic.

Q5) Does the type of job affects where you live ?

self <- filter(df,df$work_type== 'Self-employed')
place_self<-table(self$Residence_type)
PieChart(place_self,main= 'Percentage of where People with Self-Employment Live')
## >>> Note: place_self is not in a data frame (table)
## >>> Note: place_self is not in a data frame (table)

## >>> suggestions
## piechart(place_self, hole=0)  # traditional pie chart
## piechart(place_self, values="%")  # display %'s on the chart
## piechart(place_self)  # bar chart
## plot(place_self)  # bubble plot
## plot(place_self, values="count")  # lollipop plot 
## 
## --- place_self --- 
## 
##                Rural  Urban     Total 
## Frequencies:     383    421       804 
## Proportions:   0.476  0.524     1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 1.796, df = 1, p-value = 0.180
private <- filter(df,df$work_type=='Private')
place_private<-table(private$Residence_type)
PieChart(place_private,main='Percentage of where People with Private work Live')
## >>> Note: place_private is not in a data frame (table)
## >>> Note: place_private is not in a data frame (table)

## >>> suggestions
## piechart(place_private, hole=0)  # traditional pie chart
## piechart(place_private, values="%")  # display %'s on the chart
## piechart(place_private)  # bar chart
## plot(place_private)  # bubble plot
## plot(place_private, values="count")  # lollipop plot 
## 
## --- place_private --- 
## 
##                Rural  Urban      Total 
## Frequencies:    1426   1434       2860 
## Proportions:   0.499  0.501      1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 0.022, df = 1, p-value = 0.881
gov<- filter(df,df$work_type=='Govt_job')
place_gov<-table(gov$Residence_type)
PieChart(place_gov,main='Percentage of where People with Government job Live')
## >>> Note: place_gov is not in a data frame (table)
## >>> Note: place_gov is not in a data frame (table)

## >>> suggestions
## piechart(place_gov, hole=0)  # traditional pie chart
## piechart(place_gov, values="%")  # display %'s on the chart
## piechart(place_gov)  # bar chart
## plot(place_gov)  # bubble plot
## plot(place_gov, values="count")  # lollipop plot 
## 
## --- place_gov --- 
## 
##                Rural  Urban     Total 
## Frequencies:     305    339       644 
## Proportions:   0.474  0.526     1.000 
## 
## Chi-squared test of null hypothesis of equal probabilities 
##   Chisq = 1.795, df = 1, p-value = 0.180

Conclusions