Dataset: Framingham Heart Study
Description: The Framingham Heart Study is a long-term, ongoing cardiovascular cohort study of residents of the city of Framingham, Massachusetts.
Observations: 11,627
Source: Source Link [nhlbi] | Documentation Link [nhlbi]
Dataset Variables Description:
Variable | Description | Units | Range |
---|---|---|---|
SEX | Gender of the participant | 1 = Men 2 = Women |
|
AGE | Age at exam (in years) | 32-81 | |
SYSBP | Systolic Blood Pressure (mmHg) | 83.5-295 | |
DIABP | Diastolic Blood Pressure (mmHg) | 30-150 | |
CURSMOKE | Current cigarette smoking at exam | 0=Not Smoker 1=Smoker |
|
CIGPDAY | Number of cigarettes smoked each day | ||
TOTCHOL | Serum Total Cholesterol (mg/dL) | 107-696 | |
BMI | Body Mass Index, weight in kilograms/height meters squared | 14.43-56.8 | |
GLUCOSE | Casual serum glucose (mg/dL) | 39-478 | |
DIABETES | Diabetic according to criteria with casual glucose of 200 mg/dL or more |
0=Not a diabetic 1=Diabetic |
|
HEARTRTE | Heart rate (ventricular rate) in beats/min | 37-220 | |
ANYCHD | The participant suffered from any type of coronary heart disease | 0=Free of disease 1=Prevalent disease |
|
STROKE | The participant suffered from any type of stroke | 0=Free of disease 1=Prevalent disease |
|
HYPETEN | The participant is hypertensive | 0=Free of disease 1=Prevalent disease |
|
HOSPMI | The participant suffers from Hospitalized Myocardial Infarction | 0=Free of disease 1=Prevalent disease |
|
DEATH | Death from any cause | 0=Not subject to death 1=Subject to death |
# install.packages(c("tidyverse", "hrbrthemes", "plotly", "outliers", "highcharter", "lvplot", "devtools", "xgboost"))
# devtools::install_url('https://github.com/catboost/catboost/releases/download/v0.20/catboost-R-Windows-0.20.tgz', INSTALL_opts = c("--no-multiarch", "--no-test-load"))
library(data.table)
library(GGally)
library(PerformanceAnalytics)
library(splines)
library(rcompanion)
library(Hmisc)
library(cowplot)
library(WVPlots)
library(glmnet)
library(randomForest)
library(ggcorrplot)
library(psych)
library(caTools)
library(mlbench)
library(relaimpo)
library(tidyverse)
library(hrbrthemes)
library(plotly)
library(mlr)
library(outliers)
library(highcharter)
library(lvplot)
library(catboost)
library(xgboost)
library(caret)
library(parallel)
library(parallelMap)
library(ROSE)
library(DMwR)
library(pROC)
library(ggplot2)
library(corrplot)
library(Amelia)
library(plot3D)
library(tidymodels)
library(kableExtra)
library(modelsummary)
df <- read.csv("./Data/frmgham2_dirty.csv")
df <- df %>% select("SEX", "AGE", "SYSBP", "DIABP",
"CURSMOKE", "CIGPDAY", "TOTCHOL", "BMI", "GLUCOSE",
"DIABETES", "HEARTRTE", "ANYCHD", "STROKE", "HYPERTEN", "HOSPMI", "DEATH")
dim(df)
## [1] 11632 16
df %>% glimpse()
## Rows: 11,632
## Columns: 16
## $ SEX <chr> "1", "1", "2", "2", "2", "1", "1", "2", "2", "2", "2", "2", "~
## $ AGE <int> 39, 52, 46, 52, 58, 48, 54, 61, 67, 46, 51, 58, 43, 49, 55, 6~
## $ SYSBP <dbl> 106.0, 121.0, 121.0, 105.0, 108.0, 127.5, 141.0, 150.0, 183.0~
## $ DIABP <dbl> 70.0, 66.0, 81.0, 69.5, 66.0, 80.0, 89.0, 95.0, 109.0, 84.0, ~
## $ CURSMOKE <chr> "No", "No", "No", "No", "No", "Yes", "Yes", "Yes", "Yes", "Ye~
## $ CIGPDAY <int> 0, 0, 0, 0, 0, 20, 30, 30, 20, 23, 30, 30, 0, 0, 0, 0, 0, 20,~
## $ TOTCHOL <int> 195, 209, 250, 260, 237, 245, 283, 225, 232, 285, 343, NA, 22~
## $ BMI <dbl> 26.97, NA, 28.73, 29.43, 28.50, 25.34, 25.34, 28.58, 30.18, 2~
## $ GLUCOSE <int> 77, 92, 76, 86, 71, 70, 87, 103, 89, 85, 72, NA, 99, 86, 81, ~
## $ DIABETES <chr> "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", "~
## $ HEARTRTE <int> 80, 69, 95, 80, 80, 75, 75, 65, 60, 85, 90, 74, 77, 120, 86, ~
## $ ANYCHD <chr> "Yes", "Yes", "No", "No", "No", "No", "No", "No", "No", "No",~
## $ STROKE <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ HYPERTEN <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1~
## $ HOSPMI <int> 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ DEATH <int> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
head(df, 3)
## SEX AGE SYSBP DIABP CURSMOKE CIGPDAY TOTCHOL BMI GLUCOSE DIABETES HEARTRTE
## 1 1 39 106 70 No 0 195 26.97 77 No 80
## 2 1 52 121 66 No 0 209 NA 92 No 69
## 3 2 46 121 81 No 0 250 28.73 76 No 95
## ANYCHD STROKE HYPERTEN HOSPMI DEATH
## 1 Yes 0 0 1 0
## 2 Yes 0 0 1 0
## 3 No 0 0 0 0
tail(df, 2)
## SEX AGE SYSBP DIABP CURSMOKE CIGPDAY TOTCHOL BMI GLUCOSE DIABETES
## 11631 2 45 120 75 No 0 210 27.20 58 No
## 11632 2 54 130 85 No 0 218 20.55 85 No
## HEARTRTE ANYCHD STROKE HYPERTEN HOSPMI DEATH
## 11631 72 No 0 1 0 0
## 11632 72 No 0 0 0 0
summary(df)
## SEX AGE SYSBP DIABP
## Length:11632 Min. :32.00 Min. : 83.5 Min. : 30.00
## Class :character 1st Qu.:48.00 1st Qu.:120.0 1st Qu.: 75.00
## Mode :character Median :54.00 Median :132.0 Median : 82.00
## Mean :54.79 Mean :136.3 Mean : 83.04
## 3rd Qu.:62.00 3rd Qu.:149.0 3rd Qu.: 90.00
## Max. :81.00 Max. :295.0 Max. :150.00
##
## CURSMOKE CIGPDAY TOTCHOL BMI
## Length:11632 Min. : 0.000 Min. :107.0 Min. :14.43
## Class :character 1st Qu.: 0.000 1st Qu.:210.0 1st Qu.:23.09
## Mode :character Median : 0.000 Median :238.0 Median :25.48
## Mean : 8.247 Mean :241.2 Mean :25.88
## 3rd Qu.:20.000 3rd Qu.:268.0 3rd Qu.:28.07
## Max. :90.000 Max. :696.0 Max. :56.80
## NA's :79 NA's :409 NA's :52
## GLUCOSE DIABETES HEARTRTE ANYCHD
## Min. : 39.00 Length:11632 Min. : 37.00 Length:11632
## 1st Qu.: 72.00 Class :character 1st Qu.: 69.00 Class :character
## Median : 80.00 Mode :character Median : 75.00 Mode :character
## Mean : 84.12 Mean : 76.78
## 3rd Qu.: 89.00 3rd Qu.: 85.00
## Max. :478.00 Max. :220.00
## NA's :1441 NA's :6
## STROKE HYPERTEN HOSPMI DEATH
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :1.0000 Median :0.00000 Median :0.0000
## Mean :0.09121 Mean :0.7433 Mean :0.09921 Mean :0.3033
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.0000
##
colnames(df) <- tolower(colnames(df))
df <- df %>% rename(gender = sex, smoker = cursmoke, heartrate = heartrte, chd = anychd, diabetic = diabetes)
df %>% head(2)
## gender age sysbp diabp smoker cigpday totchol bmi glucose diabetic
## 1 1 39 106 70 No 0 195 26.97 77 No
## 2 1 52 121 66 No 0 209 NA 92 No
## heartrate chd stroke hyperten hospmi death
## 1 80 Yes 0 0 1 0
## 2 69 Yes 0 0 1 0
unique(df$gender) %>% t()
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] "1" "2" "M" "F" "Female" "Male"
Remarks: Our data contains different representations of gender.
maleList = c("Male", "male", "M", 1)
femaleList = c("Female", "female", "F", 2)
correctGender <- function(data, maleList, femaleList) {
n = length(data)
for (i in 1:n) {
genderType = data[i]
if (genderType %in% femaleList) {
data[i] = 2
} else if (genderType %in% maleList) {
data[i] = 1
} else {
data[i] = NA
}
}
data <- as.numeric(data)
return(data)
}
df <- df %>% mutate(gender = correctGender(df$gender, maleList, femaleList))
unique(df$gender)
## [1] 1 2
cat("No. of Duplicated Records:", sum(duplicated(df)))
## No. of Duplicated Records: 5
df <- df %>% distinct()
cat("No. of Duplicated Records:", sum(duplicated(df)))
## No. of Duplicated Records: 0
Remarks: Analysis such as Statistical Analysis and Machine Learning algorithms accept categorical data but only in the numeric format.
df <- df %>% mutate(smoker = ifelse(smoker == "No", 0, 1),
diabetic = ifelse(diabetic == "No", 0, 1),
chd = ifelse(chd == "No", 0, 1))
colSums(is.na(df)) %>%
as.data.frame() %>%
rename(count = ".") %>%
filter(count != 0) %>%
arrange(desc(count)) %>%
mutate("missing %" = round((count / nrow(df) * 100), 2))
## count missing %
## glucose 1440 12.38
## totchol 409 3.52
## cigpday 79 0.68
## bmi 52 0.45
## heartrate 6 0.05
df %>% group_by(smoker) %>% summarise(cigpday_median = median(cigpday, na.rm=TRUE))
## # A tibble: 2 x 2
## smoker cigpday_median
## <dbl> <dbl>
## 1 0 0
## 2 1 20
Remarks: It is pretty obvious, those who do not smoke will have cigpday of zero. Hence, group by smoker status and impute using median.
df <- df %>%
group_by(smoker) %>%
mutate(cigpday = ifelse(is.na(cigpday),
median(cigpday, na.rm = TRUE),
cigpday))
df %>% ungroup() -> df
df <- df %>% drop_na(heartrate) #from tidyr library
Remarks: There are only 6 missing observations of heartrate, we can drop them.
df %>% group_by(gender) %>% summarise(mean = mean(bmi, na.rm=T), median = median(bmi, na.rm=T))
## # A tibble: 2 x 3
## gender mean median
## <dbl> <dbl> <dbl>
## 1 1 26.2 26.1
## 2 2 25.6 24.8
Remarks: There is a slight difference in bmi between the genders. We will group by the gender and impute using median.
df <- df %>%
group_by(gender) %>%
mutate(bmi = ifelse(is.na(bmi),
median(bmi, na.rm = TRUE),
bmi))
df %>% ungroup() -> df
denplot <- df %>%
keep(is.numeric) %>%
select(glucose, totchol) %>%
drop_na() %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_density(color="#a1bca5", fill="#caeccf") + theme_bw() +
labs(title = "Distribution of Glucose & Total Cholesterol", x = "", y = "") +
theme(plot.title = element_text(hjust = 0.5), axis.text.y = element_blank(), axis.ticks.y = element_blank())
ggplotly(denplot)
Remarks: The distribution is right-skewed, to avoid outliers affecting our imputation, we will use median.
df %>% group_by(diabetic) %>% summarise(glucose_median = median(glucose, na.rm=TRUE),
glucose_mean = mean(glucose, na.rm=TRUE))
## # A tibble: 2 x 3
## diabetic glucose_median glucose_mean
## <dbl> <int> <dbl>
## 1 0 79 81.4
## 2 1 121 144.
Remarks: There is a huge difference in median and mean of glucose between those who are diabetic and not. Thus, we will group by diabetic and impute using median.
df <- df %>%
group_by(diabetic) %>%
mutate(glucose = ifelse(is.na(glucose),
median(glucose, na.rm = TRUE),
glucose))
df %>% ungroup() -> df
df %>% group_by(chd) %>% summarise(totchol_median = median(totchol, na.rm=TRUE),
totchol_mean = mean(totchol, na.rm=TRUE))
## # A tibble: 2 x 3
## chd totchol_median totchol_mean
## <dbl> <dbl> <dbl>
## 1 0 236 238.
## 2 1 245 249.
Remarks: There is only a minor difference in median and mean of totchol between those who have heart disease and not. Hence, we will still group by chd and impute using median.
df <- df %>%
group_by(chd) %>%
mutate(totchol = ifelse(is.na(totchol),
median(totchol, na.rm = TRUE),
totchol))
df %>% ungroup() -> df
is.na(df) %>% sum()
## [1] 0
df_outliers <- df %>% select(glucose, totchol, bmi, cigpday, heartrate)
par(mfrow=c(1,5))
LVboxplot(df_outliers$glucose, horizontal = F, col = "#E41A1C", xlab = "Glucose", ylab = "Count", bg = "white", width.method = "height")
LVboxplot(df_outliers$totchol, horizontal = F, col = "#377EB8", xlab = "Total Cholesterol", ylab = "", bg = "white", width.method = "height")
LVboxplot(df_outliers$bmi, horizontal = F, col = "#4DAF4A", xlab = "BMI", ylab = "", bg = "white", width.method = "height")
LVboxplot(df_outliers$cigpday, horizontal = F, col = "#984EA3", xlab = "Cigpday", ylab = "", bg = "white", width.method = "height")
LVboxplot(df_outliers$heartrate, horizontal = F, col = "#FF7F00", xlab = "Heartrate", ylab = "", bg = "white", width.method = "height")
Remarks: We can observe visible outliers in variables glucose
, totchol
and heartrate
Remarks: A single outliers in glucose and heartrate, we will drop them.
max_glucose = df['glucose'] %>% max()
max_heartrate = df['heartrate'] %>% max()
cat("Max. value for glucose:", max_glucose, "\nMax. value for heartrate:", max_heartrate)
## Max. value for glucose: 478
## Max. value for heartrate: 220
df <- df[which(df$glucose < 478), ]
df <- df[with(df, heartrate < 220), ]
z.scores <- scores(tidyr::drop_na(df["totchol"]), type = "z")
z.scores <- which(abs(z.scores) > 5.1)
p <- plot_ly(data = df, x = ~age, y = ~totchol, type = "scatter", mode = "markers", name = "Distribution", marker = list(color = '#0d7cc8'))
l <- list(
font = list(
family = "Trebuchet MS",
size = 13,
color = "#000"),
bgcolor = "#efefef",
bordercolor = "#FFFFFF",
borderwidth = 2)
p <- p %>% add_trace(x = df$age[z.scores], y = df$totchol[z.scores], marker = list(color = '#c80d1e'), name = "Outliers")
p <- p %>% layout(legend = l, title = 'Distribution of Total Cholesterol over Age', plot_bgcolor = "#FFF", xaxis = list(title = 'Age'),
yaxis = list(title = 'Total Cholesterol'))
p
df %>% filter(totchol < min(df$totchol[z.scores])) -> df
z.scores <- scores(tidyr::drop_na(df["totchol"]), type = "z")
z.scores <- which(abs(z.scores) > 5.2)
p <- plot_ly(data = df, x = ~age, y = ~totchol, type = "scatter", mode = "markers", name = "Distribution", marker = list(color = '#0d7cc8'))
l <- list(
font = list(
family = "Trebuchet MS",
size = 13,
color = "#000"),
bgcolor = "#efefef",
bordercolor = "#FFFFFF",
borderwidth = 2)
p <- p %>% add_trace(x = df$age[z.scores], y = df$totchol[z.scores], marker = list(color = '#c80d1e'), name = "Outliers")
p <- p %>% layout(legend = l, title = 'Distribution of Total Cholesterol over Age', plot_bgcolor = "#FFF", xaxis = list(title = 'Age'),
yaxis = list(title = 'Total Cholesterol'))
p
head(df)
## # A tibble: 6 x 16
## gender age sysbp diabp smoker cigpday totchol bmi glucose diabetic
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 1 39 106 70 0 0 195 27.0 77 0
## 2 1 52 121 66 0 0 209 26.1 92 0
## 3 2 46 121 81 0 0 250 28.7 76 0
## 4 2 52 105 69.5 0 0 260 29.4 86 0
## 5 2 58 108 66 0 0 237 28.5 71 0
## 6 1 48 128. 80 1 20 245 25.3 70 0
## # ... with 6 more variables: heartrate <int>, chd <dbl>, stroke <int>,
## # hyperten <int>, hospmi <int>, death <int>
# Obtaining the correlation matrix with the Pearson's method
df_cor <- cor(df[, sapply(df, is.numeric)], method = 'pearson')
# Visualizing with a heatmap the correlation matrix
as.matrix(data.frame(df_cor)) %>%
round(2) %>% #round
hchart() %>%
hc_add_theme(hc_theme_google()) %>%
hc_title(text = "Pearson's Correlation Coefficients", align = "center") %>%
hc_legend(align = "center") %>%
hc_colorAxis(stops = color_stops(colors = viridis::inferno(10))) %>%
hc_plotOptions(
series = list(
boderWidth = 0,
dataLabels = list(enabled = TRUE)))
cigpday
& smoker
are highly positively correlated with 0.78
diabp
& sysbp
are highly positively correlated with 0.71
chd
& hospmi
are moderately positively correlated with 0.54
diabetes
& glucose
are moderately positively correlated with 0.49
sysbp
, diabp
& hyperten
are moderately positively correlated with 0.46, 0.42
df %>% select(chd, gender) %>%
group_by(chd, gender) %>%
mutate(gender = ifelse(gender == 1,
"Male",
"Female"),
chd = ifelse(chd == 0,
"No Heart Disease",
"Heart Disease")) %>%
summarize(count = n(), .groups = 'drop') %>%
plot_ly(x = ~gender, y = ~count, color = ~factor(chd), type = 'bar', colors = "Set1",
text = ~paste0(count, " (", scales::percent(count/sum(count)), ")"),
textposition = "outside",
textfont = list(size = 11, color = "black"),
showlegend = TRUE) %>%
layout(title = "Heart Disease by Gender", yaxis = list(title = "", showgrid = FALSE, showticklabels = FALSE), xaxis = list(title = "", showgrid = FALSE), barmode = "group")
df_eda %>% select(chd, age_groups) %>%
group_by(chd, age_groups) %>%
mutate(chd = ifelse(chd == 0,
"No Heart Disease",
"Heart Disease")) %>%
summarize(count = n(), .groups = 'drop') %>%
plot_ly(y = ~age_groups, x = ~count, color = ~factor(chd), type = 'bar', colors = "Set1", orientation = 'h',
text = ~paste0(count, " (", scales::percent(count/sum(count)), ")"),
textposition = "outside",
textfont = list(size = 11, color = "black"),
showlegend = TRUE) %>%
layout(title = "Heart Disease by Age Groups", yaxis = list(title = "", showgrid = FALSE), xaxis = list(title = "", showgrid = FALSE, showticklabels = FALSE), barmode = "group")
plot_ly(
ids = sunburst_ids, labels = sunburst_labels, parents = sunburst_parents, values = values,
type = 'sunburst', branchvalues = 'total', maxdepth = 3, marker = list(colors = sunburst_colors),
) %>% layout(title = "Death and Heart Disease by Age Groups")
df_eda %>% mutate(chd = ifelse(chd == 0,
"No Heart Disease",
"Heart Disease")) %>%
plot_ly() %>%
add_trace(x=~age_groups,
y=~totchol,
color = ~factor(chd),
type = "box",
colors = "Set1") %>%
layout(boxmode = "group",
title = "Total Cholesterol Count by Age Group & Gender",
xaxis = list(title = "",
zeroline = FALSE, showgrid = FALSE),
yaxis = list(title = "Total Cholesterol",
zeroline = FALSE, showgrid = FALSE))
#Viewing histograms of the numeric variables
numvars <- c('age','glucose','bmi')
for (i in numvars){
g <- df%>%
ggplot(aes_string(x = i))+geom_histogram(color = 'black',fill = 'hotpink',binwidth = 3
)+labs(title = i,y = 'frequency')
plot(g)
}
#Various exploratory histograms
df%>%
filter(stroke == '1')%>%
group_by(bmi)%>%
count(stroke)%>%
ggplot(aes(x = bmi,weight = n))+geom_histogram(fill = 'hotpink',color = 'black')+labs(
title = 'BMI Distribution of Patients Who Had Strokes',
y = "Frequency"
)
df%>%
filter(stroke == '1')%>%
group_by(glucose)%>%
count(stroke)%>%
ggplot(aes(x = glucose,weight = n))+geom_histogram(fill = 'hotpink',color = 'black')+labs(
title = 'Average Glucose Distribution of Patients Who Had Strokes',
y = "Frequency"
)
# Stroke Distribution
df%>%
ggplot(aes(x = stroke))+geom_bar(fill = 'hotpink')+labs(
x = 'Had Stroke?',
y = 'Count'
)
#Proportions
df%>%
group_by(stroke)%>%
count(stroke)
## # A tibble: 2 x 2
## # Groups: stroke [2]
## stroke n
## <int> <int>
## 1 0 10555
## 2 1 1055
plot(df$age, df$bmi)
points(df$age, df$bmi, col = ifelse(df$gender == 1, "blue", "pink"), pch = 19)
abline(h = 190, col = "green")
legend("topleft",
inset = 0.05,
legend = c("male", "female"),
col = c("blue", "pink"),
bg = "gray",
lwd = c(6, 6))
#let's see the corrplot
corrplot(cor(df[,-5]),method="circle")
pairs(df[,-5],col=df$diabetic)
#lets plot some graphs and see relations or patterns
ggplot(df,aes(x=heartrate,y=bmi)) + geom_smooth() + geom_point(aes(color=factor(diabetic)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(df,aes(x=diabp,y=bmi)) + geom_smooth() + geom_point(aes(color=factor(diabetic)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(df,aes(x=age,y=bmi)) + geom_smooth() + geom_point(aes(color=factor(diabetic)))
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
ggplot(df,aes(x=diabp,y=glucose,color=bmi,size=age)) + geom_point(alpha=0.5) + facet_wrap(diabetic~.) + ggtitle("Glucose vs Blood Pressure")
Objective: To predict whether an individual has heart disease based on risk factors.
# parallelization
parallelStartSocket(cpus = detectCores())
## Starting parallelization in mode=socket with cpus=12.
df_model_perf = data.frame(Model = character(0), Accuracy = numeric(0), BalancedAccuracy = numeric(0))
df %>% select(-stroke, -hyperten, -hospmi) -> df_chd
set.seed(669)
split = 0.8
trainIndex = createDataPartition(df_chd$chd, p = split, list = F)
train_set <- df_chd[trainIndex,]
test_set <- df_chd[-trainIndex,]
train_set[["chd"]] = factor(train_set[["chd"]])
test_set[["chd"]] = factor(test_set[["chd"]])
Remarks: We split the data by 80:20.
tc <- trainControl(method = "repeatedcv", number = 10, repeats = 3)
Remarks: We make use of Repeated K-fold cross-validation with 10 folds for cross-validation.
set.seed(670)
log_reg <- glm(chd ~.,family=binomial(link='logit'), data = train_set)
predictLogReg <- predict(log_reg, newdata = test_set, type="response")
log_reg_cm <- confusionMatrix(data = factor(as.integer(predictLogReg>0.5)), reference = test_set$chd)
df_model_perf[nrow(df_model_perf) + 1, ] <- c("Logistic Regression",
as.numeric(log_reg_cm$overall[1]),
as.numeric(log_reg_cm$byClass[11]))
draw_confusion_matrix(log_reg_cm, "Logistic Regression")
knn <- train(chd ~., data = train_set, trControl = tc, method = "knn", tuneGrid = data.frame(k = seq(1, 27)))
predictKNN <- predict(knn, newdata = test_set)
knn_cm <- confusionMatrix(predictKNN, test_set$chd)
df_model_perf[nrow(df_model_perf) + 1, ] <- c("KNN",
as.numeric(knn_cm$overall[1]),
as.numeric(knn_cm$byClass[11]))
draw_confusion_matrix(knn_cm, "KNN")
rf <- train(chd ~., data = train_set[sample(nrow(train_set), 1000), ], trControl = tc, method = "rf")
predictRF <- predict(rf, newdata = test_set)
rf_cm <- confusionMatrix(predictRF, test_set$chd)
df_model_perf[nrow(df_model_perf) + 1, ] <- c("Random Forest",
as.numeric(rf_cm$overall[1]),
as.numeric(rf_cm$byClass[11]))
draw_confusion_matrix(rf_cm, "Random Forest")
naive <- train(chd ~., data = train_set, trControl = tc, method = "nb")
predictNaive <- predict(naive, newdata = test_set)
naive_cm <- confusionMatrix(predictNaive, test_set$chd)
df_model_perf[nrow(df_model_perf) + 1, ] <- c("Naive Bayes",
as.numeric(naive_cm$overall[1]),
as.numeric(naive_cm$byClass[11]))
draw_confusion_matrix(naive_cm, "Naive Bayes")
svm_lin <- train(chd ~.,
data = train_set[sample(nrow(train_set), 1000), ],
method = "svmLinear",
trControl = tc,
preProcess = c("center","scale"),
tuneGrid = expand.grid(C = seq(0, 2, length = 20)))
predictSVMLin <- predict(svm_lin, newdata = test_set)
svm_lin_cm <- confusionMatrix(predictSVMLin, test_set$chd)
df_model_perf[nrow(df_model_perf) + 1, ] <- c("Linear SVM",
as.numeric(svm_lin_cm$overall[1]),
as.numeric(svm_lin_cm$byClass[11]))
draw_confusion_matrix(svm_lin_cm, "Linear SVM")
svm_rad <- train(chd ~., data = train_set[sample(nrow(train_set), 1000), ], method = "svmRadial", trControl = tc, preProcess = c("center","scale"), tuneLength = 10)
predictRad <- predict(svm_rad, newdata = test_set)
svm_rad_cm <- confusionMatrix(predictRad, test_set$chd)
df_model_perf[nrow(df_model_perf) + 1, ] <- c("Radial SVM",
as.numeric(svm_rad_cm$overall[1]),
as.numeric(svm_rad_cm$byClass[11]))
draw_confusion_matrix(svm_rad_cm, "Radial SVM")
svm_poly <- train(chd ~., data = train_set[sample(nrow(train_set), 100), ], method = "svmPoly", trControl = tc, preProcess = c("center","scale"), tuneLength = 4)
predictPoly <- predict(svm_poly, newdata = test_set)
svm_poly_cm <- confusionMatrix(predictPoly, test_set$chd)
df_model_perf[nrow(df_model_perf) + 1, ] <- c("Polynomial SVM",
as.numeric(svm_poly_cm$overall[1]),
as.numeric(svm_poly_cm$byClass[11]))
draw_confusion_matrix(svm_poly_cm, "Polynomial SVM")
labels <- train_set$chd
labels_ts <- test_set$chd
train_set_xgb <- model.matrix( ~ . + 0, data = train_set %>% select(-chd))
test_set_xgb <- model.matrix( ~ . + 0, data = test_set %>% select(-chd))
labels <- as.numeric(labels) - 1
labels_ts <- as.numeric(labels_ts) - 1
train_set_dmatrix <- xgb.DMatrix(data = train_set_xgb, label = labels)
test_set_dmatrix <- xgb.DMatrix(data = test_set_xgb, label = labels_ts)
params <- list(booster = "gbtree", objective = "binary:logistic",
eta=0.3, gamma=0, max_depth=6, min_child_weight=1,
subsample=1, colsample_bytree=1)
xgbcv <- xgb.cv(params = params, data = train_set_dmatrix, nrounds = 100,
nfold = 5, showsd = T, stratified = T, print_every_n = 10,
early_stopping_rounds = 20, maximize = F)
xgb_model <- xgb.train(params = params, data = train_set_dmatrix, nrounds = 79,
watchlist = list(val=test_set_dmatrix,train=train_set_dmatrix),
print_every_n = 10, early_stopping_rounds = 10, maximize = F ,
eval_metric = "error")
xgbpred <- predict(xgb_model, test_set_dmatrix)
xgboost_cm <- confusionMatrix(data = factor(as.integer(xgbpred>0.5)), reference = factor(labels_ts))
df_model_perf[nrow(df_model_perf) + 1, ] <- c("XGBoost",
as.numeric(xgboost_cm$overall[1]),
as.numeric(xgboost_cm$byClass[11]))
draw_confusion_matrix(xgboost_cm, "XGBoost")
set.seed(673)
traintask <- mlr::makeClassifTask(data = as.data.frame(train_set), target = "chd")
testtask <- mlr::makeClassifTask(data = as.data.frame(test_set), target = "chd")
lrn <- makeLearner("classif.xgboost",predict.type = "response")
lrn$par.vals <- list( objective="binary:logistic", eval_metric="error", nrounds=100L, eta=0.1)
params <- makeParamSet(makeDiscreteParam("booster",values = c("gbtree","gblinear")),
makeIntegerParam("max_depth",lower = 3L,upper = 10L),
makeNumericParam("min_child_weight",lower = 1L,upper = 10L),
makeNumericParam("subsample",lower = 0.5,upper = 1),
makeNumericParam("colsample_bytree",lower = 0.5,upper = 1))
rdesc <- makeResampleDesc("CV", stratify = T, iters=5L)
ctrl <- makeTuneControlRandom(maxit = 10L)
mytune <- tuneParams(learner = lrn, task = traintask, resampling = rdesc,
measures = acc, par.set = params, control = ctrl, show.info = T)
lrn_tune <- setHyperPars(lrn,par.vals = mytune$x)
xgmodel <- mlr::train(learner = lrn_tune,task = traintask)
xgpred <- predict(xgmodel,testtask)
xgboost_cmb <- confusionMatrix(xgpred$data$response,xgpred$data$truth)
df_model_perf[nrow(df_model_perf) + 1, ] <- c("XGBoost Tuned",
as.numeric(xgboost_cmb$overall[1]),
as.numeric(xgboost_cmb$byClass[11]))
#https://www.hackerearth.com/practice/machine-learning/machine-learning-algorithms/beginners-tutorial-on-xgboost-parameter-tuning-r/tutorial/
draw_confusion_matrix(xgboost_cmb, "XGBoost Tuned")
set.seed(674)
train_cboost <- catboost.load_pool(data = train_set[-12], label = as.integer(train_set$chd))
cboost <- catboost.train(train_cboost, NULL,
params = list(loss_function = 'Logloss',
iterations = 100, metric_period=10))
predictCBoost <- catboost.predict(cboost, catboost.load_pool(test_set), prediction_type = "Probability")
cboost_cm <- confusionMatrix(data = factor(as.integer(predictCBoost>0.5)), reference = test_set$chd)
df_model_perf[nrow(df_model_perf) + 1, ] <- c("CatBoost",
as.numeric(cboost_cm$overall[1]),
as.numeric(cboost_cm$byClass[11]))
draw_confusion_matrix(cboost_cm, "CatBoost")
mat <- xgb.importance (feature_names = colnames(train_set_xgb), model = xgb_model)
gg <- xgb.ggplot.importance(mat, measure = "Frequency", rel_to_first = TRUE)
gg <- gg + ggplot2::ylab("Importance") +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(), axis.line = element_line(colour = "black"),
axis.title.y=element_blank()) +
labs(title = "XGBoost Feature Importance")
ggplotly(gg)
df_model_perf$Accuracy <- as.numeric(df_model_perf$Accuracy) * 100
df_model_perf$BalancedAccuracy <- as.numeric(df_model_perf$BalancedAccuracy) * 100
df_model_perfa <- df_model_perf %>% arrange(Accuracy)
plot_ly(data = df_model_perfa,
x = ~Model,
y = ~Accuracy,
name = "Accuracy",
type = "bar",
text = ~round(Accuracy, 2),
textfont = list(color = "white"),
textposition = "inside",
marker = list(color = c(rep('grey', 9), 'rgba(222,45,38,0.8)'))
) %>% layout(title = "Accuracy Comparison of Models (%)",
xaxis = list(categoryorder = "array", categoryarray = ~Model, title = ""),
yaxis = list(title = "", showticklabels=F, showgrid=F))
df_model_perfb <- df_model_perf %>% arrange(BalancedAccuracy)
plot_ly(data = df_model_perfb,
x = ~Model,
y = ~BalancedAccuracy,
name = "Balanced Accuracy",
type = "bar",
text = ~round(BalancedAccuracy, 2),
textfont = list(color = "white"),
textposition = "inside",
marker = list(color = c(rep('grey', 9), 'rgba(222,45,38,0.8)'))
) %>% layout(title = "Balanced Accuracy Comparison of Models (%)",
xaxis = list(categoryorder = "array", categoryarray = ~Model, title = ""),
yaxis = list(title = "", showticklabels=F, showgrid=F))
set.seed(671)
df_model_samp = data.frame(Model = character(0), Accuracy = numeric(0), BalancedAccuracy = numeric(0))
df_imb <- df %>% group_by(chd) %>% summarise(count = n())
df_imb$chd <- as.factor(df_imb$chd)
df_imb <- df_imb %>% mutate(label = ifelse(chd == 0,
"No Heart Disease",
"Heart Disease"))
df_imb <- df_imb %>%
arrange(desc(chd)) %>%
mutate(prop = count / sum(df_imb$count) *100) %>%
mutate(ypos = cumsum(prop)- 0.5*prop )
ggplot(df_imb, aes(x="", y=prop, fill=chd)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
theme_void() +
theme(legend.position="none") +
geom_text(aes(y = ypos, label = label), color = "white", size=5) +
scale_fill_manual(values = c("steelblue", "red"))
set.seed(672)
df_over <- ovun.sample(chd ~ ., data = train_set, method = "over", N = 10000)$data
log_reg_over <- glm(chd ~.,family=binomial(link='logit'), data = df_over)
predictLogRegOver <- predict(log_reg_over, newdata = test_set, type="response")
over_roc <- roc(test_set$chd, predictLogRegOver)
over_roc_text <- paste("Over-Samping: ", round(over_roc$auc[1], 3), sep = "")
log_reg_over_cm <- confusionMatrix(data = factor(as.integer(predictLogRegOver>0.5)), reference = test_set$chd)
df_model_samp[nrow(df_model_samp) + 1, ] <- c("Over-Sampling",
as.numeric(log_reg_over_cm$overall[1]),
as.numeric(log_reg_over_cm$byClass[11]))
df_under <- ovun.sample(chd ~ ., data = train_set, method = "under", N = 8000)$data
log_reg_under <- glm(chd ~.,family=binomial(link='logit'), data = df_under)
predictLogRegUnder <- predict(log_reg_under, newdata = test_set, type="response")
under_roc <- roc(test_set$chd, predictLogRegUnder)
under_roc_text <- paste("Under-Samping: ", round(under_roc$auc[1], 3), sep = "")
log_reg_under_cm <- confusionMatrix(data = factor(as.integer(predictLogRegUnder>0.5)), reference = test_set$chd)
df_model_samp[nrow(df_model_samp) + 1, ] <- c("Under-Sampling",
as.numeric(log_reg_under_cm$overall[1]),
as.numeric(log_reg_under_cm$byClass[11]))
df_both <- ovun.sample(chd ~ ., data = train_set, method = "both", p = 0.5)$data
log_reg_both <- glm(chd ~.,family=binomial(link='logit'), data = df_both)
predictLogRegBoth <- predict(log_reg_both, newdata = test_set, type="response")
both_roc <- roc(test_set$chd, predictLogRegBoth)
both_roc_text <- paste("Under & Over-Samping: ", round(both_roc$auc[1], 3), sep = "")
log_reg_both_cm <- confusionMatrix(data = factor(as.integer(predictLogRegBoth>0.5)), reference = test_set$chd)
df_model_samp[nrow(df_model_samp) + 1, ] <- c("Under- & Over-Sampling",
as.numeric(log_reg_both_cm$overall[1]),
as.numeric(log_reg_both_cm$byClass[11]))
df_rose <- ROSE(chd ~ ., data = train_set)$data
log_reg_rose <- glm(chd ~.,family=binomial(link='logit'), data = df_rose)
predictLogRegRose <- predict(log_reg_rose, newdata = test_set, type="response")
rose_roc <- roc(test_set$chd, predictLogRegRose)
rose_roc_text <- paste("ROSE: ", round(rose_roc$auc[1], 3), sep = "")
log_reg_rose_cm <- confusionMatrix(data = factor(as.integer(predictLogRegRose>0.5)), reference = test_set$chd)
df_model_samp[nrow(df_model_samp) + 1, ] <- c("ROSE",
as.numeric(log_reg_rose_cm$overall[1]),
as.numeric(log_reg_rose_cm$byClass[11]))
df_smote <- SMOTE(chd ~ ., as.data.frame(train_set), perc.over = 100, perc.under = 200)
log_reg_smote <- glm(chd ~.,family=binomial(link='logit'), data = df_smote)
predictLogRegSmote <- predict(log_reg_smote, newdata = test_set, type="response")
smote_roc <- roc(test_set$chd, predictLogRegSmote)
smote_roc_text <- paste("SMOTE: ", round(smote_roc$auc[1], 3), sep = "")
log_reg_smote_cm <- confusionMatrix(data = factor(as.integer(predictLogRegSmote>0.5)), reference = test_set$chd)
df_model_samp[nrow(df_model_samp) + 1, ] <- c("SMOTE",
as.numeric(log_reg_smote_cm$overall[1]),
as.numeric(log_reg_smote_cm$byClass[11]))
df_model_samp[nrow(df_model_samp) + 1, ] <- c("Original",
as.numeric(log_reg_cm$overall[1]),
as.numeric(log_reg_cm$byClass[11]))
orig_roc <- roc(test_set$chd, predictLogReg)
orig_roc_text <- paste("Original: ", round(orig_roc$auc[1], 3), sep = "")
plot.new()
plot(over_roc)
plot(under_roc, add=TRUE, col='red')
plot(both_roc, add=TRUE, col='blue')
plot(smote_roc, add=TRUE, col='green')
plot(rose_roc, add=TRUE, col='purple')
plot(orig_roc, add=TRUE, col='yellow')
legend(x = "bottomright", legend=c(over_roc_text, under_roc_text, both_roc_text, rose_roc_text, smote_roc_text, orig_roc_text), col=c("red", "blue", "green", "purple", "yellow"), lty=1:2, cex=0.8)
df_model_samp$Accuracy <- as.numeric(df_model_samp$Accuracy) * 100
df_model_samp$BalancedAccuracy <- as.numeric(df_model_samp$BalancedAccuracy) * 100
df_model_sampa <- df_model_samp %>% arrange(Accuracy)
plot_ly(data = df_model_sampa,
x = ~Model,
y = ~Accuracy,
name = "Accuracy",
type = "bar",
text = ~round(Accuracy, 2),
textfont = list(color = "white"),
textposition = "inside",
marker = list(color = c(rep('grey', 5), 'rgba(222,45,38,0.8)'))
) %>% layout(title = "Logistic Regression\nAccuracy Comparison of Models (%)",
xaxis = list(categoryorder = "array", categoryarray = ~Model, title = ""),
yaxis = list(title = "", showticklabels=F, showgrid=F))
df_model_sampb <- df_model_samp %>% arrange(BalancedAccuracy)
plot_ly(data = df_model_sampb,
x = ~Model,
y = ~BalancedAccuracy,
name = "Balanced Accuracy",
type = "bar",
text = ~round(BalancedAccuracy, 2),
textfont = list(color = "white"),
textposition = "inside",
marker = list(color = c(rep('grey', 5), 'rgba(222,45,38,0.8)'))
) %>% layout(title = "Logistic Regression\nBalanced Accuracy Comparison of Models (%)",
xaxis = list(categoryorder = "array", categoryarray = ~Model, title = ""),
yaxis = list(title = "", showticklabels=F, showgrid=F))
Remarks: We will choose Logistic Regression with Over-Sampling method as our model for this study.
df_predict <- data.frame(
list(gender = c(1, 2, 1),
age = c(40, 60, 80),
sysbp = c(170, 140, 180),
diabp = c(90, 100, 120),
smoker = c(0, 1, 1),
cigpday = c(0, 10, 5),
totchol = c(230, 250, 280),
bmi = c(23, 25, 28),
glucose = c(80, 90, 110),
diabetic = c(0, 1, 0),
heartrate = c(80, 90, 75),
death = c(0, 0, 1))
)
custom_predict <- predict(log_reg_over, newdata = df_predict, method = "prediction")
factor(as.integer(custom_predict>0.5))
## [1] 0 0 1
## Levels: 0 1
Objective: To predict whether an individual will suffer a stroke.
#Splits and folds
train_ind <- as.vector(createDataPartition(y = df$stroke, times = 1, p = 0.8))
df_train <- df[train_ind[[1]], ]
df_test <- df[-train_ind[[1]], ]
#numfolds <- 10
#folds <- createFolds(df&trainStroke, k = numfolds)
set.seed(19970507)
# Summary
summary(df_train)
## gender age sysbp diabp
## Min. :1.000 Min. :32.00 Min. : 83.5 Min. : 30.00
## 1st Qu.:1.000 1st Qu.:48.00 1st Qu.:120.0 1st Qu.: 75.00
## Median :2.000 Median :54.00 Median :132.0 Median : 82.00
## Mean :1.566 Mean :54.76 Mean :136.2 Mean : 83.04
## 3rd Qu.:2.000 3rd Qu.:62.00 3rd Qu.:149.0 3rd Qu.: 90.00
## Max. :2.000 Max. :81.00 Max. :295.0 Max. :150.00
## smoker cigpday totchol bmi
## Min. :0.0000 Min. : 0.000 Min. :112.0 Min. :15.16
## 1st Qu.:0.0000 1st Qu.: 0.000 1st Qu.:211.0 1st Qu.:23.09
## Median :0.0000 Median : 0.000 Median :238.0 Median :25.45
## Mean :0.4334 Mean : 8.408 Mean :240.9 Mean :25.84
## 3rd Qu.:1.0000 3rd Qu.:20.000 3rd Qu.:267.0 3rd Qu.:28.04
## Max. :1.0000 Max. :90.000 Max. :462.0 Max. :56.80
## glucose diabetic heartrate chd
## Min. : 39.00 Min. :0.00000 Min. : 37.00 Min. :0.0000
## 1st Qu.: 73.00 1st Qu.:0.00000 1st Qu.: 69.00 1st Qu.:0.0000
## Median : 79.00 Median :0.00000 Median : 75.00 Median :0.0000
## Mean : 83.84 Mean :0.04404 Mean : 76.78 Mean :0.2715
## 3rd Qu.: 88.00 3rd Qu.:0.00000 3rd Qu.: 85.00 3rd Qu.:1.0000
## Max. :423.00 Max. :1.00000 Max. :150.00 Max. :1.0000
## stroke hyperten hospmi death
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.000
## Median :0.00000 Median :1.0000 Median :0.00000 Median :0.000
## Mean :0.09044 Mean :0.7446 Mean :0.09765 Mean :0.302
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.000
# Categorical feature table
df_traincategorical <- df_train %>% select(c(gender,
hyperten,
stroke,
cigpday,
bmi,
hospmi))
#datasummary_skim(data = df_traincategorical %>% rename(hyperten = hyperten) %>%
#rename(stroke = stroke) %>%
#rename(hospmi = hospmi),
# type = "categorical") %>%
#kable_classic()
# Numeric Feature Table
n <- function(x){length(na.omit(x))}
emptycol = function(x){" "}
plt_list <- df_train %>% select(c(age,
glucose,
bmi))
plt_list <- lapply(plt_list, na.omit)
plt_list <- lapply(plt_list, scale)
datasummary(age +
glucose +
bmi ~
n +
Mean*Arguments(na.rm = TRUE) +
Median*Arguments(na.rm = TRUE) +
SD*Arguments(na.rm = TRUE) +
Heading("Boxplot") * emptycol +
Heading("Histogram") * emptycol,
data = df_train %>%
rename(glucose = glucose),
title = "Characteristics") %>%
column_spec(column = 6, image = spec_boxplot(plt_list)) %>%
column_spec(column = 7, image = spec_hist(plt_list)) %>%
kable_classic()
n | Mean | Median | SD | Boxplot | Histogram | |
---|---|---|---|---|---|---|
age | 9288.00 | 54.76 | 54.00 | 9.55 | ||
glucose | 9288.00 | 83.84 | 79.00 | 23.58 | ||
bmi | 9288.00 | 25.84 | 25.45 | 4.09 |
#Partition training and testing data
set.seed(7)
sample_index <- sample(nrow(df),nrow(df)*0.8)
data_train <- df[sample_index,]
data_test <- df[-sample_index,]
#Modeling
#There are 5 types of models in logistic regression below with characteristic using family and link paramater
split <- sample.split(df$stroke, SplitRatio = 0.8)
train <- subset(df, split == TRUE)
test <- subset(df, split == FALSE)
gaussian.identity <- glm(stroke ~ gender + age + smoker + cigpday + diabetic + totchol + sysbp + diabp + bmi + heartrate + glucose, train, family = gaussian(link = "identity"))
binomial.logit <- glm(stroke ~ gender + age + smoker + cigpday + diabetic + totchol + sysbp + diabp + bmi + heartrate + glucose, train, family = binomial(link = "logit"))
binomial.probit <- glm(stroke ~ gender + age + smoker + cigpday + diabetic + totchol + sysbp + diabp + bmi + heartrate + glucose, train, family = binomial(link = "probit"))
binomial.cloglog <- glm(stroke ~ gender + age + smoker + cigpday + diabetic + totchol + sysbp + diabp + bmi + heartrate + glucose, train, family = binomial(link = "cloglog"))
poisson.log <- glm(stroke ~ gender + age + smoker + cigpday + diabetic + totchol + sysbp + diabp + bmi + heartrate + glucose, train, family = poisson(link = "log"))
keyAn <- test$stroke
head(keyAn)
## [1] 0 0 0 0 0 0
#Looking for the best accuracy rate of 5 types model logistic regression
#Gaussian identity accuracy rate
prediction.gaussian.identity <- ifelse(predict(gaussian.identity, newdata = test, type = "response") >= 0.5, 1, 0)
table.gaussian.identity <- table(keyAn, prediction.gaussian.identity)
accuracy.gaussian.identity <- sum(diag(table.gaussian.identity)) / sum(table.gaussian.identity) * 100
head(prediction.gaussian.identity)
## 1 2 3 4 5 6
## 0 0 0 0 0 0
print(table.gaussian.identity)
## prediction.gaussian.identity
## keyAn 0
## 0 2111
## 1 211
print(paste("Accuracy rate using Gaussian identity model is", accuracy.gaussian.identity))
## [1] "Accuracy rate using Gaussian identity model is 90.9130060292851"
prediction.binomial.logit <- ifelse(predict(binomial.logit, newdata = test, type = "response") >= 0.5, 1, 0)
table.binomial.logit <- table(keyAn, prediction.binomial.logit)
accuracy.binomial.logit <- sum(diag(table.binomial.logit)) / sum(table.binomial.logit) *100
head(prediction.binomial.logit)
## 1 2 3 4 5 6
## 0 0 0 0 0 0
print(table.binomial.logit)
## prediction.binomial.logit
## keyAn 0 1
## 0 2107 4
## 1 207 4
print(paste("Accuracy rate using binomial model is", accuracy.binomial.logit))
## [1] "Accuracy rate using binomial model is 90.9130060292851"
prediction.binomial.probit <- ifelse(predict(binomial.probit, newdata = test, type = "response") >= 0.5, 1, 0)
table.binomial.probit <- table(keyAn, prediction.binomial.probit)
accuracy.binomial.probit <- sum(diag(table.binomial.probit)) / sum(table.binomial.probit) *100
head(prediction.binomial.probit)
## 1 2 3 4 5 6
## 0 0 0 0 0 0
print(table.binomial.probit)
## prediction.binomial.probit
## keyAn 0 1
## 0 2108 3
## 1 209 2
print(paste("Accuracy rate using binomial probit is", accuracy.binomial.probit))
## [1] "Accuracy rate using binomial probit is 90.869939707149"
prediction.binomial.cloglog <- ifelse(predict(binomial.cloglog, newdata = test, type = "response") >= 0.5, 1, 0)
table.binomial.cloglog <- table(keyAn, prediction.binomial.cloglog)
accuracy.binomial.cloglog <- sum(diag(table.binomial.cloglog)) / sum(table.binomial.cloglog) * 100
head(prediction.binomial.cloglog)
## 1 2 3 4 5 6
## 0 0 0 0 0 0
print(table.binomial.cloglog)
## prediction.binomial.cloglog
## keyAn 0 1
## 0 2107 4
## 1 207 4
print(paste("Accuracy rate using binomial cloglog is", accuracy.binomial.cloglog))
## [1] "Accuracy rate using binomial cloglog is 90.9130060292851"
prediction.poisson.log <- ifelse(predict(poisson.log, newdata = test, type = "response") >= 0.5, 1, 0)
table.poisson.log <- table(keyAn, prediction.poisson.log)
accuracy.poisson.log <- sum(diag(table.poisson.log)) / sum(table.poisson.log) * 100
head(prediction.poisson.log)
## 1 2 3 4 5 6
## 0 0 0 0 0 0
print(table.poisson.log)
## prediction.poisson.log
## keyAn 0 1
## 0 2104 7
## 1 206 5
print(paste("Acuraccy rate using poisson log is", accuracy.poisson.log))
## [1] "Acuraccy rate using poisson log is 90.8268733850129"
Objective: To predict whether an individual has diabetes.
#test train split
ind <- sample(2,nrow(df),replace=T,prob=c(0.75,0.25))
train <- df[ind==1,]
test <- df[ind==2,]
#Building a model
my_model <- glm(diabetic~.,data=train,family="binomial")
summary(my_model)
##
## Call:
## glm(formula = diabetic ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2871 -0.2162 -0.1378 -0.0940 3.7534
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.400449 0.921729 -14.538 < 2e-16 ***
## gender -0.126725 0.146835 -0.863 0.388114
## age 0.034561 0.008678 3.983 6.81e-05 ***
## sysbp 0.014197 0.004087 3.474 0.000513 ***
## diabp -0.025018 0.007722 -3.240 0.001195 **
## smoker 0.093754 0.228787 0.410 0.681963
## cigpday -0.002258 0.009606 -0.235 0.814196
## totchol -0.001523 0.001574 -0.968 0.333232
## bmi 0.060893 0.014991 4.062 4.86e-05 ***
## glucose 0.060825 0.002778 21.892 < 2e-16 ***
## heartrate 0.010700 0.005109 2.095 0.036213 *
## chd 0.585561 0.162717 3.599 0.000320 ***
## stroke 0.571623 0.174775 3.271 0.001073 **
## hyperten 0.219072 0.220962 0.991 0.321467
## hospmi 0.375971 0.196493 1.913 0.055696 .
## death 0.256521 0.151674 1.691 0.090786 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 3186.4 on 8658 degrees of freedom
## Residual deviance: 1844.4 on 8643 degrees of freedom
## AIC: 1876.4
##
## Number of Fisher Scoring iterations: 7
#Evaluation
p1 <- predict(my_model,test,type="response")
p1 <- ifelse(p1>0.5,1,0)
tab1 <- table(predicted=p1,actual=test$diabetic)
tab1
## actual
## predicted 0 1
## 0 2799 86
## 1 18 48
acc <- (sum(diag(tab1))/sum(tab1))*100
q <- paste("Accuracy is ",acc)
print(q)
## [1] "Accuracy is 96.4757709251101"
Objective: To predict the cholesterol levels of an individual based on other attributes.
df_cor <- cor(df[, sapply(df, is.numeric)], method = 'pearson')
as.matrix(data.frame(df_cor)) %>%
round(2) %>% #round
hchart() %>%
hc_add_theme(hc_theme_google()) %>%
hc_title(text = "Pearson's Correlation Coefficients", align = "center") %>%
hc_legend(align = "center") %>%
hc_colorAxis(stops = color_stops(colors = viridis::inferno(10))) %>%
hc_plotOptions(
series = list(
boderWidth = 0,
dataLabels = list(enabled = TRUE)))
df_cor <- cor(df[, sapply(df, is.numeric)], method = 'spearman')
as.matrix(data.frame(df_cor)) %>%
round(2) %>% #round
hchart() %>%
hc_add_theme(hc_theme_google()) %>%
hc_title(text = "Spearman's Correlation Coefficients", align = "center") %>%
hc_legend(align = "center") %>%
hc_colorAxis(stops = color_stops(colors = viridis::inferno(10))) %>%
hc_plotOptions(
series = list(
boderWidth = 0,
dataLabels = list(enabled = TRUE)))
set.seed(123)
training.samples <- df$totchol %>% createDataPartition(p = 0.8, list = FALSE)
train.data <- df[training.samples, ] %>% select(-hyperten, -hospmi)
test.data <- df[-training.samples, ] %>% select(-hyperten, -hospmi)
model <- lm(totchol ~., data = train.data)
predictRegTotchol <- predict(model, newdata = data.frame(gender=2, age=50, sysbp = 147,
diabp = 96, smoker = 1, cigpday =10,
bmi = 24.2, glucose = 79, diabetic = 0,
heartrate = 94, chd = 0, stroke = 0,
death = 0))
predictRegTotchol <- round(predictRegTotchol, digits = 2)
actualTotchol <- df[df$sysbp == 147 & df$diabp == 96 & df$age == 50,]
actualTotchol <- actualTotchol[['totchol']]
fig <- plot_ly(
x = c("Predicted (Linear Regression)", "Actual"),
y = c(predictRegTotchol, actualTotchol),
name = "Actual vs Predicted value",
type = "bar",
text = c(predictRegTotchol, actualTotchol),
textposition = 'auto',
marker = list(color = c('rgba(255,255,0,0.5)', 'rgba(222,45,38,0.5)'), line = list(color = 'rgb(8,48,107)', width = 1.5))
)
fig <- fig %>% layout(title = "Actual vs Predicted Value")
fig
predictionsLinReg <- model %>% predict(test.data)
LinRegR2 = R2(predictionsLinReg, test.data$totchol)
LinRegR2
## [1] 0.07681202
y <- df$totchol
x <- data.matrix(df[, c('gender', 'age', 'sysbp', 'diabp', 'smoker',
'cigpday', 'bmi', 'glucose','diabetic',
'heartrate', 'chd', 'stroke', 'death')])
cv_model <- cv.glmnet(x, y, alpha = 1)
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 0.05211864
best_model <- glmnet(x, y, alpha = 1, lambda = best_lambda)
coef(best_model)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 130.137150767
## gender 13.426457700
## age 0.680328554
## sysbp -0.005668957
## diabp 0.372748709
## smoker -0.161017328
## cigpday 0.144538382
## bmi 0.472600467
## glucose -0.040653261
## diabetic -3.861942462
## heartrate 0.142188947
## chd 10.659486040
## stroke -5.262332629
## death -3.060634676
new = matrix(c(2,50, 147,96, 1, 10,24.2, 79, 0,94, 0, 0,0), nrow=1, ncol=13)
predictLassoTotchol <- predict(best_model, s = best_lambda, newx = new)
predictLassoTotchol <- round(predictLassoTotchol, digits = 2)
fig <- plot_ly(
x = c("Predicted (Lasso Regression)", "Actual"),
y = c(predictLassoTotchol, actualTotchol),
name = "Actual vs Predicted value",
type = "bar",
text = c(predictLassoTotchol, actualTotchol),
textposition = 'auto',
marker = list(color = c('rgba(255,255,0,0.5)', 'rgba(222,45,38,0.5)'),
line = list(color = 'rgb(8,48,107)', width = 1.5))
)
fig <- fig %>% layout(title = "Actual vs Predicted Value")
fig
y_predicted <- predict(best_model, s = best_lambda, newx = x)
sst <- sum((y - mean(y))^2)
sse <- sum((y_predicted - y)^2)
R2Lasso <- 1 - sse/sst
R2Lasso
## [1] 0.07257004
y <- df$totchol
x <- data.matrix(df[, c('gender', 'age', 'sysbp', 'diabp', 'smoker',
'cigpday', 'bmi', 'glucose','diabetic',
'heartrate', 'chd', 'stroke', 'death')])
model <- glmnet(x, y, alpha = 0)
summary(model)
## Length Class Mode
## a0 100 -none- numeric
## beta 1300 dgCMatrix S4
## df 100 -none- numeric
## dim 2 -none- numeric
## lambda 100 -none- numeric
## dev.ratio 100 -none- numeric
## nulldev 1 -none- numeric
## npasses 1 -none- numeric
## jerr 1 -none- numeric
## offset 1 -none- logical
## call 4 -none- call
## nobs 1 -none- numeric
cv_model <- cv.glmnet(x, y, alpha = 0)
best_lambda <- cv_model$lambda.min
best_lambda
## [1] 0.6576626
best_model <- glmnet(x, y, alpha = 0, lambda = best_lambda)
coef(best_model)
## 14 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 130.480958643
## gender 13.338631973
## age 0.677387520
## sysbp -0.007995614
## diabp 0.373169119
## smoker -0.526714474
## cigpday 0.156036738
## bmi 0.477900088
## glucose -0.041612206
## diabetic -3.964512058
## heartrate 0.145597999
## chd 10.640198206
## stroke -5.297510380
## death -3.080083619
new = matrix(c(2,50, 147,96, 1, 10,24.2, 79, 0,94, 0, 0,0), nrow=1, ncol=13)
predictRidgeTotchol <- predict(model, s = best_lambda, newx = new)
predictRidgeTotchol <- round(predictRidgeTotchol, digits = 2)
fig <- plot_ly(
x = c("Predicted (Ridge Regression)", "Actual"),
y = c(predictRidgeTotchol, actualTotchol),
name = "Actual vs Predicted value",
type = "bar",
text = c(predictRidgeTotchol, actualTotchol),
textposition = 'auto',
marker = list(color = c('rgba(255,255,0,0.5)', 'rgba(222,45,38,0.5)'), line = list(color = 'rgb(8,48,107)', width = 1.5))
)
fig <- fig %>% layout(title = "Actual vs Predicted Value")
fig
y_predicted <- predict(model, s = best_lambda, newx = x)
sst <- sum((y - mean(y))^2)
sse <- sum((y_predicted - y)^2)
R2Ridge <- 1 - sse/sst
R2Ridge
## [1] 0.07258213
fig <- plot_ly(
x = c("Linear Regression", "Lasso Regression", "Ridge Regression"),
y = c(LinRegR2, R2Lasso, R2Ridge),
name = "",
type = "bar",
text = c(LinRegR2, R2Lasso, R2Ridge),
textposition = 'auto',
marker = list(color = c('rgba(255,255,0,0.5)', 'rgba(222,45,38,0.5)', 'rgba(0,255,255,0.5)'), line = list(color = 'rgb(8,48,107)', width = 1.5))
)
fig <- fig %>% layout(title = "R-squared values of the models")
fig
fig <- plot_ly(
x = c("Actual", "Linear Regression","Lasso Regression","Ridge Regression"),
y = c(actualTotchol, predictRegTotchol, predictLassoTotchol, predictRidgeTotchol),
name = "",
type = "bar",
text = c(actualTotchol, predictRegTotchol, predictLassoTotchol, predictRidgeTotchol),
textposition = 'auto',
marker = list(color = c('rgba(255,255,0,0.5)', 'rgba(222,45,38,0.5)', 'rgba(0,255,255,0.5)', 'rgba(255,255,255,0.5)'), line = list(color = 'rgb(8,48,107)', width = 1.5))
)
fig <- fig %>% layout(title = "Actual vs Predicted values by the models")
fig
Objective: To predict the glucose levels of an individual based on other attributes.
###Predicting Glucose level based on the variables
##Ranking Features by Importance
#Ensuring Repeatability of the results
set.seed(100)
#preparing training scheme
g_df<-df
class(g_df$glucose)
## [1] "integer"
g_df$glucose<-as.numeric(g_df$glucose)
#train the model to compute variable importance
glu_model<-train(glucose~.,data = g_df,method= "rpart")
gluImp<-varImp(glu_model)
print(gluImp)
## rpart variable importance
##
## Overall
## diabetic 100.000
## age 6.241
## sysbp 5.916
## death 5.306
## heartrate 5.103
## smoker 0.000
## totchol 0.000
## chd 0.000
## cigpday 0.000
## bmi 0.000
## diabp 0.000
## stroke 0.000
## hyperten 0.000
## gender 0.000
## hospmi 0.000
plot(gluImp,top = 5,main="Variable Importance")
#Calculating relative importance from linear regression
model_formula = glucose ~ diabetic+age+sysbp+death+heartrate
lmMod <- lm(model_formula, data=g_df)
relImp<-calc.relimp(lmMod,type="lmg",rela=F)
cat("Relative Importances:\n")
## Relative Importances:
sort(round(relImp$lmg,3),decreasing = TRUE)
## diabetic heartrate age sysbp death
## 0.251 0.011 0.010 0.009 0.005
bootsub<-boot.relimp(glucose~diabetic+age+sysbp+death+heartrate,data=g_df,b=100,type="lmg",rank=TRUE,diff=TRUE)
plot(booteval.relimp(bootsub,level=0.95))
#new data frame
glu_df<-g_df[c("glucose","age","sysbp","diabetic","heartrate","death")]
#correlation check in new data frame
M<-cor(glu_df)
corrplot(M,method = "number")
#checking for outliers
glu_outliers <- glu_df[c("glucose","age","sysbp","diabetic","heartrate","death")]
par(mfrow=c(1,6))
LVboxplot(glu_outliers$glucose, horizontal = F, col = "#E41A1C", xlab = "Glucose", ylab = "Count", bg = "white", width.method = "height")
LVboxplot(glu_outliers$age, horizontal = F, col = "#377EB8", xlab = "Age", ylab = "", bg = "white", width.method = "height")
LVboxplot(glu_outliers$sysbp, horizontal = F, col = "#4DAF4A", xlab = "SysBP", ylab = "", bg = "white", width.method = "height")
LVboxplot(glu_outliers$death, horizontal = F, col = "#984EA3", xlab = "Death", ylab = "", bg = "white", width.method = "height")
LVboxplot(glu_outliers$heartrate, horizontal = F, col = "#FF7F00", xlab = "Heartrate", ylab = "", bg = "white", width.method = "height")
LVboxplot(glu_outliers$diabetic, horizontal = F, col = "#FF7F00", xlab = "Diabetic", ylab = "", bg = "white", width.method = "height")
#dropping outliers
max_glucose = glu_df['glucose'] %>% max()
max_heartrate = glu_df['heartrate'] %>% max()
max_sysbp = glu_df['sysbp'] %>% max()
max_death = glu_df['death'] %>% max()
max_age = glu_df['age'] %>% max()
max_diabetic = glu_df['diabetic'] %>% max()
glu_df <- glu_df[which(glu_df$glucose < 423), ]
glu_df <- glu_df[which(glu_df$heartrate < 150), ]
glu_df <- glu_df[which(glu_df$sysbp < 295), ]
glu_df <- glu_df[which(glu_df$death < 1), ]
glu_df <- glu_df[which(glu_df$age < 81), ]
glu_df <- glu_df[which(glu_df$diabetic< 1), ]
#Feature Scaling-Normalization
set.seed (123)
glu.pre<-preProcess(glu_df,method="range")
glu.data<-predict(glu.pre,glu_df)
#Glucose distribution
ggplot(glu.data,aes(x = glucose))+geom_bar(color="red",fill = 'light blue')+labs(
x = 'Glucose Level',
y = 'Count'
)
dev.off()
## null device
## 1
#Glucose Level distribution by Age
ggplot(glu.data,aes(x=glucose,fill=age))+
geom_histogram(color="black",fill="light green",binwidth = .01)+
ggtitle("Glucose Level Distribution by Age")+
ylab("Distribution")+
xlab("Glucose")+scale_x_continuous(breaks=seq(40,207,1))
#Glucose Level distribution by Heartrate
ggplot(glu.data,aes(x=glucose,fill=heartrate))+
geom_histogram(color="black",fill="light pink",binwidth = .05)+
ggtitle("Glucose Level Distribution by Heartrate")+
ylab("Distribution")+
xlab("Glucose")+scale_x_continuous(breaks=seq(40,143,1))
#Train the datasets
glusplit=0.7
glu_TIdx<-createDataPartition(glu.data$glucose,p=glusplit,list=FALSE)
glu_train<-glu.data[glu_TIdx,]
glu_test<-glu.data[-glu_TIdx,]
#Building a Predictive Linear Regression Model and Evaluation
set.seed(1111)
glu_model<-train(glucose~.,data=glu_train,method="lm",metric="RMSE")
summary(glu_model)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.21978 -0.04675 -0.01059 0.03296 0.75231
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.189439 0.004224 44.848 < 2e-16 ***
## age 0.055936 0.006045 9.253 < 2e-16 ***
## sysbp 0.036235 0.009278 3.906 9.51e-05 ***
## diabetic NA NA NA NA
## heartrate 0.056977 0.009283 6.138 8.95e-10 ***
## death NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08015 on 5521 degrees of freedom
## Multiple R-squared: 0.03492, Adjusted R-squared: 0.03439
## F-statistic: 66.59 on 3 and 5521 DF, p-value: < 2.2e-16
glu_pred<-predict(glu_model,newdata = glu_test)
print(glu_model)
## Linear Regression
##
## 5525 samples
## 5 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 5525, 5525, 5525, 5525, 5525, 5525, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.08001886 0.03444807 0.0567544
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
#Building a Predictive KNN Model and Evaluation
tc<-trainControl(method = "cv",number = 10)
glu_cv<-train(glucose~.,data = glu_train,trControl=tc,method="knn")
glu_pred_knn<-predict(glu_cv,newdata = glu_test)
print(glu_cv)
## k-Nearest Neighbors
##
## 5525 samples
## 5 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 4972, 4972, 4972, 4972, 4973, 4971, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 5 0.08786224 0.008067678 0.06421378
## 7 0.08596389 0.008693716 0.06242289
## 9 0.08449386 0.009242703 0.06122042
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.
In this project of Heart Study, we had five questions we wanted to answer for:
1. How to predict whether an individual has heart disease?
2. How to predict whether an individual will suffer a stroke?
3. How to predict whether an individual has diabetes?
4. How to predict the glucose levels of an individual?
5. How to predict the cholesterol levels of an individual?