library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(readr)
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.3.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ purrr 1.0.1
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(reshape2)
##
## Attaching package: 'reshape2'
##
## The following object is masked from 'package:tidyr':
##
## smiths
library(countrycode)
## Warning: package 'countrycode' was built under R version 4.3.2
library(superml)
## Warning: package 'superml' was built under R version 4.3.2
## Loading required package: R6
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.2
## corrplot 0.92 loaded
library(caTools)
## Warning: package 'caTools' was built under R version 4.3.2
library(dplyr)
set.seed(123)
Data for the machine learning model was gathered from kaggle (https://www.kaggle.com/datasets/aravindpcoder/diabetes-dataset). This data was sourced by the Iraqi society from the Medical City Hospital, and the Center for Endocrinology and Diabetes-Al-Kindy Teaching Hospital. The map data was sourced from https://ourworldindata.org/grapher/death-rate-from-diabetes, and they aquired the dataset from the International Diabetes Federation.
db_predictors <- read_csv("https://raw.githubusercontent.com/sphill12/data607_final_project/main/Dataset%20of%20Diabetes_ml.csv")
## Rows: 1000 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Gender, CLASS
## dbl (12): ID, No_Pation, AGE, Urea, Cr, HbA1c, Chol, TG, HDL, LDL, VLDL, BMI
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
db_map <- read_csv("https://raw.githubusercontent.com/sphill12/data607_final_project/main/diabetes-prevalence_map.csv")
## Rows: 463 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): Entity, Code
## dbl (2): Year, Diabetes prevalence (% of population ages 20 to 79)
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(db_predictors)
## ID No_Pation Gender AGE
## Min. : 1.0 Min. : 123 Length:1000 Min. :20.00
## 1st Qu.:125.8 1st Qu.: 24064 Class :character 1st Qu.:51.00
## Median :300.5 Median : 34396 Mode :character Median :55.00
## Mean :340.5 Mean : 270551 Mean :53.53
## 3rd Qu.:550.2 3rd Qu.: 45384 3rd Qu.:59.00
## Max. :800.0 Max. :75435657 Max. :79.00
## Urea Cr HbA1c Chol
## Min. : 0.500 Min. : 6.00 Min. : 0.900 Min. : 0.000
## 1st Qu.: 3.700 1st Qu.: 48.00 1st Qu.: 6.500 1st Qu.: 4.000
## Median : 4.600 Median : 60.00 Median : 8.000 Median : 4.800
## Mean : 5.125 Mean : 68.94 Mean : 8.281 Mean : 4.863
## 3rd Qu.: 5.700 3rd Qu.: 73.00 3rd Qu.:10.200 3rd Qu.: 5.600
## Max. :38.900 Max. :800.00 Max. :16.000 Max. :10.300
## TG HDL LDL VLDL
## Min. : 0.30 Min. :0.200 Min. :0.30 Min. : 0.100
## 1st Qu.: 1.50 1st Qu.:0.900 1st Qu.:1.80 1st Qu.: 0.700
## Median : 2.00 Median :1.100 Median :2.50 Median : 0.900
## Mean : 2.35 Mean :1.205 Mean :2.61 Mean : 1.855
## 3rd Qu.: 2.90 3rd Qu.:1.300 3rd Qu.:3.30 3rd Qu.: 1.500
## Max. :13.80 Max. :9.900 Max. :9.90 Max. :35.000
## BMI CLASS
## Min. :19.00 Length:1000
## 1st Qu.:26.00 Class :character
## Median :30.00 Mode :character
## Mean :29.58
## 3rd Qu.:33.00
## Max. :47.75
str(db_predictors)
## spc_tbl_ [1,000 × 14] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ ID : num [1:1000] 502 735 420 680 504 634 721 421 670 759 ...
## $ No_Pation: num [1:1000] 17975 34221 47975 87656 34223 ...
## $ Gender : chr [1:1000] "F" "M" "F" "F" ...
## $ AGE : num [1:1000] 50 26 50 50 33 45 50 48 43 32 ...
## $ Urea : num [1:1000] 4.7 4.5 4.7 4.7 7.1 2.3 2 4.7 2.6 3.6 ...
## $ Cr : num [1:1000] 46 62 46 46 46 24 50 47 67 28 ...
## $ HbA1c : num [1:1000] 4.9 4.9 4.9 4.9 4.9 4 4 4 4 4 ...
## $ Chol : num [1:1000] 4.2 3.7 4.2 4.2 4.9 2.9 3.6 2.9 3.8 3.8 ...
## $ TG : num [1:1000] 0.9 1.4 0.9 0.9 1 1 1.3 0.8 0.9 2 ...
## $ HDL : num [1:1000] 2.4 1.1 2.4 2.4 0.8 1 0.9 0.9 2.4 2.4 ...
## $ LDL : num [1:1000] 1.4 2.1 1.4 1.4 2 1.5 2.1 1.6 3.7 3.8 ...
## $ VLDL : num [1:1000] 0.5 0.6 0.5 0.5 0.4 0.4 0.6 0.4 1 1 ...
## $ BMI : num [1:1000] 24 23 24 24 21 21 24 24 21 24 ...
## $ CLASS : chr [1:1000] "N" "N" "N" "N" ...
## - attr(*, "spec")=
## .. cols(
## .. ID = col_double(),
## .. No_Pation = col_double(),
## .. Gender = col_character(),
## .. AGE = col_double(),
## .. Urea = col_double(),
## .. Cr = col_double(),
## .. HbA1c = col_double(),
## .. Chol = col_double(),
## .. TG = col_double(),
## .. HDL = col_double(),
## .. LDL = col_double(),
## .. VLDL = col_double(),
## .. BMI = col_double(),
## .. CLASS = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
str(db_map)
## spc_tbl_ [463 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Entity : chr [1:463] "Afghanistan" "Afghanistan" "Albania" "Albania" ...
## $ Code : chr [1:463] "AFG" "AFG" "ALB" "ALB" ...
## $ Year : num [1:463] 2011 2021 2011 2021 2011 ...
## $ Diabetes prevalence (% of population ages 20 to 79): num [1:463] 7.6 10.9 2.8 10.2 7 7.1 20.3 5.4 9.7 2.9 ...
## - attr(*, "spec")=
## .. cols(
## .. Entity = col_character(),
## .. Code = col_character(),
## .. Year = col_double(),
## .. `Diabetes prevalence (% of population ages 20 to 79)` = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
sapply(db_predictors,function(x) sum(is.na(x)))
## ID No_Pation Gender AGE Urea Cr HbA1c Chol
## 0 0 0 0 0 0 0 0
## TG HDL LDL VLDL BMI CLASS
## 0 0 0 0 0 0
sapply(db_map,function(x) sum(is.na(x)))
## Entity
## 0
## Code
## 27
## Year
## 0
## Diabetes prevalence (% of population ages 20 to 79)
## 0
missing_val <- db_map %>%
filter(is.na(Code) == TRUE)
missing_val %>% arrange(desc(`Diabetes prevalence (% of population ages 20 to 79)`))
## # A tibble: 27 × 4
## Entity Code Year Diabetes prevalence (% of pop…¹
## <chr> <chr> <dbl> <dbl>
## 1 Middle East and North Africa (WB) <NA> 2021 12.9
## 2 South Asia (WB) <NA> 2021 12.3
## 3 Middle East and North Africa (WB) <NA> 2011 12.3
## 4 Lower-middle-income countries <NA> 2021 10.5
## 5 Middle-income countries <NA> 2021 10.4
## 6 North America (WB) <NA> 2021 10.4
## 7 Upper-middle-income countries <NA> 2021 10.3
## 8 Latin America and Caribbean (WB) <NA> 2011 10.3
## 9 Latin America and Caribbean (WB) <NA> 2021 9.87
## 10 East Asia and Pacific (WB) <NA> 2021 9.87
## # ℹ 17 more rows
## # ℹ abbreviated name: ¹`Diabetes prevalence (% of population ages 20 to 79)`
db_map <- db_map %>% filter(is.na(Code) == FALSE)
sapply(db_map,function(x) sum(is.na(x)))
## Entity
## 0
## Code
## 0
## Year
## 0
## Diabetes prevalence (% of population ages 20 to 79)
## 0
colnames(db_predictors) <- c("id", "num_patient", "gender", "age", "urea", "creatinine_ratio", "hba1c" , "chol", "triglycerides", "hdl", "ldl", "vldl", "bmi", "class")
colnames(db_map) <- c("country", "code", "year", "db_prevalence")
db_predictors <- subset(db_predictors, select = -c(id, num_patient))
head(db_predictors)
## # A tibble: 6 × 12
## gender age urea creatinine_ratio hba1c chol triglycerides hdl ldl
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 F 50 4.7 46 4.9 4.2 0.9 2.4 1.4
## 2 M 26 4.5 62 4.9 3.7 1.4 1.1 2.1
## 3 F 50 4.7 46 4.9 4.2 0.9 2.4 1.4
## 4 F 50 4.7 46 4.9 4.2 0.9 2.4 1.4
## 5 M 33 7.1 46 4.9 4.9 1 0.8 2
## 6 F 45 2.3 24 4 2.9 1 1 1.5
## # ℹ 3 more variables: vldl <dbl>, bmi <dbl>, class <chr>
ggplot(data = db_predictors, aes(x = gender)) + geom_bar()
ggplot(data = db_predictors, aes(x = class)) + geom_bar()
db_predictors$gender <- gsub("f", "F", db_predictors$gender)
gender: Diabetes is more common in men than women, so this may be a useful predictor
Urea: A waste product that is excreted by kidneys. Urea is often higher in diabetes patients
Creatinine ratio: Creatinine is a waste product in urine that comes from energy-producing processes in muscles. Higher creatinine is an indicator of diabetes
hba1c: An HbA1c test is a blood test that shows average blood sugar levels over the previous 3 months. A high blood sugar level is a good indicator of diabetes, and this test is normally used to screen for it
cholesterol: Diabetes will often increase bad cholesterol levels (LDL cholesterol), while decreasing good cholesterol (hdl cholesterol) levels
triglycerides: Triglycerides are a type of fat in the blood. High triglycerides are common in diabetes patients
VLDL Test: Stands for very low density lipoprotein test. Lipoproteins are made up of cholesterol, triglycerides and proteins. They move these substances around the body. VLDL contain a high amount of triglycerides and are considered a bad cholesterol
BMI: Body Mass Index. A higher BMI is associated with an increased probability of diabetes, especially if the individual is at obese levels
hist_grid_predictors <- melt(db_predictors[,(1:12)])
## Using gender, class as id variables
head(hist_grid_predictors)
## gender class variable value
## 1 F N age 50
## 2 M N age 26
## 3 F N age 50
## 4 F N age 50
## 5 M N age 33
## 6 F N age 45
hist_grid_predictors1 <- melt(db_predictors[,c(2,3,12)])
## Using class as id variables
hist_grid_predictors2 <- melt(db_predictors[,c(4,5,12)])
## Using class as id variables
hist_grid_predictors3 <- melt(db_predictors[,c(6,7,12)])
## Using class as id variables
hist_grid_predictors4 <- melt(db_predictors[,c(8,9,12)])
## Using class as id variables
hist_grid_predictors5 <- melt(db_predictors[,c(10,11,12)])
## Using class as id variables
head(hist_grid_predictors1)
## class variable value
## 1 N age 50
## 2 N age 26
## 3 N age 50
## 4 N age 50
## 5 N age 33
## 6 N age 45
ggplot(hist_grid_predictors1, aes(x = value,color = hist_grid_predictors1$class)) +
facet_wrap(~variable, scales = "free_x") + geom_histogram(fill = "white")
## Warning: Use of `hist_grid_predictors1$class` is discouraged.
## ℹ Use `class` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hist_grid_predictors2, aes(x = value,color = hist_grid_predictors2$class)) +
facet_wrap(~variable, scales = "free_x") + geom_histogram(fill = "white")
## Warning: Use of `hist_grid_predictors2$class` is discouraged.
## ℹ Use `class` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hist_grid_predictors3, aes(x = value,color = hist_grid_predictors3$class)) +
facet_wrap(~variable, scales = "free_x") + geom_histogram(fill = "white")
## Warning: Use of `hist_grid_predictors3$class` is discouraged.
## ℹ Use `class` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hist_grid_predictors4, aes(x = value,color = hist_grid_predictors3$class)) +
facet_wrap(~variable, scales = "free_x") + geom_histogram(fill = "white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hist_grid_predictors5, aes(x = value,color = hist_grid_predictors3$class)) +
facet_wrap(~variable, scales = "free_x") + geom_histogram(fill = "white")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(hist_grid_predictors1, aes(x = value,color = hist_grid_predictors3$class)) +
facet_wrap(~variable, scales = "free_x") + geom_boxplot(fill = "white")
ggplot(hist_grid_predictors2, aes(x = value,color = hist_grid_predictors3$class)) +
facet_wrap(~variable, scales = "free_x") + geom_boxplot(fill = "white")
ggplot(hist_grid_predictors3, aes(x = value,color = hist_grid_predictors3$class)) +
facet_wrap(~variable, scales = "free_x") + geom_boxplot(fill = "white")
## Warning: Use of `hist_grid_predictors3$class` is discouraged.
## ℹ Use `class` instead.
ggplot(hist_grid_predictors4, aes(x = value,color = hist_grid_predictors3$class)) +
facet_wrap(~variable, scales = "free_x") + geom_boxplot(fill = "white")
ggplot(hist_grid_predictors5, aes(x = value,color = hist_grid_predictors3$class)) +
facet_wrap(~variable, scales = "free_x") + geom_boxplot(fill = "white")
db_map %>%
ggplot(aes(x = db_prevalence)) +
geom_histogram(fill = "white", col = I("blue"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
db_map %>%
ggplot(aes(y = db_prevalence)) + geom_boxplot()
corrplot(cor(db_predictors[,2:11]), method = "color", order = "alphabet")
high_db <- db_map %>%
filter(db_prevalence >20)
high_db[order(high_db$db_prevalence, decreasing = TRUE),]
## # A tibble: 16 × 4
## country code year db_prevalence
## <chr> <chr> <dbl> <dbl>
## 1 Pakistan PAK 2021 30.8
## 2 Kiribati KIR 2011 25.3
## 3 French Polynesia PYF 2021 25.2
## 4 Kuwait KWT 2021 24.9
## 5 Nauru NRU 2021 23.4
## 6 New Caledonia NCL 2021 23.4
## 7 Northern Mariana Islands MNP 2021 23.4
## 8 Marshall Islands MHL 2021 23
## 9 Mauritius MUS 2021 22.6
## 10 Kiribati KIR 2021 22.1
## 11 Marshall Islands MHL 2011 21.8
## 12 Egypt EGY 2021 20.9
## 13 Kuwait KWT 2011 20.7
## 14 Nauru NRU 2011 20.4
## 15 American Samoa ASM 2021 20.3
## 16 Tuvalu TUV 2021 20.3
db_map %>%
filter(year != 2000) %>%
arrange(db_prevalence)
## # A tibble: 418 × 4
## country code year db_prevalence
## <chr> <chr> <dbl> <dbl>
## 1 Benin BEN 2021 1.1
## 2 Gambia GMB 2021 1.9
## 3 Mali MLI 2011 1.9
## 4 Benin BEN 2011 2
## 5 Gambia GMB 2011 2
## 6 Burkina Faso BFA 2021 2.1
## 7 Cape Verde CPV 2021 2.1
## 8 Cote d'Ivoire CIV 2021 2.1
## 9 Guinea GIN 2021 2.1
## 10 Guinea-Bissau GNB 2021 2.1
## # ℹ 408 more rows
wide_db_map <- db_map %>%
filter(year != 2000) %>%
pivot_wider(names_from = year, values_from = c(db_prevalence))
colnames(wide_db_map)[c(3,4)] <- c("year11", "year21")
wide_db_map$percent_change <-wide_db_map$year21 - wide_db_map$year11
wide_db_map %>% filter(wide_db_map$percent_change > 5) %>% arrange(desc(percent_change))
## # A tibble: 17 × 5
## country code year11 year21 percent_change
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Pakistan PAK 7.9 30.8 22.9
## 2 French Polynesia PYF 8.6 25.2 16.6
## 3 New Caledonia NCL 8.6 23.4 14.8
## 4 Guam GUM 8.6 19.1 10.5
## 5 Sudan SDN 8.6 18.9 10.3
## 6 Tanzania TZA 2.8 12.3 9.5
## 7 Papua New Guinea PNG 7.6 16.7 9.1
## 8 Mauritius MUS 14.8 22.6 7.8
## 9 Saint Kitts and Nevis KNA 8.5 16.1 7.6
## 10 Albania ALB 2.8 10.2 7.4
## 11 Zambia ZMB 4.8 11.9 7.1
## 12 Malaysia MYS 12.1 19 6.9
## 13 Fiji FJI 10.9 17.7 6.8
## 14 Turkey TUR 7.9 14.5 6.6
## 15 Palau PLW 11.1 17 5.9
## 16 Indonesia IDN 5.1 10.6 5.5
## 17 Nepal NPL 3.6 8.7 5.1
wide_db_map %>% arrange(percent_change)
## # A tibble: 212 × 5
## country code year11 year21 percent_change
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Lebanon LBN 19.6 8 -11.6
## 2 Bahrain BHR 19.5 11.3 -8.2
## 3 Aruba ABW 12.4 4.3 -8.1
## 4 Zimbabwe ZWE 9.8 2.1 -7.7
## 5 Botswana BWA 10.8 5.2 -5.6
## 6 Libya LBY 13.8 8.7 -5.1
## 7 Guyana GUY 16.7 11.7 -5
## 8 Gabon GAB 10.3 5.5 -4.8
## 9 Jamaica JAM 15.7 11.1 -4.6
## 10 Yemen YEM 9.6 5.4 -4.2
## # ℹ 202 more rows
db_map21 <- db_map %>%
filter(year == 2021)
country_names <- countrycode(db_map21$code, "iso3c", "country.name")
## Warning: Some values were not matched unambiguously: OWID_CIS, OWID_WRL
head(country_names)
## [1] "Afghanistan" "Albania" "Algeria" "American Samoa"
## [5] "Andorra" "Angola"
db_map21 <- cbind(db_map21, map_country = country_names)
db_map21$map_country <- gsub("United States", "USA", db_map21$map_country)
db_map21$map_country <- gsub("Congo - Kinshasa", "Democratic Republic of the Congo", db_map21$map_country)
db_map21$map_country <- gsub("Congo - Brazzaville", "Republic of Congo", db_map21$map_country)
db_map21$map_country[db_map21$code == "CIV"] <- "Ivory Coast"
db_map21$map_country[db_map21$code == "MMR"] <- "Myanmar"
db_map21$map_country[db_map21$code == "CZE"] <- "Czech Republic"
world_map <- map_data("world")
world_map <- subset(world_map, region != "Antarctica")
ggplot(db_map21) +
geom_map(dat = world_map, map = world_map, aes(map_id = region,),
fill = "white", color = "#7f7f7f", size = 0.25
) +
geom_map(map = world_map, aes(map_id = db_map21$map_country, fill = db_map21$db_prevalence), size = 0.25) +
scale_fill_gradient(low = "white", high = "red", name = "diabetes") +
expand_limits(x = world_map$long, y = world_map$lat)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Use of `db_map21$map_country` is discouraged.
## ℹ Use `map_country` instead.
## Warning: Use of `db_map21$db_prevalence` is discouraged.
## ℹ Use `db_prevalence` instead.
db_map21 <- merge(x = db_map21, y = wide_db_map[,c("country", "percent_change")], by = "country")
world_map2 <- map_data("world")
world_map2 <- subset(world_map, region != "Antarctica")
ggplot(db_map21) +
geom_map(dat = world_map, map = world_map, aes(map_id = region,),
fill = "white", color = "#7f7f7f", size = 0.25
) +
geom_map(map = world_map, aes(map_id = db_map21$map_country, fill = db_map21$percent_change), size = 0.25) +
scale_fill_gradient2(low = "blue", high = "red",mid = "white", name = "diabetes") +
expand_limits(x = world_map$long, y = world_map$lat)
## Warning: Use of `db_map21$map_country` is discouraged.
## ℹ Use `map_country` instead.
## Warning: Use of `db_map21$percent_change` is discouraged.
## ℹ Use `percent_change` instead.
ml_data <- db_predictors %>% filter(class != "P")
ggplot(data = ml_data, aes(x = class)) + geom_bar()
### For the machine learning model to process the data, character values
must be converted to a numeric format. 1’s refer to datapoints with
positive diabetes, and data from men
ml_data$class <- ifelse(ml_data$class == "N",0,1)
ml_data$gender <- ifelse(ml_data$gender == "F", 0, 1)
split_index <- sample(2, nrow(ml_data), replace = T, prob = c(0.7, 0.3))
split_index
## [1] 1 2 1 2 2 1 1 2 1 1 2 1 1 1 1 2 1 1 1 2 2 1 1 2 1 2 1 1 1 1 2 2 1 2 1 1 2
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2 2 1 1 1 1 1 2 1 2 2 2 1 2 1 2 1
## [75] 1 1 1 1 1 1 1 1 1 2 1 1 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 2 1 2 2 1 1 1 2
## [112] 1 1 2 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 2 1 2 1 1 2 2 2 1 1 1 1 1 2 1 1 1
## [149] 1 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1
## [186] 1 1 1 2 2 1 1 2 1 2 1 1 1 1 1 1 2 1 1 1 2 1 1 1 1 1 1 1 1 1 2 1 1 2 2 1 2
## [223] 2 1 1 1 1 1 1 2 1 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 2 2 2 1 1 1 1 1 2 1 1 1
## [260] 2 2 2 1 2 1 1 1 1 1 1 2 1 1 1 2 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2
## [297] 2 1 1 2 2 1 2 2 1 1 1 1 1 1 1 1 2 1 1 2 2 1 1 2 2 1 1 2 1 1 2 1 1 2 1 1 2
## [334] 2 1 1 1 1 1 2 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 2 1 1 2 1 1 1 1
## [371] 1 1 2 1 1 2 2 1 1 2 1 2 1 2 1 2 1 1 1 1 2 1 2 2 1 1 1 1 1 2 2 1 2 1 1 1 2
## [408] 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1
## [445] 2 2 2 1 1 1 1 1 1 1 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1 1 1 2 1
## [482] 2 1 1 2 1 1 1 1 2 2 1 1 2 1 2 1 1 1 2 1 1 1 1 1 1 1 1 2 1 1 1 2 1 2 1 1 2
## [519] 1 2 1 1 1 1 1 1 2 1 2 1 2 1 1 2 1 2 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1 1 1 2 1
## [556] 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 2 2 2 1 2 1 1 1 2 1 1 1
## [593] 2 1 1 2 1 1 2 1 1 1 1 1 1 2 1 2 1 1 1 1 1 2 1 2 2 2 2 1 2 2 2 1 1 1 1 2 1
## [630] 1 2 2 1 2 1 1 2 2 2 1 1 2 2 1 1 1 2 1 1 1 2 2 1 1 2 2 2 1 1 1 2 2 1 2 1 1
## [667] 2 1 1 1 1 1 1 1 1 1 2 1 1 1 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1
## [704] 2 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 2 2 2 1 2 2 1 2 1 1 2 1 2 1 1 2 2 1 1
## [741] 2 1 1 1 1 1 2 1 1 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2 1 1 1
## [778] 1 1 2 1 2 1 1 2 1 1 1 1 2 2 1 1 1 2 1 1 1 1 1 1 1 1 1 1 2 1 1 2 1 1 1 1 2
## [815] 2 1 1 1 1 1 1 1 1 2 2 1 1 1 1 2 1 2 1 2 1 1 2 1 1 1 2 2 1 1 1 1 2 1 1 1 1
## [852] 2 2 2 1 1 2 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 1 1 1 2 1 1 2 1 1 2 2 1 1
## [889] 1 1 1 2 1 1 2 2 1 1 1 2 2 1 2 1 1 1 2 2 2 1 1 1 2 1 1 2 1 1 1 1 2 1 1 2 1
## [926] 1 1 1 1 2 2 1 2 1 2 1 1 2 2 1 2 1 1 1 1 1 1
train <- ml_data[split_index == 1,]
test <- ml_data[split_index == 2,]
important terms:
mtry: The number of variables that are randomly sampled for each split. How many input features a decision tree has available at one time.
ntree: The amount of decision trees that are used in total
improve: The degree that the model has to improve for the process to keep running
train$class <- as.factor(train$class)
tuneRF(train[-12], train$class, ntreeTry = 2001, stepFactor = 1.5, improve = 0.01, trace = TRUE, plot = TRUE)
## mtry = 3 OOB error = 0.9%
## Searching left ...
## mtry = 2 OOB error = 1.05%
## -0.1666667 0.01
## Searching right ...
## mtry = 4 OOB error = 0.75%
## 0.1666667 0.01
## mtry = 6 OOB error = 0.75%
## 0 0.01
## mtry OOBError
## 2.OOB 2 0.010526316
## 3.OOB 3 0.009022556
## 4.OOB 4 0.007518797
## 6.OOB 6 0.007518797
rf_model <- randomForest(class ~ ., data = train, ntree = 2001, replace= TRUE, mtry = 4, importance = TRUE)
plot(rf_model, ylim=c(0,0.36))
legend('topright', colnames(rf_model$err.rate), col=1:3, fill=1:3)
### Extract the variable importance from our random forest model
importance <- importance(rf_model)
var_imporance <- data.frame(Variables = row.names(importance), importance = importance[,"MeanDecreaseGini"])
var_importance <- var_imporance[order(-var_imporance$importance),]
var_importance[,1:2]
## Variables importance
## hba1c hba1c 48.8271051
## bmi bmi 37.1867750
## age age 12.5633920
## chol chol 11.8895067
## vldl vldl 6.6013081
## triglycerides triglycerides 6.3555604
## ldl ldl 2.7515150
## urea urea 2.0132168
## creatinine_ratio creatinine_ratio 1.5558246
## hdl hdl 1.2905552
## gender gender 0.3433249
ggplot(var_imporance, aes(x = reorder(Variables, importance), y = importance, fill = importance)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 22.5))
### hba1c is the most important term for evaluating whether or not a
patient has diabetes.This makes sense as hba1c tests are used
specifically to test for diabetes. Our model has a predicted error rate
of 0.75%, with a higher rate of false positives than false negatives.
Certain variables such as gender, creatinine ratio , hdl, urea, and ldl
are extremely unimportant to the model
rf_model
##
## Call:
## randomForest(formula = class ~ ., data = train, ntree = 2001, replace = TRUE, mtry = 4, importance = TRUE)
## Type of random forest: classification
## Number of trees: 2001
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 1.05%
## Confusion matrix:
## 0 1 class.error
## 0 72 2 0.027027027
## 1 5 586 0.008460237
predicting1 <- data.frame(test$class, predict(rf_model, test))
sum(predicting1$predict.rf_model..test. == predicting1$test.class, na.rm = T)/ nrow(predicting1) * 100
## [1] 99.64539
tuneRF(train[,c(1:4,6:11)], train$class, ntreeTry = 2001, stepFactor = 1.5, improve = 0.01, trace = TRUE, plot = TRUE)
## mtry = 3 OOB error = 2.26%
## Searching left ...
## mtry = 2 OOB error = 2.71%
## -0.2 0.01
## Searching right ...
## mtry = 4 OOB error = 2.11%
## 0.06666667 0.01
## mtry = 6 OOB error = 1.95%
## 0.07142857 0.01
## mtry = 9 OOB error = 1.95%
## 0 0.01
## mtry OOBError
## 2.OOB 2 0.02706767
## 3.OOB 3 0.02255639
## 4.OOB 4 0.02105263
## 6.OOB 6 0.01954887
## 9.OOB 9 0.01954887
model2 <- randomForest(class ~ ., data = train[,c(1:4,6:12)], ntree = 2001, replace= TRUE, mtry = 3, importance = TRUE)
plot(model2, ylim=c(0,0.36))
legend('topright', colnames(rf_model$err.rate), col=1:3, fill=1:3)
### Our expected error rate is 2.26%
model2
##
## Call:
## randomForest(formula = class ~ ., data = train[, c(1:4, 6:12)], ntree = 2001, replace = TRUE, mtry = 3, importance = TRUE)
## Type of random forest: classification
## Number of trees: 2001
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 2.26%
## Confusion matrix:
## 0 1 class.error
## 0 67 7 0.09459459
## 1 8 583 0.01353638
model_2_predict <- predict(model2, test)
predicting2 <- data.frame(test$class, predict(model2, test))
sum(predicting2$test.class == predicting2$predict.model2..test., na.rm = T)/ nrow(predicting2) * 100
## [1] 99.29078
importance_model2 <- importance(model2)
var_imporance_model_2 <- data.frame(Variables = row.names(importance_model2), importance = importance_model2[,"MeanDecreaseGini"])
var_importance_model_2 <- var_imporance_model_2[order(-var_imporance_model_2$importance),]
var_importance_model_2 [,1:2]
## Variables importance
## bmi bmi 55.999370
## age age 25.035208
## chol chol 13.965903
## triglycerides triglycerides 9.802536
## vldl vldl 8.514498
## urea urea 4.657078
## ldl ldl 4.255268
## creatinine_ratio creatinine_ratio 4.215448
## hdl hdl 3.310978
## gender gender 1.166141
ggplot(var_importance_model_2, aes(x = reorder(Variables, importance), y = importance, fill = importance)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 22.5))
model_result <- data.frame(test$class, predict(model2, test[,1:11], type = "response"))
model3 <- randomForest(class ~ ., data = train[,c(1,2,11,12)], ntree = 2001, replace= TRUE, mtry = 3, importance = TRUE)
The error rate for non-diabetic predictions once again increases,while the diabetic data is relatively unaffected.
plot(model3, ylim=c(0,0.36))
legend('topright', colnames(model3$err.rate), col=1:3, fill=1:3)
### The estimate for error is about 7%. Even with just simple data such
as bmi, age, and gender the model is still predicted to be extremely
accurate
model3
##
## Call:
## randomForest(formula = class ~ ., data = train[, c(1, 2, 11, 12)], ntree = 2001, replace = TRUE, mtry = 3, importance = TRUE)
## Type of random forest: classification
## Number of trees: 2001
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 6.92%
## Confusion matrix:
## 0 1 class.error
## 0 53 21 0.28378378
## 1 25 566 0.04230118
importance_model3 <- importance(model3)
var_imporance_model_3 <- data.frame(Variables = row.names(importance_model3), importance = importance_model3[,"MeanDecreaseGini"])
var_importance_model_3 <- var_imporance_model_3[order(-var_imporance_model_3$importance),]
var_importance_model_3 [,1:2]
## Variables importance
## bmi bmi 94.911731
## age age 25.267639
## gender gender 4.386675
ggplot(var_importance_model_3, aes(x = reorder(Variables, importance), y = importance, fill = importance)) + geom_bar(stat = "identity") + theme(axis.text.x = element_text(angle = 22.5))
model_3_predict <- predict(model3, test)
predicting3 <- data.frame(test$class, predict(model3, test))
sum(predicting3$test.class == predicting3$predict.model3..test., na.rm = T)/ nrow(predicting2) * 100
## [1] 95.03546
smote