#Haris Javed
#Assignment # 3
#loading packages
library(tidyverse)
## Registered S3 methods overwritten by 'ggplot2':
## method from
## [.quosures rlang
## c.quosures rlang
## print.quosures rlang
## Registered S3 method overwritten by 'rvest':
## method from
## read_xml.response xml2
## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.1 v dplyr 0.8.0.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(fastDummies)
## Warning: package 'fastDummies' was built under R version 3.6.2
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.2
## corrplot 0.84 loaded
library(VIM)
## Warning: package 'VIM' was built under R version 3.6.2
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(haven)
## Warning: package 'haven' was built under R version 3.6.2
library(BaylorEdPsych)
library(mvnmle)
library(naniar)
## Warning: package 'naniar' was built under R version 3.6.2
Sys.sleep(4)
framingham <- read.csv("framingham_week3.csv", header = TRUE)
Sys.sleep(4)
Sys.sleep(4)
#glimpse of imported data
glimpse(framingham)
## Observations: 4,240
## Variables: 16
## $ gender <fct> Male, Female, Male, Female, Female, Female, Fe...
## $ age <int> 39, 46, 48, 61, 46, 43, 63, 45, 52, 43, 50, 43...
## $ education <int> 4, 2, 1, 3, 3, 2, 1, 2, 1, 1, 1, 2, 1, 3, 2, 2...
## $ currentSmoker <int> 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1...
## $ cigsPerDay <int> 0, 0, 20, 30, 23, 0, 0, 20, 0, 30, 0, 0, 15, 0...
## $ BPMeds <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0...
## $ prevalentStroke <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ prevalentHyp <int> 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1...
## $ diabetes <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ totChol <int> 195, 250, 245, 225, 285, 228, 205, 313, 260, 2...
## $ sysBP <dbl> 106.0, 121.0, 127.5, 150.0, 130.0, 180.0, 138....
## $ diaBP <dbl> 70.0, 81.0, 80.0, 95.0, 84.0, 110.0, 71.0, 71....
## $ BMI <dbl> 26.97, 28.73, 25.34, 28.58, 23.10, 30.30, 33.1...
## $ heartRate <int> 80, 95, 75, 65, 85, 77, 60, 79, 76, 93, 75, 72...
## $ glucose <int> 77, 76, 70, 103, 85, 99, 85, 78, 79, 88, 76, 6...
## $ TenYearCHD <int> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1...
Sys.sleep(4)
#quick NA check
paste(sum(is.na(framingham)),"NA's found on your dataframe")
## [1] "645 NA's found on your dataframe"
Sys.sleep(4)
#converting to long format using tidyverse/ dplyr, only selecting relevant items
framingham_long <- framingham %>% select(glucose,BPMeds,totChol,cigsPerDay,BMI,heartRate) %>% gather(key = 'variable')
#calcutaing counts, and proportion by grouping variables together
framingham_Prop <- framingham_long %>% group_by(variable) %>% summarise(NumObs = length(value), MissingValues = sum(is.na(value)), Prop_missing = round((M = MissingValues/NumObs *100),2))
#sorted by highest missing value proportion
framingham_Prop_sorted <- as.data.frame(arrange(framingham_Prop, desc(MissingValues, Prop_missing)))
framingham_Prop_sorted
## variable NumObs MissingValues Prop_missing
## 1 glucose 4240 388 9.15
## 2 BPMeds 4240 53 1.25
## 3 totChol 4240 50 1.18
## 4 cigsPerDay 4240 29 0.68
## 5 BMI 4240 19 0.45
## 6 heartRate 4240 1 0.02
#Plotting the proporting of missing values in R
bar_graph_comp <- framingham_Prop %>% plot_ly() %>% add_trace(name = "Missing_counts",x = ~variable, y = ~MissingValues, type = 'bar')
bar_graph_comp
#adding sleep timer to slow down vis
Sys.sleep(4)
#Here is a donut graph - this gives us the number of missing values, and what percent of missing values belong to which variable for example glucose accounts for 72% of the missing values
donut_graph_comp <- framingham_Prop %>% plot_ly(labels = ~variable, values = ~MissingValues) %>% add_pie(hole = 0.6) %>% layout(title = '% of missing values by variable')
Sys.sleep(4)
donut_graph_comp
Sys.sleep(4)
Sys.sleep(4)
#LOOKING FOR MISSING DATA PATTERNS AND ONLY AT VARIABLES WITH MISSING DATA
framingham_missing <- framingham %>% select(c("glucose", "BPMeds","totChol","cigsPerDay","BMI","heartRate"))
Sys.sleep(4)
#Checking for missing variables, and variables that are missing together - please look at the graph if R doesnt produce one
graph_miss <- gg_miss_upset(framingham_missing)
Sys.sleep(4)
#graphing related missing variables - Come to think of it, I only needed this graph, so some earlier code can be cleaned up
graph_miss

Sys.sleep(4)
#Collecting NAs from framingham_missing -
framingham_NAS_variables <- as.data.frame(abs(is.na(framingham_missing)))
Sys.sleep(4)
#Correlation matrix for missing variables
cor_matrix_for_NAS <- as.data.frame(cor(framingham_NAS_variables, use = 'pairwise.complete'))
cor_matrix_for_NAS
## glucose BPMeds totChol cigsPerDay
## glucose 1.000000000 -0.020982745 0.268422500 0.013360827
## BPMeds -0.020982745 1.000000000 0.007374213 -0.009336688
## totChol 0.268422500 0.007374213 1.000000000 -0.009065346
## cigsPerDay 0.013360827 -0.009336688 -0.009065346 1.000000000
## BMI 0.039940720 -0.007548412 0.025381624 -0.005567696
## heartRate -0.004874623 -0.001728044 -0.001677824 -0.001274602
## BMI heartRate
## glucose 0.039940720 -0.004874623
## BPMeds -0.007548412 -0.001728044
## totChol 0.025381624 -0.001677824
## cigsPerDay -0.005567696 -0.001274602
## BMI 1.000000000 -0.001030475
## heartRate -0.001030475 1.000000000
Sys.sleep(4)
#the missing variables with some interaction seem to be Glucose, and Total Cholestrol
Sys.sleep(4)
#recoding variables - these items are factors, and Dummy variables should be coded for them
#This is not the fastest or the best method - unless I parallelize some of this
framingham$gender <- as.factor(framingham$gender)
framingham$education <- as.factor(framingham$education)
framingham$BPMeds <- as.factor(framingham$BPMeds)
framingham$prevalentHyp <- as.factor(framingham$prevalentHyp)
framingham$prevalentStroke <- as.factor(framingham$prevalentStroke)
framingham$diabetes <- as.factor(framingham$diabetes)
framingham$TenYearCHD <- as.factor(framingham$TenYearCHD)
Sys.sleep(4)
glimpse(framingham)
## Observations: 4,240
## Variables: 16
## $ gender <fct> Male, Female, Male, Female, Female, Female, Fe...
## $ age <int> 39, 46, 48, 61, 46, 43, 63, 45, 52, 43, 50, 43...
## $ education <fct> 4, 2, 1, 3, 3, 2, 1, 2, 1, 1, 1, 2, 1, 3, 2, 2...
## $ currentSmoker <int> 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1...
## $ cigsPerDay <int> 0, 0, 20, 30, 23, 0, 0, 20, 0, 30, 0, 0, 15, 0...
## $ BPMeds <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0...
## $ prevalentStroke <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ prevalentHyp <fct> 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1...
## $ diabetes <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
## $ totChol <int> 195, 250, 245, 225, 285, 228, 205, 313, 260, 2...
## $ sysBP <dbl> 106.0, 121.0, 127.5, 150.0, 130.0, 180.0, 138....
## $ diaBP <dbl> 70.0, 81.0, 80.0, 95.0, 84.0, 110.0, 71.0, 71....
## $ BMI <dbl> 26.97, 28.73, 25.34, 28.58, 23.10, 30.30, 33.1...
## $ heartRate <int> 80, 95, 75, 65, 85, 77, 60, 79, 76, 93, 75, 72...
## $ glucose <int> 77, 76, 70, 103, 85, 99, 85, 78, 79, 88, 76, 6...
## $ TenYearCHD <fct> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1...
Sys.sleep(5)