#Haris Javed 
#Assignment # 3 
#loading packages 
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.2.1     v purrr   0.3.3
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   1.0.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0
## -- Conflicts --------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(fastDummies)
library(corrplot)
## corrplot 0.84 loaded
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(haven)
library(BaylorEdPsych)
library(mvnmle)
library(naniar)
Sys.sleep(4)
framingham <- read.csv("framingham_week3.csv", header = TRUE)
Sys.sleep(4)
Sys.sleep(4)
#glimpse of imported data
glimpse(framingham)
## Observations: 4,240
## Variables: 16
## $ gender          <fct> Male, Female, Male, Female, Female, Female, Female,...
## $ age             <int> 39, 46, 48, 61, 46, 43, 63, 45, 52, 43, 50, 43, 46,...
## $ education       <int> 4, 2, 1, 3, 3, 2, 1, 2, 1, 1, 1, 2, 1, 3, 2, 2, 3, ...
## $ currentSmoker   <int> 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, ...
## $ cigsPerDay      <int> 0, 0, 20, 30, 23, 0, 0, 20, 0, 30, 0, 0, 15, 0, 9, ...
## $ BPMeds          <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ prevalentStroke <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ prevalentHyp    <int> 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, ...
## $ diabetes        <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ totChol         <int> 195, 250, 245, 225, 285, 228, 205, 313, 260, 225, 2...
## $ sysBP           <dbl> 106.0, 121.0, 127.5, 150.0, 130.0, 180.0, 138.0, 10...
## $ diaBP           <dbl> 70.0, 81.0, 80.0, 95.0, 84.0, 110.0, 71.0, 71.0, 89...
## $ BMI             <dbl> 26.97, 28.73, 25.34, 28.58, 23.10, 30.30, 33.11, 21...
## $ heartRate       <int> 80, 95, 75, 65, 85, 77, 60, 79, 76, 93, 75, 72, 98,...
## $ glucose         <int> 77, 76, 70, 103, 85, 99, 85, 78, 79, 88, 76, 61, 64...
## $ TenYearCHD      <int> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
Sys.sleep(4)
#quick NA check 
paste(sum(is.na(framingham)),"NA's found on your dataframe")
## [1] "645 NA's found on your dataframe"
Sys.sleep(4)
#converting to long format using tidyverse/ dplyr, only selecting relevant items 
framingham_long <- framingham %>% select(glucose,BPMeds,totChol,cigsPerDay,BMI,heartRate) %>% gather(key = 'variable')
#calcutaing counts, and proportion by grouping variables together
framingham_Prop <- framingham_long %>% group_by(variable) %>% summarise(NumObs = length(value), MissingValues = sum(is.na(value)), Prop_missing = round((M = MissingValues/NumObs *100),2))
#sorted by highest missing value proportion
framingham_Prop_sorted <- as.data.frame(arrange(framingham_Prop, desc(MissingValues, Prop_missing)))
framingham_Prop_sorted
##     variable NumObs MissingValues Prop_missing
## 1    glucose   4240           388         9.15
## 2     BPMeds   4240            53         1.25
## 3    totChol   4240            50         1.18
## 4 cigsPerDay   4240            29         0.68
## 5        BMI   4240            19         0.45
## 6  heartRate   4240             1         0.02
#Plotting the proporting of missing values in R 
bar_graph_comp <- framingham_Prop %>% plot_ly() %>% add_trace(x = ~variable, y = ~NumObs, type = 'bar', name = "Total_counts") %>% add_trace(name = "Missing_counts",x = ~variable, y = ~MissingValues, type = 'bar')
bar_graph_comp
#adding sleep timer to slow down vis
Sys.sleep(4)
#Here is a donut graph - this gives us the number of missing values, and what percent of missing values belong to which variable for example glucose accounts for 72% of the missing values
donut_graph_comp <- framingham_Prop %>% plot_ly(labels = ~variable, values = ~MissingValues) %>% add_pie(hole = 0.6)
Sys.sleep(4)
donut_graph_comp
Sys.sleep(4)
Sys.sleep(4)
#LOOKING FOR MISSING DATA PATTERNS AND ONLY AT VARIABLES WITH MISSING DATA 
framingham_missing <- framingham %>% select(c("glucose", "BPMeds","totChol","cigsPerDay","BMI","heartRate"))
Sys.sleep(4)
#Checking for missing variables, and variables that are missing together - please look at the graph if R doesnt produce one 
graph_miss <- gg_miss_upset(framingham_missing)
Sys.sleep(4)
#graphing related missing variables - Come to think of it, I only needed this graph, so some earlier code can be cleaned up
graph_miss

Sys.sleep(4)
#Collecting NAs from framingham_missing - 
framingham_NAS_variables <- as.data.frame(abs(is.na(framingham_missing)))
Sys.sleep(4)
#Correlation matrix for missing variables 
cor_matrix_for_NAS <- as.data.frame(cor(framingham_NAS_variables, use = 'pairwise.complete'))
cor_matrix_for_NAS
##                 glucose       BPMeds      totChol   cigsPerDay          BMI
## glucose     1.000000000 -0.020982745  0.268422500  0.013360827  0.039940720
## BPMeds     -0.020982745  1.000000000  0.007374213 -0.009336688 -0.007548412
## totChol     0.268422500  0.007374213  1.000000000 -0.009065346  0.025381624
## cigsPerDay  0.013360827 -0.009336688 -0.009065346  1.000000000 -0.005567696
## BMI         0.039940720 -0.007548412  0.025381624 -0.005567696  1.000000000
## heartRate  -0.004874623 -0.001728044 -0.001677824 -0.001274602 -0.001030475
##               heartRate
## glucose    -0.004874623
## BPMeds     -0.001728044
## totChol    -0.001677824
## cigsPerDay -0.001274602
## BMI        -0.001030475
## heartRate   1.000000000
Sys.sleep(4)
#the missing variables with some interaction seem to be Glucose, and Total Cholestrol
Sys.sleep(4)
#recoding variables - these items are factors, and Dummy variables should be coded for them 
#This is not the fastest or the best method - unless I parallelize some of this 
framingham$gender <- as.factor(framingham$gender)
framingham$education <- as.factor(framingham$education)
framingham$BPMeds <- as.factor(framingham$BPMeds)
framingham$prevalentHyp <- as.factor(framingham$prevalentHyp)
framingham$prevalentStroke <- as.factor(framingham$prevalentStroke)
framingham$diabetes <- as.factor(framingham$diabetes)
framingham$TenYearCHD <- as.factor(framingham$TenYearCHD)
Sys.sleep(4)
glimpse(framingham)
## Observations: 4,240
## Variables: 16
## $ gender          <fct> Male, Female, Male, Female, Female, Female, Female,...
## $ age             <int> 39, 46, 48, 61, 46, 43, 63, 45, 52, 43, 50, 43, 46,...
## $ education       <fct> 4, 2, 1, 3, 3, 2, 1, 2, 1, 1, 1, 2, 1, 3, 2, 2, 3, ...
## $ currentSmoker   <int> 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, ...
## $ cigsPerDay      <int> 0, 0, 20, 30, 23, 0, 0, 20, 0, 30, 0, 0, 15, 0, 9, ...
## $ BPMeds          <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ prevalentStroke <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ prevalentHyp    <fct> 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, ...
## $ diabetes        <fct> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ totChol         <int> 195, 250, 245, 225, 285, 228, 205, 313, 260, 225, 2...
## $ sysBP           <dbl> 106.0, 121.0, 127.5, 150.0, 130.0, 180.0, 138.0, 10...
## $ diaBP           <dbl> 70.0, 81.0, 80.0, 95.0, 84.0, 110.0, 71.0, 71.0, 89...
## $ BMI             <dbl> 26.97, 28.73, 25.34, 28.58, 23.10, 30.30, 33.11, 21...
## $ heartRate       <int> 80, 95, 75, 65, 85, 77, 60, 79, 76, 93, 75, 72, 98,...
## $ glucose         <int> 77, 76, 70, 103, 85, 99, 85, 78, 79, 88, 76, 61, 64...
## $ TenYearCHD      <fct> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
Sys.sleep(5)