mdata <- read.csv("C:/Users/nahur/Desktop/WGU/Final Projects/D206 Data Cleaning/medical_raw_data.csv")
library(ggplot2)
library(plyr)
library(modeest)
library(visdat)
library(tidyverse)
library(factoextra)
Identifying Duplicates and Missing Values
#Check for duplicates
sum(duplicated(mdata))
## [1] 0
# Check for missing values
colSums(is.na(mdata))
## X CaseOrder Customer_id Interaction
## 0 0 0 0
## UID City State County
## 0 0 0 0
## Zip Lat Lng Population
## 0 0 0 0
## Area Timezone Job Children
## 0 0 0 2588
## Age Education Employment Income
## 2414 0 0 2464
## Marital Gender ReAdmis VitD_levels
## 0 0 0 0
## Doc_visits Full_meals_eaten VitD_supp Soft_drink
## 0 0 0 2467
## Initial_admin HighBlood Stroke Complication_risk
## 0 0 0 0
## Overweight Arthritis Diabetes Hyperlipidemia
## 982 0 0 0
## BackPain Anxiety Allergic_rhinitis Reflux_esophagitis
## 0 984 0 0
## Asthma Services Initial_days TotalCharge
## 0 0 1056 0
## Additional_charges Item1 Item2 Item3
## 0 0 0 0
## Item4 Item5 Item6 Item7
## 0 0 0 0
## Item8
## 0
Data Imputation
# Children
hist(mdata$Children, main = "Children (before imputation)")

mean(mdata$Children, na.rm = TRUE)
## [1] 2.098219
mdata$Children[is.na(mdata$Children)] <- median(mdata$Children, na.rm = TRUE)
mean(mdata$Children, na.rm = TRUE)
## [1] 1.814
hist(mdata$Children, main = "Children (after imputation)")

# Age
hist(mdata$Age, main = "Age (before imputation)")

mean(mdata$Age, na.rm = TRUE)
## [1] 53.29568
mdata$Age[is.na(mdata$Age)] <- median(mdata$Age, na.rm = TRUE)
mean(mdata$Age, na.rm = TRUE)
## [1] 53.2243
hist(mdata$Age, main = "Age (after imputation)")

# Income
hist(mdata$Income, main = "Income (before imputation")

mean(mdata$Income, na.rm = TRUE)
## [1] 40484.44
mdata$Income[is.na(mdata$Income)] <- median(mdata$Income, na.rm = TRUE)
mean(mdata$Income, na.rm = TRUE)
## [1] 38872.45
hist(mdata$Income, main = "Income (after imputation)")

# Overweight
hist(mdata$Overweight, main = "Overweight (before imputation)")

mean(mdata$Overweight, na.rm = TRUE)
## [1] 0.7091373
mdata$Overweight[is.na(mdata$Overweight)] <- median(mdata$Overweight, na.rm = TRUE)
mean(mdata$Overweight, na.rm = TRUE)
## [1] 0.7377
hist(mdata$Overweight, main = "Overweight (after imputation)")

# Anxiety
hist(mdata$Anxiety, main = "Anxiety (before imputation)")

mean(mdata$Anxiety, na.rm = TRUE)
## [1] 0.3223159
mdata$Anxiety[is.na(mdata$Anxiety)] <- median(mdata$Anxiety, na.rm = TRUE)
mean(mdata$Anxiety, na.rm = TRUE)
## [1] 0.2906
hist(mdata$Anxiety, main = "Anxiety (after imputation)")

# Initial_days
hist(mdata$Initial_days, main = "Initial_days (before imputation)")

mean(mdata$Initial_days, na.rm = TRUE)
## [1] 34.43208
mdata$Initial_days[is.na(mdata$Initial_days)] <- median(mdata$Initial_days, na.rm = TRUE)
mean(mdata$Initial_days, na.rm = TRUE)
## [1] 34.43365
hist(mdata$Initial_days, main = "Initial_days (after imputation)")

# Soft_drink
mdata$Soft_drink[is.na(mdata$Soft_drink)] <- mfv(mdata$Soft_drink, na_rm = TRUE)
head(mdata$Soft_drink)
## [1] "No" "No" "No" "No" "Yes" "No"
unique(mdata$Soft_drink)
## [1] "No" "Yes"
Soft_drink.num <- revalue(mdata$Soft_drink, replace = c("No" = 0, "Yes" = 1))
mdata$Soft_drink <- as.numeric(Soft_drink.num)
hist(mdata$Soft_drink, main = "Soft_drink (after imputation)")

Check for Missing Values
# Check for missing values
colSums(is.na(mdata))
## X CaseOrder Customer_id Interaction
## 0 0 0 0
## UID City State County
## 0 0 0 0
## Zip Lat Lng Population
## 0 0 0 0
## Area Timezone Job Children
## 0 0 0 0
## Age Education Employment Income
## 0 0 0 0
## Marital Gender ReAdmis VitD_levels
## 0 0 0 0
## Doc_visits Full_meals_eaten VitD_supp Soft_drink
## 0 0 0 0
## Initial_admin HighBlood Stroke Complication_risk
## 0 0 0 0
## Overweight Arthritis Diabetes Hyperlipidemia
## 0 0 0 0
## BackPain Anxiety Allergic_rhinitis Reflux_esophagitis
## 0 0 0 0
## Asthma Services Initial_days TotalCharge
## 0 0 0 0
## Additional_charges Item1 Item2 Item3
## 0 0 0 0
## Item4 Item5 Item6 Item7
## 0 0 0 0
## Item8
## 0
Ordinal Encoding
# Education
unique(mdata$Education)
## [1] "Some College, Less than 1 Year"
## [2] "Some College, 1 or More Years, No Degree"
## [3] "GED or Alternative Credential"
## [4] "Regular High School Diploma"
## [5] "Bachelor's Degree"
## [6] "Master's Degree"
## [7] "Nursery School to 8th Grade"
## [8] "9th Grade to 12th Grade, No Diploma"
## [9] "Doctorate Degree"
## [10] "Associate's Degree"
## [11] "Professional School Degree"
## [12] "No Schooling Completed"
education.num <- revalue(x=mdata$Education, replace = c("No Schooling Completed" = 0,
"Nursery School to 8th Grade" = 1,
"9th Grade to 12th Grade, No Diploma" = 3,
"GED or Alternative Credential" = 4,
"Regular High School Diploma" = 5,
"Some College, Less than 1 Year" = 6,
"Some College, 1 or More Years, No Degree" = 7,
"Professional School Degree" = 8,
"Associate's Degree" = 9,
"Bachelor's Degree" = 10,
"Master's Degree" = 11,
"Doctorate Degree" = 12))
mdata$Education <- as.numeric(education.num)
str(mdata$Education)
## num [1:10000] 6 7 7 4 5 5 10 11 5 7 ...
# ReAdmis
unique(mdata$ReAdmis)
## [1] "No" "Yes"
ReAdmis.num <- revalue(x=mdata$ReAdmis, replace = c("No" = 0, "Yes" = 1))
mdata$ReAdmis <- as.numeric(ReAdmis.num)
# HighBlood
unique(mdata$HighBlood)
## [1] "Yes" "No"
HighBlood.num <- revalue(x=mdata$HighBlood, replace = c("No" = 0, "Yes" = 1))
mdata$HighBlood <- as.numeric(HighBlood.num)
# Stroke
unique(mdata$Stroke)
## [1] "No" "Yes"
Stroke.num <- revalue(x=mdata$Stroke, replace = c("No" = 0, "Yes" = 1))
mdata$Stroke <- as.numeric(Stroke.num)
# Overweight
unique(mdata$Overweight)
## [1] 0 1
str(mdata$Overweight)
## num [1:10000] 0 1 1 0 0 1 1 1 1 1 ...
# Arthritis
unique(mdata$Arthritis)
## [1] "Yes" "No"
Arthritis.num <- revalue(x=mdata$Arthritis, replace = c("No" = 0, "Yes" = 1))
mdata$Arthritis <- as.numeric(Arthritis.num)
# Diabetes
unique(mdata$Diabetes)
## [1] "Yes" "No"
Diabetes.num <- revalue(x=mdata$Diabetes, replace = c("No" = 0, "Yes" = 1))
mdata$Diabetes <- as.numeric(Diabetes.num)
# Hyperlipidemia
unique(mdata$Hyperlipidemia)
## [1] "No" "Yes"
Hyperlipidemia.num <- revalue(x=mdata$Hyperlipidemia, replace = c("No" = 0, "Yes" = 1))
mdata$Hyperlipidemia <- as.numeric(Hyperlipidemia.num)
# BackPain
unique(mdata$BackPain)
## [1] "Yes" "No"
BackPain.num <- revalue(x=mdata$BackPain, replace = c("No" = 0, "Yes" = 1))
mdata$BackPain <- as.numeric(BackPain.num)
# Allergic_rhinitis
unique(mdata$Allergic_rhinitis)
## [1] "Yes" "No"
Allergic_rhinitis.num <- revalue(x=mdata$Allergic_rhinitis, replace = c("No" = 0, "Yes" = 1))
mdata$Allergic_rhinitis <- as.numeric(Allergic_rhinitis.num)
# Reflux_esophagitis
unique(mdata$Reflux_esophagitis)
## [1] "No" "Yes"
Reflux_esophagitis.num <- revalue(x=mdata$Reflux_esophagitis, replace = c("No" = 0, "Yes" = 1))
mdata$Reflux_esophagitis <- as.numeric(Reflux_esophagitis.num)
# Asthma
unique(mdata$Asthma)
## [1] "Yes" "No"
Asthma.num <- revalue(x=mdata$Asthma, replace = c("No" = 0, "Yes" = 1))
mdata$Asthma <- as.numeric(Asthma.num)
Boxplots for All Quantitative Variables
children_boxplot <- boxplot(mdata$Children, main = "Children")

age_boxplot <- boxplot(mdata$Age, main = "Age")

income_boxplot <- boxplot(mdata$Income, main = "Income")

vit_d_levels_boxplot <- boxplot(mdata$VitD_levels, main = "VitD_levels")

vit_d_supp_boxplot <- boxplot(mdata$VitD_supp, main = "VitD_Supp")

doc_visits_boxplot <- boxplot(mdata$Doc_visits, main = "Doc_visits")

full_meals_boxplot <- boxplot(mdata$Full_meals_eaten, main = "Full_meals_eaten")

initial_days_boxplot <- boxplot(mdata$Initial_days, main = "Initial_days")

total_charge_boxplot <- boxplot(mdata$TotalCharge, main = "TotalCharge")

additional_charge_boxplot <- boxplot(mdata$Additional_charges, main = "Additional_charges")

Principle Component Analysis
# Create dataframe
mdata_pca <- mdata[,c(16, 17, 20, 24:27, 43:45)]
# Normalize the data
mdata_pca <- prcomp(mdata[,c(16, 17, 20, 24:27, 43:45)], center = TRUE, scale = TRUE)
# run "mdata_pca$rotation" in console to get loadings
# Selecting PCs using Kaiser Rule
fviz_eig(mdata_pca, choice = "eigenvalue", addlabels = TRUE)
