print(heart) str(heart)
names(heart) colnames(heart) # Using either “names” or “colnames”, both lists the 14 variables of the dataset.
head(heart,n=15) # First 15 rows mostly are in the same age range (50s) with some outliers (1 34 and 2 in their 40s).
gender=function(x) { if(is.na(x)) { return(“Unknown”) } else if (x == 1) { return(“Male”) } else if (x == 0) { return(“Female”) } else { return(“Other”) } }
gender(heart\(sex[2]) gender(heart\)sex[6]) # The “gender” function identifies whether the individual is “male”, “female” or “other”. Using the sex column, it shows in the generated sample that row 2 is “male” while row 6 is “female”.
library(dplyr) filter(heart,chol>240) # Loading the dplyr first, filter was then applied where the condition was “chol” variable should be more than 240. It resulted to 503 rows (432 rows were omitted in the printed values).
demographics=as.data.frame(cbind(heart\(age,heart\)sex)) names(demographics)[1]=“Age” names(demographics)[2]=“Sex” print(demographics)
symptoms=as.data.frame(cbind(heart\(cp,heart\)exang)) names(symptoms)[1]=“Chest Pain Type” names(symptoms)[2]=“Exercise Induced Angina” print(symptoms)
vitals=as.data.frame(cbind(heart\(trestbps,heart\)chol,heart\(fbs,heart\)thalach,heart$oldpeak)) names(vitals)[1]=“Resting Blood Pressure” names(vitals)[2]=“Cholesterol” names(vitals)[3]=“Fasting Blood Sugar” names(vitals)[4]=“Max Heart Rate” names(vitals)[5]=“ST Depression” print(vitals)
diagnostics=as.data.frame(cbind(heart\(restecg,heart\)slope,heart\(ca,heart\)thal)) names(diagnostics)[1]=“Resting ECG Results” names(diagnostics)[2]=“ST Segment Slope” names(diagnostics)[3]=“No of Major Vessels” names(diagnostics)[4]=“Thalassemia” print(diagnostics)
dependentvariable=as.data.frame(cbind(heart$target)) names(dependentvariable)[1]=“Target” print(dependentvariable) # In the heart dataset, only the “target” is the dependent variable. The remaining are independent ones which have been subdivided into 4 data frames: demographics, symptoms, vitals and diagnostics.
is.na(heart) heart1=na.omit(heart) is.na(heart1) # No missing values is in the dataset.
duplicated(heart) sum(duplicated(heart)) cleanheart=unique(heart) sum(duplicated(cleanheart)) # Initially, there were 723 duplicated data in the dataset. 308 remains once duplicated data has been removed (new df=cleanheart).
cleanheart%>%arrange(desc(age),desc(chol)) # Variables age and chol has been set in descending order. In the dataset, 77 is the oldest age with 304 as the highest cholesterol level.
names(cleanheart)[3]=“chest pain type” names(cleanheart)[4]=“resting blood pressure” names(cleanheart)[6]=“fasting blood sugar” names(cleanheart)[8]=“max heart rate” names(cleanheart)[9]=“exercise induced angina” names(cleanheart)[12]=“no of major vessels” names(cleanheart)[13]=“thalassemia” print(cleanheart) # As the data had some acronyms/codes, it was renamed to its actual description.
cleanheart=cleanheart%>%mutate(chol_age=chol/age,cardiacriskscore=age*chol) print(cleanheart) # Variables chol_age and cardiacriskscore has been added. For chol_age, it provides a ratio of the of the cholesterol and age while cardiacriskscore calculates the score between the two.
set.seed(1234) cleanheart_training=cleanheart%>%sample_frac(0.60,replace=FALSE) print(cleanheart_training) # 181 was generated as the training data from the cleanheart dataset.
summary(cleanheart) # Code above generated the statistics of the dataset per variable.
get_mode=function(x) { uniqv=unique(x) uniqv[which.max(tabulate(match(x, uniqv)))] }
mean(cleanheart\(chol)
mean(cleanheart\)age) mean(cleanheart\(`resting blood pressure`)
median(cleanheart\)chol) median(cleanheart\(age)
median(cleanheart\)resting blood pressure)
get_mode(cleanheart\(chol)
get_mode(cleanheart\)age) get_mode(cleanheart\(`resting blood pressure`)
print(max(cleanheart\)chol, na.rm=TRUE)-min(cleanheart\(chol, na.rm=TRUE))
print(max(cleanheart\)resting blood pressure,
na.rm=TRUE)-min(cleanheart\(`resting blood
pressure`, na.rm=TRUE))
print(max(cleanheart\)max heart rate,
na.rm=TRUE)-min(cleanheart$max heart rate, na.rm=TRUE))
ggplot(data=cleanheart,aes(x=age,y=chol))+geom_point() # Data shows that as one ages, cholesterol increases with majority in the age range of 50 to 60. There is one outlier with age close to 70 and cholesterol above 500. This needs to be checked further.
ggplot(data=cleanheart,aes(x=resting blood pressure))+geom_bar(fill
= “lightskyblue”) # For resting blood pressure, highest is at 120 with
more than 30 counts followed by a little over 120 (~128).
ggplot(data=cleanheart,aes(x=max heart rate))+geom_bar(fill
= “red4”) # For max heart rate, highest count is ~160 with values above
9 counts.
corr=cor(cleanheart\(age,cleanheart\)chol,method=‘pearson’) corr # Correlation value between age and chol is at 0.20. As the value is near 0, this shows low correlation between the two variables.