### Required Packages
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)
library(readxl)
- Create Univariate analysis for the variable of your interest (your Y variable). Calculate skewness and kurtosis and describe the results
### Import the data in to R
BreastCancer <- read_excel("C:/Users/Dhruva/Desktop/GRAD 699/Assignments/BC.xlsx")
### Assign "Healthy Control", "Patient" for 1 and 2 values in Classification variable
Classification2 <- c("Healthy Control", "Patient")
Classification <- c(1, 2)
dat <- data.frame(Classification2,Classification)
BreastCancer$Classification <- dat$Classification2[match(BreastCancer$Classification, dat$Classification)]
str(BreastCancer)
## tibble [116 x 10] (S3: tbl_df/tbl/data.frame)
## $ Age : num [1:116] 48 83 82 68 86 49 89 76 73 75 ...
## $ BMI : num [1:116] 23.5 20.7 23.1 21.4 21.1 ...
## $ Glucose : num [1:116] 70 92 91 77 92 92 77 118 97 83 ...
## $ Insulin : num [1:116] 2.71 3.12 4.5 3.23 3.55 ...
## $ HOMA : num [1:116] 0.467 0.707 1.01 0.613 0.805 ...
## $ Leptin : num [1:116] 8.81 8.84 17.94 9.88 6.7 ...
## $ Adiponectin : num [1:116] 9.7 5.43 22.43 7.17 4.82 ...
## $ Resistin : num [1:116] 8 4.06 9.28 12.77 10.58 ...
## $ MCP.1 : num [1:116] 417 469 555 928 774 ...
## $ Classification: Factor w/ 2 levels "Healthy Control",..: 1 1 1 1 1 1 1 1 1 1 ...
### Count of Healthy Controls and Patients
classcount <- table(BreastCancer$Classification)
classcount
##
## Healthy Control Patient
## 52 64
### Plot the Classification data
barplot(classcount)

## Since the Classification (Y variable) is categorical, the skewness and kurtosis cannot be calculated. From the barplot output and frequency count there are more Patients (64) than the Healthy Controls (52).
- Create Bivariate plot Box Plot for your Y variable and one of other important metrics (your X). Describe figure.
ggplot(data=BreastCancer, mapping = aes(x = Glucose, y = Classification, fill=Classification)) +
geom_boxplot() +
ggtitle("Glucose levels of Healthy Controls and Patients") +
xlab("Glucose Level (mg/dL)")

## Box plot was constructed between the Glucose levels on x-axis and Classication on y-axis. The output shows that Glucose levels are high in Breast Cancer patients than in the Healthy Controls. Also the plot shows that the median Glucose level is high in Breast Cancer patients than in Healthy Controls.
- If your variables are continuous - Create a scatter plot between your Y and your X. If your variables are categorical - Create a bar plot. Describe figure.
ggplot(data=BreastCancer, mapping = aes(x = Insulin, y = Classification)) +
geom_point(aes(colour = Classification), size = 2) +
ggtitle("Insulin Levels of Healthy Controls and Patients") +
xlab("Insulin (µU/mL)")

## Scatter plot was constructed between the Insulin levels on x-axis and Classication on y-axis. The output shows that Insulin levels are high in Breast Cancer patients than the in Healthy Controls.
- Create a multivariate plot - Use the same plot as in 3 but add another important variable using colored symbols. Describe Figure. Make sure to add legend.
### Structure the data to include the Biomarkers "Insulin", "Resistin" in to one variable Biomarker
BreastCancer2 = melt(BreastCancer, id.vars = c("Classification"),
measure.vars = c("Insulin", "Resistin"))
### Assign column names
BreastCancer2 <- setNames(BreastCancer2, c("Classification", "Biomarker", "Result"))
###Plot the Data
ggplot(data=BreastCancer2, mapping = aes(x = Result, y = Classification)) +
geom_point(aes(colour = Biomarker), size = 2) +
ggtitle("Insulin and Resistin Levels of Healthy Controls and Patients") +
xlab("Level")+
scale_color_manual(name="Biomarker",
labels = c("Insulin (µU/mL)",
"Resistin (ng/mL)"),
values = c("Insulin"="red",
"Resistin"="blue"))

## Scatter plot was constructed between the Insulin and Resistin levels on x-axis and Classication on y-axis. The output shows that Insulin and Resistin levels are high in Breast Cancer patients than the in Healthy Controls.