Exploratory Data Analysis in R. Choose an interesting dataset and use R graphics to describe the data. You may use base R graphics, or a graphics package of your choice. You should include at least one example of each of the following: . histogram . boxplot . scatterplot
library(RCurl)
## Loading required package: bitops
library(ggplot2)
library(plyr)
library(reshape2)
# install.packages("extrafont")
library(extrafont)
## Registering fonts with R
Read in .csv file and rename Data
weightLoss.data <- getURL("https://raw.githubusercontent.com/ann2014/CUNY/master/WeightLoss.csv")
weightLoss.data <- read.csv(text = weightLoss.data)
head(weightLoss.data)
## X group wl1 wl2 wl3 se1 se2 se3
## 1 1 Control 4 3 3 14 13 15
## 2 2 Control 4 4 3 13 14 17
## 3 3 Control 4 3 1 17 12 16
## 4 4 Control 3 2 1 11 11 12
## 5 5 Control 5 3 2 16 15 14
## 6 6 Control 6 5 4 17 18 18
names(weightLoss.data)[1] <- "id"
names(weightLoss.data)[3:5] <- c("WeightLoss_month1", "WeightLoss_month2", "WeightLoss_month3")
names(weightLoss.data)[6:8] <- c("SelfEsteem_month1", "SelfEsteem_month2", "SelfEsteem_month3")
wl.data <- melt(weightLoss.data[, 1:5], id.vars = c("id", "group"))
names(wl.data)[3:4] <- c("WeightLoss_Month", "WeightLoss")
we.data <- melt(weightLoss.data[, c(1,2,6,7,8)], id.vars = c("id", "group"))
names(we.data)[3:4] <- c("SelfEsteem_Month", "SelfEsteem_Score")
data.long <- cbind(wl.data, we.data)[, -5:-6]
str(weightLoss.data)
## 'data.frame': 34 obs. of 8 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ group : Factor w/ 3 levels "Control","Diet",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ WeightLoss_month1: int 4 4 4 3 5 6 6 5 5 3 ...
## $ WeightLoss_month2: int 3 4 3 2 3 5 5 4 4 3 ...
## $ WeightLoss_month3: int 3 3 1 1 2 4 4 1 1 2 ...
## $ SelfEsteem_month1: int 14 13 17 11 16 17 17 13 14 14 ...
## $ SelfEsteem_month2: int 13 14 12 11 15 18 16 15 14 15 ...
## $ SelfEsteem_month3: int 15 17 16 12 14 18 19 15 15 13 ...
summary(weightLoss.data)
## id group WeightLoss_month1 WeightLoss_month2
## Min. : 1.00 Control:12 Min. :3.000 Min. :2.000
## 1st Qu.: 9.25 Diet :12 1st Qu.:4.000 1st Qu.:3.000
## Median :17.50 DietEx :10 Median :5.000 Median :4.000
## Mean :17.50 Mean :5.294 Mean :4.353
## 3rd Qu.:25.75 3rd Qu.:6.000 3rd Qu.:5.000
## Max. :34.00 Max. :9.000 Max. :9.000
## WeightLoss_month3 SelfEsteem_month1 SelfEsteem_month2 SelfEsteem_month3
## Min. :1.000 Min. :11.00 Min. :11.00 Min. :11.00
## 1st Qu.:1.000 1st Qu.:13.00 1st Qu.:12.00 1st Qu.:15.00
## Median :2.000 Median :15.00 Median :14.00 Median :17.00
## Mean :2.176 Mean :14.91 Mean :13.82 Mean :16.21
## 3rd Qu.:3.000 3rd Qu.:16.00 3rd Qu.:15.00 3rd Qu.:18.00
## Max. :4.000 Max. :19.00 Max. :19.00 Max. :19.00
# Use Weight Loss (pounds) as categorical data, run weight loss frequence by group
table(weightLoss.data$group, weightLoss.data$WeightLoss_month1)
##
## 3 4 5 6 7 8 9
## Control 2 4 4 2 0 0 0
## Diet 1 3 2 3 3 0 0
## DietEx 2 1 0 2 2 1 2
Creating Histograms: use histograms to view the distribution of one-dimensional data
a <- ggplot(data.long, aes(x=as.factor(WeightLoss), fill=group))
a <- a + labs(x = "Weight in pounds", y = "Count", title = "Weight Loss by Group within 3 months")
a <- a + geom_bar()
a <- a + facet_grid (WeightLoss_Month ~ group)
a <- a + geom_line(aes(y = SelfEsteem_Score, fill=group))
a <- a + geom_point(aes(y = SelfEsteem_Score, colour = "blue"))
a <- a + theme(legend.position='bottom', panel.grid.major.x = element_blank(),
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
legend.key.size = unit(.5, "cm"),
axis.ticks.y =element_blank(),
plot.margin = unit( c(1,0,0,0) , units = "lines" ),
plot.title = element_text(size = 30, lineheight = .8,
vjust = 1, family = "Bauhaus 93"))
a <- a + scale_fill_discrete(guide_legend(title ="Group"))
a
Create Scatter Plots
ggplot(weightLoss.data, aes(x= WeightLoss_month1, y = SelfEsteem_month1, color = group)) +
labs(x = "Weight Loss", y = "Self-Esteem Score", title = "Weight Loss vs. Self-Esteem - Month 1") +
geom_point() + #geom_line() +
facet_wrap(~ group)
ggplot(weightLoss.data, aes(x= WeightLoss_month2, y = SelfEsteem_month2, color = group)) +
labs(x = "Weight Loss", y = "Self-Esteem Score", title = "Weight Loss vs. Self-Esteem - Month 2") +
geom_point() + #geom_line() +
facet_wrap(~ group)
ggplot(weightLoss.data, aes(x= WeightLoss_month3, y = SelfEsteem_month3, color = group)) +
labs(x = "Weight Loss", y = "Self-Esteem Score", title = "Weight Loss vs. Self-Esteem - Month 3") +
geom_point() + #geom_line() +
facet_wrap(~ group)
Create Boxplots
ggplot(weightLoss.data) +
geom_boxplot(aes(x=group, y=WeightLoss_month1)) + coord_flip() +
geom_boxplot(aes(x=group, y=SelfEsteem_month1), fill = "chartreuse4") +
labs(x = "Group", y = "Weight Loss Self-Esteem Score", title = "Weight Loss vs. Self-Esteem - Month 1")
ggplot(weightLoss.data) +
geom_boxplot(aes(x=group, y=WeightLoss_month2)) + coord_flip() +
geom_boxplot(aes(x=group, y=SelfEsteem_month2), fill = "chartreuse4") +
labs(x = "Group", y = "Weight Loss Self-Esteem Score", title = "Weight Loss vs. Self-Esteem - Month 2")
ggplot(weightLoss.data) +
geom_boxplot(aes(x=group, y=WeightLoss_month3)) + coord_flip() +
geom_boxplot(aes(x=group, y=SelfEsteem_month3), fill = "chartreuse4") +
labs(x = "Group", y = "Weight Loss Self-Esteem Score", title = "Weight Loss vs. Self-Esteem - Month 3")
The graphic data visualization provides clear pictures of the relationships between weight loss and self-esteem during 3-month weight loss program. Three groups in the research also showed different pattern regarding self-esteem measurement: - The more weight loss associated with higher self-esteem. - The DietEx group lost more weight, and control group lost least weight in the first 2 months, the data are showing same pattern at month 3. - The longer into the program, the less the weight loss.