Visualization is a key area of data analysis and analytics.
library(foreign)
library(ggplot2)
chs = read.dta("~/chs12.dta") # note that chs12.dta is saved in Stata version 12
# get size of dataframe i.e. number of rows and columns
dim(chs)
## [1] 10337 35
# preview first 10 observations and first 10 columns
head(chs, n = 10)[, 1:10]
## id region smsa hhsize highbp sex race age agegroup height
## 1 1400 South SMSA Non-City 4 No Male White 54 50-59 174.6
## 2 1401 South SMSA Non-City 6 No Female White 41 40-49 152.3
## 3 1402 South SMSA City 6 No Female Other 21 20-29 164.1
## 4 1404 South SMSA Non-City 9 Yes Female White 63 60-69 162.6
## 5 1405 South SMSA City 3 No Female White 64 60-69 163.1
## 6 1406 South SMSA City 1 Yes Female White 63 60-69 147.1
## 7 1407 South SMSA City 2 No Female White 67 60-69 153.9
## 8 1408 South SMSA Non-City 1 Yes Female White 57 50-59 160.0
## 9 1410 South SMSA Non-City 3 No Female White 68 60-69 164.0
## 10 1411 South SMSA Non-City 4 No Male Black 68 60-69 176.6
A simple scatter plot- this is done by specifying the variable to appear on the \(x\) and \(y\) respectively within the \(aes\) parameter.
p = ggplot(chs,
aes(x = weight, y = bmi)) +
geom_point()
p
Plot different colors for male and female subjects
p = ggplot(chs,
aes(x = weight, y = bmi, col = sex)) +
geom_point()
p
The ggplot2 package has several themes that can be used to change the appearance of the graph. They include:
To change the theme, add the them name e.g. theme_bw() to your syntax as shown in the code below.
p = ggplot(chs,
aes(x = weight, y = bmi, col = sex)) +
geom_point() +
theme_bw()
p
In addition to using the built-in ggplot2 themes, we can use the predefined themes from the ggthemes library to modify the aesthetics of plots. Use the syntax below to display these themes.
library(ggthemes)
library(stringr)
# get a list all functions in ggthemes package
func_all = ls("package:ggthemes")
# extract only those functions that begin with theme_
fun_themes = func_all[str_detect(func_all, "theme_")]
fun_themes
## [1] "theme_base" "theme_calc" "theme_clean"
## [4] "theme_economist" "theme_economist_white" "theme_excel"
## [7] "theme_excel_new" "theme_few" "theme_fivethirtyeight"
## [10] "theme_foundation" "theme_gdocs" "theme_hc"
## [13] "theme_igray" "theme_map" "theme_pander"
## [16] "theme_par" "theme_solarized" "theme_solarized_2"
## [19] "theme_solid" "theme_stata" "theme_tufte"
## [22] "theme_wsj"
Below we use the theme theme_solarized() and remove add the dark background by adding light = FALSE.
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
theme_solarized(light = FALSE)
We can use the theme() and element_rect() functions to change the plot panel background color. This take the syntax below.
theme(panel.background = element_rect(fill, color, size, linetype))
where
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
theme(
panel.background = element_rect(fill = "skyblue",color = "blue",
size = 0.9, linetype = "solid")
)
The function element_line() can also be used to change the size and appearance of the grid lines using the syntax below.
theme(panel.grid.major = element_line(color, size, linetype),
panel.grid.minor = element_line(color, size, linetype))
where
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
theme(
panel.background = element_rect(fill = "skyblue",color = "blue",
size = 0.9, linetype = "solid"),
panel.grid.major = element_line(size = 0.5, linetype = 'solid', color = "white"),
panel.grid.minor = element_line(size = 0.25, linetype = 'solid', color = "white")
)
To remove the grids and the outside border, use the following syntax.
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank())
The ggplot2 by default does not give a title to the graph - you have to specify the graph title yourself. This section illustrates how to add and modify a graph title.
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
ggtitle('Body mass weight and weight') +
theme(plot.title = element_text(hjust = 0.5)) # center the title
There are several parameters for modifying the title. They include:
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
ggtitle(label = 'Body mass weight and weight') +
theme(plot.title = element_text(hjust = 0.5, color = "skyblue", size = 15, face = "bold.italic"))
Use the character \n to break a title into multiple lines.
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
ggtitle(label = 'Body mass weight and weight\nCommunity Health Survey') +
theme(plot.title = element_text(hjust = 0.5, color = "skyblue", size = 15, face = "bold.italic"))
The subtitle and caption can be added in a similar manner.
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
ggtitle(label = 'Body mass weight and weight',
subtitle = 'Community Health Survey') +
labs(caption = 'Source: NHANES') +
theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
plot.subtitle = element_text(hjust = 0.5, color = "gold"),
plot.caption = element_text(color = "skyblue"))
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
xlim(40, 200) +
ggtitle(label = 'Body mass weight and weight',
subtitle = 'Community Health Survey') +
labs(caption = 'Source: NHANES') +
theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
plot.subtitle = element_text(hjust = 0.5, color = "gold"),
plot.caption = element_text(color = "skyblue"))
## Warning: Removed 21 rows containing missing values (geom_point).
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
xlim(40, 200) +
ylim(0, 80) +
ggtitle(label = 'Body mass weight and weight',
subtitle = 'Community Health Survey') +
labs(caption = 'Source: NHANES') +
theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
plot.subtitle = element_text(hjust = 0.5, color = "gold"),
plot.caption = element_text(color = "skyblue"))
## Warning: Removed 21 rows containing missing values (geom_point).
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
coord_cartesian(xlim =c(0, 200), ylim = c(0, 80)) +
ggtitle(label = 'Body mass weight and weight',
subtitle = 'Community Health Survey') +
labs(caption = 'Source: NHANES') +
theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
plot.subtitle = element_text(hjust = 0.5, color = "gold"),
plot.caption = element_text(color = "skyblue"))
Axes values and ticks can be removed using the following syntax.
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
ggtitle(label = 'Body mass weight and weight',
subtitle = 'Community Health Survey') +
labs(caption = 'Source: NHANES') +
theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
plot.subtitle = element_text(hjust = 0.5, color = "gold"),
plot.caption = element_text(color = "skyblue")) +
theme(axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
geom_point() +
coord_cartesian(xlim =c(0, 200), ylim = c(0, 80)) +
ggtitle(label = 'Body mass weight and weight',
subtitle = 'Community Health Survey') +
labs(caption = 'Source: NHANES',
x = "Weight (kg)",
y = "Body mass index") +
theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
plot.subtitle = element_text(hjust = 0.5, color = "gold"),
plot.caption = element_text(color = "skyblue"))
STEM Research
https://stemresearchs.com