Introduction


Visualization is a key area of data analysis and analytics.

Required package


library(foreign)
library(ggplot2)

Import data set to use


chs = read.dta("~/chs12.dta") # note that chs12.dta is saved in Stata version 12
# get size of dataframe i.e. number of rows and columns
dim(chs)
## [1] 10337    35
# preview first 10 observations and first 10 columns
head(chs, n = 10)[, 1:10]
##      id region          smsa hhsize highbp    sex  race age agegroup height
## 1  1400  South SMSA Non-City      4     No   Male White  54    50-59  174.6
## 2  1401  South SMSA Non-City      6     No Female White  41    40-49  152.3
## 3  1402  South     SMSA City      6     No Female Other  21    20-29  164.1
## 4  1404  South SMSA Non-City      9    Yes Female White  63    60-69  162.6
## 5  1405  South     SMSA City      3     No Female White  64    60-69  163.1
## 6  1406  South     SMSA City      1    Yes Female White  63    60-69  147.1
## 7  1407  South     SMSA City      2     No Female White  67    60-69  153.9
## 8  1408  South SMSA Non-City      1    Yes Female White  57    50-59  160.0
## 9  1410  South SMSA Non-City      3     No Female White  68    60-69  164.0
## 10 1411  South SMSA Non-City      4     No   Male Black  68    60-69  176.6

1. Chart properties


A simple scatter plot- this is done by specifying the variable to appear on the \(x\) and \(y\) respectively within the \(aes\) parameter.

p = ggplot(chs,
           aes(x = weight, y = bmi)) +
    geom_point()
p

Plot different colors for male and female subjects

p = ggplot(chs,
           aes(x = weight, y = bmi, col = sex)) +
    geom_point()
p

1.1. Themes


The ggplot2 package has several themes that can be used to change the appearance of the graph. They include:

  • theme_bw() The default theme, featuring a gray background and white gridlines.
  • theme_linedraw - A black on white theme.
  • theme_linedraw - A theme with only black lines of various widths on white backgrounds.
  • theme_light - A theme similar to theme_linedraw but with grey lines and axes designed to draw more attention to the data.
  • theme_dark - A theme similar to theme_light, but with a dark background. A useful theme for** making thin colored lines stand out.
  • theme_minimal - A theme with no background annotations.
  • theme_classic - A theme with no gridlines.
  • theme_void - A completely empty theme.

To change the theme, add the them name e.g. theme_bw() to your syntax as shown in the code below.

p = ggplot(chs,
           aes(x = weight, y = bmi, col = sex)) +
    geom_point() +
    theme_bw()
p

In addition to using the built-in ggplot2 themes, we can use the predefined themes from the ggthemes library to modify the aesthetics of plots. Use the syntax below to display these themes.

library(ggthemes)
library(stringr)
# get a list all functions in ggthemes package
func_all = ls("package:ggthemes")
# extract only those functions that begin with theme_
fun_themes = func_all[str_detect(func_all, "theme_")]
fun_themes
##  [1] "theme_base"            "theme_calc"            "theme_clean"          
##  [4] "theme_economist"       "theme_economist_white" "theme_excel"          
##  [7] "theme_excel_new"       "theme_few"             "theme_fivethirtyeight"
## [10] "theme_foundation"      "theme_gdocs"           "theme_hc"             
## [13] "theme_igray"           "theme_map"             "theme_pander"         
## [16] "theme_par"             "theme_solarized"       "theme_solarized_2"    
## [19] "theme_solid"           "theme_stata"           "theme_tufte"          
## [22] "theme_wsj"

Below we use the theme theme_solarized() and remove add the dark background by adding light = FALSE.

ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    theme_solarized(light = FALSE)

1.2. Modifying specific components of plots


We can use the theme() and element_rect() functions to change the plot panel background color. This take the syntax below.

theme(panel.background = element_rect(fill, color, size, linetype))

where

  • fill: fill color for rectangle
  • color: border color
  • size: border size
  • linetype: line type (“blank”, “solid”, “dashed”, “dotted”, “dotdash”, “longdash”, “twodash”)
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    theme(
        panel.background = element_rect(fill = "skyblue",color = "blue",
                                        size = 0.9, linetype = "solid")
        )

The function element_line() can also be used to change the size and appearance of the grid lines using the syntax below.

theme(panel.grid.major = element_line(color, size, linetype),
panel.grid.minor = element_line(color, size, linetype))

where

  • color: border color
  • size: border size
  • linetype: line type (“blank”, “solid”, “dashed”, “dotted”, “dotdash”, “longdash”, “twodash”)
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    theme(
        panel.background = element_rect(fill = "skyblue",color = "blue",
                                        size = 0.9, linetype = "solid"),
        panel.grid.major = element_line(size = 0.5, linetype = 'solid', color = "white"),
        panel.grid.minor = element_line(size = 0.25, linetype = 'solid', color = "white")
        )

To remove the grids and the outside border, use the following syntax.

ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    theme(panel.border = element_blank(),
          panel.grid.major = element_blank(),
          panel.grid.minor = element_blank())

1.2. Titles


The ggplot2 by default does not give a title to the graph - you have to specify the graph title yourself. This section illustrates how to add and modify a graph title.

ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    ggtitle('Body mass weight and weight') +
    theme(plot.title = element_text(hjust = 0.5)) # center the title

There are several parameters for modifying the title. They include:

  • family: font family
  • face: font face. Options include “italic”, “bold” and “bold.italic”
  • color: font color
  • size: font size in pts
  • hjust: horizontal justification between 0 and 1
  • vjust: vertical justification between 0 and 1
  • lineheight: line height, i.e. the spaceing between lines for multi-line titles
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    ggtitle(label = 'Body mass weight and weight') +
    theme(plot.title = element_text(hjust = 0.5, color = "skyblue", size = 15, face = "bold.italic"))

Use the character \n to break a title into multiple lines.

ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    ggtitle(label = 'Body mass weight and weight\nCommunity Health Survey') +
    theme(plot.title = element_text(hjust = 0.5, color = "skyblue", size = 15, face = "bold.italic"))

The subtitle and caption can be added in a similar manner.

ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    ggtitle(label = 'Body mass weight and weight',
            subtitle = 'Community Health Survey') +
    labs(caption = 'Source: NHANES') +
    theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
          plot.subtitle = element_text(hjust = 0.5, color = "gold"),
          plot.caption = element_text(color = "skyblue"))

1.3. Axes labels


1.4. Axes limits


Set \(x\) axis
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    xlim(40, 200) +
    ggtitle(label = 'Body mass weight and weight',
            subtitle = 'Community Health Survey') +
    labs(caption = 'Source: NHANES') +
    theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
          plot.subtitle = element_text(hjust = 0.5, color = "gold"),
          plot.caption = element_text(color = "skyblue"))
## Warning: Removed 21 rows containing missing values (geom_point).

Set \(y\) axis
ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    xlim(40, 200) +
    ylim(0, 80) +
    ggtitle(label = 'Body mass weight and weight',
            subtitle = 'Community Health Survey') +
    labs(caption = 'Source: NHANES') +
    theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
          plot.subtitle = element_text(hjust = 0.5, color = "gold"),
          plot.caption = element_text(color = "skyblue"))
## Warning: Removed 21 rows containing missing values (geom_point).

Set both the \(x\) and \(y\) axis

ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    coord_cartesian(xlim =c(0, 200), ylim = c(0, 80)) +
    ggtitle(label = 'Body mass weight and weight',
            subtitle = 'Community Health Survey') +
    labs(caption = 'Source: NHANES') +
    theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
          plot.subtitle = element_text(hjust = 0.5, color = "gold"),
          plot.caption = element_text(color = "skyblue"))

Axes values and ticks can be removed using the following syntax.

ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    ggtitle(label = 'Body mass weight and weight',
            subtitle = 'Community Health Survey') +
    labs(caption = 'Source: NHANES') +
    theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
          plot.subtitle = element_text(hjust = 0.5, color = "gold"),
          plot.caption = element_text(color = "skyblue")) +
    theme(axis.text.x = element_blank(),
          axis.ticks.x = element_blank(),
          axis.text.y = element_blank(),
          axis.ticks.y = element_blank()
          )

Axes labels

ggplot(chs, aes(x = weight, y = bmi, color = sex)) +
    geom_point() +
    coord_cartesian(xlim =c(0, 200), ylim = c(0, 80)) +
    ggtitle(label = 'Body mass weight and weight',
            subtitle = 'Community Health Survey') +
    labs(caption = 'Source: NHANES',
         x = "Weight (kg)",
         y = "Body mass index") +
    theme(plot.title = element_text(hjust = 0.5, color = "skyblue"),
          plot.subtitle = element_text(hjust = 0.5, color = "gold"),
          plot.caption = element_text(color = "skyblue"))


STEM Research
https://stemresearchs.com