Data visualization

Components of a plot

The data being plotted.
The geometrics (circles, lines, etc) that appear on the plot.
A set of mappings from variables in the data o the aesthetics (appearance) of the geometric objects.
A statistical transformations used to calculate the data values used in the plot.
A position adjustment for locating each geometric object ont he plot.
A scale for each aesthetic mapping used.
A coordinate system used to organize the geometric objects.
The facets or groups of data shown in different plots.

library("ggplot2")
library("ggrepel")
library("dplyr")
library("DT")
library("tm") ## Text mining package
library("wordcloud") ## Visualization
#library("ggradar")
library("scales")
library("waffle")
library("plotly")
library("leaflet")

##

GGPlot vs Qplot

The main diffence between this two function is that we use ggplot() function when we plot data from a dataframe and qplot() function when we plot data from other different object from dataframes (Vectors usually.)

# for example
x<-1:10
y<-rnorm(10)

qplot(x,y, geom="line") # I will use this

ggplot(data.frame(x,y), aes(x,y)) + geom_line() # verbose

d <- data.frame(x, y)

qplot(x, y, data=d, geom="line") + theme_classic()

ggplot(d, aes(x,y)) + geom_line() + theme_classic() # I will use this

BarChart

Represent grouped data
Can be used to plot: ** Counts by group ** Average by group

datatable(data = mtcars, style = "bootstrap")

qplot(
      mtcars$cyl,
      geom = "bar",
      colour = I("#000000"),
      fill = I("#000000"),
      ylab = "Cylinders",
      xlab = "Number of Vehicles",
      main = "Vehicles by Cylinders"
      ) + theme_minimal()

the “identity” transformation will leave the data “as is”

bar_palette <- c("#1FAB89","#FF8080","#FFBA92","#C6F1D6")

class_count <- dplyr::count(mpg,class)

ggplot(class_count, aes(x = class, y = n)) + 
      geom_bar(stat = "identity", fill = "#1a3e59") +
      theme_classic()

The geom_bar by default uses a position adjustment of "stack", which makes each rectangle’s height proprotional to its value and stacks them on top of each other.

# bar chart of class, colored by drive (front, rear, 4-wheel)
ggplot(mpg, aes(x = class, fill = drv)) + 
  geom_bar() +  theme_classic() +
  scale_fill_manual(values = bar_palette)

# position = "dodge": values next to each other
ggplot(mpg, aes(x = class, fill = drv)) + 
  geom_bar(position = "dodge") + theme_classic() +
  scale_fill_manual(values = bar_palette)

# position = "fill": percentage chart
ggplot(mpg, aes(x = class, fill = drv)) + 
  geom_bar(position = "fill") +  theme_classic()+
  scale_fill_manual(values = bar_palette) +
  scale_y_continuous(breaks = seq(0, 1, by = .2), labels = scales::percent)

## Histograms

qplot(
      mtcars$hp,
      geom = "histogram",
      binwidth = 25,
      colour = I("black"),
      xlim = c(50,350),
      xlab = "Horse Power",
      ylab = "Number of cars",
      alpha = I(0),
      main = "Histogram"
) + theme_classic()

## Warning: Removed 2 rows containing missing values (geom_bar).

Facets

Facets are ways of grouping a data plot into multiple different pieces (subplots)

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_grid(~ class) + theme_minimal()

## With more than one categorical variable

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  facet_grid(year ~ cyl)

Pie Charts

The pie chart graph isn’t included in the library, so first is needed create a stacked bar chart

my_palette2 <- c("#394a6d","#3c9d9b","#52de97", "#c0ffb3", "#8105d8",
                 "#e688a1", "#730068")
ggplot(mtcars, aes(x = 1, y = sort(mtcars$carb), fill =sort(mtcars$carb))) + 
        geom_bar(stat = "identity") + theme_classic() +coord_polar(theta = "x") +
    theme(
      axis.line = element_blank(),
      axis.text.x = element_blank(),
      panel.background = element_blank()
    ) + scale_color_manual(values = my_palette2)

ggplot(mtcars, aes(x = 1, y = sort(mtcars$carb), fill =sort(mtcars$carb))) + 
        geom_bar(stat = "identity") + theme_classic() +coord_polar(theta = "y")

Coordinate System

coord_cartesian the default cartesian coordinate system, where you specify x and y values.
coord_flip A cartesisn system with x and y flipped
coord_fixed A cartesian system with a “fixed” aspect ratio
coord_polar A plot using polar coordinates
coord_quickmap a coordinate system that aproximates a good aspect ratio for maps.

Scatterplots

my_palette <- c("#512c96","#3c6f9c", "#dd6892", "#f9c6ba")
mtcars$cylFactor <- factor(mtcars$cyl)

best_in_class <- mpg %>%
  group_by(class) %>%
  filter(row_number(desc(hwy)) == 1)

# milage relationship, ordered in reverse
ggplot(mpg, aes(x = cty, y = hwy)) +
  geom_point( color = "#E6A400") +
  scale_x_reverse() +
  scale_y_reverse() +
  theme_classic()

##  Scatterplot with factor variables
ggplot(mtcars, aes( x = mpg, y = wt, 
                        shape = cylFactor, colour = cylFactor)) +
      geom_point() + scale_color_manual( values = my_palette) +
      labs(colour = "Cylinders")+
      ggtitle("Scatterplot") +
       theme_classic()

## Scatterplot with numeric vaariables
ggplot(mtcars, aes( x = mpg, y = wt, 
                        shape = cylFactor, colour = cyl)) +
      geom_point()  + theme_classic()

# specifying color palette
ggplot(mpg, aes(x = displ, y = hwy, color = class, size = cty)) +
  geom_point() +
  scale_color_brewer(palette = "Set3") + theme_classic() +
   labs(title = "Fuel Efficiency by Engine Power",
       subtitle = "Fuel economy data from 1999 and 2008 for 38 popular models of cars",
       x = "Engine power (litres displacement)",
       y = "Fuel Efficiency (miles per gallon)",
       color = "Car Type") + 
    geom_text_repel(data = best_in_class, aes(label = model))

Line plots and linear and gaussian regresssion

euStockDF <- as_tibble(EuStockMarkets)

ggplot() +
      geom_line(data = euStockDF, aes(x = c(1:nrow(euStockDF)), y = DAX),
                size = 0.8, colour = my_palette2[3]) +
      geom_line(data = euStockDF, aes(x = c(1:nrow(euStockDF)), y = FTSE),
                size = 0.8, colour = my_palette2[2])+
      geom_line(data = euStockDF, aes(x = c(1:nrow(euStockDF)), y = SMI),
                size = 0.8, colour = my_palette2[6])+
      geom_line(data = euStockDF, aes(x = c(1:nrow(euStockDF)), y = CAC),
                size = 0.8, colour = my_palette2[5])+
      labs(x = "Time", y = "Stocks") +
      ggtitle("EU Stocks")  + theme_classic() +
      theme(plot.title = element_text(hjust = 0.5))

## Linear regression


ggplot(mtcars, aes(x = mpg, y = wt, color = cylFactor))+
      geom_point(shape = 19) +
      scale_color_manual(values = my_palette2) +
      geom_smooth(method = "lm", se = FALSE, 
                  color = my_palette2[7]) +
      ggtitle("Linear Regression") +
      theme_minimal() + 
      xlab("Miles per Gallon") +
      ylab("Weight")

## Gaussian regression

ggplot(mtcars, aes(x = mpg, y = wt, color = cylFactor))+
      geom_point(shape = 19) +
      scale_color_manual(values = my_palette2) +
      geom_smooth(method = "auto", se = TRUE, 
                  color = my_palette[3]) + 
      theme_classic() + ggtitle("Gaussian Regression") +
      xlab("Miles per Gallon") +
      ylab("Weight")

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## Stat summary

ggplot(mpg, aes(displ, hwy)) + 
  geom_point(color = "#7F78D2") + 
  stat_summary(fun.y = "mean", geom = "line", size = 0.5, linetype = "dashed") +
  theme_minimal()

Word Cloud

if(!dir.exists("./WordCloud/")) dir.create("./WordCloud/")

download.file("https://ibm.box.com/shared/static/cmid70rpa7xe4ocitcga1bve7r0kqnia.txt",destfile = "WordCloud/Churchill_speeches.txt", quiet = TRUE)


dirPath <- "./WordCloud"
speech <- Corpus(DirSource(dirPath))

#inspect(speech)

## convert to lower case the letters
speech <- tm_map(speech, content_transformer(tolower))

## Extract the numbers of the text

speech <- tm_map(speech, removeNumbers)

## Remove common stop words like'the' or  'we'

speech <- tm_map(speech, removeWords,
                 stopwords("english"))

## If we want to remove our own words...
speech <- tm_map(speech, removeWords,
                 c("floccinaucinihilipification","squirreled"))

## Remove punctuation

speech <- tm_map(speech, removePunctuation)

## Remove  unnecessary whitespace

speech <- tm_map(speech, stripWhitespace)

## Create a Term document matrix

dtm <- TermDocumentMatrix(speech)

## Matrix Transformation

m <- as.matrix(dtm)

# Sort it  to show  the most  frequent words

v <- sort(rowSums(m),decreasing = TRUE)

# Transform to dataframe

d <- tibble(word = names(v),
            freq = v)

head(d, 10)

## # A tibble: 10 x 2
##    word     freq
##    <chr>   <dbl>
##  1 shall      11
##  2 fight       7
##  3 may         6
##  4 will        6
##  5 europe      5
##  6 upon        5
##  7 victory     5
##  8 war         5
##  9 can         4
## 10 many        4

## Wordcloud visualization

wordcloud(words = d$word,
          freq = d$freq,
          min.freq =1, max.words = 100,
          colors = brewer.pal(6,"Dark2"),
          random.order = FALSE)

Radar charts

mtcars_data <- mtcars %>% 
  tibble::rownames_to_column(var = "group") %>% 
  mutate_at( vars(-group), rescale) %>% 
  tail(3) %>% select(1:10)

IRkernel::set_plot_options(width = 950, height = 600, units = 'px')
ggradar(mtcars_radar)

Waffle Charts

waffle_palette <- c("#c7d4b6", "#a3aabd", "#a0d0de","#97b5cf")

expenses <- c(`Health ($43,212)` = 43212,
              `Education ($113.412)` = 113412,
              `Transportation ($20,231)` = 20231,
              `Entertaiment ($28,145)` = 28145)


#IRkernel::set_plot_options(width = 950, height = 600, units = 'px')
waffle(expenses/1235, rows = 5, size = 0.3,
       colors = waffle_palette, title = "Imaginary Household Expenses Each Year",
       xlab = "1 square = $934")

Boxplots

set.seed(1234)

set_a <- rnorm(200, mean = 1, sd = 2)
set_b <- rnorm(200, mean = 0, sd = 1)

df <- tibble(label = factor(rep(c("A","B"), each = 200)),
             value = c(set_a,set_b))


ggplot(df, aes(x = label,y=value)) + geom_boxplot() +
  ggtitle("Boxplot") +theme_minimal()

ggplotly()

## Or whith qplot()

qplot(factor(cyl), mpg, data =mtcars, geom = "boxplot")

Creating maps

## Times Square 
map <- leaflet() %>% addTiles() %>%
    addMarkers(lng = -73.9851, lat = 40.7589,
               popup = "Times square")

map

## Map with different styles

eiffel_tower <- leaflet() %>%  addProviderTiles("Stamen.Watercolor") %>% 
                  addMarkers(lng = 2.2945, lat = 48.8584,
                             popup = "Eiffel tower")

eiffel_tower

### Maps with dataframes

quakes <- quakes

map_quakes <- leaflet(quakes) %>% addTiles() %>% 
              addCircleMarkers(lng = quakes$long, lat = quakes$lat)

map_quakes

### For improve the clarity of the map

clusterd_map_quakes <- leaflet(quakes) %>% addTiles() %>% 
                    addMarkers(clusterOptions = markerClusterOptions())

## Assuming "long" and "lat" are longitude and latitude, respectively

clusterd_map_quakes

# saveWidget(clusterd_map_quakes, file = "clustered_map.html",
#            selfcontained = FALSE)

## Cirble map

map_circles <- leaflet(quakes) %>% addTiles() %>% 
              addCircles(lng = quakes$long, lat = quakes$lat)

map_circles

## Rentangles in the  amo

map_rentangle <- leaflet() %>% addTiles() %>% 
          addMarkers(lng = 86.92, lat = 27.99,
                     popup = "Mount Everest") %>% 
          addRectangles(86.9,27.95,87,28.05)

map_rentangle

Other Visualization libraries

ggvis is a library that uses the Grammar of Graphics (similar to ggplot), but for interactive visualizations.
plotly is a open-source library for developing interactive visualizations. It provides a number of “standard” interactions (pop-up labels, drag to pan, select to zoom, etc) automatically. Moreover, it is possible to take a ggplot2 plot and wrap it in Plotly in order to make it interactive. Plotly has many examples to learn from, though a less effective set of documentation.
htmlwidgets provides a way to utilize a number of JavaScript interactive visualization libraries. JavaScript is the programming language used to create interactive websites (HTML files), and so is highly specialized for creating interactive experiences.