## gapminder should be installed, just restart your jupyterhub 
### if issues persist this back up method works
#options(download.file.method =  "wget")
#remotes::install_github("jennybc/gapminder")

## Run this code, to manage packages and install as needed.
# Install pacman
if (!require("pacman")) install.packages("pacman", repos = "http://cran.us.r-project.org")
# p_load function loads packages if installed, or install then loads otherwise
pacman::p_load(tidyverse,gapminder,leaflet)

Introduction

Data visualization and mapping are essential techniques in data analysis, allowing us to uncover patterns, trends, and relationships within datasets. This project will guide us through fundamental data visualization techniques using R, focusing on the gapminder dataset and Seattle Airbnb listings data. We will explore how to create scatterplots, apply scales, use colors effectively, implement faceting, and generate interactive maps.

Agenda

The gapminder dataset provides demographic and economic data from countries worldwide. It includes key variables such as GDP per capita, life expectancy, and population. To get started, install and load the necessary packages:

#install.packages("gapminder")
# Load necessary libraries
library(gapminder)
library(tidyverse)
str(gapminder)
## tibble [1,704 × 6] (S3: tbl_df/tbl/data.frame)
##  $ country  : Factor w/ 142 levels "Afghanistan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ continent: Factor w/ 5 levels "Africa","Americas",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ year     : int [1:1704] 1952 1957 1962 1967 1972 1977 1982 1987 1992 1997 ...
##  $ lifeExp  : num [1:1704] 28.8 30.3 32 34 36.1 ...
##  $ pop      : int [1:1704] 8425333 9240934 10267083 11537966 13079460 14880372 12881816 13867957 16317921 22227415 ...
##  $ gdpPercap: num [1:1704] 779 821 853 836 740 ...

The dataset contains categorical variables (country, continent) and continuous variables (year, lifeExp, pop, gdpPercap). Understanding these data types helps in selecting appropriate visualization techniques.

To focus on a specific country, subset the data for Algeria:

algeria_data <- gapminder %>% filter(country == "Algeria")
head(algeria_data)
## # A tibble: 6 × 6
##   country continent  year lifeExp      pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Algeria Africa     1952    43.1  9279525     2449.
## 2 Algeria Africa     1957    45.7 10270856     3014.
## 3 Algeria Africa     1962    48.3 11000948     2551.
## 4 Algeria Africa     1967    51.4 12760499     3247.
## 5 Algeria Africa     1972    54.5 14760787     4183.
## 6 Algeria Africa     1977    58.0 17152804     4910.

2. Visualizing Covariations with ggplot2

Scatterplots

Scatterplots help visualize relationships between two variables and detect outliers. The following example plots GDP per capita against life expectancy:

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) +
  geom_point()

To analyze the relationship between population and life expectancy:

ggplot(gapminder, aes(x = pop, y = lifeExp)) +
  geom_point() +
  labs(title = "Population vs. Life Expectancy",
       x = "Population",
       y = "Life Expectancy") +
  theme_minimal()

#### Scales

Applying a logarithmic scale can improve data visualization by making patterns clearer:

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) +
  geom_point() +
  scale_x_log10()

#### Colors

Adding colors enhances data interpretation. Coloring by continent:

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp, color = continent)) +
  geom_point() +
  scale_x_log10()

For continuous variables, a gradient color scale can be applied:

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp, color = log(pop))) +
  geom_point() +
  scale_x_log10()

#### Facets

Faceting creates multiple small plots, breaking down the data by a categorical variable:

ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) + 
  geom_point() + 
  scale_x_log10() + 
  facet_wrap(~continent) 

### 3. Analyzing Life Expectancy Trends

Line Plots Over Time

To examine life expectancy trends, plot life expectancy over time for each country:

ggplot(gapminder, aes(x = year, y = lifeExp, group = country, color = continent)) +
  geom_line(alpha = 0.5) +
  labs(title = "Life Expectancy Over Time by Country",
       x = "Year",
       y = "Life Expectancy",
       color = "Continent") +
  theme_minimal() +
  facet_wrap(~continent)

### 4. Interactive Mapping with leaflet

The leaflet package enables interactive map visualization. First, install and load the package, then read the Airbnb dataset:

if (!require(leaflet)) install.packages("leaflet", dependencies = TRUE)
library(leaflet)
library(tidyverse)

airbnb_data <- read_csv("listings.csv")
## Rows: 7785 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (5): name, host_name, neighbourhood_group, neighbourhood, room_type
## dbl  (10): id, host_id, latitude, longitude, price, minimum_nights, number_o...
## date  (1): last_review
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Mapping Points and Popups

To create a basic map displaying Airbnb listings:

leaflet(airbnb_data) %>%
  addTiles() %>%
  addCircles(popup = ~name)
## Assuming "longitude" and "latitude" are longitude and latitude, respectively

Filtering Data and Enhancing Visualization

To highlight listings priced over $200:

airbnb_high_price <- airbnb_data %>% filter(price > 200)

leaflet(airbnb_high_price) %>%
  addTiles() %>%
  addCircles(lng = ~longitude, lat = ~latitude, popup = ~name, color = "red")
leaflet(airbnb_data) %>%
  addTiles() %>%
  addCircles(lng = ~longitude, lat = ~latitude, color = ~colorFactor("YlOrRd", price)(price), popup = ~paste("Price:", price)) %>%
  addLegend("bottomright", pal = colorFactor("YlOrRd", airbnb_data$price), values = airbnb_data$price, title = "Price Range")
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette YlOrRd is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette YlOrRd is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette YlOrRd is 9
## Returning the palette you asked for with that many colors

Conclusion

This project covered essential data visualization techniques in R, from basic scatterplots and faceting to interactive mapping with leaflet. By combining these methods, we can better analyze patterns and trends in datasets like gapminder and Airbnb listings, leading to more effective data-driven insights.

References

Kieran Healy, Data Visualization: A Practical Introduction

Charles Lanfear, Introduction to R for Social Scientists