Assignment 8 Continuous Variables with DS Labs and Highcharter Tutorials

Author

Angel Porter

The echo: false option disables the printing of code (only output is displayed).

#Load the required packages
library(tidyverse) #for creating plots and graphs

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.2.0     ✔ readr     2.1.6
✔ forcats   1.0.1     ✔ stringr   1.6.0
✔ ggplot2   4.0.2     ✔ tibble    3.3.1
✔ lubridate 1.9.5     ✔ tidyr     1.3.2
✔ purrr     1.2.1     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr) #reading and writes CSV and other text files
library("dslabs") #data wrangling/cleaning data
library(dplyr)#processing and manipulating data
library(tibble) #display is pretty
library(viridis) #color palatte adjustments

Loading required package: viridisLite

#Check my working directory
getwd()

[1] "C:/Users/angel/OneDrive/Documents"

list.files()

 [1] "Angel's Notebook.url"             "assignment-7.rmarkdown"          
 [3] "assignment-7_files"               "assignment 7.html"               
 [5] "assignment 7.qmd"                 "assignment 7.rmarkdown"          
 [7] "assignment 7_files"               "coffee"                          
 [9] "coffee line.qmd"                  "coffee.csv"                      
[11] "Custom Office Templates"          "desktop.ini"                     
[13] "FishGills3.csv"                   "My Tableau Repository"           
[15] "nations.csv"                      "OneNote Notebooks"               
[17] "p1.html"                          "p1.qmd"                          
[19] "p1_files"                         "rsconnect"                       
[21] "Study Techniques.url"             "Untitled 1.html"                 
[23] "Untitled 1.qmd"                   "Untitled 1_files"                
[25] "yale 2.pdf"                       "yale diversity preview visit.pdf"
[27] "Zoom"

# load Breast Cacer Wisconsin Diagnoostic Dataset from UCI Machine Learning Repository dataset in “dslabs” package
temp_carbon_df <- dslabs::temp_carbon

#Check the structure
temp_carbon_df %>% str()

'data.frame':   268 obs. of  5 variables:
 $ year            : num  1880 1881 1882 1883 1884 ...
 $ temp_anomaly    : num  -0.11 -0.08 -0.1 -0.18 -0.26 -0.25 -0.24 -0.28 -0.13 -0.09 ...
 $ land_anomaly    : num  -0.48 -0.4 -0.48 -0.66 -0.69 -0.56 -0.51 -0.47 -0.41 -0.31 ...
 $ ocean_anomaly   : num  -0.01 0.01 0 -0.04 -0.14 -0.17 -0.17 -0.23 -0.05 -0.02 ...
 $ carbon_emissions: num  236 243 256 272 275 277 281 295 327 327 ...

# Continuous numerical variables are usually measured, such as height. These variables can take on an infinite number of values within a given range.
# To produce informative graphics that tell a clear story, data journalists often need to turn a continuous variable into a categorical variable by dividing it into bins.

# Take a peek at what the data look like
head(temp_carbon_df)

  year temp_anomaly land_anomaly ocean_anomaly carbon_emissions
1 1880        -0.11        -0.48         -0.01              236
2 1881        -0.08        -0.40          0.01              243
3 1882        -0.10        -0.48          0.00              256
4 1883        -0.18        -0.66         -0.04              272
5 1884        -0.26        -0.69         -0.14              275
6 1885        -0.25        -0.56         -0.17              277

# Convert to a tibble:
# - never converts strings to factors
# - never changes variable names
# - never creates row names
as_tibble(temp_carbon_df)

# A tibble: 268 × 5
    year temp_anomaly land_anomaly ocean_anomaly carbon_emissions
   <dbl>        <dbl>        <dbl>         <dbl>            <dbl>
 1  1880        -0.11        -0.48         -0.01              236
 2  1881        -0.08        -0.4           0.01              243
 3  1882        -0.1         -0.48          0                 256
 4  1883        -0.18        -0.66         -0.04              272
 5  1884        -0.26        -0.69         -0.14              275
 6  1885        -0.25        -0.56         -0.17              277
 7  1886        -0.24        -0.51         -0.17              281
 8  1887        -0.28        -0.47         -0.23              295
 9  1888        -0.13        -0.41         -0.05              327
10  1889        -0.09        -0.31         -0.02              327
# ℹ 258 more rows

temp_tbl <- as_tibble(temp_carbon_df)

# A heatmap visualizes a table of numbers by substituting the numbers with colored cells.
# We will be using a cluster heatmap.

# Identify names of each column
names(temp_tbl)

[1] "year"             "temp_anomaly"     "land_anomaly"     "ocean_anomaly"   
[5] "carbon_emissions"

# Identify missing values in each column
colSums(is.na(temp_tbl))

            year     temp_anomaly     land_anomaly    ocean_anomaly 
               0              129              129              129 
carbon_emissions 
               4

heat_data <- temp_tbl %>%
  # Handle missing values
  drop_na(year, temp_anomaly, carbon_emissions) %>%
  
  # Convert continuous variables into categorical bins for heatmap structure
  mutate(
    carbon_emissions_bin = cut(carbon_emissions, breaks = 10),
    temp_anomaly_bin = cut(temp_anomaly, breaks = 10)
  ) %>%
  
  # Group by bin categories
  group_by(carbon_emissions_bin, temp_anomaly_bin) %>%
  
  # Compute the average year for each bin combination
  summarize(avg_year = mean(year, na.rm = TRUE), .groups = "drop")

# Graph a heatmap
ggplot(heat_data, aes(x = carbon_emissions_bin, y = temp_anomaly_bin, fill = avg_year)) +
  # Draw tiles representing each bin combination
  geom_tile(color = "grey70") +
  
  # Apply non-default color scale to represent average year
  scale_fill_viridis_c(option = "cividis") +
  
  # Add labels and title
  labs(
    title = "Carbon Emissions vs Temperature Anomaly Colored by Year",
    subtitle = "Heatmap showing when combinations of emissions and temperature occurred",
    x = "Carbon Emissions (binned)",
    y = "Temperature Anomaly (binned)",
    fill = "Average Year",
    caption = "DS Labs"
  ) +
  
  # Change theme and rotate x-axis labels to prevent overlap
  theme_dark() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#I decided to take a look at Global temperature anomaly and carbon emissions, 1751-2018. #The heatmap indicates a distinct upward trend in both carbon emissions and temperature anomalies over time. Earlier years correspond to lower emissions and temperature anomalies, whereas recent years exhibit higher values for both variables. This pattern demonstrates a strong positive relationship between carbon emissions and temperature changes, as both metrics increase concurrently over time.