DS Labs Datasets

Use the package DSLabs (Data Science Labs)

There are a number of datasets in this package to use to practice creating visualizations

# install.packages("dslabs")  # these are data science labs
library("dslabs")
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
##  [1] "make-admissions.R"                   
##  [2] "make-brca.R"                         
##  [3] "make-brexit_polls.R"                 
##  [4] "make-death_prob.R"                   
##  [5] "make-divorce_margarine.R"            
##  [6] "make-gapminder-rdas.R"               
##  [7] "make-greenhouse_gases.R"             
##  [8] "make-historic_co2.R"                 
##  [9] "make-mnist_27.R"                     
## [10] "make-movielens.R"                    
## [11] "make-murders-rda.R"                  
## [12] "make-na_example-rda.R"               
## [13] "make-nyc_regents_scores.R"           
## [14] "make-olive.R"                        
## [15] "make-outlier_example.R"              
## [16] "make-polls_2008.R"                   
## [17] "make-polls_us_election_2016.R"       
## [18] "make-reported_heights-rda.R"         
## [19] "make-research_funding_rates.R"       
## [20] "make-stars.R"                        
## [21] "make-temp_carbon.R"                  
## [22] "make-tissue-gene-expression.R"       
## [23] "make-trump_tweets.R"                 
## [24] "make-weekly_us_contagious_diseases.R"
## [25] "save-gapminder-example-csv.R"

Loading library

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.3     ✓ stringr 1.4.0
## ✓ readr   2.0.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggthemes)
library(ggrepel)
library(treemap)
library(dplyr)
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Highcharts (www.highcharts.com) is a Highsoft software product which is
## not free for commercial and Governmental use
## 
## Attaching package: 'highcharter'
## The following object is masked from 'package:dslabs':
## 
##     stars
library(RColorBrewer)
library(readr)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(ggplot2)
library(viridisLite)

I will be using gapminder from dslabs package for this assignment.

data(gapminder)
summary(gapminder)
##                 country           year      infant_mortality life_expectancy
##  Albania            :   57   Min.   :1960   Min.   :  1.50   Min.   :13.20  
##  Algeria            :   57   1st Qu.:1974   1st Qu.: 16.00   1st Qu.:57.50  
##  Angola             :   57   Median :1988   Median : 41.50   Median :67.54  
##  Antigua and Barbuda:   57   Mean   :1988   Mean   : 55.31   Mean   :64.81  
##  Argentina          :   57   3rd Qu.:2002   3rd Qu.: 85.10   3rd Qu.:73.00  
##  Armenia            :   57   Max.   :2016   Max.   :276.90   Max.   :83.90  
##  (Other)            :10203                  NA's   :1453                    
##    fertility       population             gdp               continent   
##  Min.   :0.840   Min.   :3.124e+04   Min.   :4.040e+07   Africa  :2907  
##  1st Qu.:2.200   1st Qu.:1.333e+06   1st Qu.:1.846e+09   Americas:2052  
##  Median :3.750   Median :5.009e+06   Median :7.794e+09   Asia    :2679  
##  Mean   :4.084   Mean   :2.701e+07   Mean   :1.480e+11   Europe  :2223  
##  3rd Qu.:6.000   3rd Qu.:1.523e+07   3rd Qu.:5.540e+10   Oceania : 684  
##  Max.   :9.220   Max.   :1.376e+09   Max.   :1.174e+13                  
##  NA's   :187     NA's   :185         NA's   :2972                       
##              region    
##  Western Asia   :1026  
##  Eastern Africa : 912  
##  Western Africa : 912  
##  Caribbean      : 741  
##  South America  : 684  
##  Southern Europe: 684  
##  (Other)        :5586

Electing and sorting data to use

test <- gapminder %>%
  dplyr::select(continent,year,population, life_expectancy)%>%
  dplyr::arrange(year) %>%
  mutate(population_in_millions = population/10^6)

Using Highchart to to creat graph for life expectancy in each year and grouping by continent

cols <- brewer.pal(8, "Accent")

highchart() %>%
  hc_add_series(data = test,
                   type = "column", hcaes(x = year,
                   y = life_expectancy, 
                   group = continent))%>%
                    hc_colors(cols) %>%
                    hc_title(
                            text="Life Expectancy by Continent 1960 - 2016 ")%>%
                    hc_xAxis(
                            title = list(text="Year")) %>%
                    hc_yAxis(
                            title = list(text="Life Expectancy")) %>%
                    hc_legend(align = "right", 
                              verticalAlign = "top")

ColorBrewer

#library(RColorBrewer)
#n <- 60
#qual_col_pals = brewer.pal.info[brewer.pal.info$category == 'qual',]
#col_vector = unlist(mapply(brewer.pal, qual_col_pals$maxcolors, rownames(qual_col_pals)))
#pie(rep(1,n), col=sample(col_vector, n))

Using Highchart to creat graph for population in each year and grouping by region

c <- brewer.pal(12, "Paired")

highchart() %>%
  
  hc_add_series(data = gapminder,
                   type = "bar", 
                   hcaes(x = year,
                   y = population, 
                   group = region))%>%
                     hc_colors(c)%>%
                    hc_title(
                            text="Population by Region From 1960 - 2015 ")%>%
                    hc_xAxis(title = list(text="Year")) %>%
                    hc_yAxis(title = list(text=" Population in Millions ")) 

Selecting only the needed data for the next graph

region6 <- gapminder %>%
  filter(year%in%c(1960, 2016),region == "Eastern Europe" | region == "Australia and New Zealand" | region == "Western Africa" | region == "South America" | region== "South-Eastern Asia" |region=="Northern America" ) 
#region6 <- gapminder %>%
# filter(region == "Eastern Europe" | region == "Australia and New Zealand" | region == "Western Africa" | region == "South America" | region== "South-Eastern Asia" #|region=="Northern America" ) %>%
# arrange(year)

life expectancy comparison of year 1960 vs 2016 by select region

highchart() %>%
  hc_add_series(data = region6,
                   type = "bar", 
                    hcaes(x = year,
                   y = life_expectancy, 
                   group = region))%>%
                    hc_title(
                            text="Life Expectancy by Select Region 1960 vs 2016 ")%>%
                    hc_xAxis(
                            title = list(text="Year")) %>%
                    hc_yAxis(
                            title = list(text="Life Expectancy")) %>%
                    hc_legend(align = "right", 
                              verticalAlign = "top")

** My goal was to explore this dataset along with highchart. I found highchart to be a great way to create vizualization due to its interactive features. The first graph allows us to to view life expectancy by year and continent.The life expectancy has generally increased since availability of data when we group it by continent. The second graph to so show the population change by region. I decided on this because I felt aesthetic of this visual was visually pleasing but the variables of region are many so selecting few to compare would be the best use on this. I wanted to try same graph with less variables so the third chart only has data from the year 1960 and 2016 with randomly selected regions. We see that life expectancy has increased since the earliest available data.