DS Labs Homework

Author

Thejitha Rajapakshe

Installing Tidyverse

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.4.4     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
library(ggrepel)

Installing highcharter, RcolorBrewer

library(highcharter)
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
Highcharts (www.highcharts.com) is a Highsoft software product which is
not free for commercial and Governmental use
library(RColorBrewer)

Installing DSLabs (Data Science Labs)

library("dslabs")

Attaching package: 'dslabs'
The following object is masked from 'package:highcharter':

    stars
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
 [1] "make-admissions.R"                   
 [2] "make-brca.R"                         
 [3] "make-brexit_polls.R"                 
 [4] "make-calificaciones.R"               
 [5] "make-death_prob.R"                   
 [6] "make-divorce_margarine.R"            
 [7] "make-gapminder-rdas.R"               
 [8] "make-greenhouse_gases.R"             
 [9] "make-historic_co2.R"                 
[10] "make-mice_weights.R"                 
[11] "make-mnist_127.R"                    
[12] "make-mnist_27.R"                     
[13] "make-movielens.R"                    
[14] "make-murders-rda.R"                  
[15] "make-na_example-rda.R"               
[16] "make-nyc_regents_scores.R"           
[17] "make-olive.R"                        
[18] "make-outlier_example.R"              
[19] "make-polls_2008.R"                   
[20] "make-polls_us_election_2016.R"       
[21] "make-pr_death_counts.R"              
[22] "make-reported_heights-rda.R"         
[23] "make-research_funding_rates.R"       
[24] "make-stars.R"                        
[25] "make-temp_carbon.R"                  
[26] "make-tissue-gene-expression.R"       
[27] "make-trump_tweets.R"                 
[28] "make-weekly_us_contagious_diseases.R"
[29] "save-gapminder-example-csv.R"        
#Loading data sets
data("nyc_regents_scores") 
data("greenhouse_gases")
data("us_contagious_diseases")
nyc_regents_scores <- nyc_regents_scores %>% mutate(score = ifelse(is.na(score), 101, score)) #I am adding another row to the data set to consider the next set of values in the calculation as its score is NA

scores_clean <- nyc_regents_scores |>
  filter(!is.na(integrated_algebra)  & !is.na(global_history) & !is.na(living_environment) & !is.na(english) & !is.na(us_history)) #Removing all NAs from the columns

scores_clean <- scores_clean %>% mutate(score = as.character(score)) #Making the score coloumn a character variable"

scores_selected <- scores_clean %>% select(-1)#adding scores coloumn

scores_selected$max <- apply(scores_selected, 1, max, na.rm=TRUE)#adding a maximum coloumn for the highest score out of the 5 subjects

scores_selected$scores <- c("Student 1", "Student 2", "Student 3", "Student 4", "Student 5", "Student 6", "Student 7", "Student 8", "Student 9", "Student 10", "Student 11", "Student 12", "Student 13", "Student 14", "Student 15", "Student 16", "Student 17", "Student 18", "Student 19", "Student 20", "Student 21", "Student 22", "Student 23", "Student 24", "Student 25", "Student 26", "Student 27", "Student 28", "Student 29", "Student 30", "Student 31", "Student 32", "Student 33", "Student 34", "Student 35", "Student 36", "Student 37", "Student 38", "Student 39", "Student 40", "Student 41", "Student 42", "Student 43", "Student 44", "Student 45", "Student 46", "Student 47", "Student 48", "Student 49", "Student 50", "Student 51", "Student 52", "Student 53", "Student 54", "Student 55", "Student 56", "Student 57", "Student 58", "Student 59", "Student 60", "Student 61", "Student 62", "Student 63", "Student 64", "Student 65", "Student 66", "Student 67", "Student 68", "Student 69", "Student 70", "Student 71", "Student 72", "Student 73", "Student 74", "Student 75", "Student 76", "Student 77", "Student 78", "Student 79", "Student 80", "Student 81", "Student 82", "Student 83", "Student 84", "Student 85", "Student 86", "Student 87", "Student 88", "Student 89", "Student 90", "Student 91", "Student 92", "Student 93", "Student 94", "Student 95", "Student 96", "Student 97") #adding each student

score_final<- scores_selected |> 
  mutate(Average = (integrated_algebra + global_history + living_environment + english + us_history) / 5) #finding average to compare to hghest subject
cols <- brewer.pal(4, "Set1") #setting color
highchart() |> # Setting graph for x = Average, y = max, and grouping them according them to the highest sub
  hc_add_series(data = score_final,
                type = "line",
                hcaes(x = Average,
                      y = max,
                      group = max)) |>
  hc_colors(cols) |># color of points
  hc_xAxis(title = list(text = "Average of all 5 subjects")) |> #label x axis
  hc_yAxis(title = list(text = "Highest Subject")) |>#label y axis
  hc_legend(enabled = FALSE) |> #no legend
  hc_plotOptions(series = list(marker = list(symbol = "circle"))) # setting point as dot

Concentration of Greenhouse Gases Over 200 Years

highchart() |>
  hc_add_series(data = greenhouse_gases,
                type = "line",
                hcaes(x = year,
                      y = concentration,
                      group = gas)) |> #setting axis using highchart for green house gases
  hc_colors(cols) |> #adjusting the colors 
  hc_xAxis(title = list(text = "Year")) |> #labelling y axis
  hc_yAxis(title = list(text = "Concentration (ppm)")) |> #labelling x axis
  hc_legend(verticalAlign = "right") |> #adjusting position of legend
  hc_plotOptions(series = list(marker = list(symbol = "circle"))) #circle dot
# set color palette
cols <- brewer.pal(4,"Set2")
#setting title for y axis 1
highchart() |>
  hc_yAxis_multiples(
    list(title = list(text = "Year)")),
    list(title = list(text = "count"),
         opposite = TRUE))|> #setting tile for y axis 2
  hc_add_series(data = us_contagious_diseases$year,
                name = "Year",
                type = "line",
                yAxis = 0) |>
  hc_add_series(data = us_contagious_diseases$count,
                name = "Case Count",
                type = "line",
                yAxis = 1) |>
  hc_xAxis(categories = us_contagious_diseases$disease, #setting x axis
           tickInterval = 5) |>
  hc_colors(cols) |>
  hc_legend() |> #legend uploading
  hc_chart(style = list(fontFamily = "Arial"))#changing font for labels

The rendered markdown document showcases visualizations created using Highcharter for three distinct datasets: NYC Regents scores, greenhouse gases concentration over 200 years, and contagious disease data for US states.

For the NYC Regents scores dataset, I preprocessed the data to calculate the maximum score for each student across multiple subjects. Then, I utilized Highcharter to generate a line chart illustrating the relationship between the average of all subjects and the highest score achieved by each student. This visualization helps in understanding the performance distribution among students.

The greenhouse gases dataset provided a historical perspective on the concentration of greenhouse gases over 200 years. Using Highcharter, I crafted a line chart where the x-axis represents the year, the y-axis represents the concentration of greenhouse gases, and different colors distinguish between different types of gases. This visualization aids in observing long-term trends in greenhouse gas levels.

Lastly, the dataset on contagious diseases in US states enabled the creation of a dual-axis chart using Highcharter. This chart juxtaposes the number of cases of different diseases against the years, with each disease represented by a different line color. The chart effectively illustrates the patterns in disease outbreaks across various states.