library("dslabs")
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
data(package="dslabs")
list.files(system.file("script", package = "dslabs"))
## [1] "make-admissions.R"
## [2] "make-brca.R"
## [3] "make-brexit_polls.R"
## [4] "make-calificaciones.R"
## [5] "make-death_prob.R"
## [6] "make-divorce_margarine.R"
## [7] "make-gapminder-rdas.R"
## [8] "make-greenhouse_gases.R"
## [9] "make-historic_co2.R"
## [10] "make-mice_weights.R"
## [11] "make-mnist_127.R"
## [12] "make-mnist_27.R"
## [13] "make-movielens.R"
## [14] "make-murders-rda.R"
## [15] "make-na_example-rda.R"
## [16] "make-nyc_regents_scores.R"
## [17] "make-olive.R"
## [18] "make-outlier_example.R"
## [19] "make-polls_2008.R"
## [20] "make-polls_us_election_2016.R"
## [21] "make-pr_death_counts.R"
## [22] "make-reported_heights-rda.R"
## [23] "make-research_funding_rates.R"
## [24] "make-stars.R"
## [25] "make-temp_carbon.R"
## [26] "make-tissue-gene-expression.R"
## [27] "make-trump_tweets.R"
## [28] "make-weekly_us_contagious_diseases.R"
## [29] "save-gapminder-example-csv.R"
data("nyc_regents_scores")
str(nyc_regents_scores)
## 'data.frame': 102 obs. of 6 variables:
## $ score : num 0 1 2 3 4 5 6 7 8 9 ...
## $ integrated_algebra: num 56 NA 1 NA 3 2 4 1 24 3 ...
## $ global_history : num 55 8 9 3 15 11 29 37 53 49 ...
## $ living_environment: num 66 3 2 1 1 10 3 2 6 3 ...
## $ english : num 165 69 237 190 109 122 151 175 197 175 ...
## $ us_history : num 65 4 16 10 6 8 7 12 16 28 ...
sum(is.na(nyc_regents_scores))
## [1] 10
nyc_regents_scores %>%
filter(!is.na(score) &
!is.na(living_environment) &
!is.na(global_history) &
!is.na(english) &
!is.na(us_history)) %>%
select(Score = score,
LV_Environment = living_environment,
English = english,
History = global_history,
US_H = us_history) %>%
gather(Subjects, Frequency, History, US_H, English, LV_Environment) %>%
ggplot(aes(Frequency, Score, col = Subjects)) +
geom_line(size = 0.9) +
ylab("Score Frequency") +
xlab("Scores from 0 to 100") +
ggtitle("NYC Regents Exam Frequency Plot for Chosen Subjects") +
scale_color_manual(values = c("#03436A", "#64A8D1", "#B0F26D", "#CFA127")) +
theme_light()+
xlim(c(0, 100)) +
theme(strip.background= element_blank(),
strip.text.x = element_blank(),
strip.text.y = element_blank())
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 328 rows containing missing values (`geom_line()`).
The data set I chose in DSLabs is called, “nyc_regents_score”. This data set include scores of NYC Regents Exam in Integrated Algebra, Global History, Living Environment, English, and US History in 2010. I wanted to make a plot that contains score frequency of four subjects, and the subjects that seemed interesting to me was Living Environment, Global History, English and US History. For this assignment, I decided to create a line graph.I got rid of the NA values because it makes graphing a lot more easier. First, I thought I would use the “RCOLORBREWER” as usual to pick the colors of the lines but I ended up typing the color codes manually.