Administrative

Please indicate

  • Roughly how much time you spent on this HW so far: 4 hours
  • The URL of the RPubs published URL here.
  • What gave you the most trouble: The third graph gave a really hard time. Also I did not know exactly what the second graph was supposed to look like.
  • Any comments you have:

Question 1:

Use the mlb_teams.csv data set to create an informative data graphic that illustrates the relationship between winning percentage (WPct) and payroll in context.

library(ggthemes)
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(ggplot2)

get data

MLB <- read.csv("https://raw.githubusercontent.com/cmsc205/data/master/mlb_teams.csv")

setup plot. I think I will try a scatterplot to start or geom_point

MLBPlot <- ggplot(data = MLB,aes(x = WPct,y = payroll/1000000)) + geom_point()

Now I am going to add labels and titles. Later went back and added color, trendline, and divided payroll by one million to make it look nicer.

MLBPlot <- ggplot(data = MLB,aes(x = WPct,y = payroll/1000000)) + 
  geom_point(color = "blue") +
  labs(x = "Winning Percentage, %", y = "Payroll in Millions of Dollars", title = "MLB: How Payroll affects Winning Percentage") +
  geom_smooth(color = "black") +
  theme_update()

print MLB

MLBPlot
## `geom_smooth()` using method = 'loess'

Question 2:

Using data from the nasaweather R package, use the path geometry (i.e. use a geom_path layer) to plot the path of each tropical storm in the storms data table. Use color to distinguish the storms from one another, and use faceting to plot each year in its own panel.

Hint: Don’t forget to install and load the nasaweather R package!

load package

library(nasaweather)
library(mdsr)
## Loading required package: mosaic
## Loading required package: lattice
## Loading required package: mosaicData
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## 
## The 'mosaic' package masks several functions from core packages in order to add additional features.  
## The original behavior of these functions should not be affected by this.
## 
## Attaching package: 'mosaic'
## The following object is masked from 'package:Matrix':
## 
##     mean
## The following objects are masked from 'package:dplyr':
## 
##     count, do, tally
## The following object is masked from 'package:ggthemes':
## 
##     theme_map
## The following objects are masked from 'package:stats':
## 
##     binom.test, cor, cov, D, fivenum, IQR, median, prop.test,
##     quantile, sd, t.test, var
## The following objects are masked from 'package:base':
## 
##     max, mean, min, prod, range, sample, sum

need storms

storms <- storms

Filter out tropical storms

Tropical_Storms <- storms %>% filter(type == "Tropical Storm")

make plot

ggplot(Tropical_Storms, aes(x = lat, y = long)) + 
  geom_path() +
  facet_wrap(~ year,ncol = 2)

add color to distinguish each storm

ggplot(Tropical_Storms, aes(x = lat, y = long)) + 
  geom_path(show.legend = FALSE,aes(color = name)) +
  facet_wrap(~ year,ncol = 2) +
  labs(y = "Longitude", x = "Latitude", title = "Path of Tropical Storms")

Question 3:

Using the data set Top25CommonFemaleNames.csv, recreate the “Median Names for Females with the 25 Most Common Names” graphic from FiveThirtyEight (link to graphic; link to full article).

read csv

FemaleNames <- read.csv("https://raw.githubusercontent.com/cmsc205/data/master/Top25CommonFemaleNames.csv")

setup plot Names on y axis and years on x axis

Females <- ggplot(data = FemaleNames,aes(x = median_age, y = name, ymin = 0, ymax = 0)) +
  geom_linerange(color = "yellow") +
  geom_point()
Females

Create Linerange

Females <- factor(FemaleNames, levels = FemaleNames[order("median_age")])

Females <- ggplot(data = FemaleNames,aes(x = name, y = median_age, ymin = q1_age, ymax = q3_age)) +
  geom_linerange(aes(color = "yellow", size = 1)) +
  geom_point() +
  coord_flip()
Females

Need to reorder names so that they are in order by median age value

Females <- ggplot(data = FemaleNames,aes(x = reorder(name, -median_age), y = median_age, ymin = q1_age, ymax = q3_age)) +
  geom_linerange(aes(size = 3), color = "goldenrod", show.legend = FALSE) +
  geom_point(color = "red") +
  coord_flip() + labs(title = "Median Ages For Females with the 25 Most Common Names", subtitle = "Among Americans estimated to be alive as of Jan. 1, 2014", x = NULL, y = "years old")
Females

editing axes and adding text

Females <- ggplot(data = FemaleNames,aes(x = reorder(name, -median_age), y = median_age, ymin = q1_age, ymax = q3_age)) +
  geom_linerange(aes(size = 3), color = "goldenrod", show.legend = FALSE) +
  geom_point(color = "red") +
  coord_flip() + labs(title = "Median Ages For Females with the 25 Most\nCommon Names", subtitle = "Among Americans estimated to be alive as of Jan. 1, 2014", x = NULL, y = NULL) + scale_y_continuous(breaks = seq(15, 75, 10), position = "right") +
  geom_text(label = "75th percentile", x = 16, y = 50, size = 3) +
  geom_text(label = "25th", x = 16, y = 28, size = 3) +
  theme_fivethirtyeight() +
  theme(panel.grid.major.y = element_blank(), panel.grid.major.x = element_line(linetype = "dotted")) +
  geom_point(aes(x = 22, y = 63), show.legend = FALSE, color = "red") +
  geom_text(label = "median", x = 22, y = 66, size = 3)
Females