Week2Assignment

library(tidyverse)

## -- Attaching packages -------------------------------------------------------------------------------- tidyverse 1.3.0 --

## <U+2713> ggplot2 3.2.1     <U+2713> purrr   0.3.3
## <U+2713> tibble  2.1.3     <U+2713> dplyr   0.8.3
## <U+2713> tidyr   1.0.0     <U+2713> stringr 1.4.0
## <U+2713> readr   1.3.1     <U+2713> forcats 0.4.0

## -- Conflicts ----------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

#Week2 Assignment: Chapter 3.6.1 #Question1: What geom would you use to draw a line chart? #Answer: geom_smooth() A boxplot? geom_boxplot() A histogram? geom_histogram() An area chart? geom_area()

#Question2: Run this code in your head and predict what the output will look like. #Then, run the code in R and check your predictions.

#Answer: R will output both a line and point graphs, with the lines displayed in drv categories colored differently #without the confidence intervals displayed around the lines.

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
  geom_point() + 
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Question3: What does show.legend = FALSE do? What happens if you remove it? #Answer: show.legend = FALSE makes the scale not to have any legend for the aesthetic where it was included. If it is #removed, the legend appears on the scale because its default value is TRUE. #Question3b: Why do you think I used it earlier in the chapter?

ggplot(data = mpg) +
  geom_smooth(
    mapping = aes(x = displ, y = hwy, color = drv),
    show.legend = FALSE
  )

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Answer: Because if the aesthetic color is used, by default the graph will have to provide a legend unless otherwise #stated e.g. in the case above where it was stated as FALSE

#Question4: What does the se argument to geom_smooth() do? #Answer: It provides the uper and lower confidence interval for each line chart displayed unless specified as FALSE

#Question5: Will these two graphs look different? Why/why not?

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point() + 
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

ggplot() + 
  geom_point(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy))

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Answer: Yes, because they are both diplaying the same x and y variable on teh same data. The difference is in the syntax the first one avoids duplication by passing the arguments to ggplot()

#Question6: Recreate the R code necessary to generate the following graphs. #Answer: Graph 1

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(stroke = 4) + 
  geom_smooth(size=2, se=FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Answer: Graph 2

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, group = drv)) + 
  geom_point(stroke = 4) + 
  geom_smooth(size=2,se=FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Answer: Graph 3

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
  geom_point(stroke = 3) + 
  geom_smooth(size=3,se=FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Answer: Graph 4

ggplot() + 
  geom_point(data = mpg, mapping = aes(x = displ, y = hwy, color = drv),stroke = 3) + 
  geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy), size=3,se=FALSE, show.legend = FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Answer: Graph 5

ggplot() + 
  geom_point(data = mpg, mapping = aes(x = displ, y = hwy, color = drv),stroke = 3) + 
  geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy, linetype = drv), size=3,se=FALSE)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Answer: Graph 6

ggplot() + 
  geom_point(data = mpg, mapping = aes(x = displ, y = hwy, color = drv),stroke = 4)

#Week2 Assignment: Chapter 3.7.1 #What is the default geom associated with ?stat_summary()? Answer: geom_freqpoly() #How could you rewrite the previous plot to use that geom function instead of the stat function?

 ggplot(data = diamonds) + 
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.ymin = min,
    fun.ymax = max,
    fun.y = median
  )

#Answer: The closest I came to, considering the default geom:

 ggplot(data = diamonds, mapping = aes(x = cut, y = depth)) +
   geom_freqpoly(stat = "identity")

#Question2: What does geom_col() do? How is it different to geom_bar()? #Answer:geom_col() uses the values in the data to determine the height of the bars #geom_col() uses stat_identity(): it leaves the data as is while #geom_bar() makes the height of the bar proportional to the number of cases in each group #(or if the weight aesthetic is supplied, the sum of the weights). geom_bar() uses stat_count(), #counting the number of cases at each x position

#Question3: Most geoms and stats come in pairs that are almost always used in concert. #Read through the documentation and make a list of all the pairs. What do they have in common?

#Answer: geom_col() uses stat_identity(); geom_bar() uses stat_count(); geom_histogram() uses stat_bin(). #geom_smooth() and stat_smooth();

#Question4: What variables does stat_smooth() compute? What parameters control its behaviour? #Answer: stat_smooth() computes y (predicted value), ymin (lower pointwise confidence interval around the mean) #ymax (upper pointwise confidence interval around the mean), se (standard error)

#Answer: Parameter controlling stat_smooth: se, na.rm, show.legend, span, fullrange, level, methods.arg

#Question5: In our proportion bar chart, we need to set group = 1. #Why? In other words what is the problem with these two graphs? #If group is not used, the proportion is calculated with respect to the data that contains that field and is ultimately going to be 100% in any case. For instance, The proportion of an ideal cut in the ideal cut specific data will be 1

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, y = ..prop..))

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = cut, y = ..prop..))

#Answer: Group makes the plot to look at specific rows that contain the specific cut and the proportion is found with respect to the whole database as in proportion of an ideal cut in the whole dataset.

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, y = ..prop.., group = 1))

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = cut, y = ..prop.., group = 1))

#Week2 Assignment: Chapter 3.8.1

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, colour = cut))

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = cut))

ggplot(data = diamonds, mapping = aes(x = cut, fill = clarity)) + 
  geom_bar(alpha = 1/5, position = "identity")

ggplot(data = diamonds, mapping = aes(x = cut, colour = clarity)) + 
  geom_bar(fill = NA, position = "identity")

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "fill")

#Question1: What is the problem with this plot? How could you improve it?

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + 
  geom_point()

#Answer: The values of hwy and displ are rounded so the points appear on a grid and many points overlap each other. #This problem is known as overplotting. This arrangement makes it hard to see where the mass of the data is. #This can be improved by adding the position adjustment as “jitter” by usisng geom_jitters instead of geom_point

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + 
  geom_jitter()

#Question2: What parameters to geom_jitter() control the amount of jittering? #Answer: width, height

#Question3: Compare and contrast ?geom_jitter() with ?geom_count()

#Answer: Both geom_jitter() and geom_count() solve the overplotting problem in discrete values #However, while geom_jitter() solves the overplotting problem by adding some random noise, #geom_count() counts the number of observations at each location, then maps the count to point area #Also, geom_jitters() is used with smaller datasets

#Question4: What’s the default position adjustment for geom_boxplot()? Answer: position = “dodge2” #Create a visualisation of the mpg dataset that demonstrates it.

ggplot(mpg, aes(class, hwy)) +
 geom_boxplot(position = "dodge2")

#Week2 Assignment: Chapter 3.9.1 #Question1: Turn a stacked bar chart into a pie chart using coord_polar().

bar <- ggplot(data = diamonds) + 
  geom_bar(
    mapping = aes(x = cut, fill = cut), 
    show.legend = FALSE,
    width = 1
  ) + 
  theme(aspect.ratio = 1) +
  labs(x = NULL, y = NULL)

bar + coord_flip()

bar + coord_polar()

#Question2: What does labs() do? Read the documentation. #Answer: labs() modifys axis, legends and plot labels

#Question3: What’s the difference between coord_quickmap() and coord_map()? #Answer: coord_map projects a portion of the earth, which is approximately spherical, #onto a flat 2D plane using any projection defined by the mapproj package. Map projections do not, #in general, preserve straight lines, so this requires considerable computation while #coord_quickmap is a quick approximation that does preserve straight lines. #It works best for smaller areas closer to the equator.

#Question 4: What does the plot below tell you about the relationship between city and highway mpg? #Answer: miles per gallons increases in the same proprtion in the city and the highway

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point() + 
  geom_abline() +
  coord_fixed()

#Why is coord_fixed() important? #Answer: Because it forces a ratio representing the number of units on the y-axis equivalent to one unit on the x-axis #What does geom_abline() do? #Answer:geom_abline() adds reference lines to a plot

Week2Assignment

Ndisha Mwakala

1/12/2020