Week3-assignment

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'ggplot2' was built under R version 4.2.3

## Warning: package 'tibble' was built under R version 4.2.3

## Warning: package 'tidyr' was built under R version 4.2.3

## Warning: package 'readr' was built under R version 4.2.3

## Warning: package 'purrr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'forcats' was built under R version 4.2.3

## Warning: package 'lubridate' was built under R version 4.2.3

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.1     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.2     v tibble    3.2.1
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.1     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

library(ggplot2)

ggplot(data = mpg)

# I see a blank graph

nrow(mpg)

## [1] 234

ncol(mpg)

## [1] 11

# It has 234 rows and 11 columns

?mpg

## starting httpd help server ... done

# drv refers to drivetrain, FWD, RWD and 4 wd.

ggplot(data = mpg)+
  geom_point(mapping = aes(x = cyl, y = hwy))

# the more cylinders it has, the worse is the mileage per gallon

ggplot(data = mpg)+
  geom_point(mapping = aes(x = class, y = drv))

# the plot is not usefull because each class can come in different drivetrain layout and essentially both variables are categorical with only few options

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

# the provided code has color inside the aes mapping while it should be outside like code above

?mpg
str(mpg)

## tibble [234 x 11] (S3: tbl_df/tbl/data.frame)
##  $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
##  $ model       : chr [1:234] "a4" "a4" "a4" "a4" ...
##  $ displ       : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
##  $ year        : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
##  $ cyl         : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
##  $ trans       : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
##  $ drv         : chr [1:234] "f" "f" "f" "f" ...
##  $ cty         : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
##  $ hwy         : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
##  $ fl          : chr [1:234] "p" "p" "p" "p" ...
##  $ class       : chr [1:234] "compact" "compact" "compact" "compact" ...

mpg

## # A tibble: 234 x 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto~ f        18    29 p     comp~
##  2 audi         a4           1.8  1999     4 manu~ f        21    29 p     comp~
##  3 audi         a4           2    2008     4 manu~ f        20    31 p     comp~
##  4 audi         a4           2    2008     4 auto~ f        21    30 p     comp~
##  5 audi         a4           2.8  1999     6 auto~ f        16    26 p     comp~
##  6 audi         a4           2.8  1999     6 manu~ f        18    26 p     comp~
##  7 audi         a4           3.1  2008     6 auto~ f        18    27 p     comp~
##  8 audi         a4 quattro   1.8  1999     4 manu~ 4        18    26 p     comp~
##  9 audi         a4 quattro   1.8  1999     4 auto~ 4        16    25 p     comp~
## 10 audi         a4 quattro   2    2008     4 manu~ 4        20    28 p     comp~
## # ... with 224 more rows

# for categorical data we have, manufacturer, model, trans, drv, fl, class. continous variables are displ, year, cyl, cty and hwy.

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = cyl, y = cty, color = displ))

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = cyl, y = cty, size = displ))

#ggplot(data = mpg) + 
#  geom_point(mapping = aes(x = cyl, y = cty, shape = #displ))

# when we use a continuous variable to color, R uses gradient of same color, when using it in size it affects the size of point plotted in the graph. Finally, It cannot be mapped to shape

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = cyl, y = cty, color = displ, size = displ))

# in this case it combines the effect of size and color gradient to same variable

?geom_point
ggplot(data = mpg) + 
  geom_point(mapping = aes(x = cyl, y = cty), shape = 21, colour = "black", fill = "white", size = 2, stroke = 2)

# the stroke either increase or decrease the width of the outside layer of  point plotted

ggplot(mpg, aes(x = displ, y = cty, colour = displ < 5)) + geom_point()

# then it will divide population over two, either true or false and assign a color or shape to each.

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid(drv ~ .)

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy)) +
  facet_grid( .~cyl)

# first plot is hwy consumption by displacement for the 3 different drivetrains, the second one is same but by number of cylinders this this. the . specify how you divide the visual by row or column depending on its positio in the facet_grid

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy)) + 
  facet_wrap(~ class, nrow = 2)

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = displ, y = hwy, color = class))

# the difference is when using facet wrap it divide visuals depending on number of different value we have in class, using class as color will enable us to see same results in one visual, this make it easier to compare. but have we had a large dataset, the visual will be too crowded and we wouldn't be able to compare as effectively as in this case and it would be better to use facet wrap, we can also subgroup classes with similar interest to generate less numbers of plot and be able to compare.

?facet_wrap

# nrow determines number of rows, similarly ncol determines number of columns. other option to control layout are, scales, shrink, as.table. Facet_grid() doesn't have n row and ncol because we use the number of variables in a dimension either as nrow or ncol

ggplot(data = mpg, mapping = aes(x = displ, y = hwy, color = drv)) + 
  geom_point() + 
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# My prediction were right, FWD cars tend to have better gas mileage than RWD than AWD. there are other factors that might skew results but are not part of dataset like type of fuel, and whether a car is NA or supercharged.

ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point() + 
  geom_smooth()

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot() + 
  geom_point(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_smooth(data = mpg, mapping = aes(x = displ, y = hwy))

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# yes they look the same because the variables are mapped the same way.

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point() +
  geom_smooth(mapping = aes(group = drv), se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy, colour = drv)) +
  geom_point() +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(colour = drv)) +
  geom_smooth(se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(aes(colour = drv)) +
  geom_smooth(aes(linetype = drv), se = FALSE)

## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

ggplot(mpg, aes(x = displ, y = hwy)) +
  geom_point(size = 4, color="white") +
  geom_point(aes(colour = drv))

ggplot(data = diamonds) +
  geom_pointrange(
    mapping = aes(x = cut, y = depth),
    stat = "summary",
    fun.min = min,
    fun.max = max,
    fun = median
  )

#default geom is pointrange.

?geom_col
?geom_bar

#geom bar does a statiscal transformation by default while geom col doesn't

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, y = after_stat(prop)))

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, fill = color, y = after_stat(prop)))

#problem is that we end up with same proportions because they are calculated within the grouping and all bars have the same height 1.

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +   geom_point(position = "jitter")

#there is an overplotting issues because a lot of observation have same inout un cty and hwy, to avoid that we can use the jitter in position adjustment to add some randomness in the data and give a better visual although it decreases accuracy.

#width control the horizental randomness, and the heigh the vertical one.

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +   geom_jitter()

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +   geom_count()

#it could be a good idea to use both of them since the geom count enable us to see where the overlapping is happening while geom_jitter spreads it.

ggplot(mpg, aes(x = factor(1), fill = class)) +
  geom_bar(width = 1)

ggplot(mpg, aes(x = factor(1), fill = class)) +
  geom_bar(width = 1) +
  coord_polar(theta = "y")

# coord_quickmap is a quick approximation that does preserve straight lines and works best for smaller areas closer to the equator while coord_map projects a portion of earth into a 2D using projection methods specified in the mapproj package.

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_point() + 
  geom_abline() +
  coord_fixed()

#it shows that there is a positive correlation between the two variables, the higher cty consumption the higher hwy which makes sense if we ignore different power trains hybrid vs gas vs diesel and supercharged or not.coord fixed forces a specific ratio which gives us a linear relationship while geom abline add a line to data shown in other layers.

Week3-assignment

Mehdi Alaoui

2023-04-15