Week 5 - EDA

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'ggplot2' was built under R version 4.2.3

## Warning: package 'tibble' was built under R version 4.2.3

## Warning: package 'tidyr' was built under R version 4.2.3

## Warning: package 'readr' was built under R version 4.2.3

## Warning: package 'purrr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'forcats' was built under R version 4.2.3

## Warning: package 'lubridate' was built under R version 4.2.3

## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr     1.1.1     v readr     2.1.4
## v forcats   1.0.0     v stringr   1.5.0
## v ggplot2   3.4.2     v tibble    3.2.1
## v lubridate 1.9.2     v tidyr     1.3.0
## v purrr     1.0.1     
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

head(mpg)

## # A tibble: 6 x 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa~
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa~
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa~
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa~
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa~
## 6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa~

#For this assignment i will use mtcars as dataset

ggplot(data = mpg) +
  geom_bar(mapping = aes(x = class))

#in this graph we can see the frequency of each class of cars in the dataset

 mpg %>%
  count(class)

## # A tibble: 7 x 2
##   class          n
##   <chr>      <int>
## 1 2seater        5
## 2 compact       47
## 3 midsize       41
## 4 minivan       11
## 5 pickup        33
## 6 subcompact    35
## 7 suv           62

#in this section, i am counting how many observation we have for each class of cars in the dataset, similar to visual in the upper section

 ggplot(data = mpg) +
  geom_histogram(mapping = aes(x = hwy), binwidth = 5)

#in this section, i am checking the distribution of a continuous variable, highway consumption

 mpg %>%
  count(cut_width(hwy,5))

## # A tibble: 8 x 2
##   `cut_width(hwy, 5)`     n
##   <fct>               <int>
## 1 [7.5,12.5]              5
## 2 (12.5,17.5]            50
## 3 (17.5,22.5]            43
## 4 (22.5,27.5]            81
## 5 (27.5,32.5]            44
## 6 (32.5,37.5]             8
## 7 (37.5,42.5]             1
## 8 (42.5,47.5]             2

#similarly, we can do the samething by counting how many observation we have in specific range

 highcons <- mpg %>% 
  filter(hwy < 20)
  
ggplot(data = highcons, mapping = aes(x = hwy)) +
  geom_histogram(binwidth = 2)

#here i am subsetting the data to cars that have high consumption in highway less than 20 mpg and checking the frequency of observation within a binwidth specified

 ggplot(data = highcons, mapping = aes(x = hwy, colour = drv)) +
  geom_freqpoly(binwidth = 2)

#In this section, i m checking the highconsumption cars in highway by drivetrain.

 ggplot(data = mpg, mapping = aes(x = hwy)) +
  geom_histogram(binwidth = 1)

#here, i m checking what are the most common values for highway mpg in the dataset.

 ggplot(data = mpg, mapping = aes(x = cty)) +
  geom_histogram(binwidth = 0.8)

#In this code we are checking unusual values for mpg in city, we can see in the data gaps examples of values above 30 mpg and other below 10 mpg

 ggplot(mpg) +
  geom_histogram(mapping = aes(x=cty), binwidth = 0.8) +
  coord_cartesian(ylim = c(0,20))

#This is the same visual, just limiting the range of y axis to 20 to make smaller values visible

unusual <- mpg %>%
  filter(cty < 10 | cty > 20) %>%
  select(cyl, displ, trans) %>%
  arrange(displ)
unusual

## # A tibble: 50 x 3
##      cyl displ trans     
##    <int> <dbl> <chr>     
##  1     4   1.6 manual(m5)
##  2     4   1.6 auto(l4)  
##  3     4   1.6 manual(m5)
##  4     4   1.6 manual(m5)
##  5     4   1.6 auto(l4)  
##  6     4   1.8 manual(m5)
##  7     4   1.8 manual(m5)
##  8     4   1.8 auto(l5)  
##  9     4   1.8 auto(l5)  
## 10     4   1.8 auto(l3)  
## # ... with 40 more rows

#This code is generating unusual consumption values in the city which are defined here as less than 10 mpg or above 20 mpg and display their number of cylinders, displacement and transmission type.

mpg2 <- mpg %>%
  filter(between(cty, 5, 28))
mpg2

## # A tibble: 231 x 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto~ f        18    29 p     comp~
##  2 audi         a4           1.8  1999     4 manu~ f        21    29 p     comp~
##  3 audi         a4           2    2008     4 manu~ f        20    31 p     comp~
##  4 audi         a4           2    2008     4 auto~ f        21    30 p     comp~
##  5 audi         a4           2.8  1999     6 auto~ f        16    26 p     comp~
##  6 audi         a4           2.8  1999     6 manu~ f        18    26 p     comp~
##  7 audi         a4           3.1  2008     6 auto~ f        18    27 p     comp~
##  8 audi         a4 quattro   1.8  1999     4 manu~ 4        18    26 p     comp~
##  9 audi         a4 quattro   1.8  1999     4 auto~ 4        16    25 p     comp~
## 10 audi         a4 quattro   2    2008     4 manu~ 4        20    28 p     comp~
## # ... with 221 more rows

#this code we are filtering the data with values in cty variable between 5 and 28, essentially this create o subset of mpg and named it mpg2,

mpg2 <- mpg %>%
  mutate(cty = ifelse(cty < 5 | cty > 28, NA, cty))
mpg2

## # A tibble: 234 x 11
##    manufacturer model      displ  year   cyl trans drv     cty   hwy fl    class
##    <chr>        <chr>      <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
##  1 audi         a4           1.8  1999     4 auto~ f        18    29 p     comp~
##  2 audi         a4           1.8  1999     4 manu~ f        21    29 p     comp~
##  3 audi         a4           2    2008     4 manu~ f        20    31 p     comp~
##  4 audi         a4           2    2008     4 auto~ f        21    30 p     comp~
##  5 audi         a4           2.8  1999     6 auto~ f        16    26 p     comp~
##  6 audi         a4           2.8  1999     6 manu~ f        18    26 p     comp~
##  7 audi         a4           3.1  2008     6 auto~ f        18    27 p     comp~
##  8 audi         a4 quattro   1.8  1999     4 manu~ 4        18    26 p     comp~
##  9 audi         a4 quattro   1.8  1999     4 auto~ 4        16    25 p     comp~
## 10 audi         a4 quattro   2    2008     4 manu~ 4        20    28 p     comp~
## # ... with 224 more rows

#this code will do the same thing except this times, it will replace values outside the range defined by NA

ggplot(data = mpg2, mapping = aes(x = cty, y = hwy)) + 
  geom_point()

## Warning: Removed 3 rows containing missing values (`geom_point()`).

#> Warning: Removed 9 rows containing missing values (`geom_point()`).
ggplot(data = mpg2, mapping = aes(x = cty, y = hwy)) + 
  geom_point(na.rm = TRUE)

# in here, i m plotting cty variable vs hwy variable, first code gave us a waring to say how many rows got deleted because it contained NA in one of those variables, Second code doesn't give an error because we set na.rw to TRUE, which implies that i am aware that dataset has missing values

nycflights13::flights %>% 
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>% 
  ggplot(mapping = aes(sched_dep_time)) + 
    geom_freqpoly(mapping = aes(colour = cancelled), binwidth = 1/4)

# in this section, i'm picking the same code as in the text book, because its not applicable to the dataset i picked. here we are comparing the flight with missing values in dep_time with those that were cancelled. But we can see from the plot that the flight with missing values in dep time have more count in non cancelled vs cancelled which implies the reason we have missing values is some other factor rather than whether flight was cancelled or not.

ggplot(data = mpg, mapping = aes(x = cty)) + 
  geom_freqpoly(mapping = aes(colour = drv), binwidth = 2)

#here, i am plotting the cty consumption by drivetrain to see the difference of distributions

ggplot(data = mpg, mapping = aes(x = cty, y=..density..)) + 
  geom_freqpoly(mapping = aes(colour = drv), binwidth = 2)

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## i Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(data = mpg, mapping = aes(x = drv, y = cty)) + 
  geom_boxplot()

# here we are plotting a box plot of consumption in city vs drivetrain

ggplot(data = mpg, mapping = aes(x = reorder(drv, cty, FUN=median) , y = cty)) + 
  geom_boxplot()

# here we are plotting a box plot of consumption in city vs drivetraindoing same plot as before but this time we are plotting median value by drivetrain

ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(drv, cty, FUN = median), y = cty)) +
  coord_flip()

# coord_flip argument enable us the flip visual, put x in y axis and vice versa

ggplot(data = mpg) +
  geom_count(mapping = aes(x = drv, y = class))

#here i m plotting two categorical variables, drivetrain and class to see how many times each combination happens

mpg %>%
  count(drv,class)

## # A tibble: 12 x 3
##    drv   class          n
##    <chr> <chr>      <int>
##  1 4     compact       12
##  2 4     midsize        3
##  3 4     pickup        33
##  4 4     subcompact     4
##  5 4     suv           51
##  6 f     compact       35
##  7 f     midsize       38
##  8 f     minivan       11
##  9 f     subcompact    22
## 10 r     2seater        5
## 11 r     subcompact     9
## 12 r     suv           11

#same thing with a table

mpg %>% 
  count(drv, class) %>%  
  ggplot(mapping = aes(x = drv, y = class)) +
    geom_tile(mapping = aes(fill = n))

ggplot(data = mpg) +
  geom_point(mapping = aes(x = cty, y = hwy))

# here, i m plotting two continuous variable, due to data size i don't have issues of overplotting 

ggplot(data = mpg) +
  geom_bin2d(mapping = aes(x = cty, y = hwy))

library(hexbin)

## Warning: package 'hexbin' was built under R version 4.2.3

# install.packages("hexbin")
ggplot(data = mpg) +
  geom_hex(mapping = aes(x = cty, y = hwy))

#geom_bin and geom_hex are very helpful in this situation because it divides population over two bins then use fill of same color, this enables the viewer to quickly see frequency in each point plotted, it adds another dimension to the data.

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) + 
  geom_boxplot(mapping = aes(group = cut_width(cty, 1)))

# the other option is to bin one continuous variable in this case city mileage to act like a categorical variable vs highway mileage

ggplot(data = mpg) + 
  geom_point(mapping = aes(x = cty, y = displ))

# here i m plotting cty consumption vs displacement which we notice are negatively correlated as expected

library(modelr)

## Warning: package 'modelr' was built under R version 4.2.3

mod <- lm(log(cty) ~ log(displ), data = mpg)

mpg2 <- mpg %>% 
  add_residuals(mod) %>% 
  mutate(resid = exp(resid))

ggplot(data = mpg2) + 
  geom_point(mapping = aes(x = cty, y = resid))

# here using this model, we extracted the patterns from the data, the model gives a prediction of city mileage once displacement effect is removed from the data 
ggplot(data=mpg2)+
  geom_boxplot(mapping = aes(x = cty, y=resid))

## Warning: Continuous x aesthetic
## i did you forget `aes(group = ...)`?

ggplot(data = mpg, mapping = aes(x= cty))+ 
  geom_freqpoly(binwidth = 2)

#rewriting same code more concisely

ggplot(data = mpg,aes(x= cty))+ 
  geom_freqpoly(binwidth = 2)

mpg %>%
  count(cty, class) %>%
  ggplot(aes(cty, class, fill = n)) +
  geom_tile()

Week 5 - EDA

Mehdi Alaoui

2023-04-23