library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.1 v readr 2.1.4
## v forcats 1.0.0 v stringr 1.5.0
## v ggplot2 3.4.2 v tibble 3.2.1
## v lubridate 1.9.2 v tidyr 1.3.0
## v purrr 1.0.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
head(mpg)
## # A tibble: 6 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p compa~
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p compa~
## 3 audi a4 2 2008 4 manual(m6) f 20 31 p compa~
## 4 audi a4 2 2008 4 auto(av) f 21 30 p compa~
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p compa~
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p compa~
#For this assignment i will use mtcars as dataset
ggplot(data = mpg) +
geom_bar(mapping = aes(x = class))

#in this graph we can see the frequency of each class of cars in the dataset
mpg %>%
count(class)
## # A tibble: 7 x 2
## class n
## <chr> <int>
## 1 2seater 5
## 2 compact 47
## 3 midsize 41
## 4 minivan 11
## 5 pickup 33
## 6 subcompact 35
## 7 suv 62
#in this section, i am counting how many observation we have for each class of cars in the dataset, similar to visual in the upper section
ggplot(data = mpg) +
geom_histogram(mapping = aes(x = hwy), binwidth = 5)

#in this section, i am checking the distribution of a continuous variable, highway consumption
mpg %>%
count(cut_width(hwy,5))
## # A tibble: 8 x 2
## `cut_width(hwy, 5)` n
## <fct> <int>
## 1 [7.5,12.5] 5
## 2 (12.5,17.5] 50
## 3 (17.5,22.5] 43
## 4 (22.5,27.5] 81
## 5 (27.5,32.5] 44
## 6 (32.5,37.5] 8
## 7 (37.5,42.5] 1
## 8 (42.5,47.5] 2
#similarly, we can do the samething by counting how many observation we have in specific range
highcons <- mpg %>%
filter(hwy < 20)
ggplot(data = highcons, mapping = aes(x = hwy)) +
geom_histogram(binwidth = 2)

#here i am subsetting the data to cars that have high consumption in highway less than 20 mpg and checking the frequency of observation within a binwidth specified
ggplot(data = highcons, mapping = aes(x = hwy, colour = drv)) +
geom_freqpoly(binwidth = 2)

#In this section, i m checking the highconsumption cars in highway by drivetrain.
ggplot(data = mpg, mapping = aes(x = hwy)) +
geom_histogram(binwidth = 1)

#here, i m checking what are the most common values for highway mpg in the dataset.
ggplot(data = mpg, mapping = aes(x = cty)) +
geom_histogram(binwidth = 0.8)

#In this code we are checking unusual values for mpg in city, we can see in the data gaps examples of values above 30 mpg and other below 10 mpg
ggplot(mpg) +
geom_histogram(mapping = aes(x=cty), binwidth = 0.8) +
coord_cartesian(ylim = c(0,20))

#This is the same visual, just limiting the range of y axis to 20 to make smaller values visible
unusual <- mpg %>%
filter(cty < 10 | cty > 20) %>%
select(cyl, displ, trans) %>%
arrange(displ)
unusual
## # A tibble: 50 x 3
## cyl displ trans
## <int> <dbl> <chr>
## 1 4 1.6 manual(m5)
## 2 4 1.6 auto(l4)
## 3 4 1.6 manual(m5)
## 4 4 1.6 manual(m5)
## 5 4 1.6 auto(l4)
## 6 4 1.8 manual(m5)
## 7 4 1.8 manual(m5)
## 8 4 1.8 auto(l5)
## 9 4 1.8 auto(l5)
## 10 4 1.8 auto(l3)
## # ... with 40 more rows
#This code is generating unusual consumption values in the city which are defined here as less than 10 mpg or above 20 mpg and display their number of cylinders, displacement and transmission type.
mpg2 <- mpg %>%
filter(between(cty, 5, 28))
mpg2
## # A tibble: 231 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto~ f 18 29 p comp~
## 2 audi a4 1.8 1999 4 manu~ f 21 29 p comp~
## 3 audi a4 2 2008 4 manu~ f 20 31 p comp~
## 4 audi a4 2 2008 4 auto~ f 21 30 p comp~
## 5 audi a4 2.8 1999 6 auto~ f 16 26 p comp~
## 6 audi a4 2.8 1999 6 manu~ f 18 26 p comp~
## 7 audi a4 3.1 2008 6 auto~ f 18 27 p comp~
## 8 audi a4 quattro 1.8 1999 4 manu~ 4 18 26 p comp~
## 9 audi a4 quattro 1.8 1999 4 auto~ 4 16 25 p comp~
## 10 audi a4 quattro 2 2008 4 manu~ 4 20 28 p comp~
## # ... with 221 more rows
#this code we are filtering the data with values in cty variable between 5 and 28, essentially this create o subset of mpg and named it mpg2,
mpg2 <- mpg %>%
mutate(cty = ifelse(cty < 5 | cty > 28, NA, cty))
mpg2
## # A tibble: 234 x 11
## manufacturer model displ year cyl trans drv cty hwy fl class
## <chr> <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
## 1 audi a4 1.8 1999 4 auto~ f 18 29 p comp~
## 2 audi a4 1.8 1999 4 manu~ f 21 29 p comp~
## 3 audi a4 2 2008 4 manu~ f 20 31 p comp~
## 4 audi a4 2 2008 4 auto~ f 21 30 p comp~
## 5 audi a4 2.8 1999 6 auto~ f 16 26 p comp~
## 6 audi a4 2.8 1999 6 manu~ f 18 26 p comp~
## 7 audi a4 3.1 2008 6 auto~ f 18 27 p comp~
## 8 audi a4 quattro 1.8 1999 4 manu~ 4 18 26 p comp~
## 9 audi a4 quattro 1.8 1999 4 auto~ 4 16 25 p comp~
## 10 audi a4 quattro 2 2008 4 manu~ 4 20 28 p comp~
## # ... with 224 more rows
#this code will do the same thing except this times, it will replace values outside the range defined by NA
ggplot(data = mpg2, mapping = aes(x = cty, y = hwy)) +
geom_point()
## Warning: Removed 3 rows containing missing values (`geom_point()`).

#> Warning: Removed 9 rows containing missing values (`geom_point()`).
ggplot(data = mpg2, mapping = aes(x = cty, y = hwy)) +
geom_point(na.rm = TRUE)

# in here, i m plotting cty variable vs hwy variable, first code gave us a waring to say how many rows got deleted because it contained NA in one of those variables, Second code doesn't give an error because we set na.rw to TRUE, which implies that i am aware that dataset has missing values
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(mapping = aes(sched_dep_time)) +
geom_freqpoly(mapping = aes(colour = cancelled), binwidth = 1/4)

# in this section, i'm picking the same code as in the text book, because its not applicable to the dataset i picked. here we are comparing the flight with missing values in dep_time with those that were cancelled. But we can see from the plot that the flight with missing values in dep time have more count in non cancelled vs cancelled which implies the reason we have missing values is some other factor rather than whether flight was cancelled or not.
ggplot(data = mpg, mapping = aes(x = cty)) +
geom_freqpoly(mapping = aes(colour = drv), binwidth = 2)

#here, i am plotting the cty consumption by drivetrain to see the difference of distributions
ggplot(data = mpg, mapping = aes(x = cty, y=..density..)) +
geom_freqpoly(mapping = aes(colour = drv), binwidth = 2)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## i Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(data = mpg, mapping = aes(x = drv, y = cty)) +
geom_boxplot()

# here we are plotting a box plot of consumption in city vs drivetrain
ggplot(data = mpg, mapping = aes(x = reorder(drv, cty, FUN=median) , y = cty)) +
geom_boxplot()

# here we are plotting a box plot of consumption in city vs drivetraindoing same plot as before but this time we are plotting median value by drivetrain
ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(drv, cty, FUN = median), y = cty)) +
coord_flip()

# coord_flip argument enable us the flip visual, put x in y axis and vice versa
ggplot(data = mpg) +
geom_count(mapping = aes(x = drv, y = class))

#here i m plotting two categorical variables, drivetrain and class to see how many times each combination happens
mpg %>%
count(drv,class)
## # A tibble: 12 x 3
## drv class n
## <chr> <chr> <int>
## 1 4 compact 12
## 2 4 midsize 3
## 3 4 pickup 33
## 4 4 subcompact 4
## 5 4 suv 51
## 6 f compact 35
## 7 f midsize 38
## 8 f minivan 11
## 9 f subcompact 22
## 10 r 2seater 5
## 11 r subcompact 9
## 12 r suv 11
#same thing with a table
mpg %>%
count(drv, class) %>%
ggplot(mapping = aes(x = drv, y = class)) +
geom_tile(mapping = aes(fill = n))

ggplot(data = mpg) +
geom_point(mapping = aes(x = cty, y = hwy))

# here, i m plotting two continuous variable, due to data size i don't have issues of overplotting
ggplot(data = mpg) +
geom_bin2d(mapping = aes(x = cty, y = hwy))

library(hexbin)
## Warning: package 'hexbin' was built under R version 4.2.3
# install.packages("hexbin")
ggplot(data = mpg) +
geom_hex(mapping = aes(x = cty, y = hwy))

#geom_bin and geom_hex are very helpful in this situation because it divides population over two bins then use fill of same color, this enables the viewer to quickly see frequency in each point plotted, it adds another dimension to the data.
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
geom_boxplot(mapping = aes(group = cut_width(cty, 1)))

# the other option is to bin one continuous variable in this case city mileage to act like a categorical variable vs highway mileage
ggplot(data = mpg) +
geom_point(mapping = aes(x = cty, y = displ))

# here i m plotting cty consumption vs displacement which we notice are negatively correlated as expected
library(modelr)
## Warning: package 'modelr' was built under R version 4.2.3
mod <- lm(log(cty) ~ log(displ), data = mpg)
mpg2 <- mpg %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))
ggplot(data = mpg2) +
geom_point(mapping = aes(x = cty, y = resid))

# here using this model, we extracted the patterns from the data, the model gives a prediction of city mileage once displacement effect is removed from the data
ggplot(data=mpg2)+
geom_boxplot(mapping = aes(x = cty, y=resid))
## Warning: Continuous x aesthetic
## i did you forget `aes(group = ...)`?

ggplot(data = mpg, mapping = aes(x= cty))+
geom_freqpoly(binwidth = 2)

#rewriting same code more concisely
ggplot(data = mpg,aes(x= cty))+
geom_freqpoly(binwidth = 2)

mpg %>%
count(cty, class) %>%
ggplot(aes(cty, class, fill = n)) +
geom_tile()
