Import Data

data <- read_excel("../00_data/MyData.xlsx")

Introduction

Questions

Variation

Visualizing distributions

ggplot(data = data) +
  geom_bar(mapping = aes(x = Year))

ggplot(data = data) +
  geom_histogram(mapping = aes(x = Kilometers_Driven))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, mapping = aes(x = Year, colour = Owner_Type)) +
  geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

ggplot(data = data, mapping = aes(x = Kilometers_Driven)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values

ggplot(data) + 
  geom_histogram(mapping = aes(x = Year), binwidth = 0.5)

ggplot(data) + 
  geom_histogram(mapping = aes(x = Seats), binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 85))

Missing Values

data2 <- data %>%
mutate(Brand = ifelse(Price < 3 | Price > 20, NA, Price))
ggplot(data = data2, mapping = aes(x = Year, y = Price)) + 
  geom_point()

Covariation

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = Brand, y = Price)) +
    geom_boxplot()

Two categorical variables

data %>%
    
    count(Owner_Type, Brand) %>%
    
    ggplot(aes(x = Owner_Type, y = Brand, fill = n)) +
    geom_tile()

Two continous variables

Patterns and models