Import data

data <- read_excel("../00_data/my_data.xlsx") %>%
    mutate(diagnosed = as.numeric(diagnosed))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `diagnosed = as.numeric(diagnosed)`.
## Caused by warning:
## ! NAs introduced by coercion

Variation

Visualizing distributions

data %>%
    ggplot(aes(x = service)) +
    geom_bar()

Typical values

data %>%
  ggplot(aes(x = service)) +
  geom_bar() +
  labs(
    title = "Distribution of Service",
    x = "Severity Level",
    y = "Count")

Missing Values

data %>%
    
    # Remove missing values
    filter(!is.na(diagnosed))
## # A tibble: 438 × 5
##    service component severity         diagnosed  year
##    <chr>   <chr>     <chr>                <dbl> <dbl>
##  1 Army    Active    Penetrating            189  2006
##  2 Army    Active    Severe                 102  2006
##  3 Army    Active    Moderate               709  2006
##  4 Army    Active    Mild                  5896  2006
##  5 Army    Active    Not Classifiable       122  2006
##  6 Army    Guard     Penetrating             33  2006
##  7 Army    Guard     Severe                  26  2006
##  8 Army    Guard     Moderate               177  2006
##  9 Army    Guard     Mild                  1332  2006
## 10 Army    Guard     Not Classifiable        29  2006
## # ℹ 428 more rows

A categorical and continuous variable

ggplot(data, aes(x = severity, y = diagnosed)) +
  geom_boxplot() +
  labs(
    title = "Diagnosed Cases by Severity",
    x = "Severity",
    y = "Diagnosed"
  )
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

Two categorical variables

data %>%
  count(component, severity) %>%
  ggplot(aes(x = component, y = severity, fill = n)) +
  geom_tile()

Two numeric variables

ggplot(data, aes(x = year, y = diagnosed)) +
  geom_point(alpha = 0.3) +
  labs(
    title = "Relationship Between Year and Diagnosed Cases",
    x = "Year",
    y = "Diagnosed"
  )
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).