Visualizing & Summarizing Relationships

Load Packages

glimpse(duke_forest)
Rows: 98
Columns: 13
$ address    <chr> "1 Learned Pl, Durham, NC 27705", "1616 Pinecrest Rd, Durha…
$ price      <dbl> 1520000, 1030000, 420000, 680000, 428500, 456000, 1270000, …
$ bed        <dbl> 3, 5, 2, 4, 4, 3, 5, 4, 4, 3, 4, 4, 3, 5, 4, 5, 3, 4, 4, 3,…
$ bath       <dbl> 4.0, 4.0, 3.0, 3.0, 3.0, 3.0, 5.0, 3.0, 5.0, 2.0, 3.0, 3.0,…
$ area       <dbl> 6040, 4475, 1745, 2091, 1772, 1950, 3909, 2841, 3924, 2173,…
$ type       <chr> "Single Family", "Single Family", "Single Family", "Single …
$ year_built <dbl> 1972, 1969, 1959, 1961, 2020, 2014, 1968, 1973, 1972, 1964,…
$ heating    <chr> "Other, Gas", "Forced air, Gas", "Forced air, Gas", "Heat p…
$ cooling    <fct> central, central, central, central, central, central, centr…
$ parking    <chr> "0 spaces", "Carport, Covered", "Garage - Attached, Covered…
$ lot        <dbl> 0.97, 1.38, 0.51, 0.84, 0.16, 0.45, 0.94, 0.79, 0.53, 0.73,…
$ hoa        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ url        <chr> "https://www.zillow.com/homedetails/1-Learned-Pl-Durham-NC-…
ggplot(duke_forest, aes(x = area, y = price)) +
  geom_point()

duke_forest |>
  summarize(r = cor(area, price))
# A tibble: 1 × 1
      r
  <dbl>
1 0.667
ggplot(duke_forest, aes(x = price, fill = cooling)) +
  geom_histogram(binwidth = 100000)

ggplot(duke_forest, aes(x = price, y = cooling)) +
  geom_boxplot()

ggplot(
  duke_forest,
  aes(x = price, color = cooling, fill = cooling)
  ) +
  geom_density(alpha = 0.5)

ggplot(
  duke_forest,
  aes(x = price, y = cooling)
  ) +
  geom_violin()

duke_forest |>
  summarize(
    median_price = median(price),
    min_price = min(price),
    iqr_price = IQR(price),
    max_price = max(price)
    )
# A tibble: 1 × 4
  median_price min_price iqr_price max_price
         <dbl>     <dbl>     <dbl>     <dbl>
1       540000     95000    193125   1520000
duke_forest <- duke_forest |>
  mutate(
   year_built_cat = if_else(
     year_built >= 1970,
     "1970 or later",
     "1969 or earlier"
   )
  )

duke_forest |>
  select(contains("year_built"))
# A tibble: 98 × 2
   year_built year_built_cat 
        <dbl> <chr>          
 1       1972 1970 or later  
 2       1969 1969 or earlier
 3       1959 1969 or earlier
 4       1961 1969 or earlier
 5       2020 1970 or later  
 6       2014 1970 or later  
 7       1968 1969 or earlier
 8       1973 1970 or later  
 9       1972 1970 or later  
10       1964 1969 or earlier
# ℹ 88 more rows
ggplot(duke_forest, aes(x = year_built_cat, fill = cooling))

  geom_bar()
geom_bar: just = 0.5, width = NULL, na.rm = FALSE, orientation = NA
stat_count: width = NULL, na.rm = FALSE, orientation = NA
position_stack 
duke_forest |>
  count(year_built_cat, cooling) |>
  group_by(year_built_cat) |>
  mutate(prop = n / sum(n))
# A tibble: 4 × 4
# Groups:   year_built_cat [2]
  year_built_cat  cooling     n  prop
  <chr>           <fct>   <int> <dbl>
1 1969 or earlier other      39 0.591
2 1969 or earlier central    27 0.409
3 1970 or later   other      14 0.438
4 1970 or later   central    18 0.562
ggplot(
  duke_forest,
  aes(
    x = area, y = price,
    color = cooling, shape = cooling
    )
  ) +
  geom_point(alpha = 0.7, size = 4) +
  scale_x_continuous(labels = label_number(big.mark = ",")) +
  scale_y_continuous(labels = label_dollar()) +
  scale_color_manual(
    values = c("central" = "darkblue", "forced air" = "red", "other" = "green")
    ) +
  labs(
    title = "Houses in Duke Forest",
    subtitle = "Durham, NC",
    color = "Cooling Type", shape = "Cooling Type",
    x = "Area (sqft)", 
    y = "Price ($)",
    caption = "Data Source: Zillow"
    ) +
  facet_wrap(~cooling, ncol = 1)

duke_forest |>
  group_by(cooling) |>
  summarize(r = cor(price, area))
# A tibble: 2 × 2
  cooling     r
  <fct>   <dbl>
1 other   0.459
2 central 0.854