Load the data
library(readr)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ dplyr 1.0.7
## ✓ tibble 3.1.3 ✓ stringr 1.4.0
## ✓ tidyr 1.1.3 ✓ forcats 0.5.1
## ✓ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggrepel)
library(ggthemes)
housing <- read_csv("landdata-states.csv")
## Rows: 7803 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): State, region
## dbl (9): Date, Home.Value, Structure.Cost, Land.Value, Land.Share..Pct., Hom...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(housing)
hp2001Q1 <- filter(housing, Date == 2001.25)
ggplot(hp2001Q1,
aes(y = Structure.Cost, x = log(Land.Value) )) + geom_point()

Scatter plot
# Prediction Lines
hp2001Q1$pred.SC = predict(lm(Structure.Cost ~ log(Land.Value), data = hp2001Q1), data = hp2001Q1)
#View(hp2001Q1)
p1 = ggplot(hp2001Q1,
aes(y = Structure.Cost, x = log(Land.Value) )
)
# Add the prediction line : Over write Y
# "color" only matters in geom_point()
p1 + geom_point(aes(color = Home.Value)) +
geom_line(aes(y = pred.SC))

p1 + geom_point(aes(color = Home.Value)) +
geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Text
p1 + geom_text(aes(label = State), size = 3)

p1 + geom_point() +
geom_text_repel(aes(label = State), size = 3)

# A wrong command
# p1 + geom_point(aes(size = 2), color = "red")
# Correct and better one
p1 + geom_point(aes(color = Home.Value, shape = region))
## Warning: Removed 1 rows containing missing values (geom_point).

Histogram
#args(geom_histogram)
p2 = ggplot(housing, aes(x = Home.Value))
p2 + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

p2 + geom_histogram(binwidth = 4000)

Bar chart
#args(geombar)
housing.sum = aggregate(housing["Home.Value"], housing["State"], FUN = mean)
# geom_bar() expects one variable
ggplot(housing.sum, aes(x = State, y = Home.Value)) +
geom_bar(stat = "identity")

Scales
# position_jitter: present the aggragation of data using color transparency
p4 = ggplot(housing, aes(x = State, y = Home.Price.Index))+
theme(legend.position = "top") +
geom_point(aes(color = Date), alpha = 0.5,
size = 1.5,
position = position_jitter(width = .25, height = 0))
# Scale the color
# "low" and "high" apply when the variable is continuous. Specific colors should be assigned when it is discrete.
p4 + scale_x_discrete(name = "State Abbrevations") +
scale_color_continuous(name = "",
breaks = c(1976, 1994, 2013),
labels = c("76", "94", "13"),
low = "blue",
high = "red"
)

Faceting
p5 = ggplot(housing, aes(x = Date, y = Home.Value))
# a horrible example
p5 + geom_line(aes(color = State))

# facet_wrap() applies when there is one variable to look into
p5 + geom_line() +
facet_wrap(~State, ncol = 10) +
theme_minimal()
