0) Setup
# Se mancano pacchetti: togli il # e lancia una volta
# install.packages(c("readr","dplyr","ggplot2","knitr","lubridate","forcats"))
library(readr) # lettura CSV
library(dplyr) # wrangling
##
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
##
## filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2) # grafici
library(knitr) # tabelle
library(lubridate) # date
##
## Caricamento pacchetto: 'lubridate'
## I seguenti oggetti sono mascherati da 'package:base':
##
## date, intersect, setdiff, union
library(forcats) # fattori ordinati
knitr::opts_chunk$set(message = FALSE, warning = FALSE, fig.width = 7, fig.height = 4.5)
# Gestisce entrambi i possibili nomi del file
csv_path <- if (file.exists("realestate_texas.csv")) "realestate_texas.csv" else
if (file.exists("Real Estate Texas.csv")) "Real Estate Texas.csv" else
stop("CSV non trovato: metti il file nella stessa cartella del .Rmd.")
df <- read_csv(csv_path, show_col_types = FALSE)
# Controlli base
dim(df) # righe, colonne
## [1] 240 8
glimpse(df) # struttura
## Rows: 240
## Columns: 8
## $ city <chr> "Beaumont", "Beaumont", "Beaumont", "Beaumont", "Beau…
## $ year <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,…
## $ month <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5,…
## $ sales <dbl> 83, 108, 182, 200, 202, 189, 164, 174, 124, 150, 150,…
## $ volume <dbl> 14.162, 17.690, 28.701, 26.819, 28.833, 27.219, 22.70…
## $ median_price <dbl> 163800, 138200, 122400, 123200, 123100, 122800, 12430…
## $ listings <dbl> 1533, 1586, 1689, 1708, 1771, 1803, 1857, 1830, 1829,…
## $ months_inventory <dbl> 9.5, 10.0, 10.6, 10.6, 10.9, 11.1, 11.7, 11.6, 11.7, …
summary(df) # riassunto statistico
## city year month sales
## Length:240 Min. :2010 Min. : 1.00 Min. : 79.0
## Class :character 1st Qu.:2011 1st Qu.: 3.75 1st Qu.:127.0
## Mode :character Median :2012 Median : 6.50 Median :175.5
## Mean :2012 Mean : 6.50 Mean :192.3
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:247.0
## Max. :2014 Max. :12.00 Max. :423.0
## volume median_price listings months_inventory
## Min. : 8.166 Min. : 73800 Min. : 743 Min. : 3.400
## 1st Qu.:17.660 1st Qu.:117300 1st Qu.:1026 1st Qu.: 7.800
## Median :27.062 Median :134500 Median :1618 Median : 8.950
## Mean :31.005 Mean :132665 Mean :1738 Mean : 9.193
## 3rd Qu.:40.893 3rd Qu.:150050 3rd Qu.:2056 3rd Qu.:10.950
## Max. :83.547 Max. :180000 Max. :3296 Max. :14.900
df <- df %>%
mutate(
year = as.integer(year),
month = as.integer(month),
date = as.Date(paste(year, month, "01", sep = "-")),
month_lbl = factor(month, levels = 1:12, labels = month.abb, ordered = TRUE)
) %>%
arrange(city, date)
# Valori mancanti per colonna (dovrebbero essere 0)
colSums(is.na(df))
## city year month sales
## 0 0 0 0
## volume median_price listings months_inventory
## 0 0 0 0
## date month_lbl
## 0 0
year_agg <- df %>%
group_by(year) %>%
summarise(
total_sales = sum(sales),
total_volume = sum(volume), # milioni di $
avg_price = mean(median_price),
avg_inventory = mean(months_inventory),
.groups = "drop"
)
kable(year_agg, digits = 2, caption = "KPI annuali aggregati (tutte le città)")
KPI annuali aggregati (tutte le città)
2010 |
8096 |
1232.44 |
130191.7 |
9.97 |
2011 |
7878 |
1207.58 |
127854.2 |
10.90 |
2012 |
8935 |
1404.84 |
130077.1 |
9.88 |
2013 |
10172 |
1687.32 |
135722.9 |
8.15 |
2014 |
11069 |
1909.07 |
139481.2 |
7.06 |
ggplot(year_agg, aes(year, total_sales)) +
geom_line() + geom_point() +
labs(title = "Vendite totali per anno (tutte le città)", x = "Anno", y = "Vendite")

ggplot(year_agg, aes(year, avg_price)) +
geom_line() + geom_point() +
labs(title = "Prezzo mediano medio per anno", x = "Anno", y = "Prezzo mediano ($)")

ggplot(year_agg, aes(year, avg_inventory)) +
geom_line() + geom_point() +
labs(title = "Months of Inventory (media annua)", x = "Anno", y = "Mesi")

seasonality <- df %>%
group_by(month_lbl) %>%
summarise(
avg_sales = mean(sales),
avg_price = mean(median_price),
avg_inv = mean(months_inventory),
.groups = "drop"
)
kable(seasonality, digits = 2, caption = "Stagionalità media per mese")
Stagionalità media per mese
Jan |
127.40 |
124250 |
8.84 |
Feb |
140.85 |
130075 |
9.06 |
Mar |
189.45 |
127415 |
9.40 |
Apr |
211.70 |
131490 |
9.72 |
May |
238.85 |
134485 |
9.68 |
Jun |
243.55 |
137620 |
9.70 |
Jul |
235.75 |
134750 |
9.62 |
Aug |
231.45 |
136675 |
9.39 |
Sep |
182.35 |
134040 |
9.19 |
Oct |
179.90 |
133480 |
8.94 |
Nov |
156.85 |
134305 |
8.66 |
Dec |
169.40 |
133400 |
8.12 |
ggplot(seasonality, aes(month_lbl, avg_sales, group = 1)) +
geom_line() + geom_point() +
labs(title = "Stagionalità delle vendite (media)", x = "Mese", y = "Vendite medie")

last_year <- max(df$year, na.rm = TRUE)
city_latest <- df %>%
filter(year == last_year) %>%
group_by(city) %>%
summarise(
sales = sum(sales),
volume = sum(volume),
price = mean(median_price),
inventory = mean(months_inventory),
.groups = "drop"
) %>%
arrange(desc(sales))
kable(city_latest, digits = 2, caption = paste("Confronto città –", last_year))
Confronto città – 2014
Tyler |
3978 |
715.22 |
150466.7 |
8.76 |
Bryan-College Station |
3123 |
633.75 |
169533.3 |
4.38 |
Beaumont |
2564 |
385.58 |
132250.0 |
7.64 |
Wichita Falls |
1404 |
174.51 |
105675.0 |
7.44 |
ggplot(city_latest, aes(reorder(city, sales), sales)) +
geom_col() +
coord_flip() +
labs(title = paste("Vendite per città –", last_year), x = "Città", y = "Vendite")

# Matrice di correlazione globale
cor_mat <- df %>%
select(sales, volume, median_price, listings, months_inventory) %>%
cor(use = "complete.obs")
kable(round(cor_mat, 2), caption = "Matrice di correlazione (globale)")
Matrice di correlazione (globale)
sales |
1.00 |
0.98 |
0.59 |
0.62 |
0.15 |
volume |
0.98 |
1.00 |
0.70 |
0.57 |
0.06 |
median_price |
0.59 |
0.70 |
1.00 |
0.40 |
-0.04 |
listings |
0.62 |
0.57 |
0.40 |
1.00 |
0.74 |
months_inventory |
0.15 |
0.06 |
-0.04 |
0.74 |
1.00 |
# Prezzo vs Vendite (colore per città)
ggplot(df, aes(median_price, sales, color = city)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Prezzo mediano vs Vendite", x = "Prezzo mediano ($)", y = "Vendite mensili")
## `geom_smooth()` using formula = 'y ~ x'

# Inventory vs Vendite (colore per città)
ggplot(df, aes(months_inventory, sales, color = city)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(title = "Months of Inventory vs Vendite", x = "Mesi di inventory", y = "Vendite mensili")
## `geom_smooth()` using formula = 'y ~ x'
