0) Setup

# Se mancano pacchetti: togli il # e lancia una volta
# install.packages(c("readr","dplyr","ggplot2","knitr","lubridate","forcats"))

library(readr)     # lettura CSV
library(dplyr)     # wrangling
## 
## Caricamento pacchetto: 'dplyr'
## I seguenti oggetti sono mascherati da 'package:stats':
## 
##     filter, lag
## I seguenti oggetti sono mascherati da 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)   # grafici
library(knitr)     # tabelle
library(lubridate) # date
## 
## Caricamento pacchetto: 'lubridate'
## I seguenti oggetti sono mascherati da 'package:base':
## 
##     date, intersect, setdiff, union
library(forcats)   # fattori ordinati

knitr::opts_chunk$set(message = FALSE, warning = FALSE, fig.width = 7, fig.height = 4.5)
# Gestisce entrambi i possibili nomi del file
csv_path <- if (file.exists("realestate_texas.csv")) "realestate_texas.csv" else
            if (file.exists("Real Estate Texas.csv")) "Real Estate Texas.csv" else
            stop("CSV non trovato: metti il file nella stessa cartella del .Rmd.")

df <- read_csv(csv_path, show_col_types = FALSE)

# Controlli base
dim(df)        # righe, colonne
## [1] 240   8
glimpse(df)    # struttura
## Rows: 240
## Columns: 8
## $ city             <chr> "Beaumont", "Beaumont", "Beaumont", "Beaumont", "Beau…
## $ year             <dbl> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,…
## $ month            <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5,…
## $ sales            <dbl> 83, 108, 182, 200, 202, 189, 164, 174, 124, 150, 150,…
## $ volume           <dbl> 14.162, 17.690, 28.701, 26.819, 28.833, 27.219, 22.70…
## $ median_price     <dbl> 163800, 138200, 122400, 123200, 123100, 122800, 12430…
## $ listings         <dbl> 1533, 1586, 1689, 1708, 1771, 1803, 1857, 1830, 1829,…
## $ months_inventory <dbl> 9.5, 10.0, 10.6, 10.6, 10.9, 11.1, 11.7, 11.6, 11.7, …
summary(df)    # riassunto statistico
##      city                year          month           sales      
##  Length:240         Min.   :2010   Min.   : 1.00   Min.   : 79.0  
##  Class :character   1st Qu.:2011   1st Qu.: 3.75   1st Qu.:127.0  
##  Mode  :character   Median :2012   Median : 6.50   Median :175.5  
##                     Mean   :2012   Mean   : 6.50   Mean   :192.3  
##                     3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.:247.0  
##                     Max.   :2014   Max.   :12.00   Max.   :423.0  
##      volume        median_price       listings    months_inventory
##  Min.   : 8.166   Min.   : 73800   Min.   : 743   Min.   : 3.400  
##  1st Qu.:17.660   1st Qu.:117300   1st Qu.:1026   1st Qu.: 7.800  
##  Median :27.062   Median :134500   Median :1618   Median : 8.950  
##  Mean   :31.005   Mean   :132665   Mean   :1738   Mean   : 9.193  
##  3rd Qu.:40.893   3rd Qu.:150050   3rd Qu.:2056   3rd Qu.:10.950  
##  Max.   :83.547   Max.   :180000   Max.   :3296   Max.   :14.900
df <- df %>%
  mutate(
    year      = as.integer(year),
    month     = as.integer(month),
    date      = as.Date(paste(year, month, "01", sep = "-")),
    month_lbl = factor(month, levels = 1:12, labels = month.abb, ordered = TRUE)
  ) %>%
  arrange(city, date)

# Valori mancanti per colonna (dovrebbero essere 0)
colSums(is.na(df))
##             city             year            month            sales 
##                0                0                0                0 
##           volume     median_price         listings months_inventory 
##                0                0                0                0 
##             date        month_lbl 
##                0                0
year_agg <- df %>%
  group_by(year) %>%
  summarise(
    total_sales   = sum(sales),
    total_volume  = sum(volume),          # milioni di $
    avg_price     = mean(median_price),
    avg_inventory = mean(months_inventory),
    .groups = "drop"
  )

kable(year_agg, digits = 2, caption = "KPI annuali aggregati (tutte le città)")
KPI annuali aggregati (tutte le città)
year total_sales total_volume avg_price avg_inventory
2010 8096 1232.44 130191.7 9.97
2011 7878 1207.58 127854.2 10.90
2012 8935 1404.84 130077.1 9.88
2013 10172 1687.32 135722.9 8.15
2014 11069 1909.07 139481.2 7.06
ggplot(year_agg, aes(year, total_sales)) +
  geom_line() + geom_point() +
  labs(title = "Vendite totali per anno (tutte le città)", x = "Anno", y = "Vendite")

ggplot(year_agg, aes(year, avg_price)) +
  geom_line() + geom_point() +
  labs(title = "Prezzo mediano medio per anno", x = "Anno", y = "Prezzo mediano ($)")

ggplot(year_agg, aes(year, avg_inventory)) +
  geom_line() + geom_point() +
  labs(title = "Months of Inventory (media annua)", x = "Anno", y = "Mesi")

seasonality <- df %>%
  group_by(month_lbl) %>%
  summarise(
    avg_sales = mean(sales),
    avg_price = mean(median_price),
    avg_inv   = mean(months_inventory),
    .groups = "drop"
  )

kable(seasonality, digits = 2, caption = "Stagionalità media per mese")
Stagionalità media per mese
month_lbl avg_sales avg_price avg_inv
Jan 127.40 124250 8.84
Feb 140.85 130075 9.06
Mar 189.45 127415 9.40
Apr 211.70 131490 9.72
May 238.85 134485 9.68
Jun 243.55 137620 9.70
Jul 235.75 134750 9.62
Aug 231.45 136675 9.39
Sep 182.35 134040 9.19
Oct 179.90 133480 8.94
Nov 156.85 134305 8.66
Dec 169.40 133400 8.12
ggplot(seasonality, aes(month_lbl, avg_sales, group = 1)) +
  geom_line() + geom_point() +
  labs(title = "Stagionalità delle vendite (media)", x = "Mese", y = "Vendite medie")

last_year <- max(df$year, na.rm = TRUE)

city_latest <- df %>%
  filter(year == last_year) %>%
  group_by(city) %>%
  summarise(
    sales     = sum(sales),
    volume    = sum(volume),
    price     = mean(median_price),
    inventory = mean(months_inventory),
    .groups = "drop"
  ) %>%
  arrange(desc(sales))

kable(city_latest, digits = 2, caption = paste("Confronto città –", last_year))
Confronto città – 2014
city sales volume price inventory
Tyler 3978 715.22 150466.7 8.76
Bryan-College Station 3123 633.75 169533.3 4.38
Beaumont 2564 385.58 132250.0 7.64
Wichita Falls 1404 174.51 105675.0 7.44
ggplot(city_latest, aes(reorder(city, sales), sales)) +
  geom_col() +
  coord_flip() +
  labs(title = paste("Vendite per città –", last_year), x = "Città", y = "Vendite")

# Matrice di correlazione globale
cor_mat <- df %>%
  select(sales, volume, median_price, listings, months_inventory) %>%
  cor(use = "complete.obs")

kable(round(cor_mat, 2), caption = "Matrice di correlazione (globale)")
Matrice di correlazione (globale)
sales volume median_price listings months_inventory
sales 1.00 0.98 0.59 0.62 0.15
volume 0.98 1.00 0.70 0.57 0.06
median_price 0.59 0.70 1.00 0.40 -0.04
listings 0.62 0.57 0.40 1.00 0.74
months_inventory 0.15 0.06 -0.04 0.74 1.00
# Prezzo vs Vendite (colore per città)
ggplot(df, aes(median_price, sales, color = city)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Prezzo mediano vs Vendite", x = "Prezzo mediano ($)", y = "Vendite mensili")
## `geom_smooth()` using formula = 'y ~ x'

# Inventory vs Vendite (colore per città)
ggplot(df, aes(months_inventory, sales, color = city)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE) +
  labs(title = "Months of Inventory vs Vendite", x = "Mesi di inventory", y = "Vendite mensili")
## `geom_smooth()` using formula = 'y ~ x'