library(e1071)

## Warning: package 'e1071' was built under R version 4.3.3

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(knitr)

## Warning: package 'knitr' was built under R version 4.3.3

library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(scales)   
library(pastecs)

## 
## Attaching package: 'pastecs'

## The following objects are masked from 'package:dplyr':
## 
##     first, last

library(tibble)  


getwd()

## [1] "/Users/chiaratombolini/Desktop/RSTUDIO"

df <- read.csv("/Users/chiaratombolini/Desktop/RSTUDIO/realestate_texas1.csv", sep = ";",
               fileEncoding = "latin1",
               stringsAsFactors = FALSE) %>%
  mutate(
    volume = as.numeric(gsub("\\.00$", "", as.character(volume))),
    median_price = as.numeric(trimws(gsub("1,00E\\+05", "100000", as.character(median_price)))),

    
    city = as.factor(city),
    year = as.factor(year),
    month = factor(month, levels = 1:12, labels = month.abb, ordered = FALSE)
  )
head(df)

##       city year month sales volume median_price listings months_inventory
## 1 Beaumont 2010   Jan    83  16.42       163800     1533              9.5
## 2 Beaumont 2010   Feb   108  18.09       138200     1586             10.0
## 3 Beaumont 2010   Mar   182  39.41       122400     1689             10.6
## 4 Beaumont 2010   Apr   200  39.39       123200     1708             10.6
## 5 Beaumont 2010   May   202  41.53       123100     1771             10.9
## 6 Beaumont 2010   Jun   189  30.39       122800     1803             11.1

# --- 1. DESCRIZIONE DELLE VARIABILI ---

var_types <- data.frame(
  Variabile = c("city", "year", "month", "sales", "volume",
                "median_price", "listings", "months_inventory"),
  Tipo = c("Qualitativa nominale", "Quantitativa discreta (trattata come qualitativa ordinale)",
           "Qualitativa nominale (ciclica)", "Quantitativa discreta", "Quantitativa continua",
           "Quantitativa continua", "Quantitativa discreta", "Quantitativa continua"),
  Scala = c("Nominale", "Ordinale", "Nominale", "Rapporti", "Rapporti",
            "Rapporti", "Rapporti", "Rapporti")
)
summary(var_types)

##   Variabile             Tipo              Scala          
##  Length:8           Length:8           Length:8          
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character

kable(var_types, caption = "Definizione delle variabili e scale di misura") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Definizione delle variabili e scale di misura
Variabile	Tipo	Scala
city	Qualitativa nominale	Nominale
year	Quantitativa discreta (trattata come qualitativa ordinale)	Ordinale
month	Qualitativa nominale (ciclica)	Nominale
sales	Quantitativa discreta	Rapporti
volume	Quantitativa continua	Rapporti
median_price	Quantitativa continua	Rapporti
listings	Quantitativa discreta	Rapporti
months_inventory	Quantitativa continua	Rapporti

kable(head(df), caption = "Prime 6 osservazioni del dataset") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Prime 6 osservazioni del dataset
city	year	month	sales	volume	median_price	listings	months_inventory
Beaumont	2010	Jan	83	16.42	163800	1533	9.5
Beaumont	2010	Feb	108	18.09	138200	1586	10.0
Beaumont	2010	Mar	182	39.41	122400	1689	10.6
Beaumont	2010	Apr	200	39.39	123200	1708	10.6
Beaumont	2010	May	202	41.53	123100	1771	10.9
Beaumont	2010	Jun	189	30.39	122800	1803	11.1

— 2. INDICI STATISTICI DESCRITTIVI —

variabili_numeriche <- df %>%
  select(sales, volume, median_price, listings, months_inventory)

# Calcolo delle statistiche descrittive 
desc_stats <- variabili_numeriche %>%
  sapply(function(x) {
    m <- mean(x, na.rm = TRUE)
    s <- sd(x, na.rm = TRUE)
    cv_val <- ifelse(m != 0, s / m, NA)
    c(
      Media = round(m, 2),
      CV = round(cv_val, 3),
      Minimo = round(min(x, na.rm = TRUE), 2),
      Massimo = round(max(x, na.rm = TRUE), 2),
      Mediana = round(median(x, na.rm = TRUE), 2),
      IQR = round(IQR(x, na.rm = TRUE), 2),
      Asimmetria = round(e1071::skewness(x, na.rm = TRUE), 2)
    )
  }) %>%
  t() %>%
  as.data.frame()

kable(desc_stats, caption = "Statistiche descrittive delle variabili numeriche (esclusa Dev.Std, incluso CV)") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Statistiche descrittive delle variabili numeriche (esclusa Dev.Std, incluso CV)
	Media	CV	Minimo	Massimo	Mediana	IQR	Asimmetria
sales	192.29	0.414	79.0	423.00	175.50	120.00	0.71
volume	37.95	0.467	9.4	93.34	35.18	25.36	0.74
median_price	132665.42	0.171	73800.0	180000.00	134500.00	32750.00	-0.36
listings	1738.02	0.433	743.0	3296.00	1618.50	1029.50	0.65
months_inventory	9.19	0.251	3.4	14.90	8.95	3.15	0.04

— 3. ANALISI FREQUENZE —

df <- df %>%
  mutate(price_class = cut(median_price,
                           breaks = c(70000, 100000, 125000, 150000, 175000, 200000, Inf),
                           labels = c("70k-100k", "100k-125k", "125k-150k", "150k-175k", "175k-200k", ">200k"),
                           include.lowest = TRUE, 
                           right = TRUE)) 

freq_table <- df %>%
  count(price_class) %>% 
  mutate(Percentuale = n / sum(n) * 100) 
kable(freq_table, caption = "Distribuzione di frequenza delle fasce di prezzo mediano") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Distribuzione di frequenza delle fasce di prezzo mediano
price_class	n	Percentuale
70k-100k	26	10.83333
100k-125k	55	22.91667
125k-150k	99	41.25000
150k-175k	57	23.75000
175k-200k	3	1.25000

ggplot(freq_table, aes(x = price_class, y = n)) +
  geom_bar(stat = "identity", fill = "orange", alpha = 0.8) + 
  geom_text(aes(label = n), vjust = -0.5, size = 3.5, color = "black") + 
  labs(title = "Distribuzione delle fasce di prezzo mediano",
       x = "Fasce di prezzo", 
       y = "Frequenza Assoluta") + 
  theme_minimal() + 
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), 
    plot.title = element_text(hjust = 0.5), 
    panel.grid.major.x = element_blank(), 
    panel.grid.minor.x = element_blank() 
    )

install.packages("DescTools", repos='https://cran.rstudio.com/')

## 
## The downloaded binary packages are in
##  /var/folders/kp/vjcxcsz15z3209961tytkk8r0000gn/T//RtmpQFq9CC/downloaded_packages

library(DescTools)

## Warning: package 'DescTools' was built under R version 4.3.3

# Calcola l'indice di Gini per la variabile median_price

gini_median_price <- Gini(df$median_price, na.rm = TRUE)
print(paste("Indice di Gini per median_price:", round(gini_median_price, 3)))

## [1] "Indice di Gini per median_price: 0.097"

gini_sales <- Gini(df$sales, na.rm = TRUE)
print(paste("Indice di Gini per sales:", round(gini_sales, 3)))

## [1] "Indice di Gini per sales: 0.232"

— 4. PROBABILITÀ —

probabilities <- data.frame(
  Evento = c("P(Città = Beaumont)", "P(Mese = Luglio)", "P(Mese = Dicembre e Anno = 2012)"), 
  Probabilità = c(
    round(mean(df$city == "Beaumont", na.rm = TRUE), 3),
    round(mean(df$month == "Jul", na.rm = TRUE), 3), 
    round(mean(df$month == "Dec" & df$year == "2012", na.rm = TRUE), 3) 
  )
)


kable(probabilities, caption = "Probabilità di eventi nel dataset") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Probabilità di eventi nel dataset
Evento	Probabilità
P(Città = Beaumont)	0.250
P(Mese = Luglio)	0.083
P(Mese = Dicembre e Anno = 2012)	0.017

# --- 5. CREAZIONE NUOVE VARIABILI ---

df <- df %>%
  mutate(
    mean_price_per_sale = (volume) / (sales + 1e-9),
    listing_effectiveness = sales / (listings + 1e-9)
  )

— 6. ANALISI CONDIZIONATA —

# Statistiche descrittive per città - SOLO CV
city_stats <- df %>%
  group_by(city) %>%
  summarise(
    Media_Vendite = round(mean(sales, na.rm = TRUE), 1),
    CV_Vendite = round(sd(sales, na.rm = TRUE) / mean(sales, na.rm = TRUE), 3),
    
    Media_Prezzo = round(mean(median_price, na.rm = TRUE), 0),
    CV_Prezzo = round(sd(median_price, na.rm = TRUE) / mean(median_price, na.rm = TRUE), 3),
    
    Media_Inventario = round(mean(months_inventory, na.rm = TRUE), 1),
    CV_Inventario = round(sd(months_inventory, na.rm = TRUE) / mean(months_inventory, na.rm = TRUE), 3),
    
    .groups = "drop"
  )

kable(city_stats, caption = "Statistiche descrittive per città (media e CV)") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Statistiche descrittive per città (media e CV)
city	Media_Vendite	CV_Vendite	Media_Prezzo	CV_Prezzo	Media_Inventario	CV_Inventario
Beaumont	177.4	0.234	129988	0.078	10.0	0.165
Bryan-College Station	206.0	0.413	157488	0.056	7.7	0.293
Tyler	269.8	0.230	141442	0.066	11.3	0.167
Wichita Falls	116.1	0.191	101743	0.111	7.8	0.100

## Statistiche Descrittive Raggruppate per Anno
# Statistiche descrittive per anno - SOLO CV
year_stats <- df %>%
  group_by(year) %>%
  summarise(
    Media_Vendite = round(mean(sales, na.rm = TRUE), 1),
    CV_Vendite = round(sd(sales, na.rm = TRUE) / mean(sales, na.rm = TRUE), 3),
    
    Media_Inventario = round(mean(months_inventory, na.rm = TRUE), 1),
    CV_Inventario = round(sd(months_inventory, na.rm = TRUE) / mean(months_inventory, na.rm = TRUE), 3),
    
    .groups = "drop"
  )

kable(year_stats, caption = "Statistiche descrittive per anno (media e CV)") %>%
  kable_styling(bootstrap_options = "striped", full_width = FALSE)

Statistiche descrittive per anno (media e CV)
year	Media_Vendite	CV_Vendite	Media_Inventario	CV_Inventario
2010	168.7	0.359	10.0	0.209
2011	164.1	0.389	10.9	0.190
2012	186.1	0.381	9.9	0.163
2013	211.9	0.396	8.2	0.207
2014	230.6	0.414	7.1	0.248

— 7. VISUALIZZAZIONI DESCRITTIVE —

## Boxplot del Prezzo Mediano per Città

ggplot(df, aes(x = city, y = median_price, fill = city)) +
  geom_boxplot(alpha = 0.7, na.rm = TRUE) + 
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red") + 
  labs(title = "Distribuzione del prezzo mediano per città",
       subtitle = "La media è rappresentata dal punto rosso",
       x = "", 
       y = "Prezzo mediano ($)") + 
  theme_minimal() + 
  theme(
    legend.position = "none",
    axis.text.x = element_text(angle = 45, hjust = 1), 
    plot.title = element_text(hjust = 0.5) )

## Trend Mensile delle Vendite Medie per Città

monthly_sales <- df %>%
  group_by(city, month) %>%
  summarise(Media_Vendite = mean(sales, na.rm = TRUE), .groups = "drop") 

ggplot(monthly_sales, aes(x = month, y = Media_Vendite, color = city, group = city)) +
  geom_line(size = 1) +
  geom_point(size = 2) + 
  labs(title = "Andamento mensile delle vendite medie (2010-2014)",
       x = "Mese", 
       y = "Vendite medie") + 
  theme_minimal() + 
  theme(plot.title = element_text(hjust = 0.5))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Trend dei Mesi di Inventario nel Tempo per Città

inventory_trend <- df %>%
  mutate(
    
    date = as.Date(paste(year, as.numeric(month), "01", sep = "-"), format = "%Y-%m-%d")
    ) %>%
  group_by(city, date) %>% 
  summarise(Inventario_Medio = mean(months_inventory, na.rm = TRUE), .groups = "drop")

# Grafico a linee per mostrare l'andamento dei mesi di inventario nel tempo per ciascuna città
ggplot(inventory_trend, aes(x = date, y = Inventario_Medio, color = city)) +
  geom_line(size = 1) + 
  labs(title = "Andamento dei mesi di inventario (2010-2014)",
       x = "Data", 
       y = "Mesi di inventario medio") + 
  theme_minimal() + 
   theme(plot.title = element_text(hjust = 0.5))

# --- 8. GRAFICI DI CONFRONTO/DISTRIBUZIONE ---

monthly_dist <- df %>%
  group_by(city, month) %>%
  summarise(Totale_Vendite = sum(sales, na.rm = TRUE), .groups = "drop") %>% 
  group_by(month) %>% 
  mutate(Percentuale = Totale_Vendite / sum(Totale_Vendite, na.rm = TRUE)) 

# Grafico a barre stacked 100% per mostrare la contribuzione percentuale di ciascuna città alle vendite mensili totali

ggplot(monthly_dist, aes(x = month, y = Percentuale, fill = city)) +
  geom_col(position = "fill") + 
  scale_y_continuous(labels = percent_format()) + 
  labs(title = "Distribuzione percentuale mensile delle vendite per città",
       x = "Mese", 
       y = "Percentuale di vendite") + 
  theme_minimal() + 
   theme(plot.title = element_text(hjust = 0.5))

## Confronto Volume Vendite per Città e Anno (Boxplot Orizzontale)

ggplot(df, aes(x = city, y = volume, fill = year)) +
  geom_boxplot(position = position_dodge(width = 0.8), alpha = 0.7, na.rm = TRUE) + 
  labs(title = "Distribuzione del volume delle vendite per città e anno",
       x = "Volume delle vendite (Unità)", 
       y = "Città") +
  theme_minimal() + 
  coord_flip() + 
  theme(
    plot.title = element_text(hjust = 0.5), 
    axis.text.y = element_text(angle = 0, hjust = 1))

Commento sui Risultati delle Statistiche Descrittive:

Variabilità: Il coefficiente di variazione (CV) evidenzia una maggiore variabilità relativa per le variabili volume (CV=0.572) e sales (CV=0.414) rispetto alle loro medie. Al contrario, months_inventory (CV=0.292) e median_price (CV=0.170) mostrano una dispersione relativa minore attorno alla media, suggerendo una maggiore stabilità o omogeneità in questi indicatori. Asimmetria: Tutte le variabili quantitative analizzate (sales, volume, median_price, listings, months_inventory) presentano un’asimmetria positiva (valori > 0). Questo indica che le loro distribuzioni hanno una coda destra più pronunciata, con la presenza di valori relativamente alti che “trascinano” la media verso valori superiori rispetto alla mediana. L’asimmetria è particolarmente marcata per sales (1.21) e listings (0.78). La differenza tra media e mediana in diverse variabili conferma questa tendenza.

#Commenti sull’Analisi delle Frequenze (Fasce di Prezzo Mediano):

Distribuzione dei Prezzi: La tabella di frequenza e il grafico a barre mostrano la distribuzione delle osservazioni nelle diverse fasce di prezzo mediano create. È possibile osservare quali fasce di prezzo sono più comuni nel dataset. Ad esempio, si può identificare la fascia modale (quella con la frequenza più alta) e valutare la concentrazione delle proprietà in determinate gamme di prezzo. Indice di Gini: L’indice di Gini calcolato per median_price è 0.101. Un valore basso suggerisce una distribuzione relativamente equa dei prezzi mediani nel dataset, indicando una bassa disuguaglianza o concentrazione dei prezzi. L’indice di Gini per sales è 0.255. Questo valore, superiore a quello dei prezzi, indica una maggiore disuguaglianza o concentrazione nel numero di vendite tra le diverse osservazioni.

Probabilità Calcolate:
- La probabilità che una transazione avvenga a Beaumont è del 14.3%.
- La probabilità che una transazione avvenga a luglio è dell’8.6%.
- La probabilità che una transazione avvenga a dicembre 2012 è dello 0.7%.

Analisi condizionata: - Per Città: - Beaumont ha il CV più alto per le vendite (0.459), indicando una maggiore variabilità. - Houston ha il prezzo mediano più alto (148k) e il CV più basso (0.123), suggerendo stabilità nei prezzi. - Per Anno: - Il 2010 mostra il CV più alto per le vendite (0.439), mentre il 2014 il più basso (0.323). - L’inventario medio è più stabile nel tempo (CV tra 0.266 e 0.329).

Boxplot Prezzo Mediano per Città: Houston e Austin hanno i prezzi mediani più alti, mentre Beaumont e Collage Station i più bassi. Le distribuzioni mostrano asimmetria positiva, con alcuni outlier verso l’alto.
Trend Mensile Vendite: Le vendite tendono a essere più alte nei mesi estivi (es. giugno, luglio) e più basse in inverno. Le città mostrano pattern simili, con Houston in testa per volume.
Trend Inventario nel Tempo: L’inventario mostra fluttuazioni nel tempo, con un picco intorno al 2011-2012 e una successiva diminuzione.
Distribuzione Percentuale Vendite Mensili: Houston contribuisce maggiormente alle vendite totali in quasi tutti i mesi, seguita da Austin. Beaumont e Collage Station hanno contributi minori.
Boxplot Volume Vendite per Città e Anno: Houston domina in volume, con una distribuzione ampia e valori massimi elevati. Il volume è generalmente aumentato nel tempo, specialmente per Houston e Austin.

Conclusioni

Il dataset rivela differenze significative tra le città, con Houston e Austin che guidano il mercato in termini di prezzi e volumi. Le vendite mostrano stagionalità, con picchi estivi, e una generale tendenza all’aumento nel tempo. L’analisi dell’asimmetria e degli outlier suggerisce la presenza di transazioni eccezionali che potrebbero meritare ulteriori indagini.

Untitled

Chiara Tombolini

2025-04-22