Progetto

#########################################################################
# 1) Importo il dataset "Real Estate Texas.csv"
#########################################################################

file = "~/Desktop/ProfessionAI/statistica-descrittiva-per-datascience-main/Real Estate Texas.csv"
dati <- read.csv(file, sep=",")
# Visualizza le prime 10 righe
head(dati,10)

##        city year month sales volume median_price listings months_inventory
## 1  Beaumont 2010     1    83 14.162       163800     1533              9.5
## 2  Beaumont 2010     2   108 17.690       138200     1586             10.0
## 3  Beaumont 2010     3   182 28.701       122400     1689             10.6
## 4  Beaumont 2010     4   200 26.819       123200     1708             10.6
## 5  Beaumont 2010     5   202 28.833       123100     1771             10.9
## 6  Beaumont 2010     6   189 27.219       122800     1803             11.1
## 7  Beaumont 2010     7   164 22.706       124300     1857             11.7
## 8  Beaumont 2010     8   174 25.237       136800     1830             11.6
## 9  Beaumont 2010     9   124 17.233       121100     1829             11.7
## 10 Beaumont 2010    10   150 23.904       138500     1779             11.5

# Controllo le distribuzioni in city e year
table(dati["city"])

## city
##              Beaumont Bryan-College Station                 Tyler 
##                    60                    60                    60 
##         Wichita Falls 
##                    60

table(dati["year"])

## year
## 2010 2011 2012 2013 2014 
##   48   48   48   48   48

##             city             year            month            sales 
##      "character"        "integer"        "integer"        "integer" 
##           volume     median_price         listings months_inventory 
##        "numeric"        "numeric"        "integer"        "numeric"

## Rows: 240
## Columns: 8
## $ city             <chr> "Beaumont", "Beaumont", "Beaumont", "Beaumont", "Beau…
## $ year             <int> 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010, 2010,…
## $ month            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5,…
## $ sales            <int> 83, 108, 182, 200, 202, 189, 164, 174, 124, 150, 150,…
## $ volume           <dbl> 14.162, 17.690, 28.701, 26.819, 28.833, 27.219, 22.70…
## $ median_price     <dbl> 163800, 138200, 122400, 123200, 123100, 122800, 12430…
## $ listings         <int> 1533, 1586, 1689, 1708, 1771, 1803, 1857, 1830, 1829,…
## $ months_inventory <dbl> 9.5, 10.0, 10.6, 10.6, 10.9, 11.1, 11.7, 11.6, 11.7, …

#########################################################################
# 3) Calcola Indici di posizione, variabilità e forma per tutte le variabili per
#    le quali ha senso farlo, per le altre crea una distribuzione di frequenza. 
#    Commenta tutto brevemente.
#########################################################################

# Ha senso stimare indici pos/var/forma per tutte le variabili non qualitative

#--------------------- INDICI DI POSIZIONE/VARIABILITA'/FORMA ----------------------------
# preferisco fare una funzione dove metto dei valori di indici di posizione, variabilità e forma insieme

ind.pos.var <- function(x){
  varname <- deparse(substitute(x))
  print(paste0("Variabile: ", varname))
  q=quantile(x)
  cat("##########################################\n")
  print(paste0("[Minimum, Maximum] = [", min(x),",", max(x),"]"))
  print(paste0("[25th, 75th] percentiles = ", "  [",q[2],",",q[4],"]"))
  print(paste0("IQR = ",IQR(x)))
  print(paste0("Variance = ",var(x)))
  print(paste0("Coeff. di variation = ",sd(x)*100/mean(x)))
  print(paste0("Mean +/- sdt.dev = ", mean(x)," +/- ", sd(x)))
  print(paste0("Median, [25th,75th]Percentile = ", median(x)," [-",median(x)-q[2],",+",q[4]-median(x),"]"))
  print(paste0("Skewness = ",skewness(x)))
  print(paste0("Kurtosis = ",kurtosis(x)-3))
  s <- skewness(x)
  if (s > 0) {cat("Postive asymmetry\n")} else if (s < 0) {cat("Negative asymmetry\n")} else {cat("Symmetry\n")}
  k <- kurtosis(x)-3
  if (k > 0) {cat("Leptocurtica\n")} else if (k < 0) {cat("Platucurtica\n")} else {cat("Mesocurtica\n")}
  cat("##########################################\n\n")
  plot(density(x))
  return(sd(x)*100/mean(x))
}

#------- sales ---------
cv_sales = ind.pos.var(dati$sales)

## [1] "Variabile: dati$sales"
## ##########################################
## [1] "[Minimum, Maximum] = [79,423]"
## [1] "[25th, 75th] percentiles =   [127,247]"
## [1] "IQR = 120"
## [1] "Variance = 6344.29951185495"
## [1] "Coeff. di variation = 41.4220296482492"
## [1] "Mean +/- sdt.dev = 192.291666666667 +/- 79.6511111777792"
## [1] "Median, [25th,75th]Percentile = 175.5 [-48.5,+71.5]"
## [1] "Skewness = 0.718104024884958"
## [1] "Kurtosis = -0.313176409071494"
## Postive asymmetry
## Platucurtica
## ##########################################

#commento:numero totale di vendite compreso tra 79 e 423, con mediana pari a ~175
#         (IQR = 120). Distribuzione moderatamente asimmetrica positiva (skewness ~0.72),
#         con coda verso valori elevati e curtosi negativa (platicurtica).
#------- volume ---------
cv_volume = ind.pos.var(dati$volume)

## [1] "Variabile: dati$volume"
## ##########################################
## [1] "[Minimum, Maximum] = [8.166,83.547]"
## [1] "[25th, 75th] percentiles =   [17.6595,40.893]"
## [1] "IQR = 23.2335"
## [1] "Variance = 277.270692404027"
## [1] "Coeff. di variation = 53.7053586805415"
## [1] "Mean +/- sdt.dev = 31.0051875 +/- 16.6514471564494"
## [1] "Median, [25th,75th]Percentile = 27.0625 [-9.403,+13.8305]"
## [1] "Skewness = 0.884742026325996"
## [1] "Kurtosis = 0.176986997089744"
## Postive asymmetry
## Leptocurtica
## ##########################################

#commento:valore totale delle vendite con mediana pari a ~27 milioni di dollari
#         (IQR ~23). Distribuzione asimmetrica positiva (skewness ~0.88),
#         più concentrata attorno alla media (leptocurtica).
#------- median_price ---------
cv_median_price = ind.pos.var(dati$median_price)

## [1] "Variabile: dati$median_price"
## ##########################################
## [1] "[Minimum, Maximum] = [73800,180000]"
## [1] "[25th, 75th] percentiles =   [117300,150050]"
## [1] "IQR = 32750"
## [1] "Variance = 513572983.089261"
## [1] "Coeff. di variation = 17.0821825732064"
## [1] "Mean +/- sdt.dev = 132665.416666667 +/- 22662.148686505"
## [1] "Median, [25th,75th]Percentile = 134500 [-17200,+15550]"
## [1] "Skewness = -0.364552878177368"
## [1] "Kurtosis = -0.622961820755545"
## Negative asymmetry
## Platucurtica
## ##########################################

#commento: Media dei prezzi mediane di vendita delle case pari a ~132665 dollari
#          (deviazione standard ~22662). Distribuzione leggermente asimmetrica
#          negativa (skewness ~-0.36) e platicurtica.
#------- listings ---------
cv_listings = ind.pos.var(dati$listings)

## [1] "Variabile: dati$listings"
## ##########################################
## [1] "[Minimum, Maximum] = [743,3296]"
## [1] "[25th, 75th] percentiles =   [1026.5,2056]"
## [1] "IQR = 1029.5"
## [1] "Variance = 566568.966091352"
## [1] "Coeff. di variation = 43.3083275909432"
## [1] "Mean +/- sdt.dev = 1738.02083333333 +/- 752.707756098841"
## [1] "Median, [25th,75th]Percentile = 1618.5 [-592,+437.5]"
## [1] "Skewness = 0.649498226273972"
## [1] "Kurtosis = -0.791790033332583"
## Postive asymmetry
## Platucurtica
## ##########################################

#commento: numero di annunci attivi con mediana pari a ~1,619
#          (IQR ~1030). Distribuzione asimmetrica positiva (skewness~0.65),
#         con elevata variabilità e curtosi negativa (platicurtica).
#------- months_inventory ---------
cv_months_inventory = ind.pos.var(dati$months_inventory)

## [1] "Variabile: dati$months_inventory"
## ##########################################
## [1] "[Minimum, Maximum] = [3.4,14.9]"
## [1] "[25th, 75th] percentiles =   [7.8,10.95]"
## [1] "IQR = 3.15"
## [1] "Variance = 5.30688912133891"
## [1] "Coeff. di variation = 25.0603059264982"
## [1] "Mean +/- sdt.dev = 9.1925 +/- 2.30366862229334"
## [1] "Median, [25th,75th]Percentile = 8.95 [-1.15,+2]"
## [1] "Skewness = 0.0409752658710811"
## [1] "Kurtosis = -0.174447541638483"
## Postive asymmetry
## Platucurtica
## ##########################################

#commento: numero medio di mesi necessari per vendere tutte le inserzioni pari a ~9.1925
#           (deviazione standard ~2.3). Distribuzione quasi simmetrica
#           (skewness ~0.04) e leggermente platicurtica.



#---- funzione per distribuzione di frequenza ---- 
freq.tabs <- function(x){
  varname <- deparse(substitute(x))
  N <- length(x)
  
  ni <- table(x)
  fi <- ni / N
  Ni <- cumsum(ni)
  Fi <- Ni / N
  
  cat("Variable:", varname, "\n")
  cat("##########################################\n")
  print(data.frame(
    category = names(ni),
    freq = as.vector(ni),
    rel_freq = as.vector(fi),
    cum_freq = as.vector(Ni),
    cum_rel_freq = as.vector(Fi)
  ))
  cat("##########################################\n\n")}

# frequenze ass e rel per valori in cui non aveva senso fare stima di Indici di posizione, variabilità e forma
freq.tabs(dati$city)

## Variable: dati$city 
## ##########################################
##                category freq rel_freq cum_freq cum_rel_freq
## 1              Beaumont   60     0.25       60         0.25
## 2 Bryan-College Station   60     0.25      120         0.50
## 3                 Tyler   60     0.25      180         0.75
## 4         Wichita Falls   60     0.25      240         1.00
## ##########################################

#commento: il campione è equamente distribuito tra le 4 città considerate
#          (Beaumont, Bryan-College Station, Tyler e Wichita Falls),
#          ciascuna rappresentata dal 25% delle osservazioni.
freq.tabs(dati$year)

## Variable: dati$year 
## ##########################################
##   category freq rel_freq cum_freq cum_rel_freq
## 1     2010   48      0.2       48          0.2
## 2     2011   48      0.2       96          0.4
## 3     2012   48      0.2      144          0.6
## 4     2013   48      0.2      192          0.8
## 5     2014   48      0.2      240          1.0
## ##########################################

#commento: le osservazioni sono uniformemente distribuite nel periodo 2010-2014,
#          con lo stesso numero di dati per ciascun anno (20% per anno).
freq.tabs(dati$month)

## Variable: dati$month 
## ##########################################
##    category freq   rel_freq cum_freq cum_rel_freq
## 1         1   20 0.08333333       20   0.08333333
## 2         2   20 0.08333333       40   0.16666667
## 3         3   20 0.08333333       60   0.25000000
## 4         4   20 0.08333333       80   0.33333333
## 5         5   20 0.08333333      100   0.41666667
## 6         6   20 0.08333333      120   0.50000000
## 7         7   20 0.08333333      140   0.58333333
## 8         8   20 0.08333333      160   0.66666667
## 9         9   20 0.08333333      180   0.75000000
## 10       10   20 0.08333333      200   0.83333333
## 11       11   20 0.08333333      220   0.91666667
## 12       12   20 0.08333333      240   1.00000000
## ##########################################

# commento: le osservazioni sono equamente distribuite nei 12 mesi dell'anno,
#           ciascun mese rappresenta circa l'8.33% del totale,
#           indicando assenza di stagionalità nel campionamento.

#########################################################################
# 4) Qual è la variabile con variabilità più elevata? Come ci sei arrivato? 
#    E quale quella più asimmetrica?
#########################################################################

analisi_variabilita <- function(dataframe, vars){
  cv <- c()
  sk <- c()
  for (v in vars) {
    x <- dataframe[[v]]
    cv[v] <- sd(x) * 100 / mean(x)
    sk[v] <- skewness(x)
  }
  var_max <- names(cv)[which.max(cv)]
  skew_max <- names(sk)[which.max(abs(sk))]   # quella più asimmetrica (in valore assoluto)
  list(
    coeff_var = cv,
    skewness = sk,
    max_variability = list(variable = var_max, cv = cv[var_max]),
    max_skewness = list(variable = skew_max, skewness = sk[skew_max])
  )
}

vars_numeric <- c("sales","volume","median_price","listings","months_inventory")
ris <- analisi_variabilita(dati, vars_numeric)
ris

## $coeff_var
##            sales           volume     median_price         listings 
##         41.42203         53.70536         17.08218         43.30833 
## months_inventory 
##         25.06031 
## 
## $skewness
##            sales           volume     median_price         listings 
##       0.71810402       0.88474203      -0.36455288       0.64949823 
## months_inventory 
##       0.04097527 
## 
## $max_variability
## $max_variability$variable
## [1] "volume"
## 
## $max_variability$cv
##   volume 
## 53.70536 
## 
## 
## $max_skewness
## $max_skewness$variable
## [1] "volume"
## 
## $max_skewness$skewness
##   volume 
## 0.884742

#commento: La variabile con maggiore variabilità relativa è "volume", in quanto presenta
#          il coefficiente di variazione (CV = sd × 100 / media) più elevato ~53.7%).
#          La stessa variabile risulta anche la più asimmetrica, avendo il valore di skewness
#          in valore assoluto più alto (~0.88).

#########################################################################
# 5) Dividi una delle variabili quantitative in classi, scegli tu quale e come, 
# costruisci la distribuzione di frequenze, il grafico a barre corrispondente e 
# infine calcola l’indice di Gini. 
#########################################################################

# Indice di Gini
gini.index <- function(x){
  ni=table(x)
  fi=ni/length(x)
  fi2=fi^2
  J=length(table(x))
  gini = 1 - sum(fi2)
  gini.normalizzato = gini/((J-1)/J)
  return(gini.normalizzato)
}

# suddivisione della variabile quantitativa 'sales' in classi;
# distribuzione di frequenze stratificata per città
sales_class <- cut(dati$sales, 
                   breaks=c(0,100,200,300,400,Inf),
                   labels = c("0–100", "100–200", "200–300", "300-400","400+"),
                   include.lowest = TRUE)

# frequenze relative
tab_city_sales <- table(dati$city, sales_class)
tab_city_sales

##                        sales_class
##                         0–100 100–200 200–300 300-400 400+
##   Beaumont                  1      43      16       0    0
##   Bryan-College Station     3      34      15       6    2
##   Tyler                     0       8      35      16    1
##   Wichita Falls            17      43       0       0    0

barplot(tab_city_sales, beside = TRUE, legend.text = rownames(tab_city_sales),
        col = rainbow(4), main = "Sales distribution by class and city",
        xlab = "Sales classes", ylab = "Frequency")

prop.table(tab_city_sales, margin = 1)

##                        sales_class
##                              0–100    100–200    200–300    300-400       400+
##   Beaumont              0.01666667 0.71666667 0.26666667 0.00000000 0.00000000
##   Bryan-College Station 0.05000000 0.56666667 0.25000000 0.10000000 0.03333333
##   Tyler                 0.00000000 0.13333333 0.58333333 0.26666667 0.01666667
##   Wichita Falls         0.28333333 0.71666667 0.00000000 0.00000000 0.00000000

cat("INDICE DI GINI:")

## INDICE DI GINI:

#commento:  questo indice di Gini complessivo sulle classi
gini.index(sales_class)

## [1] 0.7796441

#commento:  Indice di Gini per città (concentrazione delle frequenze nelle classi)
gini_city <- tapply(sales_class, dati$city, gini.index)
gini_city

##              Beaumont Bryan-College Station                 Tyler 
##             0.5187500             0.7534722             0.7131944 
##         Wichita Falls 
##             0.5076389

#########################################################################
# 6) Indovina l’indice di gini per la variabile city.
#########################################################################


f = table(dati$city)/length(dati$city)
J <- length(f)
gini <- 1 - ( (1 - sum(f^2)) / ((J - 1) / J) )
print("commento: dato che tutte le città hanno rel_freq = 25%, e con l'indice di Gini atteso è 0.")

## [1] "commento: dato che tutte le città hanno rel_freq = 25%, e con l'indice di Gini atteso è 0."

#########################################################################
# 7) Qual è la probabilità che presa una riga a caso di questo dataset essa 
#    riporti la città “Beaumont”? E la probabilità che riporti il mese di Luglio? 
#    E la probabilità che riporti il mese di dicembre 2012?
#########################################################################

n <- nrow(dati)
#PROB: città “Beaumont”
Probab_Beaumont <- sum(dati$city == "Beaumont") / n # probabilità del 25 %

#PROB: mese “Luglio”
Prob_Luglio <- sum(dati$month == 7) / n # probabilità del 8.3 %

#PROB: mese “Dicembre 2012”
Prob_Dec2012 <- sum(dati$year == 2012 & dati$month == 12) / n # probabilità del 1.7 %

#commento: Le probabilità sono stimate come frequenze relative delle osservazioni
#          nel dataset, assumendo una riga scelta a caso.

#########################################################################
# 8) Esiste una colonna col prezzo mediano, creane una che indica invece il prezzo medio, 
#    utilizzando le altre variabili che hai a disposizione
#########################################################################

head(dati,10)

##        city year month sales volume median_price listings months_inventory
## 1  Beaumont 2010     1    83 14.162       163800     1533              9.5
## 2  Beaumont 2010     2   108 17.690       138200     1586             10.0
## 3  Beaumont 2010     3   182 28.701       122400     1689             10.6
## 4  Beaumont 2010     4   200 26.819       123200     1708             10.6
## 5  Beaumont 2010     5   202 28.833       123100     1771             10.9
## 6  Beaumont 2010     6   189 27.219       122800     1803             11.1
## 7  Beaumont 2010     7   164 22.706       124300     1857             11.7
## 8  Beaumont 2010     8   174 25.237       136800     1830             11.6
## 9  Beaumont 2010     9   124 17.233       121100     1829             11.7
## 10 Beaumont 2010    10   150 23.904       138500     1779             11.5

library(dplyr)
#valore totale delle vendite in milioni di dollari (1e6) diviso numero totale di vendite
dati <- dati %>% mutate(mean_price = volume * 1e6 / sales)

plot(density(dati$mean_price),
     main = "Distribution of the average selling price",
     xlab = "Average price (dollar)")

#########################################################################
# 9) Prova a creare un’altra colonna che dia un’idea di “efficacia” degli annunci di 
#    vendita. Riesci a fare qualche considerazione?
#########################################################################

# Efficacia = numero totale di vendite / numero totale di annunci attivi
# Efficacia -> 0 == poche vendite nonostante tanti annunci
# Efficacia -> 1 == tant vendite nonostante i pochi annunci
dati <- dati %>% mutate(efficacia_annunci = sales  / listings)

mean(dati$efficacia_annunci)

## [1] 0.1187449

# In media gli annunci hanno un'efficienza di 0.12, ovvero su 100 annunci vengono vendute 12 case.
plot(density(dati$efficacia_annunci),
     main = "Efficency of sales as a function of number of ads",
     xlab = "Sales/Listings")

# controllo indici di posizione e variabilità per sales_to_lists
cv_sales = ind.pos.var(dati$efficacia_annunci)

## [1] "Variabile: dati$efficacia_annunci"
## ##########################################
## [1] "[Minimum, Maximum] = [0.050140252454418,0.387127761767531]"
## [1] "[25th, 75th] percentiles =   [0.0898017413734923,0.134922462941847]"
## [1] "IQR = 0.0451207215683549"
## [1] "Variance = 0.00219951628782874"
## [1] "Coeff. di variation = 39.4955845199724"
## [1] "Mean +/- sdt.dev = 0.118744921731651 +/- 0.0468990009256993"
## [1] "Median, [25th,75th]Percentile = 0.109626835334067 [-0.0198250939605743,+0.0252956276077805]"
## [1] "Skewness = 2.08931813879356"
## [1] "Kurtosis = 6.88174747715348"
## Postive asymmetry
## Leptocurtica
## ##########################################

# CONSIDERAZIONI
# È stato definito un indicatore di efficacia degli annunci come rapporto tra il numero di vendite e il numero di annunci attivi.
# La distribuzione presenta elevata variabilità relativa (CV ~39.5%) ed è fortemente asimmetrica positiva 
# (skewness ~2.09) e leptocurtica, con pochi valori elevati.
# Pertanto l’indicatore risulta meglio descritto tramite mediana  (~0.11) e intervallo interquartile ([0.09, 0.13]).

#########################################################################
# 10) Prova a creare dei summary(), o semplicemente media e deviazione standard, 
#     di alcune variabili a tua scelta, condizionatamente alla città, agli anni
#     e ai mesi. Puoi utilizzare il linguaggio R di base oppure essere un vero 
#     Pro con il pacchetto dplyr. Ti lascio un suggerimento in pseudocodice, 
#     oltre al cheatsheet nel materiale:
#########################################################################

library(dplyr)

# posso raggruppare le variabili in classi in funzione della sola città
dati %>%
  group_by(city) %>% 
  summarise(
    mean_sales = mean(sales),
    sd_sales   = sd(sales),
    mean_price = mean(median_price),
    sd_price   = sd(median_price),
    mean_eff = mean(efficacia_annunci),
    sd_eff   = sd(efficacia_annunci),
  )

## # A tibble: 4 × 7
##   city                  mean_sales sd_sales mean_price sd_price mean_eff sd_eff
##   <chr>                      <dbl>    <dbl>      <dbl>    <dbl>    <dbl>  <dbl>
## 1 Beaumont                    177.     41.5    129988.   10105.   0.106  0.0267
## 2 Bryan-College Station       206.     85.0    157488.    8852.   0.147  0.0729
## 3 Tyler                       270.     62.0    141442.    9337.   0.0935 0.0235
## 4 Wichita Falls               116.     22.2    101743.   11320.   0.128  0.0247

# ... l'anno, ...
dati %>%
  group_by(year) %>% 
  summarise(
    mean_sales = mean(sales),
    sd_sales   = sd(sales),
    mean_price = mean(median_price),
    sd_price   = sd(median_price),
    mean_eff = mean(efficacia_annunci),
    sd_eff   = sd(efficacia_annunci))

## # A tibble: 5 × 7
##    year mean_sales sd_sales mean_price sd_price mean_eff sd_eff
##   <int>      <dbl>    <dbl>      <dbl>    <dbl>    <dbl>  <dbl>
## 1  2010       169.     60.5    130192.   21822.   0.0997 0.0337
## 2  2011       164.     63.9    127854.   21318.   0.0927 0.0232
## 3  2012       186.     70.9    130077.   21432.   0.110  0.0281
## 4  2013       212.     84.0    135723.   21708.   0.135  0.0448
## 5  2014       231.     95.5    139481.   25625.   0.157  0.0618

# ... i mesi ....
dati %>%
  group_by(month) %>% 
  summarise(
    mean_sales = mean(sales),
    sd_price   = sd(median_price),
    median_price = median(median_price),
    mean_eff = mean(efficacia_annunci),
    sd_eff   = sd(efficacia_annunci))

## # A tibble: 12 × 6
##    month mean_sales sd_price median_price mean_eff sd_eff
##    <int>      <dbl>    <dbl>        <dbl>    <dbl>  <dbl>
##  1     1       127.   25151.       128400   0.0831 0.0230
##  2     2       141.   22823.       132000   0.0878 0.0219
##  3     3       189.   23442.       129800   0.116  0.0346
##  4     4       212.   21458.       132300   0.125  0.0380
##  5     5       239.   18796.       135300   0.141  0.0503
##  6     6       244.   19231.       137350   0.142  0.0576
##  7     7       236.   21945.       137450   0.143  0.0740
##  8     8       231.   22488.       143850   0.142  0.0526
##  9     9       182.   24344.       133100   0.112  0.0348
## 10    10       180.   26358.       136150   0.112  0.0360
## 11    11       157.   24691.       140050   0.102  0.0293
## 12    12       169.   22810.       134250   0.117  0.0379

# Da qui in poi utilizza ggplot2 per creare grafici fantastici! 

library(ggplot2)

# numero totale di vendite per mese, anno e città
ggplot(dati, aes(x = month, y = sales, color = city)) +
  stat_summary(fun = mean, geom = "line") +
  stat_summary(fun = mean, geom = "point") +
  facet_wrap(~ year) +
  theme_minimal()

# Efficacia annunci per vendita
ggplot(dati, aes(x = month, y = efficacia_annunci, color = city)) +
  stat_summary(fun = mean, geom = "line") +
  stat_summary(fun = mean, geom = "point") +
  facet_wrap(~ year) +
  theme_minimal()

#commento: nelle città di Tyler e Bryan-College Station, gli annunci
#          sono stati più efficaci verso giugno-luglio ogni anno
#          per le altre città l'andamento è meno piccato e più flat

###############################################################################
# 1) Utilizza i boxplot per confrontare la distribuzione del prezzo mediano delle 
#    case tra le varie città. Commenta il risultato
###############################################################################


dati %>%
  group_by(city) %>%
  summarise(mp = median_price,.groups = "drop") %>%
  ggplot(aes(x = city, y = mp), col="blue") +
  geom_boxplot(aes(x=city, y=mp),col="blue",lwd=1) +
  labs(x = "City",y = "Median Price Houses")+ theme_bw()

## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning in fortify(data, ...): Arguments in `...` must be used.
## ✖ Problematic argument:
## • col = "blue"
## ℹ Did you misspell an argument name?

# COMMENTO: Il prezzo mediano delle case è maggiore in Bryan-College Station
# ad una distanza di circa due quantili delle distribuzioni troviamo al secondo posto il prezzo mediano delle
# case. aTyler, e al terzo posito quello in Beaumont. In Wichita Falls il prezzo mediano delle case
# è significativamente minore con valori massimi (escludendo un singolo outlier) che raggiungono a malapena
# i primi quantili della distribuzione dei prezzi in Beaumont.

###############################################################################
# 2) Utilizza i boxplot o qualche variante per confrontare la distribuzione del 
#    valore totale delle vendite tra le varie città ma anche tra i vari anni. 
#    Qualche considerazione da fare?
###############################################################################

ggplot(dati, aes(x = factor(year), y = volume, fill = city)) +
  geom_boxplot(position = position_dodge(0.8), outlier.alpha = 0.4) +
  labs(x = "Year",y = "Total sales value [Mln]",fill = "City",
    title = "Distribution of sales volume by city and year") +
  theme_bw()

# CONSIDERAZIONE:
# Tyler e Bryan-College Station mostrano un chiaro aumento del totale delle
# vendite nel corso degli anni, con uno spostamento verso l'alto della mediana.
# (si dovrebbe stimare il gradiente per capire il tasso di aumento).
# Wichita Falls  presenta invece una distribuzione stabile nel tempo, con variazioni  entro IQR  
# Beaumont mostra una lieve crescita delle vendite, pur mantenendo una 
# variabilità relativamente costante. Inoltre, la distribuzione delle 
# vendite mensili risulta significativamente più variabile a Bryan-College 
# Station rispetto a Tyler, come indicato da box e baffi più allungati (lungo l'asse delle ordinate)

###############################################################################
# 3) Usa un grafico a barre sovrapposte per confrontare il totale delle vendite 
#    nei vari mesi, sempre considerando le città. Prova a commentare ciò che viene fuori. 
#    Già che ci sei prova anche il grafico a barre normalizzato. 
#    Consiglio: Stai attento alla differenza tra geom_bar() e geom_col().
#    PRO LEVEL: cerca un modo intelligente per inserire ANCHE la variabile Year allo stesso blocco di codice, senza però creare accrocchi nel grafico.
###############################################################################

# faccio una cosa carina con i nomi dei mesi invece che i numeri (per graficarli in x devo ruotarli)
dati$month_name <- factor(dati$month,levels = 1:12,labels = month.name)

dati %>%
  group_by(city, month_name) %>%
  summarise(
    venditetot = sum(volume), #, sum(volume, na.rm = TRUE) nel caso ci siano NaN, ma non ci sono..
    .groups = "drop") %>%
  ggplot(aes(x = month_name, y = venditetot, fill = city)) +
  geom_col(position = "stack", color = "black") +
  labs(x = "Month", y = "Total Sales Price [Mln]", fill = "City",
    title = "Total sales price by city and month") + 
  theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

# COMMENTO:
# Il grafico mostra un picco delle vendite totali (in tutte le città) nei mesi da Maggio a Agosto.
# Tyler e Bryan-College Station contribuiscono maggiormente alle vendite e ne determinano le principali variazioni
# Beaumont e  Wichita Falls mostrano andamenti più stabili e un contributo inferiore. 

# normalizzato
dati %>%
  group_by(city, month_name) %>%
  summarise(venditetot = sum(volume),.groups = "drop") %>%
  ggplot(aes(x = month_name, y = venditetot, fill = city)) +
  geom_col(position = "fill", color = "black") +
  labs(x = "Month", y = "Total Sales Price [Mln]", fill = "City",
       title = "Total sales price by city and month") + 
  theme_bw() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

# se voglio mettere anche l'anno posso creare 5 plot diversi per ogni anno  

dati %>%
  group_by(city, year, month_name) %>%  
  summarise(venditetot = sum(volume),.groups = "drop") %>%
  ggplot(aes(x = month_name, y = venditetot, fill = city)) +
  geom_bar(stat = "identity", position = "stack", color = "black") +
  facet_wrap(~ year, nrow = 2) +        
  labs(x = "Month", y = "Total sales value [Mln]",fill = "City",
    title = "Total sales value by city, month and year") +
  theme_bw() +
  theme(legend.position = "bottom",axis.text.x = element_text(angle = 75, hjust = 1))

###############################################################################
# 4) Prova a creare un line chart di una variabile a tua scelta per fare confronti 
#    commentati fra città e periodi storici. Ti avviso che probabilmente all’inizio ti 
#    verranno fuori linee storte e poco chiare, ma non demordere. Consigli: Prova inserendo
#    una variabile per volta. Prova a usare variabili esterne al dataset, tipo vettori creati 
#    da te appositamente.
###############################################################################


dati %>%
  group_by(city, year) %>%
  summarise(mean_sales = mean(sales),.groups = "drop") %>%
  ggplot(aes(x = year, y = mean_sales, col = city)) +
  geom_line(aes(x=year, y=mean_sales,col=city),lwd=3) +
  labs(x = "Year",y = "Mean of the total number of sales",col = "City",
    title = "Mean of the total number of sales by City and Year") + theme_bw()

dati %>%
  group_by(city, year) %>%
  summarise(mean_sales = mean(efficacia_annunci),.groups = "drop") %>%
  ggplot(aes(x = year, y = mean_sales, col = city)) +
  geom_line(aes(x=year, y=mean_sales,col=city),lwd=3) +
  labs(x = "Year",y = "Mean ad effectiveness",col = "City",
       title = "effectiveness of home and sales ads by City and Year") + theme_bw()

# Nell'ultimo anno (2014) l'efficacia degli annunci in Bryan-Colege Station è stata 
# più del doppio rispetto le altre! Impressionante! Ecco come ottenere tanto con poco!

dati %>%
  group_by(city, year) %>%
  summarise(mean_volume = sum(volume),.groups = "drop") %>%
  ggplot(aes(x = year, y = mean_volume, col = city)) +
  geom_line(aes(x=year, y=mean_volume,col=city),lwd=3) +
  labs(x= "Year",y= "Total sales [Mln]",col = "City",
    title = "Total sales by City nd Year")+ theme_bw()

dati %>%
  group_by(city, year) %>%
  summarise(tot_listings = sum(listings),.groups = "drop") %>%
  ggplot(aes(x= year, y= tot_listings, col= city)) +
  geom_line(aes(x=year, y=tot_listings,col=city),lwd=3) +
  labs(x = "Year",y = "Total number of ads",col = "City",
    title = "Total number of ads by City and Year")+ theme_bw()

# per sicurezza ricreo lista month_name
dati$month_name <- factor(dati$month,levels = 1:12,labels = month.name)
dati %>%
  group_by(city, year, month) %>%
  summarise(inv = mean(months_inventory),list = mean(listings),.groups = "drop") %>%
  mutate(time= year + (month - 1) / 12) %>% #costruire una variabile temporale continua
  ggplot(aes(x = time, y = inv, color = city, size = list)) +
  geom_point(alpha = 0.7) +
  geom_line(aes(group = city), linewidth = 1) +
  labs(x = "Year",y = "Months of inventory",size = "Active ads in this period",color = "City",
    title = "Months of inventory over time") +
  theme_bw()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# COMMENTO: osserviamo una riduzione nel tempo del numero di mesi per vendita do tutte le case in annunci
# per tutte le città, indicando un aumento della domanda immobiliare.
# Bryan-College Station mostra il calo più marcato. 
# Tyler mantiene valori più elevati, suggerendo una maggiore disponibilità di offerta.
# La dimensione dei punti conferma la relazione positiva tra numero di annunci e mesi per le vendite.



dati2 <- dati %>%
  arrange(year, month) %>%
  mutate(t_index = (year - min(year)) * 12 + month) %>%
  group_by(city, t_index) %>%
  summarise(eff_list = mean(sales / listings),.groups = "drop")

ggplot(dati2, aes(x = t_index, y = eff_list,color = city)) +
  geom_line(linewidth = 1) + geom_point(size = 1.5) +
  scale_x_continuous(breaks = seq(1, 60, 2)) +
  labs(x = "Time",y = "Efficiency (Sales/Ads)",color = "City",
    title = "Efficiency evolution by city (60 months)") + 
  theme_bw() + theme(legend.position = "bottom")

# COMMENTO
# In media, per tutte le città, l'efficienza delle vendite (vendite e annunci attivi) 
# mostra un aumento nel tempo, indicando un progressivo miglioramento della capacità
# del mercato di assorbire l'offerta. 
# Wichita Falls presenta valori relativamente elevati nella fase iniziale del periodo analizzato, 
# seguiti da una crescita più moderata e stabile.
# Bryan-College Station evidenzia invece un andamento ciclico, con oscillazioni stagionali caratterizzate
# da picchi e minimi che aumentano progressivamente  nel tempo, suggerendo un rafforzamento strutturale del mercato locale.

Progetto_AR.R

Andrea Travascio

2025-12-14