Analisi immobiliare del mercato del Texas

Carico le librerie, le funzioni e il dataset

Installazione dei packages usati e verifica della versione

Librerie

# library installatiion
library(ggplot2)

## Warning: il pacchetto 'ggplot2' è stato creato con R versione 4.4.1

# Packages verification installation
packageVersion("ggplot2")

## [1] '3.5.1'

########
# Load the moments package for skewness and kurtosis calculations
library(moments)
# Packages verification installation
packageVersion("moments")

## [1] '0.14.1'

########
# Load the moments package for skewness and kurtosis calculations
library(dplyr)

## Warning: il pacchetto 'dplyr' è stato creato con R versione 4.4.1

## 
## Caricamento pacchetto: 'dplyr'

## I seguenti oggetti sono mascherati da 'package:stats':
## 
##     filter, lag

## I seguenti oggetti sono mascherati da 'package:base':
## 
##     intersect, setdiff, setequal, union

# Packages verification installation
packageVersion("dplyr")

## [1] '1.1.4'

########
# Load the moments package for skewness and kurtosis calculations
library(stringr)

## Warning: il pacchetto 'stringr' è stato creato con R versione 4.4.1

# Packages verification installation
packageVersion("stringr")

## [1] '1.5.1'

Funzioni

############################################################################
# Function to calculate mean, median, min, max, first quantile and third quantile  # for each numeric column in a dataframe
############################################################################
calculate_index_pos <- function(df) {
  # Select only numeric columns
  numeric_df <- df[sapply(df, is.numeric)]
  
  # Calculate mean, median, min, and max for each column
  stats <- data.frame(
    Mean   = sapply(numeric_df, mean, na.rm = TRUE),
    Median = sapply(numeric_df, median, na.rm = TRUE),
    Mode   = sapply(numeric_df, getmode),
    Min    = sapply(numeric_df, min, na.rm = TRUE),
    Max    = sapply(numeric_df, max, na.rm = TRUE),
    Qtl.25 = sapply(numeric_df, quantile, probs=0.25, na.rm = TRUE),
    Qtl.75 = sapply(numeric_df, quantile, probs=0.75, na.rm = TRUE)
    )
  return(stats)
}
############################################################################



############################################################################
# Calculate variance index of dataframe
############################################################################

calculate_variance_index <- function(df) {
  # Select only numeric columns
  numeric_df <- df[sapply(df, is.numeric)]
  
  # Calculate variance index 
  stats = data.frame(
    Range    =sapply(numeric_df, getrange),
    IQR      =sapply(numeric_df, IQR, na.rm = TRUE),
    variance =sapply(numeric_df, var, na.rm = TRUE), 
    st.dev   =sapply(numeric_df, sd, na.rm = TRUE),
    CV       =sapply(numeric_df, CV ) 
    )
  return(stats)
}
#
############################################################################



############################################################################
# Calculate the shape of a dataframe
############################################################################
calculate_shape = function(df) {
  # Select only numeric columns
  numeric_df = df[sapply(df, is.numeric)]
  
  # Calculate variance index 
  stats = data.frame(
    Asym_ind  =sapply(numeric_df, skewness,na.rm = TRUE),
    Kurtosis  =sapply(numeric_df, kurtosis, na.rm = TRUE)
    )
  return(stats)
}
############################################################################


############################################################################
# Get mode of the vector 
############################################################################
getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}
#
############################################################################




############################################################################
# Get the range of a vector 
############################################################################
getrange <- function(v) {
  range = max(v)-min(v) 
  return(range)
  }
############################################################################




############################################################################
# Function Variance %
############################################################################
CV = function(x){
  return = (  sd(x)/mean(x)*100 ) 
}
############################################################################




############################################################################
# GINI index 
############################################################################
gini.index = function(x){
      ni=table(x)         
      fi=ni/length(x)
      fi2=fi^2
      J = length (table(x))
      
      gini = 1 - sum(fi2)
      gini.normalizzato = gini/((J-1)/J)
      return(gini.normalizzato)
}
#
############################################################################



############################################################################
# class and graph maker
############################################################################
freq_grap = function(x,
                     seq_start,
                     seq_end,
                     seq_steps,
                     title_desc,
                     y_start,
                     y_end,
                     y_steps) {

    # Create the classes
    x_cl = cut(x, seq(seq_start, seq_end, seq_steps))
    x_n  = length(x)
    
    # Create the frequency distribution table
    distr_freq = as.data.frame(
      cbind(
        ni = table(x_cl),              # Absolute frequency
        fi = table(x_cl) / x_n,        # Relative frequency
        Ni = cumsum(table(x_cl)),      # Cumulative absolute frequency
        Fi = cumsum(table(x_cl) / x_n) # Cumulative relative frequency
      )
    )
    
    # Create the frequency distribution plot
    plot = ggplot(data = as.data.frame(x_cl)) +
      geom_bar(aes(x = x_cl),
               position = "stack",
               stat = "count",
               col = "black",
               
               fill = "lightblue") +
      labs(title = title_desc,
           x = "Class intervals",
           y = "Absolute frequency") +
      scale_y_continuous(breaks = seq(y_start, y_end, y_steps)) +
      theme_classic() +
      theme(legend.position = "bottom")
    
    print(plot)  # Force the plot to display
    
    return(distr_freq)
}
#
############################################################################




############################################################################
# word counter 
############################################################################
count_word = function(word, text_vector) {
    # Convert to lowercase for case-insensitive search
    word = tolower(word)
    text_vector = tolower(text_vector)
    
    # Count occurrences of the specific word in each element of the vector
    count = sum(sapply(text_vector, function(text) {
        str_count(text, paste0("\\b", word, "\\b"))
    }))
    
    return(count)
}
#
############################################################################



############################################################################
# words counter in columns - 
############################################################################
count_words_in_columns = function(df, column_A, word_A, column_B, word_B) {
    # Convert words to lowercase for case-insensitive search
    word_A = tolower(word_A)
    word_B = tolower(word_B)
    
    # Initialize counter
    count = 0
    
    # Loop through each row of the dataframe
    for (i in seq_len(nrow(df))) {
        # Convert values in specified columns to lowercase for case-insensitive comparison
        text_A = tolower(as.character(df[i, column_A]))
        text_B = tolower(as.character(df[i, column_B]))
        
        # Check if word_A is in column_A and word_B is in column_B for the same row
        if (grepl(word_A, text_A) & grepl(word_B, text_B)) {
            count = count + 1
        }
    }
    
    return(count)
}
#     
############################################################################




############################################################################
# Function Line Chart by group
############################################################################
line_chart_group  = function(dataframe,
                            data_col,
                            df_filter_1,
                            df_filter_2,
                            x_start,
                            x_end,
                            x_steps,
                            title_desc,
                            Y_desc,
                            X_desc,
                            y_start,
                            y_end,
                            y_steps
                            ) {

    # Using the {{ }} operator to refer to column names
    
    df_A = NULL
    df_A <- dataframe %>%
    group_by({{ df_filter_1 }},{{ df_filter_2 }}) %>%
    summarise(tot_sum = sum({{data_col}}, na.rm = TRUE)) %>%
    ungroup()
    # Print the new dataframe created 
    print(df_A)

    ggplot(data = df_A,
           aes(x = {{ df_filter_1 }},
               y = tot_sum,
               color = {{ df_filter_2 }}, 
               group = {{ df_filter_2 }}
               ))+
    geom_line(size = 1)+ 
    theme_minimal() +
    # Definisce i nomi degli assi 
    labs (title = title_desc,
              x = X_desc,
              y = Y_desc 
    )+   
   scale_y_continuous(breaks=seq(y_start,y_end,y_steps))+
   if (x_start >0) {
          
            scale_x_continuous(breaks = seq(x_start, x_end, x_steps))
          }   
  
}  
############################################################################

Import dei dati

dati_re_texas <- read.csv("realestate_texas.csv", header = TRUE, sep = ",", fileEncoding = "ISO-8859-1")

n <- dim(dati_re_texas)[1] # Define dataset elements

head(dati_re_texas) # Controllo che il dataframe sia stato caricato

##       city year month sales volume median_price listings months_inventory
## 1 Beaumont 2010     1    83 14.162       163800     1533              9.5
## 2 Beaumont 2010     2   108 17.690       138200     1586             10.0
## 3 Beaumont 2010     3   182 28.701       122400     1689             10.6
## 4 Beaumont 2010     4   200 26.819       123200     1708             10.6
## 5 Beaumont 2010     5   202 28.833       123100     1771             10.9
## 6 Beaumont 2010     6   189 27.219       122800     1803             11.1

Il dataset rappresenta le seguenti variabili:

Città dove viene venduta la casa - variabile categorica
Anno di vendita - variabile tempo + var.discreta
Mese di vendita - variabile tempo + var.discreta
Numero delle vendite - variabile quantitativa continua - distr.freq ok
Volume di affari delle vendite - variabile quantitativa continua - distr.freq ok
Prezzo medio di vendita - variabile quantitativa continua - distr.freq ok
Listings , numero totale di annunci
Months inventory , mesi necessari per vendere la casa

PUNTO 1: Analisi delle variabili tempo

Le variabili che sottointendono una dimensione tempo sono :

year - anno di vendita
month - mese di vendita
months_inventory - quantità di tempo per vendere una casa

L’analisi che faremo sarà quello di capire la correlazione di vendite in un dato periodo sia all’interno dello stesso anno sia negli anni oggett della ricerca, per capirne il trend.

Anche la variabile months_inventory potrebbe darci un’indicazione di quanto possa essere difficile vendere una casa

PUNTO 1: Analisi delle variabile “month” ( mese )

# Define range
min_month=min(dati_re_texas$month)
max_month=max(dati_re_texas$month)

ggplot(data = dati_re_texas)+
  # Select bar type
  geom_bar(aes(x=month
               ),
           col = "black",
           fill = "lightblue"
           )+
  # Define axis names
  labs (title = "Visualizzazione delle vendite durante l'anno",
        x="Anno",
        y ="Totali vendite" 
        )+ 
  # Define x scales numbers
  scale_x_continuous(breaks=seq(min_month,max_month,1))+
  # Define theme type
  theme_classic()

PUNTO 2: Analisi delle variabile “year” ( anno )

# Define range
min_year=min(dati_re_texas$year)
max_year=max(dati_re_texas$year)

ggplot(data = dati_re_texas)+
  # Select bar type 
  geom_bar(aes(x=year
               ),
           # dodge barre affiancata stack sovrapposte
           col = "black",
           fill = "lightblue"
           )+
  # Define axis names 
  labs (title = "Visualizzazione delle vendite per anno",
        x="Anno",
        y ="Totali vendite" 
        )+ 
  # Definisce sull'asse Y la precisione
  scale_x_continuous(breaks=seq(min_year,max_year,1))+
  # Definisco il tipo di tema
  theme_classic()

PUNTO 3: Analisi delle variabile “months_inventory” ( tempo necessario per vendere la casa )

# Define range
min_sale_time=min(dati_re_texas$months_inventory)
max_sale_time=max(dati_re_texas$months_inventory)

ggplot(data = dati_re_texas)+
  # Select bar type 
  geom_bar(aes(x=months_inventory
               ),
           col = "black",
           fill = "lightblue"
           )+
  # Definisce i nomi degli assi 
  labs (title = "Visualizzazione del tempo[mesi] per eseguire una di vendita",
        x="Totali vendite",
        y ="Mesi necessari per vendere una casa"
        )+ 
  # Definisce sull'asse Y la precisione
  scale_y_continuous(breaks=seq(min_sale_time,max_sale_time,1))+
  # Definisco il tipo di tema
  theme_classic()

CONCLUSIONI: Le variabili “year” e “month” ci dicono che le rilevazioni sono costanti e che quindi andranno valutate in combinazione ad altre variabili come ” sales , “volume o”city”.

La variabile months inventory ha una sua distribuzione invece concentrate in due fasce da 6,5 a 10 e da 10 a13. Vedremo più avanti di scoprirne il motivo

PUNTO 2: Indici di posizione,variabilità e forma

All’interno del mio dataframe ho il nome delle città quindi per calcolare gli indici di posizione questa colonna verrà eliminata.

Costruisco un nuovo dataframe senza la colonna “city”

GLi indici di posizione che ho scelto sono: media, mediana, moda, minimo e massimo e percentili

dati_re_texas_no_city = dati_re_texas[, -1]  # Remove first colomun

Rimuoviamo anche le variabili “year” e “month” per i motivi visti sopra.

dati_re_texas_no_city <- dati_re_texas_no_city[, !colnames(dati_re_texas_no_city) %in% "year"]
dati_re_texas_no_city <- dati_re_texas_no_city[, !colnames(dati_re_texas_no_city) %in% "month"]

head(dati_re_texas_no_city)

##   sales volume median_price listings months_inventory
## 1    83 14.162       163800     1533              9.5
## 2   108 17.690       138200     1586             10.0
## 3   182 28.701       122400     1689             10.6
## 4   200 26.819       123200     1708             10.6
## 5   202 28.833       123100     1771             10.9
## 6   189 27.219       122800     1803             11.1

Abbiamo creato un dataframe con le variabili che abbiamo deciso di analizzare

====================================================================

Calcolo gli indici di posizione per le variabili: sales,volume,median_price,listings,month_inventory

calculate_index_pos(dati_re_texas_no_city)

##                          Mean      Median       Mode       Min        Max
## sales               192.29167    175.5000    124.000    79.000    423.000
## volume               31.00519     27.0625     35.335     8.166     83.547
## median_price     132665.41667 134500.0000 130000.000 73800.000 180000.000
## listings           1738.02083   1618.5000   1581.000   743.000   3296.000
## months_inventory      9.19250      8.9500      8.100     3.400     14.900
##                       Qtl.25     Qtl.75
## sales               127.0000    247.000
## volume               17.6595     40.893
## median_price     117300.0000 150050.000
## listings           1026.5000   2056.000
## months_inventory      7.8000     10.950

====================================================================

Calcolo la variabilità per le seguenti variabili: sales,volume,median_price,listings,month_inventory

calculate_variance_index(dati_re_texas_no_city)

##                       Range        IQR     variance       st.dev       CV
## sales               344.000   120.0000 6.344300e+03    79.651111 41.42203
## volume               75.381    23.2335 2.772707e+02    16.651447 53.70536
## median_price     106200.000 32750.0000 5.135730e+08 22662.148687 17.08218
## listings           2553.000  1029.5000 5.665690e+05   752.707756 43.30833
## months_inventory     11.500     3.1500 5.306889e+00     2.303669 25.06031

Tutte le variabili esaminate hanno medie molto distinti. Per favorire l’analisi bisognerà scalare il tutto.

====================================================================

Calcola la forma per le seguenti variabili: sales,volume,median_price,listings,month_inventory-

calculate_shape(dati_re_texas_no_city)

##                     Asym_ind Kurtosis
## sales             0.71810402 2.686824
## volume            0.88474203 3.176987
## median_price     -0.36455288 2.377038
## listings          0.64949823 2.208210
## months_inventory  0.04097527 2.825552

Si evidenzia che tutte le distribuzioni hanno indice positivo ( sono più frequenti valori bassi ) ad eccetto del prezzo medio di vendita che invece ha indice negativo ( sono più frequenti i valori alti ). Per quanto riguarda la curtosi gli indici sono tutti positivi quindi parliamo di leptocurioso, Risulteranno tutte più allungate rispetto alla standard normale, in particolare ” months_inventory” e

# Using the function with the defined parameters
distr_freq = freq_grap(x = dati_re_texas_no_city$months_inventory,
                       seq_start = 0,
                       seq_end = 20,
                       seq_steps = 2,
                       title_desc = "Tempo per vendere un immobile [mesi] ",
                       y_start = 0,
                       y_end = 300,
                       y_steps = 10)

Leptocurtosi.

====================================================================

PUNTO 3: Identificazione variabili

La variabile con maggiore asimmetria è “volume” da punto 2.calcolo forma. Asym_ind

La variabile con maggiore variabilità è: “volume” da punto 2.calcolo variabilità indice CV

PUNTO 4: Creazioni di classi per una var.qualitativa

Distribuzione di frequenza in classi per la variabile “seales”

# Using the function with the defined parameters
###
distr_freq = freq_grap(x = dati_re_texas_no_city$sales,
                       seq_start  = 50,
                       seq_end    = 450,
                       seq_steps  = 50,
                       title_desc = "Distribuzione in classi della variabile sales"                        ,
                       y_start    = 0,
                       y_end      = 100,
                       y_steps    = 10
                       )

Distribuzione in frequenza tabellare

distr_freq

##           ni         fi  Ni        Fi
## (50,100]  21 0.08750000  21 0.0875000
## (100,150] 72 0.30000000  93 0.3875000
## (150,200] 56 0.23333333 149 0.6208333
## (200,250] 32 0.13333333 181 0.7541667
## (250,300] 34 0.14166667 215 0.8958333
## (300,350] 13 0.05416667 228 0.9500000
## (350,400]  9 0.03750000 237 0.9875000
## (400,450]  3 0.01250000 240 1.0000000

Calcolo l’indice di etereogeneità di Gini per quanto vale la variabile “sales”

gini.index(dati_re_texas_no_city$sales)

## [1] 0.998379

La distribuzione ha etereogeneita massima ( equidistribuita )

====================================================================

PUNTO 5: Calcolo delle probabilità

La probabilità che presa una riga a caso , ci sia ” Beaumont ”

col_count=count_word ("Beaumont",dati_re_texas$city)
col_length=length(dati_re_texas$city)

prob_city = col_count/col_length
prob_city

## [1] 0.25

La probabilità che presa una riga a caso, ci sia ” luglio ”

col_count=count_word ("7",dati_re_texas$month) #7 means July
col_length=length(dati_re_texas$month) 

prob_month = col_count/col_length
prob_month

## [1] 0.08333333

La probabilità che presa una riga a caso , ci sia ” dicembre ” e anno 2012

count_res_columns = count_words_in_columns(df=dati_re_texas,
                                           column_A = "year", 
                                           word_A   = "2012",
                                           column_B = "month",
                                           word_B   =  "12"
                                           )

count_length=length(dati_re_texas$year) 

print(count_res_columns/count_length)

## [1] 0.01666667

PUNTO 6: Creazione di nuove variabili

CONSEGNA: Crea una nuova colonna che calcoli il prezzo medio degli immobili utilizzando le variabili disponibili - Prova a creare una colonna che misuri l’efficacia degli annunci di vendita. Commenta e discuti i risultati.

Creo una nuova variabile prezzo medio di vendita

# Made a copy of dataframe
dati_re_texas_sales = NULL
dati_re_texas_sales = dati_re_texas

# Add new column for average sale 
dati_re_texas_sales$avg_sale=(dati_re_texas$volume*1000000)/dati_re_texas$sales

# Add new column for 
dati_re_texas_sales$gain_force_1=(dati_re_texas$sales/dati_re_texas$listings)

distr_freq = freq_grap(x = dati_re_texas_sales$gain_force_1,
                       seq_start  = 0.0,
                       seq_end    = 0.5,
                       seq_steps  = 0.05,
                       title_desc = "Distribuzione in classi della variabile                                            rapporto tra vendite e annunci",
                       y_start    = 0,
                       y_end      = 130,
                       y_steps    = 10
                       )

La distribuzione si concentra sulle prime due classi e cioè fino a 0.15 come rapporto tra case vendute e annunci attivi . Notiamo anche che la zona del Bryan-College si classica come zona migliore per rapporto vendite/annunci attivi. Per fare questo controllo ho preso il nuovo dataframe sales creato e l’ho ordinato per la variabile gain force 1, scorrendo la fine ho scoperto che c’è un’alta concentrazione di case vendute in questa zona.

Per fare la verifica uso un grafico a barre affiancate dove ogni classe avrà anche le 4 città

#################################################
# class and graph maker with dodge function
freq_grap_dodge = function(x,
                           df_class,
                           group_tbl,
                           seq_start,
                           seq_end,
                           seq_steps,
                           title_desc,
                           y_start,
                           y_end,
                           y_steps
                           ) {

    # Create the classes
    x_cl = cut(df_class, seq(seq_start, seq_end, seq_steps))
    x_n  = length(df_class)
    
    # Create the frequency distribution table
    distr_freq = as.data.frame(
      cbind(
        ni = table(x_cl),              # Absolute frequency
        fi = table(x_cl) / x_n,        # Relative frequency
        Ni = cumsum(table(x_cl)),      # Cumulative absolute frequency
        Fi = cumsum(table(x_cl) / x_n) # Cumulative relative frequency
      )
    )
    
    # Create the frequency distribution plot
    plot = ggplot(data =x) +
      geom_bar(aes(x=x_cl,
               fill =group_tbl),
           # dodge barre affiancata stack sovrapposte
           position = "dodge",
           stat = "count",
           col = "black"
           )+
      # Definisce i nomi degli assi 
      labs (title = title_desc,
        x="classi ",
        y ="Frequenza assolute" 
        )+ 
      # Definisce sull'asse Y la precisione
      scale_y_continuous(breaks=seq(y_start,y_end,y_steps))+
      # Definisco il tipo di tema
      theme_classic()+
      theme(legend.position = "bottom")
    
      print(plot)  # Force the plot to display
    
      return(distr_freq)
}

gain_df = freq_grap_dodge (x=dati_re_texas_sales,
                             df_class=dati_re_texas_sales$gain_force_1,
                             group_tbl=dati_re_texas_sales$city,
                             seq_start  =0,
                             seq_end    =0.5,
                             seq_steps  =0.05,
                             title_desc= "Distribuzione in classi della variabile rapporto tra vendite e annunci divisi per citta",
                             y_start=0,
                             y_end  =130,
                             y_steps=10
                             )

L’efficacia degli annunci di vendita ( quindi valore alto colore verde ) si nota sopratutto nella zona del Bryan-College

====================================================================

PUNTO 7: Analisi condizionata

CONSEGNA: Usa il pacchetto dplyr o il linguaggio base di R per effettuare analisi statistiche condizionate per città, anno e mese. Genera dei summary (media, deviazione standard) e rappresenta graficamente i risultati

Per fare questo confronteremo le 3 variabili con il vettore sales ( vendite totali ) del dataframe

sales_year_df = freq_grap_dodge (x=dati_re_texas,
                             df_class   =dati_re_texas$sales,
                             group_tbl  =dati_re_texas$city,
                             seq_start  =50,
                             seq_end    =500,
                             seq_steps  =50,
                             title_desc= "Distribuzione in classi della variabile sales ottimizzata per le varie città",
                             y_start=0,
                             y_end  =100,
                             y_steps=10
                             )

La città di Tyler ha la più alta concentrazione di vendite seguita dal Bryan College.

===================================================================

#################################################
# Function to sum a column based on a filter
#################################################
sum_column_flt_graph = function(dataframe, 
                                filter_col, 
                                data_col,
                                title_desc,
                                y_desc,
                                x_desc,
                                y_start,
                                y_end, 
                                y_steps,
                                x_start,
                                x_end,
                                x_steps
                            
                                ) {
  
  # Using the {{ }} operator to refer to column names
  result = dataframe %>%
    group_by({{ filter_col }}) %>%
    summarise(
      total_sum  = sum({{ data_col }}, na.rm = TRUE),
      mean_value = mean({{ data_col }}, na.rm = TRUE),
      sd_value   = sd({{ data_col }}, na.rm = TRUE)
    )
  print(result)

    # Create the frequency distribution plot
    plot= ggplot(data=result)+
          geom_col(aes(x= {{ filter_col }},
                    y=total_sum),
                    col = "black",
                    fill = "lightblue"    
                    )+
          # Titles and labels     
          labs(title = title_desc,
            x = x_desc,
            y = y_desc)+ 
          # 
          #scale_y_continuous(breaks = seq(y_start, y_end, y_steps)) +
          theme_classic() +
          scale_y_continuous(breaks = seq(y_start, y_end, y_steps))+
          if (x_start >0) {
             
            scale_x_continuous(breaks = seq(x_start, x_end, x_steps))
          }   
        
    print(plot)  # Force the plot to display  
  
  return(result)
}
#
#################################################

# 
sales_by_year = NULL
sales_by_year = sum_column_flt_graph(dati_re_texas, 
                                     city, 
                                     sales,
                                     " Totali vendita per città",
                                     " Prova asse Y ",
                                     " Città osservata",
                                     0,
                                     20000, 
                                     2000,
                                     0,
                                     0,
                                     0
                                     )

## # A tibble: 4 × 4
##   city                  total_sum mean_value sd_value
##   <chr>                     <int>      <dbl>    <dbl>
## 1 Beaumont                  10643       177.     41.5
## 2 Bryan-College Station     12358       206.     85.0
## 3 Tyler                     16185       270.     62.0
## 4 Wichita Falls              6964       116.     22.2

La città dove sono state fatte più vendite è Tyler

=================================================================

# 
sales_by_month = NULL
sales_by_month = sum_column_flt_graph(dati_re_texas, 
                                      month, 
                                     sales,
                                     " Totali vendita per città",
                                     " Prova asse Y ",
                                     " Città osservata",
                                     0,
                                     20000, 
                                     2000,
                                     1,
                                     12,
                                     1
                                     )

## # A tibble: 12 × 4
##    month total_sum mean_value sd_value
##    <int>     <int>      <dbl>    <dbl>
##  1     1      2548       127.     43.4
##  2     2      2817       141.     51.1
##  3     3      3789       189.     59.2
##  4     4      4234       212.     65.4
##  5     5      4777       239.     83.1
##  6     6      4871       244.     95.0
##  7     7      4715       236.     96.3
##  8     8      4629       231.     79.2
##  9     9      3647       182.     72.5
## 10    10      3598       180.     75.0
## 11    11      3137       157.     55.5
## 12    12      3388       169.     60.7

Il mese dove si vende di più è giugno

===================================================================

sales_by_year = NULL
sales_by_year = sum_column_flt_graph(dati_re_texas, 
                                      year, 
                                     sales,
                                     " Totali vendita per città",
                                     " Prova asse Y ",
                                     " Città osservata",
                                     0,
                                     20000, 
                                     2000,
                                     2010,
                                     2014,
                                     1
                                     )

## # A tibble: 5 × 4
##    year total_sum mean_value sd_value
##   <int>     <int>      <dbl>    <dbl>
## 1  2010      8096       169.     60.5
## 2  2011      7878       164.     63.9
## 3  2012      8935       186.     70.9
## 4  2013     10172       212.     84.0
## 5  2014     11069       231.     95.5

Il trend di vendita migliora anno dopo anno e il 2014 risulta essere l’anno dove si è venduto di più

====================================================================

PUNTO 8: Creazione di vis.con ggplot2

CONSEGNA: Utilizza ggplot2 per creare grafici personalizzati.

Boxplot per confrontare la distribuzione del prezzo mediano tra le città. -

ggplot(data=dati_re_texas)+
         geom_boxplot(aes(x=city,
                   y=median_price),
                   fill="lightblue"
                   )+
             
         labs(title =" Box plot del prezzo mediano per città di osservazione ",
            x = " Città",
            y = " Prezzo mediano")

La città di Bryan-College station ha la distribuzione più alta fra tutte.

====================================================================

# Creazione delle classi per 'sales' con cut, se gli intervalli sono appropriati
dati_re_texas$sales_cl = NULL
dati_re_texas$sales_cl <- cut(dati_re_texas$sales, breaks = seq(70, 421, 70), include.lowest = TRUE)

# Grafico con ggplot
ggplot(data = dati_re_texas) +
    geom_boxplot(aes(x = city, 
                     y = median_price, 
                     fill = sales_cl)) +
    labs(title = "Box plot del prezzo mediano per città di osservazione", 
         x = "Città", 
         y = "Prezzo mediano") +
    scale_fill_brewer(palette = "Set3")  # Facoltativo, per aggiungere una palette di colori

La zona del Bryan college si colloca al gradino più alto sia per quanto riguarda il prezzo mediano sia per quanto riguardo il numero di vendite. La variabile sales_cl è la divisione in range delle quantità di case vendute nella zona, si osserva che sia Bryan che Tyler possono vantare un numero alto di case vendute nonostante il prezzo mediano sia alto

====================================================================

# Build a variables class vector using year 
dati_re_texas$year_cl = NULL
dati_re_texas$year_cl <- cut(dati_re_texas$year, breaks = seq(2010,2014,1), include.lowest = TRUE)

# Grafico con ggplot
ggplot(data = dati_re_texas) +
    geom_boxplot(aes(x = city, 
                     y = volume, 
                     fill = year_cl)) +
    labs(title = "Box plot del volume di vendite per città e anno di osservazione", 
         x = "Città", 
         y = "Volume di vendita in milioni di dollari") +
    scale_fill_brewer(palette = "Set3")  # Facoltativo, per aggiungere una palette di colori

Notiamo che le due città Bryan college e Tyler hanno performato molto bene per quanto riguardi i volumi

Inoltre c’è stato un incremento nei vari anni, che per quanto riguarda Tyler è piu regolare mentre per Bryan college si nota un vero gap nel 2010 ( probabilmente la pandemia potrebbe aver influenzato la zona essendo un college)

====================================================================

# Build a variables class vector using median price 
dati_re_texas$median_price_cl = NULL
dati_re_texas$median_price_cl = cut(dati_re_texas$median_price, breaks = seq(75000,200000,25000), include.lowest = TRUE)

# Buil Boxplot
ggplot(data = dati_re_texas) +
    geom_boxplot(aes(x = city, 
                     y = median_price, 
                     fill = median_price_cl)) +
    labs(title = "Box plot del prezzo mediano diviso in classi per città", 
         x = "Prezzo mediano di vendita", 
         y = "Città di riferimento e ") +
    scale_fill_brewer(palette = "Set3")  # Facoltativo, per aggiungere una palette di colori

Bryan-college risulta essere il migliore per quanto riguarda il prezzo mediano di vendita.

====================================================================

Grafici a barre per confrontare il totale delle vendite per mese e città.

# Define the function
divide_sales_by_month_city = function(data) {
  
  # Check if necessary columns are present in the input data
  if (!all(c("city", "month", "sales") %in% colnames(data))) {
    stop("Data frame must contain 'city', 'month', and 'sales' columns.")
  }
  
  # Group data by city and month, then summarize the total sales
  summarized_data <- data %>%
    group_by(city, month) %>%
    summarise(total_sales = sum(sales, na.rm = TRUE)) %>%
    ungroup()
  
  return(summarized_data)
}

# Usage example
# Suppose dati_re_texas is your input dataframe
result <- divide_sales_by_month_city(dati_re_texas)

## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.

print(result)

## # A tibble: 48 × 3
##    city     month total_sales
##    <chr>    <int>       <int>
##  1 Beaumont     1         608
##  2 Beaumont     2         677
##  3 Beaumont     3         855
##  4 Beaumont     4         948
##  5 Beaumont     5        1034
##  6 Beaumont     6        1025
##  7 Beaumont     7         927
##  8 Beaumont     8        1087
##  9 Beaumont     9         870
## 10 Beaumont    10         946
## # ℹ 38 more rows

# Bar graph with absolut values
ggplot(data = result)+
  # selezione tipo grafico 
  geom_col(aes(x=month,
               y=total_sales,
               fill =city),
           # dodge barre affiancata stack sovrapposte
           position = "stack",
           stat = "count",
           col = "black"
           )

## Warning in geom_col(aes(x = month, y = total_sales, fill = city), position =
## "stack", : Ignoring unknown parameters: `stat`

# Bar graph normalized using GGplot
ggplot(data = result) +
  geom_bar(aes(x = month, 
               y = total_sales, 
               fill = city),
               position = "fill",    
               stat = "identity",
               col = "black") +
  
  labs(x = "Mese", 
       y = "Percentuale Vendite (%)", 
       title = "Percentuale di Vendite Totali per Città e Mese") +
  # Show percentage on Y axe
  scale_y_continuous(labels = scales::percent_format(scale = 1)) +  
  # Show index month from 1 to 12 by 1 
  scale_x_continuous(breaks=seq(1,12,1))+
  theme_minimal()

Notiamo due casi particolari:

Bryan college: nei mesi estivi le vendite aumentano in modo considerevole

Usando un grafico normalizzato si conferma che Bryan college performa molto bene durante i mesi estivi

====================================================================

Line charts per confrontare l’andamento delle vendite in periodi storici differenti.

line_chart_group     (dataframe   =dati_re_texas,
                      data_col    =sales,
                      df_filter_1 =month,
                      df_filter_2 =year,
                      x_start     = 1,
                      x_end       = 12,
                      x_steps      = 1,
                      title_desc  = "Andamento vendite suddiviso per anni ",
                      Y_desc      = "Vendite totali",
                      X_desc      = "Mese/anno",
                      y_start     = 0,
                      y_end       = 5000,
                      y_steps     = 100
                      )

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

## # A tibble: 60 × 3
##    month  year tot_sum
##    <int> <int>   <int>
##  1     1  2010     421
##  2     1  2011     425
##  3     1  2012     499
##  4     1  2013     576
##  5     1  2014     627
##  6     2  2010     487
##  7     2  2011     469
##  8     2  2012     574
##  9     2  2013     593
## 10     2  2014     694
## # ℹ 50 more rows

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Come si evidenzia dal grafico la tendenza delle vendite è sempre molto positiva dal 2010 al 2014 le vendite sono sempre aumentate. All’inizio dell’anno c’è sempre una contrazione come alla fine dello stesso, ma durante i mesi caldi le vendite aumentano generalmente di molto

line_chart_group             (dataframe   =dati_re_texas,
                              data_col    =months_inventory,
                              df_filter_1 =month,
                              df_filter_2 =year,
                              x_start     = 1,
                              x_end       = 12,
                              x_steps     = 1,
                              title_desc  = "Mesi per concludere una vendita ",
                              Y_desc      = "Mesi",
                              X_desc      = "Mese/anno",
                              y_start     = 0,
                              y_end       = 60,
                              y_steps     = 3
                              )

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

## # A tibble: 60 × 3
##    month  year tot_sum
##    <int> <int>   <dbl>
##  1     1  2010    35  
##  2     1  2011    39.8
##  3     1  2012    39.8
##  4     1  2013    34.2
##  5     1  2014    28  
##  6     2  2010    36.6
##  7     2  2011    40.8
##  8     2  2012    40.6
##  9     2  2013    34.8
## 10     2  2014    28.4
## # ℹ 50 more rows

Un’altra cosa interessante è questo grafico, nonostante il fatturato è aumentato negli anni il tempo necessario per effettuare una vendita è diminuito. All’inzio dell’anno è sempre piu difficile vendere mentre verso la fine dell’anno tendezialmente i mesi necessari per vendere sono sempre di meno

pluto: inserire una label per facilitare la compresione della linea

===================================================================

line_chart_group             (dataframe   =dati_re_texas,
                              data_col    =sales,
                              df_filter_1 =city,
                              df_filter_2 =year,
                              x_start     = 0,
                              x_end       = 0,
                              x_steps     = 0,
                              title_desc  = "Vendite divise per citta e anno ",
                              Y_desc      = "Vendite",
                              X_desc      = "Città / anno",
                              y_start     = 0,
                              y_end       = 5000,
                              y_steps     = 500
                              )

## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.

## # A tibble: 20 × 3
##    city                   year tot_sum
##    <chr>                 <int>   <int>
##  1 Beaumont               2010    1874
##  2 Beaumont               2011    1728
##  3 Beaumont               2012    2063
##  4 Beaumont               2013    2414
##  5 Beaumont               2014    2564
##  6 Bryan-College Station  2010    2011
##  7 Bryan-College Station  2011    2009
##  8 Bryan-College Station  2012    2361
##  9 Bryan-College Station  2013    2854
## 10 Bryan-College Station  2014    3123
## 11 Tyler                  2010    2730
## 12 Tyler                  2011    2866
## 13 Tyler                  2012    3162
## 14 Tyler                  2013    3449
## 15 Tyler                  2014    3978
## 16 Wichita Falls          2010    1481
## 17 Wichita Falls          2011    1275
## 18 Wichita Falls          2012    1349
## 19 Wichita Falls          2013    1455
## 20 Wichita Falls          2014    1404

Dal grafico si vede che le vendite seguono lo stesso schema durante i 4 anni: mediamente sono tutte aumentate in maniera proporzionale, sempre la città di Tyler ha la quantita di vendite maggiore

line_chart_group             (dataframe   =dati_re_texas_sales,
                              data_col    =gain_force_1,
                              df_filter_1 =city,
                              df_filter_2 =year,
                              x_start     = 0,
                              x_end       = 0,
                              x_steps     = 0,
                              title_desc  = "Vendite divise per citta e anno ",
                              Y_desc      = "Vendite",
                              X_desc      = "Città / anno",
                              y_start     = 0,
                              y_end       = 450,
                              y_steps     = 50
                              )

## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.

## # A tibble: 20 × 3
##    city                   year tot_sum
##    <chr>                 <int>   <dbl>
##  1 Beaumont               2010   1.08 
##  2 Beaumont               2011   0.987
##  3 Beaumont               2012   1.22 
##  4 Beaumont               2013   1.47 
##  5 Beaumont               2014   1.61 
##  6 Bryan-College Station  2010   1.27 
##  7 Bryan-College Station  2011   1.23 
##  8 Bryan-College Station  2012   1.46 
##  9 Bryan-College Station  2013   2.05 
## 10 Bryan-College Station  2014   2.83 
## 11 Tyler                  2010   0.894
## 12 Tyler                  2011   0.927
## 13 Tyler                  2012   1.08 
## 14 Tyler                  2013   1.21 
## 15 Tyler                  2014   1.49 
## 16 Wichita Falls          2010   1.55 
## 17 Wichita Falls          2011   1.30 
## 18 Wichita Falls          2012   1.51 
## 19 Wichita Falls          2013   1.73 
## 20 Wichita Falls          2014   1.60

Questa nuovo parametro creato il “gain_force” mi indica il numero di vendite effettuate sul numero di annunci attivi. Nel Bryan-college la strategia è molto vincente, inoltre è aumentata durante gli anni .

line_chart_group             (dataframe   =dati_re_texas_sales,
                              data_col    =months_inventory,
                              df_filter_1 =city,
                              df_filter_2 =year,
                              x_start     = 0,
                              x_end       = 0,
                              x_steps     = 0,
                              title_desc  = "Vendite divise per città e anno ",
                              Y_desc      = "Vendite",
                              X_desc      = "Città / anno",
                              y_start     = 0,
                              y_end       = 450,
                              y_steps     = 50
                              )

## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.

## # A tibble: 20 × 3
##    city                   year tot_sum
##    <chr>                 <int>   <dbl>
##  1 Beaumont               2010   131. 
##  2 Beaumont               2011   141. 
##  3 Beaumont               2012   129. 
##  4 Beaumont               2013   105. 
##  5 Beaumont               2014    91.7
##  6 Bryan-College Station  2010   104  
##  7 Bryan-College Station  2011   118. 
##  8 Bryan-College Station  2012   107. 
##  9 Bryan-College Station  2013    78  
## 10 Bryan-College Station  2014    52.6
## 11 Tyler                  2010   152. 
## 12 Tyler                  2011   162. 
## 13 Tyler                  2012   139  
## 14 Tyler                  2013   122. 
## 15 Tyler                  2014   105. 
## 16 Wichita Falls          2010    92.2
## 17 Wichita Falls          2011   103. 
## 18 Wichita Falls          2012    98.5
## 19 Wichita Falls          2013    85.6
## 20 Wichita Falls          2014    89.3

Il miglior venditore sembra essere proprio a Bryan-college, infatti anche la quantità di mesi per vendere l’immobile che quindi si traduce per velocità di vendita è sempre Bryan-college.

line_chart_group             (dataframe   =dati_re_texas_sales,
                              data_col    =avg_sale,
                              df_filter_1 =city,
                              df_filter_2 =year,
                              x_start     = 0,
                              x_end       = 0,
                              x_steps     = 0,
                              title_desc  = "Vendite divise per città e anno ",
                              Y_desc      = "Vendite",
                              X_desc      = "Città / anno",
                              y_start     = 0,
                              y_end       = 450,
                              y_steps     = 50
                              )

## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.

## # A tibble: 20 × 3
##    city                   year  tot_sum
##    <chr>                 <int>    <dbl>
##  1 Beaumont               2010 1758989.
##  2 Beaumont               2011 1751063.
##  3 Beaumont               2012 1697711.
##  4 Beaumont               2013 1800948.
##  5 Beaumont               2014 1789712.
##  6 Bryan-College Station  2010 2095222.
##  7 Bryan-College Station  2011 2084268.
##  8 Bryan-College Station  2012 2152327.
##  9 Bryan-College Station  2013 2247790.
## 10 Bryan-College Station  2014 2432452.
## 11 Tyler                  2010 1914451.
## 12 Tyler                  2011 1922976.
## 13 Tyler                  2012 1986396.
## 14 Tyler                  2013 2094022.
## 15 Tyler                  2014 2142762.
## 16 Wichita Falls          2010 1440390.
## 17 Wichita Falls          2011 1357723.
## 18 Wichita Falls          2012 1406703.
## 19 Wichita Falls          2013 1475092.
## 20 Wichita Falls          2014 1485892.

Anche il prezzo medio e mediano indicano come città di riferimento Bryan-College

PUNTO 9: Considerazione finali

Considerando i vari grafici, abbiamo capito che la zona di Tyler è la migliore, ma se analizziamo i vari grafici e sopratutto la colonna che abbiamo creato “Gain_force_1” e il “months_inventory” capiamo che il venditore del Bryan-College risulta il migliore. Le case vengono vendute più velocemente e il fattore case vendute rapportato agli annunci e nettamente il migliore, oltre ad avere prezzi medi e mediani superiori agli altri.

Consiglierei all’immobiliare di tracciare tecniche operative e di marketing basate sul venditore di Bryan college.

Analisi immobiliare

Forgiarini David

2024-10-28

Analisi immobiliare del mercato del Texas

Librerie

Funzioni

Import dei dati

PUNTO 1: Analisi delle variabili tempo

PUNTO 2: Indici di posizione,variabilità e forma

PUNTO 3: Identificazione variabili

PUNTO 4: Creazioni di classi per una var.qualitativa

PUNTO 5: Calcolo delle probabilità

PUNTO 6: Creazione di nuove variabili

PUNTO 7: Analisi condizionata

PUNTO 8: Creazione di vis.con ggplot2

PUNTO 9: Considerazione finali