L’azienda Texas Realty Insights desidera analizzare le tendenze del mercato immobiliare nello stato del Texas, sfruttando i dati storici relativi alle vendite di immobili. L’obiettivo è fornire insight statistici e visivi che supportino le decisioni strategiche di vendita e ottimizzazione delle inserzioni immobiliari.

Load necessary libraries (i had already installed them while following the course)

library(dplyr)
library(ggplot2)
library(moments) # For skewness and kurtosis

1) Variable Analysis

setwd("~/File eseguibili R")
data = read.csv("realestate_texas.csv", sep = ',')
summary(data)
##      city                year          month           sales      
##  Length:240         Min.   :2010   Min.   : 1.00   Min.   : 79.0  
##  Class :character   1st Qu.:2011   1st Qu.: 3.75   1st Qu.:127.0  
##  Mode  :character   Median :2012   Median : 6.50   Median :175.5  
##                     Mean   :2012   Mean   : 6.50   Mean   :192.3  
##                     3rd Qu.:2013   3rd Qu.: 9.25   3rd Qu.:247.0  
##                     Max.   :2014   Max.   :12.00   Max.   :423.0  
##      volume        median_price       listings    months_inventory
##  Min.   : 8.166   Min.   : 73800   Min.   : 743   Min.   : 3.400  
##  1st Qu.:17.660   1st Qu.:117300   1st Qu.:1026   1st Qu.: 7.800  
##  Median :27.062   Median :134500   Median :1618   Median : 8.950  
##  Mean   :31.005   Mean   :132665   Mean   :1738   Mean   : 9.193  
##  3rd Qu.:40.893   3rd Qu.:150050   3rd Qu.:2056   3rd Qu.:10.950  
##  Max.   :83.547   Max.   :180000   Max.   :3296   Max.   :14.900

Looking at the summary and at the actual data in the environment tab we can see that:

2) Statistical Analysis (Position, Variability, and Shape)

attach(data)

# sales feature
sales_summary <- data %>%
  summarise(
    # Position
    median_sales = median(sales),
    min_sales = min(sales),
    Q1_sales = quantile(sales, 0.25), # 1st Quartile
    mean_sales = mean(sales),
    Q3_sales = quantile(sales, 0.75), # 3rd Quartile
    max_sales = max(sales),
    
    # Variability
    sd_sales = sd(sales), # Standard Deviation
    iqr_sales = IQR(sales), # Interquartile Range
    cv_sales = (sd_sales / mean_sales) * 100, # Coefficient of Variation in percentage

    # Shape
    skew_sales = skewness(sales),
    kurt_sales = kurtosis(sales)-3
  )


boxplot(sales~city)

# volume feature 
volume_summary <- data %>%
  summarise(
    # Position
    median_volume = median(volume),
    min_volume = min(volume),
    Q1_volume = quantile(volume, 0.25), # 1st Quartile
    mean_volume = mean(volume),
    Q3_volume = quantile(volume, 0.75), # 3rd Quartile
    max_volume = max(volume),
    
    # Variability
    sd_volume = sd(volume), # Standard Deviation
    iqr_volume = IQR(volume), # Interquartile Range
    cv_volume = (sd_volume / mean_volume) * 100, # Coefficient of Variation in percentage
    
    # Shape
    skew_volume = skewness(volume),
    kurt_volume = kurtosis(volume)-3
  )
boxplot(volume~city)

# median_price feature
median_price_summary <- data %>%
  summarise(
    # Position
    median_median_price = median(median_price),
    min_median_price = min(median_price),
    Q1_median_price = quantile(median_price, 0.25), # 1st Quartile
    mean_median_price = mean(median_price),
    Q3_median_price = quantile(median_price, 0.75), # 3rd Quartile
    max_median_price = max(median_price),
    
    # Variability
    sd_median_price = sd(median_price), # Standard Deviation
    iqr_median_price = IQR(median_price), # Interquartile Range
    cv_median_price = (sd_median_price / mean_median_price) * 100, # Coefficient of Variation in percentage
    
    # Shape
    skew_median_price = skewness(median_price),
    kurt_median_price = kurtosis(median_price)-3
  )

boxplot(median_price~city)

# listings feature
listings_summary <- data %>%
  summarise(
    # Position
    median_listings = median(listings),
    min_listings = min(listings),
    Q1_listings = quantile(listings, 0.25), # 1st Quartile
    mean_listings = mean(listings),
    Q3_listings = quantile(listings, 0.75), # 3rd Quartile
    max_listings = max(listings),
    
    # Variability
    sd_listings = sd(listings), # Standard Deviation
    iqr_listings = IQR(listings), # Interquartile Range
    cv_listings = (sd_listings / mean_listings) * 100, # Coefficient of Variation in percentage
    
    # Shape
    skew_listings = skewness(listings),
    kurt_listings = kurtosis(listings)-3
  )
boxplot(listings~city)

# months_inventory feature ##
months_inventory_summary <- data %>%
  summarise(
    # Position
    median_months_inventory = median(months_inventory),
    min_months_inventory = min(months_inventory),
    Q1_months_inventory = quantile(months_inventory, 0.25), # 1st Quartile
    mean_months_inventory = mean(months_inventory),
    Q3_months_inventory = quantile(months_inventory, 0.75), # 3rd Quartile
    max_months_inventory = max(months_inventory),
    
    # Variability
    sd_months_inventory = sd(months_inventory), # Standard Deviation
    iqr_months_inventory = IQR(months_inventory), # Interquartile Range
    cv_months_inventory = (sd_months_inventory / mean_months_inventory) * 100, # Coefficient of Variation in percentage
    
    # Shape
    skew_months_inventory = skewness(months_inventory),
    kurt_months_inventory = kurtosis(months_inventory)-3
  )
boxplot(months_inventory~city)

# View results
sales_summary
##   median_sales min_sales Q1_sales mean_sales Q3_sales max_sales sd_sales
## 1        175.5        79      127   192.2917      247       423 79.65111
##   iqr_sales cv_sales skew_sales kurt_sales
## 1       120 41.42203   0.718104 -0.3131764
volume_summary
##   median_volume min_volume Q1_volume mean_volume Q3_volume max_volume sd_volume
## 1       27.0625      8.166   17.6595    31.00519    40.893     83.547  16.65145
##   iqr_volume cv_volume skew_volume kurt_volume
## 1    23.2335  53.70536    0.884742    0.176987
median_price_summary
##   median_median_price min_median_price Q1_median_price mean_median_price
## 1              134500            73800          117300          132665.4
##   Q3_median_price max_median_price sd_median_price iqr_median_price
## 1          150050           180000        22662.15            32750
##   cv_median_price skew_median_price kurt_median_price
## 1        17.08218        -0.3645529        -0.6229618
listings_summary
##   median_listings min_listings Q1_listings mean_listings Q3_listings
## 1          1618.5          743      1026.5      1738.021        2056
##   max_listings sd_listings iqr_listings cv_listings skew_listings kurt_listings
## 1         3296    752.7078       1029.5    43.30833     0.6494982      -0.79179
months_inventory_summary
##   median_months_inventory min_months_inventory Q1_months_inventory
## 1                    8.95                  3.4                 7.8
##   mean_months_inventory Q3_months_inventory max_months_inventory
## 1                9.1925               10.95                 14.9
##   sd_months_inventory iqr_months_inventory cv_months_inventory
## 1            2.303669                 3.15            25.06031
##   skew_months_inventory kurt_months_inventory
## 1            0.04097527            -0.1744475

From the printed summaries we can conclude that:

Analysis of the other variables:

# other variables --> frequency distribution
n <- nrow(data)

# Frequency distribution for 'city'
city_table <- table(data$city)
distr_freq_city <- as.data.frame(
  cbind(
    ni = city_table, # Absolute frequency
    fi = city_table / n, # Relative frequency
    Ni = cumsum(city_table), # Cumulative absolute frequency
    Fi = cumsum(city_table / n) # Cumulative relative frequency
  )
)

# Frequency distribution for 'year'
year_table <- table(data$year)
distr_freq_year <- as.data.frame(
  cbind(
    ni = year_table, # Absolute frequency
    fi = year_table / n, # Relative frequency
    Ni = cumsum(year_table), # Cumulative absolute frequency
    Fi = cumsum(year_table / n) # Cumulative relative frequency
  )
)

# Frequency distribution for 'month'
month_table <- table(data$month)
distr_freq_month <- as.data.frame(
  cbind(
    ni = month_table, # Absolute frequency
    fi = month_table / n, # Relative frequency
    Ni = cumsum(month_table), # Cumulative absolute frequency
    Fi = cumsum(month_table / n) # Cumulative relative frequency
  )
)

# Print results
# Frequency Distribution for City
print(distr_freq_city)
##                       ni   fi  Ni   Fi
## Beaumont              60 0.25  60 0.25
## Bryan-College Station 60 0.25 120 0.50
## Tyler                 60 0.25 180 0.75
## Wichita Falls         60 0.25 240 1.00
freq_city <- table(data$city)
freq_city
## 
##              Beaumont Bryan-College Station                 Tyler 
##                    60                    60                    60 
##         Wichita Falls 
##                    60
# Frequency Distribution for Year
print(distr_freq_year)
##      ni  fi  Ni  Fi
## 2010 48 0.2  48 0.2
## 2011 48 0.2  96 0.4
## 2012 48 0.2 144 0.6
## 2013 48 0.2 192 0.8
## 2014 48 0.2 240 1.0
freq_year <- table(data$year)
freq_year
## 
## 2010 2011 2012 2013 2014 
##   48   48   48   48   48
# Frequency Distribution for Month
print(distr_freq_month)
##    ni         fi  Ni         Fi
## 1  20 0.08333333  20 0.08333333
## 2  20 0.08333333  40 0.16666667
## 3  20 0.08333333  60 0.25000000
## 4  20 0.08333333  80 0.33333333
## 5  20 0.08333333 100 0.41666667
## 6  20 0.08333333 120 0.50000000
## 7  20 0.08333333 140 0.58333333
## 8  20 0.08333333 160 0.66666667
## 9  20 0.08333333 180 0.75000000
## 10 20 0.08333333 200 0.83333333
## 11 20 0.08333333 220 0.91666667
## 12 20 0.08333333 240 1.00000000
freq_month <- table(data$month)
freq_month
## 
##  1  2  3  4  5  6  7  8  9 10 11 12 
## 20 20 20 20 20 20 20 20 20 20 20 20

I can already understand from this results that city, year and month are like the primary keys of this table and are used to identify a single record. We have 5 years of data record every month per 4 geographical areas. Hence, if used by themselves, these variables give no statistical information.

3) Identification variables with greater variability and asymmetry

4) Classes creation for a quantitative variable

# subdivision `median_price` in classes
n= length(data$median_price)
breaks <- seq(70000, 180000, length.out = 12)
data$Price_Class <- cut(median_price, breaks = breaks)

# frequency distribution
price_freq <- table(data$Price_Class)/n
labels <- c("70k-80k","80k-90k","90k-100k","100k-110k","110k-120k","120k-130k","130k-140k","140k-150k","150k-160k","160k-170k","170k-180k")
barplot(price_freq, main = "Distribution Median Price", col = "blue",
        names.arg = labels,
        las = 2, 
        cex.names = 0.6, 
        xlab = "Price (in $)", 
        ylab = "Relative Frequency")

# Gini Index
gini.index = function(x){
  ni=table(x)
  fi=ni/length(x)
  fi2=fi^2
  J = length(table(x))
  
  gini = 1-sum(fi2)
  gini.normalizzato = gini/((J-1)/J)
  
  return (gini.normalizzato)
}

gini_index = gini.index(data$Price_Class)
gini_index
## [1] 0.9586042

5) Probability calculations

p_beaumont <- mean(city == "Beaumont") 
# i'm using the mean on a boolean vector, which gives as a result the number of TRUE elements divided by the length of the vector, which is exactly the probability of extracting Beaumont taking a random row in the dataset
p_july <- mean(month == "7")
p_dec_2012 <- mean(month == "12" & year == 2012)

p_beaumont
## [1] 0.25
p_july
## [1] 0.08333333
p_dec_2012
## [1] 0.01666667

6) Creation new variables

data <- data %>%
  mutate(
    Avg_Price = volume / sales * 1000000,  
    Effectiveness = sales / listings
  )

summary(data$Avg_Price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   97010  132939  156588  154320  173915  213234
summary(data$Effectiveness)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.05014 0.08980 0.10963 0.11874 0.13492 0.38713
boxplot(data$Avg_Price~city)

boxplot(data$Effectiveness~city)

7-8) Conditional Analysis and visualization using ggplot

# analysis of the sales and price over time, considering all the cities together
# Create a combined "Year-Month" column for grouping
data <- data %>%
  mutate(Year_Month = paste0(year, sprintf("%02d", month)))

# Aggregate data by year and month
aggregated_data <- data %>%
  group_by(Year_Month) %>%
  summarise(
    Total_Sales = sum(sales),
    Avg_Median_Price = mean(median_price)
  ) %>%
  arrange(Year_Month) # Sort by date

# Line chart total sales over time
ggplot(aggregated_data, aes(x = Year_Month)) +
  geom_line(aes(y = Total_Sales, color = "Total Sales", group = 1), linewidth = 1) +
  labs(
    title = "Trends in Total Sales Over Time",
    x = "Time (Year-Month)",
    y = "Value",
    color = "Legend"
  ) +
  scale_x_discrete(breaks = c("201001","201101","201201","201301","201401")) + 
  scale_y_continuous(sec.axis = sec_axis(~ ., name = "Median Price (USD)")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Line chart average price over time
ggplot(aggregated_data, aes(x = Year_Month)) +
  geom_line(aes(y = Avg_Median_Price, color = "Median Price", group = 1), linewidth = 1) +
  labs(
    title = "Trends in Median Price Over Time",
    x = "Time (Year-Month)",
    y = "Value",
    color = "Legend"
  ) +
  scale_x_discrete(breaks = c("201001","201101","201201","201301","201401")) +
  scale_y_continuous(sec.axis = sec_axis(~ ., name = "Median Price (USD)")) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# analysis of the sales and price over time, considering distinct cities and grouping by month to have a better look at the seasonality effects
# Filter for the cities and group by city and month
aggregated_data <- data %>%
  filter(city %in% c("Beaumont", "Bryan-College Station", "Tyler", "Wichita Falls")) %>%
  group_by(city, month) %>%
  summarise(
    Total_Sales = sum(sales),
    Avg_Median_Price = mean(median_price)
  ) %>%
  ungroup()
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
# plot
ggplot(aggregated_data, aes(x = month)) +
  geom_line(aes(y = Total_Sales, color = "Total Sales"), linewidth = 1) +
  facet_wrap(~ city) + # Create one plot per city
  labs(
    title = "Trends in Total Sales by Month",
    x = "Month",
    y = "Value",
    color = "Legend"
  ) +
  scale_x_continuous(breaks = 1:12, labels = month.name) + # Label months by name
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "right"
  )

# plot
ggplot(aggregated_data, aes(x = month)) +
  geom_line(aes(y = Avg_Median_Price, color = "Median Price"), linewidth = 1) +
  facet_wrap(~ city) + # Create one plot per city
  labs(
    title = "Trends in median price by Month",
    x = "Month",
    y = "Value",
    color = "Legend"
  ) +
  scale_x_continuous(breaks = 1:12, labels = month.name) + # Label months by name
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "right"
  )

9) Conclusions

Summing up we can conclude that:

  1. Wichita Falls has on average the cheapest houses, while bryan-college station the most expensive
  2. Despite of Bryan-college station having the most expensive houses, the overall volumes are a bit higher for Tyler city, and this is due to a massive advertising policy for this city which causes a big number of houses sold
  3. The huge advertising policy in Tyler is not justified from the results, since we have seen that the effectiveness of this ads policy is not so good (especially compared to bryan-college station)
  4. A clear seasonality effect is evident in the sales done (and consequently in the volumes sold), with valleys in January and peaks which varies a little from year to year (but anyway in between april-august time span)
  5. Through the five year of analysis we could see a progressive increase in the average price of the houses and on the total amount of houses sold