L’azienda Texas Realty Insights desidera analizzare le tendenze del mercato immobiliare nello stato del Texas, sfruttando i dati storici relativi alle vendite di immobili. L’obiettivo è fornire insight statistici e visivi che supportino le decisioni strategiche di vendita e ottimizzazione delle inserzioni immobiliari.
library(dplyr)
library(ggplot2)
library(moments) # For skewness and kurtosis
setwd("~/File eseguibili R")
data = read.csv("realestate_texas.csv", sep = ',')
summary(data)
## city year month sales
## Length:240 Min. :2010 Min. : 1.00 Min. : 79.0
## Class :character 1st Qu.:2011 1st Qu.: 3.75 1st Qu.:127.0
## Mode :character Median :2012 Median : 6.50 Median :175.5
## Mean :2012 Mean : 6.50 Mean :192.3
## 3rd Qu.:2013 3rd Qu.: 9.25 3rd Qu.:247.0
## Max. :2014 Max. :12.00 Max. :423.0
## volume median_price listings months_inventory
## Min. : 8.166 Min. : 73800 Min. : 743 Min. : 3.400
## 1st Qu.:17.660 1st Qu.:117300 1st Qu.:1026 1st Qu.: 7.800
## Median :27.062 Median :134500 Median :1618 Median : 8.950
## Mean :31.005 Mean :132665 Mean :1738 Mean : 9.193
## 3rd Qu.:40.893 3rd Qu.:150050 3rd Qu.:2056 3rd Qu.:10.950
## Max. :83.547 Max. :180000 Max. :3296 Max. :14.900
Looking at the summary and at the actual data in the environment tab we can see that:
attach(data)
# sales feature
sales_summary <- data %>%
summarise(
# Position
median_sales = median(sales),
min_sales = min(sales),
Q1_sales = quantile(sales, 0.25), # 1st Quartile
mean_sales = mean(sales),
Q3_sales = quantile(sales, 0.75), # 3rd Quartile
max_sales = max(sales),
# Variability
sd_sales = sd(sales), # Standard Deviation
iqr_sales = IQR(sales), # Interquartile Range
cv_sales = (sd_sales / mean_sales) * 100, # Coefficient of Variation in percentage
# Shape
skew_sales = skewness(sales),
kurt_sales = kurtosis(sales)-3
)
boxplot(sales~city)
# volume feature
volume_summary <- data %>%
summarise(
# Position
median_volume = median(volume),
min_volume = min(volume),
Q1_volume = quantile(volume, 0.25), # 1st Quartile
mean_volume = mean(volume),
Q3_volume = quantile(volume, 0.75), # 3rd Quartile
max_volume = max(volume),
# Variability
sd_volume = sd(volume), # Standard Deviation
iqr_volume = IQR(volume), # Interquartile Range
cv_volume = (sd_volume / mean_volume) * 100, # Coefficient of Variation in percentage
# Shape
skew_volume = skewness(volume),
kurt_volume = kurtosis(volume)-3
)
boxplot(volume~city)
# median_price feature
median_price_summary <- data %>%
summarise(
# Position
median_median_price = median(median_price),
min_median_price = min(median_price),
Q1_median_price = quantile(median_price, 0.25), # 1st Quartile
mean_median_price = mean(median_price),
Q3_median_price = quantile(median_price, 0.75), # 3rd Quartile
max_median_price = max(median_price),
# Variability
sd_median_price = sd(median_price), # Standard Deviation
iqr_median_price = IQR(median_price), # Interquartile Range
cv_median_price = (sd_median_price / mean_median_price) * 100, # Coefficient of Variation in percentage
# Shape
skew_median_price = skewness(median_price),
kurt_median_price = kurtosis(median_price)-3
)
boxplot(median_price~city)
# listings feature
listings_summary <- data %>%
summarise(
# Position
median_listings = median(listings),
min_listings = min(listings),
Q1_listings = quantile(listings, 0.25), # 1st Quartile
mean_listings = mean(listings),
Q3_listings = quantile(listings, 0.75), # 3rd Quartile
max_listings = max(listings),
# Variability
sd_listings = sd(listings), # Standard Deviation
iqr_listings = IQR(listings), # Interquartile Range
cv_listings = (sd_listings / mean_listings) * 100, # Coefficient of Variation in percentage
# Shape
skew_listings = skewness(listings),
kurt_listings = kurtosis(listings)-3
)
boxplot(listings~city)
# months_inventory feature ##
months_inventory_summary <- data %>%
summarise(
# Position
median_months_inventory = median(months_inventory),
min_months_inventory = min(months_inventory),
Q1_months_inventory = quantile(months_inventory, 0.25), # 1st Quartile
mean_months_inventory = mean(months_inventory),
Q3_months_inventory = quantile(months_inventory, 0.75), # 3rd Quartile
max_months_inventory = max(months_inventory),
# Variability
sd_months_inventory = sd(months_inventory), # Standard Deviation
iqr_months_inventory = IQR(months_inventory), # Interquartile Range
cv_months_inventory = (sd_months_inventory / mean_months_inventory) * 100, # Coefficient of Variation in percentage
# Shape
skew_months_inventory = skewness(months_inventory),
kurt_months_inventory = kurtosis(months_inventory)-3
)
boxplot(months_inventory~city)
# View results
sales_summary
## median_sales min_sales Q1_sales mean_sales Q3_sales max_sales sd_sales
## 1 175.5 79 127 192.2917 247 423 79.65111
## iqr_sales cv_sales skew_sales kurt_sales
## 1 120 41.42203 0.718104 -0.3131764
volume_summary
## median_volume min_volume Q1_volume mean_volume Q3_volume max_volume sd_volume
## 1 27.0625 8.166 17.6595 31.00519 40.893 83.547 16.65145
## iqr_volume cv_volume skew_volume kurt_volume
## 1 23.2335 53.70536 0.884742 0.176987
median_price_summary
## median_median_price min_median_price Q1_median_price mean_median_price
## 1 134500 73800 117300 132665.4
## Q3_median_price max_median_price sd_median_price iqr_median_price
## 1 150050 180000 22662.15 32750
## cv_median_price skew_median_price kurt_median_price
## 1 17.08218 -0.3645529 -0.6229618
listings_summary
## median_listings min_listings Q1_listings mean_listings Q3_listings
## 1 1618.5 743 1026.5 1738.021 2056
## max_listings sd_listings iqr_listings cv_listings skew_listings kurt_listings
## 1 3296 752.7078 1029.5 43.30833 0.6494982 -0.79179
months_inventory_summary
## median_months_inventory min_months_inventory Q1_months_inventory
## 1 8.95 3.4 7.8
## mean_months_inventory Q3_months_inventory max_months_inventory
## 1 9.1925 10.95 14.9
## sd_months_inventory iqr_months_inventory cv_months_inventory
## 1 2.303669 3.15 25.06031
## skew_months_inventory kurt_months_inventory
## 1 0.04097527 -0.1744475
From the printed summaries we can conclude that:
Analysis of the other variables:
# other variables --> frequency distribution
n <- nrow(data)
# Frequency distribution for 'city'
city_table <- table(data$city)
distr_freq_city <- as.data.frame(
cbind(
ni = city_table, # Absolute frequency
fi = city_table / n, # Relative frequency
Ni = cumsum(city_table), # Cumulative absolute frequency
Fi = cumsum(city_table / n) # Cumulative relative frequency
)
)
# Frequency distribution for 'year'
year_table <- table(data$year)
distr_freq_year <- as.data.frame(
cbind(
ni = year_table, # Absolute frequency
fi = year_table / n, # Relative frequency
Ni = cumsum(year_table), # Cumulative absolute frequency
Fi = cumsum(year_table / n) # Cumulative relative frequency
)
)
# Frequency distribution for 'month'
month_table <- table(data$month)
distr_freq_month <- as.data.frame(
cbind(
ni = month_table, # Absolute frequency
fi = month_table / n, # Relative frequency
Ni = cumsum(month_table), # Cumulative absolute frequency
Fi = cumsum(month_table / n) # Cumulative relative frequency
)
)
# Print results
# Frequency Distribution for City
print(distr_freq_city)
## ni fi Ni Fi
## Beaumont 60 0.25 60 0.25
## Bryan-College Station 60 0.25 120 0.50
## Tyler 60 0.25 180 0.75
## Wichita Falls 60 0.25 240 1.00
freq_city <- table(data$city)
freq_city
##
## Beaumont Bryan-College Station Tyler
## 60 60 60
## Wichita Falls
## 60
# Frequency Distribution for Year
print(distr_freq_year)
## ni fi Ni Fi
## 2010 48 0.2 48 0.2
## 2011 48 0.2 96 0.4
## 2012 48 0.2 144 0.6
## 2013 48 0.2 192 0.8
## 2014 48 0.2 240 1.0
freq_year <- table(data$year)
freq_year
##
## 2010 2011 2012 2013 2014
## 48 48 48 48 48
# Frequency Distribution for Month
print(distr_freq_month)
## ni fi Ni Fi
## 1 20 0.08333333 20 0.08333333
## 2 20 0.08333333 40 0.16666667
## 3 20 0.08333333 60 0.25000000
## 4 20 0.08333333 80 0.33333333
## 5 20 0.08333333 100 0.41666667
## 6 20 0.08333333 120 0.50000000
## 7 20 0.08333333 140 0.58333333
## 8 20 0.08333333 160 0.66666667
## 9 20 0.08333333 180 0.75000000
## 10 20 0.08333333 200 0.83333333
## 11 20 0.08333333 220 0.91666667
## 12 20 0.08333333 240 1.00000000
freq_month <- table(data$month)
freq_month
##
## 1 2 3 4 5 6 7 8 9 10 11 12
## 20 20 20 20 20 20 20 20 20 20 20 20
I can already understand from this results that city, year and month are like the primary keys of this table and are used to identify a single record. We have 5 years of data record every month per 4 geographical areas. Hence, if used by themselves, these variables give no statistical information.
# subdivision `median_price` in classes
n= length(data$median_price)
breaks <- seq(70000, 180000, length.out = 12)
data$Price_Class <- cut(median_price, breaks = breaks)
# frequency distribution
price_freq <- table(data$Price_Class)/n
labels <- c("70k-80k","80k-90k","90k-100k","100k-110k","110k-120k","120k-130k","130k-140k","140k-150k","150k-160k","160k-170k","170k-180k")
barplot(price_freq, main = "Distribution Median Price", col = "blue",
names.arg = labels,
las = 2,
cex.names = 0.6,
xlab = "Price (in $)",
ylab = "Relative Frequency")
# Gini Index
gini.index = function(x){
ni=table(x)
fi=ni/length(x)
fi2=fi^2
J = length(table(x))
gini = 1-sum(fi2)
gini.normalizzato = gini/((J-1)/J)
return (gini.normalizzato)
}
gini_index = gini.index(data$Price_Class)
gini_index
## [1] 0.9586042
p_beaumont <- mean(city == "Beaumont")
# i'm using the mean on a boolean vector, which gives as a result the number of TRUE elements divided by the length of the vector, which is exactly the probability of extracting Beaumont taking a random row in the dataset
p_july <- mean(month == "7")
p_dec_2012 <- mean(month == "12" & year == 2012)
p_beaumont
## [1] 0.25
p_july
## [1] 0.08333333
p_dec_2012
## [1] 0.01666667
data <- data %>%
mutate(
Avg_Price = volume / sales * 1000000,
Effectiveness = sales / listings
)
summary(data$Avg_Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 97010 132939 156588 154320 173915 213234
summary(data$Effectiveness)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.05014 0.08980 0.10963 0.11874 0.13492 0.38713
boxplot(data$Avg_Price~city)
boxplot(data$Effectiveness~city)
# analysis of the sales and price over time, considering all the cities together
# Create a combined "Year-Month" column for grouping
data <- data %>%
mutate(Year_Month = paste0(year, sprintf("%02d", month)))
# Aggregate data by year and month
aggregated_data <- data %>%
group_by(Year_Month) %>%
summarise(
Total_Sales = sum(sales),
Avg_Median_Price = mean(median_price)
) %>%
arrange(Year_Month) # Sort by date
# Line chart total sales over time
ggplot(aggregated_data, aes(x = Year_Month)) +
geom_line(aes(y = Total_Sales, color = "Total Sales", group = 1), linewidth = 1) +
labs(
title = "Trends in Total Sales Over Time",
x = "Time (Year-Month)",
y = "Value",
color = "Legend"
) +
scale_x_discrete(breaks = c("201001","201101","201201","201301","201401")) +
scale_y_continuous(sec.axis = sec_axis(~ ., name = "Median Price (USD)")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Line chart average price over time
ggplot(aggregated_data, aes(x = Year_Month)) +
geom_line(aes(y = Avg_Median_Price, color = "Median Price", group = 1), linewidth = 1) +
labs(
title = "Trends in Median Price Over Time",
x = "Time (Year-Month)",
y = "Value",
color = "Legend"
) +
scale_x_discrete(breaks = c("201001","201101","201201","201301","201401")) +
scale_y_continuous(sec.axis = sec_axis(~ ., name = "Median Price (USD)")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# analysis of the sales and price over time, considering distinct cities and grouping by month to have a better look at the seasonality effects
# Filter for the cities and group by city and month
aggregated_data <- data %>%
filter(city %in% c("Beaumont", "Bryan-College Station", "Tyler", "Wichita Falls")) %>%
group_by(city, month) %>%
summarise(
Total_Sales = sum(sales),
Avg_Median_Price = mean(median_price)
) %>%
ungroup()
## `summarise()` has grouped output by 'city'. You can override using the
## `.groups` argument.
# plot
ggplot(aggregated_data, aes(x = month)) +
geom_line(aes(y = Total_Sales, color = "Total Sales"), linewidth = 1) +
facet_wrap(~ city) + # Create one plot per city
labs(
title = "Trends in Total Sales by Month",
x = "Month",
y = "Value",
color = "Legend"
) +
scale_x_continuous(breaks = 1:12, labels = month.name) + # Label months by name
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right"
)
# plot
ggplot(aggregated_data, aes(x = month)) +
geom_line(aes(y = Avg_Median_Price, color = "Median Price"), linewidth = 1) +
facet_wrap(~ city) + # Create one plot per city
labs(
title = "Trends in median price by Month",
x = "Month",
y = "Value",
color = "Legend"
) +
scale_x_continuous(breaks = 1:12, labels = month.name) + # Label months by name
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "right"
)
Summing up we can conclude that: